limits-of-tom/compare_tom_cpa.ipynb
2024-06-11 15:36:55 +02:00

230 KiB

In [2]:
import pickle 
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr
import numpy as np
In [3]:
# task status
tom6_fname = "XXX"
# task knowledge
tom7_fname = "XXX"
# task intention
tom8_fname = "XXX"
# own plan
exp2_5_fname = "XXX"
exp2_6_fname = "XXX"
exp2_0_fname = "XXX"
# partner plan 
exp3_7_fname = "XXX"
exp3_0_fname = "XXX"
In [4]:
with open(tom6_fname, 'rb') as f: 
    tom6 = pickle.load(f)
with open(tom7_fname, 'rb') as f: 
    tom7 = pickle.load(f)
with open(tom8_fname, 'rb') as f: 
    tom8 = pickle.load(f)
In [5]:
with open(exp2_5_fname, 'rb') as f:
    exp2_5 = pickle.load(f)
with open(exp2_6_fname, 'rb') as f:
    exp2_6 = pickle.load(f)
with open(exp2_0_fname, 'rb') as f:
    exp2_0 = pickle.load(f)
with open(exp3_0_fname, 'rb') as f:
    exp3_0 = pickle.load(f)
with open(exp3_7_fname, 'rb') as f:
    exp3_7 = pickle.load(f)
In [6]:
tom6 = [item[0] for item in tom6 if item and item[0]]
tom7 = [item[0] for item in tom7 if item and item[0]]
tom8 = [item[0] for item in tom8 if item and item[0]]
In [7]:
# compute per-game f1 scores
tom6_f1 = []
tom6_game_paths = []
for game in tom6: 
    pred = game[0]
    gt = game[1]
    game_path = game[2]
    f1 = f1_score(gt, pred, average="weighted")
    tom6_f1.append(f1)
    tom6_game_paths.append(game_path)

tom7_f1 = []
tom7_game_paths = []
for game in tom7: 
    pred = game[0]
    gt = game[1]
    game_path = game[2]
    f1 = f1_score(gt, pred, average="weighted")
    tom7_f1.append(f1)
    tom7_game_paths.append(game_path)

tom8_f1 = []
tom8_game_paths = []
for game in tom8: 
    pred = game[0]
    gt = game[1]
    game_path = game[2]
    f1 = f1_score(gt, pred, average="weighted")
    tom8_f1.append(f1)
    tom8_game_paths.append(game_path)
In [8]:
tom6_game_paths == tom7_game_paths == tom8_game_paths
Out[8]:
True
In [9]:
game_ids = [path.split("/")[-1] for path in tom6_game_paths]
game_ids[0]
Out[9]:
'141_212_108_99_20210325_121618'
In [11]:
exp2_5_f1 = [item[-1] for item in exp2_5]
exp2_6_f1 = [item[-1] for item in exp2_6]
exp2_0_f1 = [item[-1] for item in exp2_0]
exp3_0_f1 = [item[-1] for item in exp3_0]
exp3_7_f1 = [item[-1] for item in exp3_7]
exp2_game_ids = [item[-3].split("/")[-1] for item in exp2_5]
In [12]:
exp2_f1_diff_5_0 = [a - b for a, b in zip(exp2_5_f1, exp2_0_f1)]
exp2_f1_diff_6_0 = [a - b for a, b in zip(exp2_6_f1, exp2_0_f1)]
exp3_f1_diff_7_0 = [a - b for a, b in zip(exp3_7_f1, exp3_0_f1)]
In [13]:
len(exp2_game_ids), len(game_ids)
Out[13]:
(60, 64)
In [14]:
common_elements = set(exp2_game_ids) & set(game_ids)
exp2_common_game_ids = []
exp2_common_f1_diff_5_0 = []
exp2_common_f1_diff_6_0 = []
exp3_common_f1_diff_7_0 = []
for i in range(len(exp2_game_ids)):
    if exp2_game_ids[i] in common_elements:
        exp2_common_game_ids.append(exp2_game_ids[i])
        exp2_common_f1_diff_5_0.append(exp2_f1_diff_5_0[i])
        exp2_common_f1_diff_6_0.append(exp2_f1_diff_6_0[i])
        exp3_common_f1_diff_7_0.append(exp3_f1_diff_7_0[i])

common_game_ids = []
tom6_common_f1 = []
tom7_common_f1 = []
tom8_common_f1 = []
for i in range(len(game_ids)):
    if game_ids[i] in common_elements:
        common_game_ids.append(game_ids[i])
        tom6_common_f1.append(tom6_f1[i])
        tom7_common_f1.append(tom7_f1[i])
        tom8_common_f1.append(tom8_f1[i])
In [15]:
exp2_common_game_ids == common_game_ids
Out[15]:
True

Own missing knowledge (exp2)

In [16]:
tom6correlation_coefficient, tom6p_value = pearsonr(tom6_common_f1, exp2_common_f1_diff_5_0)
tom7correlation_coefficient, tom7p_value = pearsonr(tom7_common_f1, exp2_common_f1_diff_6_0)
tom8correlation_coefficient, tom8p_value = pearsonr(tom8_common_f1, exp2_common_f1_diff_6_0)
In [17]:
print(tom6correlation_coefficient, tom6p_value)
print(tom7correlation_coefficient, tom7p_value)
print(tom8correlation_coefficient, tom8p_value)
0.21866640044266308 0.09913002973227936
0.08225606020725357 0.5393156230207309
0.15718692888976768 0.23864397213593697
In [18]:
fontsize = 19

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(19, 5.1))

# Plot for Task Intention
axes[2].scatter(tom8_common_f1, exp2_common_f1_diff_6_0)
axes[2].set_title(f'$r={tom8correlation_coefficient:.2f}$, $p={tom8p_value:.2f}$', fontsize=fontsize)
axes[2].plot(np.unique(tom8_common_f1), np.poly1d(np.polyfit(tom8_common_f1, exp2_common_f1_diff_6_0, 1))(np.unique(tom8_common_f1)), color='red', linestyle='--')
axes[2].set_xlabel('F1(Task Intention)', fontsize=fontsize)
#axes[2].set_ylabel('F1(OMK w ToM) - F1(OMK w/ ToM)', fontsize=fontsize)
#axes[2].legend(fontsize=fontsize, loc='upper center', bbox_to_anchor=(0.5, 1.17))
axes[2].grid(True, linestyle='--')
axes[2].tick_params(axis='both', labelsize=fontsize)

# Plot for Task Status
axes[0].scatter(tom6_common_f1, exp2_common_f1_diff_5_0, c='purple')
axes[0].set_title(f'$r={tom6correlation_coefficient:.2f}$, $p={tom6p_value:.2f}$', fontsize=fontsize)
axes[0].plot(np.unique(tom6_common_f1), np.poly1d(np.polyfit(tom6_common_f1, exp2_common_f1_diff_5_0, 1))(np.unique(tom6_common_f1)), color='red', linestyle='--')
axes[0].set_xlabel('F1(Task Status)', fontsize=fontsize)
axes[0].set_ylabel('F1(OMK w ToM) $-$ F1(OMK w/ ToM)', fontsize=fontsize)
#axes[0].legend(fontsize=fontsize, loc='upper center', bbox_to_anchor=(0.5, 1.17))
axes[0].grid(True, linestyle='--')
axes[0].tick_params(axis='both', labelsize=fontsize)

# Plot for Task Knowledge
axes[1].scatter(tom7_common_f1, exp2_common_f1_diff_6_0, c='green')
axes[1].set_title(f'$r={tom7correlation_coefficient:.2f}$, $p={tom7p_value:.2f}$', fontsize=fontsize)
axes[1].plot(np.unique(tom7_common_f1), np.poly1d(np.polyfit(tom7_common_f1, exp2_common_f1_diff_6_0, 1))(np.unique(tom7_common_f1)), color='red', linestyle='--')
axes[1].set_xlabel('F1(Task Knowledge)', fontsize=fontsize)
#axes[1].set_ylabel('F1(OMK w ToM) - F1(OMK w/ ToM)', fontsize=fontsize)
#axes[1].legend(fontsize=fontsize, loc='upper center', bbox_to_anchor=(0.5, 1.25))
axes[1].grid(True, linestyle='--')
axes[1].tick_params(axis='both', labelsize=fontsize)

plt.tight_layout()

plt.savefig('correlation_tom_feats_exp2.pdf', bbox_inches='tight')

plt.show()

Partner's missing knowledge (exp3)

In [19]:
tom6correlation_coefficient, tom6p_value = pearsonr(tom6_common_f1, exp3_common_f1_diff_7_0)
tom7correlation_coefficient, tom7p_value = pearsonr(tom7_common_f1, exp3_common_f1_diff_7_0)
tom8correlation_coefficient, tom8p_value = pearsonr(tom8_common_f1, exp3_common_f1_diff_7_0)
In [20]:
print(tom6correlation_coefficient, tom6p_value)
print(tom7correlation_coefficient, tom7p_value)
print(tom8correlation_coefficient, tom8p_value)
0.23820019887745394 0.07176536455128052
-0.0691280537719391 0.6061193964113574
0.10181632005409125 0.4469499039738408
In [21]:
fontsize = 19
t_ylim = 0.4
b_ylim = -0.12

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(19, 5.1))

# Plot for Task Intention
axes[2].scatter(tom8_common_f1, exp3_common_f1_diff_7_0)
axes[2].set_title(f'$r={tom8correlation_coefficient:.2f}$, $p={tom8p_value:.2f}$', fontsize=fontsize)
axes[2].plot(np.unique(tom8_common_f1), np.poly1d(np.polyfit(tom8_common_f1, exp3_common_f1_diff_7_0, 1))(np.unique(tom8_common_f1)), color='red', linestyle='--')
axes[2].set_xlabel('F1(Task Intention)', fontsize=fontsize)
axes[2].set_ylim(b_ylim, t_ylim)
#axes[2].set_ylabel('F1(OMK w ToM) - F1(OMK w/ ToM)', fontsize=fontsize)
#axes[2].legend(fontsize=fontsize, loc='upper center', bbox_to_anchor=(0.5, 1.17))
axes[2].grid(True, linestyle='--')
axes[2].tick_params(axis='both', labelsize=fontsize)

# Plot for Task Status
axes[0].scatter(tom6_common_f1, exp3_common_f1_diff_7_0, c='purple')
axes[0].set_title(f'$r={tom6correlation_coefficient:.2f}$, $p={tom6p_value:.2f}$', fontsize=fontsize)
axes[0].plot(np.unique(tom6_common_f1), np.poly1d(np.polyfit(tom6_common_f1, exp3_common_f1_diff_7_0, 1))(np.unique(tom6_common_f1)), color='red', linestyle='--')
axes[0].set_xlabel('F1(Task Status)', fontsize=fontsize)
axes[0].set_ylabel('F1(PMK w ToM) $-$ F1(PMK w/ ToM)', fontsize=fontsize)
axes[0].set_ylim(b_ylim, t_ylim)
#axes[0].legend(fontsize=fontsize, loc='upper center', bbox_to_anchor=(0.5, 1.17))
axes[0].grid(True, linestyle='--')
axes[0].tick_params(axis='both', labelsize=fontsize)

# Plot for Task Knowledge
axes[1].scatter(tom7_common_f1, exp3_common_f1_diff_7_0, c='green')
axes[1].set_title(f'$r={tom7correlation_coefficient:.2f}$, $p={tom7p_value:.2f}$', fontsize=fontsize)
axes[1].plot(np.unique(tom7_common_f1), np.poly1d(np.polyfit(tom7_common_f1, exp3_common_f1_diff_7_0, 1))(np.unique(tom7_common_f1)), color='red', linestyle='--')
axes[1].set_xlabel('F1(Task Knowledge)', fontsize=fontsize)
axes[1].set_ylim(b_ylim, t_ylim)
#axes[1].set_ylabel('F1(OMK w ToM) - F1(OMK w/ ToM)', fontsize=fontsize)
#axes[1].legend(fontsize=fontsize, loc='upper center', bbox_to_anchor=(0.5, 1.25))
axes[1].grid(True, linestyle='--')
axes[1].tick_params(axis='both', labelsize=fontsize)

plt.tight_layout()

plt.savefig('correlation_tom_feats_exp3.pdf', bbox_inches='tight')

plt.show()