156 lines
8.7 KiB
Python
156 lines
8.7 KiB
Python
|
import numpy as np
|
||
|
from config import conf
|
||
|
import os, sys
|
||
|
from config import names as gs
|
||
|
import pandas as pd
|
||
|
|
||
|
truth = np.genfromtxt(conf.binned_personality_file, skip_header=1, usecols=xrange(1, conf.n_traits+1), delimiter=',')
|
||
|
|
||
|
# all comparisons to perform. Each has
|
||
|
# a name,
|
||
|
# two annotation values that determine if classifiers trained on all data or on specific subsets only will be examined;
|
||
|
# names for both tasks to compare
|
||
|
comparisons = dict({'split halves': [conf.annotation_all, conf.annotation_all, 'first half', 'second half'],
|
||
|
'two ways': [conf.annotation_ways, conf.annotation_ways, 'way there', 'way back'],
|
||
|
'way vs shop in general classifier': [conf.annotation_all, conf.annotation_all, 'both ways' ,'shop'],
|
||
|
'way vs shop in specialised classifier': [conf.annotation_ways, conf.annotation_shop, 'both ways', 'shop'],
|
||
|
'way in specialised classifier vs way in general classifier': [conf.annotation_ways, conf.annotation_all, 'both ways', 'both ways'],
|
||
|
'shop in specialised classifier vs shop in general classifier': [conf.annotation_shop, conf.annotation_all, 'shop', 'shop']
|
||
|
})
|
||
|
|
||
|
def get_majority_vote(predictions):
|
||
|
if len(predictions) == 0:
|
||
|
return -1
|
||
|
(values, counts) = np.unique(predictions, return_counts=True)
|
||
|
ind = np.argmax(counts)
|
||
|
return values[ind]
|
||
|
|
||
|
def get_average_correlation(predA, predB, m_iter):
|
||
|
"""
|
||
|
:param predA: predictions for task A, n_participants x m_iter
|
||
|
:param predB: predictions for task B, n_participants x m_iter
|
||
|
:return:
|
||
|
"""
|
||
|
correlations = []
|
||
|
for si in xrange(0, m_iter):
|
||
|
if predB.ndim == 1:
|
||
|
if np.sum(predA[:,si]) > 0:
|
||
|
A = predA[:,si]
|
||
|
B = predB
|
||
|
consider = (A>0)
|
||
|
A = A[consider]
|
||
|
B = B[consider]
|
||
|
else:
|
||
|
continue
|
||
|
else:
|
||
|
if np.sum(predA[:,si]) > 0 and (np.sum(predB[:,si]) > 0):
|
||
|
A = predA[:,si]
|
||
|
B = predB[:,si]
|
||
|
consider = (A>0) & (B>0)
|
||
|
A = A[consider]
|
||
|
B = B[consider]
|
||
|
else:
|
||
|
continue
|
||
|
|
||
|
correlation = np.corrcoef(np.array([A, B]))[0][1]
|
||
|
correlations.append(correlation)
|
||
|
|
||
|
avg = np.tanh(np.mean(np.arctanh(np.array(correlations))))
|
||
|
return avg
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
# check if the output target folder already exists and create if not
|
||
|
if not os.path.exists(conf.figure_folder):
|
||
|
os.mkdir(conf.figure_folder)
|
||
|
|
||
|
# collect masks for each participant, annotation (all data, shop, way), window size and subset in question (e.g. first half, or way to the shop)
|
||
|
# each mask is True for samples of a particular participant and subset; False for all others
|
||
|
window_masks = []
|
||
|
for wsi in xrange(0, len(conf.all_window_sizes)):
|
||
|
x_file, y_file, id_file = conf.get_merged_feature_files(conf.all_window_sizes[wsi])
|
||
|
|
||
|
for annotation_value in conf.annotation_values:
|
||
|
ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)
|
||
|
|
||
|
if annotation_value == conf.annotation_shop:
|
||
|
ids_ws = ids_ws[ids_ws[:, 1] == conf.time_window_annotation_shop, :]
|
||
|
elif annotation_value == conf.annotation_ways:
|
||
|
ids_ws = ids_ws[(ids_ws[:, 1] == conf.time_window_annotation_wayI) | (ids_ws[:, 1] == conf.time_window_annotation_wayII), :]
|
||
|
|
||
|
for p in xrange(0, conf.n_participants):
|
||
|
ids_ws_p = ids_ws[(ids_ws[:, 0] == p), :]
|
||
|
|
||
|
window_masks.append([annotation_value, p, wsi, 'first half', ids_ws_p[:, 2] == conf.time_window_annotation_halfI])
|
||
|
window_masks.append([annotation_value, p, wsi, 'second half', ids_ws_p[:, 2] == conf.time_window_annotation_halfII])
|
||
|
|
||
|
window_masks.append([annotation_value, p, wsi, 'way there', ids_ws_p[:, 1] == conf.time_window_annotation_wayI])
|
||
|
window_masks.append([annotation_value, p, wsi, 'way back', ids_ws_p[:, 1] == conf.time_window_annotation_wayII])
|
||
|
|
||
|
window_masks.append([annotation_value, p, wsi, 'shop', ids_ws_p[:, 1] == conf.time_window_annotation_shop])
|
||
|
window_masks.append([annotation_value, p, wsi, 'both ways', np.logical_or(ids_ws_p[:, 1] == conf.time_window_annotation_wayI,ids_ws_p[:, 1] == conf.time_window_annotation_wayII)])
|
||
|
|
||
|
window_masks_df = pd.DataFrame(window_masks, columns=['annotation', 'participant', 'window size index', 'subtask', 'mask'])
|
||
|
|
||
|
# collect predictions for each participant and each setting that is interesting for one of the comparisons
|
||
|
# Results are directly written into figures/table1-5.csv
|
||
|
with open(conf.figure_folder + '/table1-5.csv', 'w') as f:
|
||
|
f.write('comparison')
|
||
|
for trait in xrange(0, conf.n_traits):
|
||
|
f.write(',' + conf.medium_traitlabels[trait])
|
||
|
f.write('\n')
|
||
|
|
||
|
for comp_title, (annotation_value_I, annotation_value_II, subtaskI, subtaskII) in comparisons.items():
|
||
|
f.write(comp_title)
|
||
|
result_filename = conf.result_folder + '/predictions_' + comp_title.replace(' ','_') + '.npz'
|
||
|
if not os.path.exists(result_filename):
|
||
|
print 'computing data for', comp_title
|
||
|
print 'Note taht this might take a while - if the script is run again, intermediate results will be available and speed up all computations.'
|
||
|
|
||
|
predictions_I = np.zeros((conf.n_participants, conf.n_traits, conf.max_n_iter), dtype=int)
|
||
|
predictions_II = np.zeros((conf.n_participants, conf.n_traits, conf.max_n_iter), dtype=int)
|
||
|
|
||
|
for trait in xrange(0, conf.n_traits):
|
||
|
for si in xrange(0, conf.max_n_iter):
|
||
|
filenameI = conf.get_result_filename(annotation_value_I, trait, False, si, add_suffix=True)
|
||
|
filenameII = conf.get_result_filename(annotation_value_II, trait, False, si, add_suffix=True)
|
||
|
|
||
|
if os.path.exists(filenameI) and os.path.exists(filenameII):
|
||
|
dataI = np.load(filenameI)
|
||
|
detailed_predictions_I = dataI['detailed_predictions']
|
||
|
chosen_window_indices_I = dataI['chosen_window_indices']
|
||
|
|
||
|
dataII = np.load(filenameII)
|
||
|
detailed_predictions_II = dataII['detailed_predictions']
|
||
|
chosen_window_indices_II = dataII['chosen_window_indices']
|
||
|
|
||
|
for p, window_index_I, window_index_II, local_detailed_preds_I, local_detailed_preds_II in zip(xrange(0, conf.n_participants), chosen_window_indices_I, chosen_window_indices_II, detailed_predictions_I, detailed_predictions_II):
|
||
|
maskI = window_masks_df[(window_masks_df.annotation == annotation_value_I) &
|
||
|
(window_masks_df.participant == p) &
|
||
|
(window_masks_df['window size index'] == window_index_I) &
|
||
|
(window_masks_df.subtask == subtaskI)
|
||
|
].as_matrix(columns=['mask'])[0][0]
|
||
|
maskII = window_masks_df[(window_masks_df.annotation == annotation_value_II) &
|
||
|
(window_masks_df.participant == p) &
|
||
|
(window_masks_df['window size index'] == window_index_II) &
|
||
|
(window_masks_df.subtask == subtaskII)
|
||
|
].as_matrix(columns=['mask'])[0][0]
|
||
|
|
||
|
predictions_I[p, trait, si] = get_majority_vote(np.array(local_detailed_preds_I)[maskI])
|
||
|
predictions_II[p, trait, si] = get_majority_vote(np.array(local_detailed_preds_II)[maskII])
|
||
|
else:
|
||
|
print 'did not find', filenameI, 'or', filenameII
|
||
|
sys.exit(1)
|
||
|
np.savez(result_filename, predictions_I=predictions_I, predictions_II=predictions_II)
|
||
|
else:
|
||
|
data = np.load(result_filename)
|
||
|
predictions_I = data['predictions_I']
|
||
|
predictions_II = data['predictions_II']
|
||
|
|
||
|
# predictions_I are predictions from one context, predictions_II is the other context
|
||
|
# compute their average correlation and write it to file
|
||
|
for t in xrange(0, conf.n_traits):
|
||
|
corrI = get_average_correlation(predictions_I[:, t, :], predictions_II[:, t, :], 100)
|
||
|
f.write(','+'%.2f'%corrI)
|
||
|
f.write('\n')
|