import numpy as np from config import conf import os, sys from config import names as gs import pandas as pd truth = np.genfromtxt(conf.binned_personality_file, skip_header=1, usecols=xrange(1, conf.n_traits+1), delimiter=',') # all comparisons to perform. Each has # a name, # two annotation values that determine if classifiers trained on all data or on specific subsets only will be examined; # names for both tasks to compare comparisons = dict({'split halves': [conf.annotation_all, conf.annotation_all, 'first half', 'second half'], 'two ways': [conf.annotation_ways, conf.annotation_ways, 'way there', 'way back'], 'way vs shop in general classifier': [conf.annotation_all, conf.annotation_all, 'both ways' ,'shop'], 'way vs shop in specialised classifier': [conf.annotation_ways, conf.annotation_shop, 'both ways', 'shop'], 'way in specialised classifier vs way in general classifier': [conf.annotation_ways, conf.annotation_all, 'both ways', 'both ways'], 'shop in specialised classifier vs shop in general classifier': [conf.annotation_shop, conf.annotation_all, 'shop', 'shop'] }) def get_majority_vote(predictions): if len(predictions) == 0: return -1 (values, counts) = np.unique(predictions, return_counts=True) ind = np.argmax(counts) return values[ind] def get_average_correlation(predA, predB, m_iter): """ :param predA: predictions for task A, n_participants x m_iter :param predB: predictions for task B, n_participants x m_iter :return: """ correlations = [] for si in xrange(0, m_iter): if predB.ndim == 1: if np.sum(predA[:,si]) > 0: A = predA[:,si] B = predB consider = (A>0) A = A[consider] B = B[consider] else: continue else: if np.sum(predA[:,si]) > 0 and (np.sum(predB[:,si]) > 0): A = predA[:,si] B = predB[:,si] consider = (A>0) & (B>0) A = A[consider] B = B[consider] else: continue correlation = np.corrcoef(np.array([A, B]))[0][1] correlations.append(correlation) avg = np.tanh(np.mean(np.arctanh(np.array(correlations)))) return avg if __name__ == "__main__": # check if the output target folder already exists and create if not if not os.path.exists(conf.figure_folder): os.mkdir(conf.figure_folder) # collect masks for each participant, annotation (all data, shop, way), window size and subset in question (e.g. first half, or way to the shop) # each mask is True for samples of a particular participant and subset; False for all others window_masks = [] for wsi in xrange(0, len(conf.all_window_sizes)): x_file, y_file, id_file = conf.get_merged_feature_files(conf.all_window_sizes[wsi]) for annotation_value in conf.annotation_values: ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int) if annotation_value == conf.annotation_shop: ids_ws = ids_ws[ids_ws[:, 1] == conf.time_window_annotation_shop, :] elif annotation_value == conf.annotation_ways: ids_ws = ids_ws[(ids_ws[:, 1] == conf.time_window_annotation_wayI) | (ids_ws[:, 1] == conf.time_window_annotation_wayII), :] for p in xrange(0, conf.n_participants): ids_ws_p = ids_ws[(ids_ws[:, 0] == p), :] window_masks.append([annotation_value, p, wsi, 'first half', ids_ws_p[:, 2] == conf.time_window_annotation_halfI]) window_masks.append([annotation_value, p, wsi, 'second half', ids_ws_p[:, 2] == conf.time_window_annotation_halfII]) window_masks.append([annotation_value, p, wsi, 'way there', ids_ws_p[:, 1] == conf.time_window_annotation_wayI]) window_masks.append([annotation_value, p, wsi, 'way back', ids_ws_p[:, 1] == conf.time_window_annotation_wayII]) window_masks.append([annotation_value, p, wsi, 'shop', ids_ws_p[:, 1] == conf.time_window_annotation_shop]) window_masks.append([annotation_value, p, wsi, 'both ways', np.logical_or(ids_ws_p[:, 1] == conf.time_window_annotation_wayI,ids_ws_p[:, 1] == conf.time_window_annotation_wayII)]) window_masks_df = pd.DataFrame(window_masks, columns=['annotation', 'participant', 'window size index', 'subtask', 'mask']) # collect predictions for each participant and each setting that is interesting for one of the comparisons # Results are directly written into figures/table1-5.csv with open(conf.figure_folder + '/table1-5.csv', 'w') as f: f.write('comparison') for trait in xrange(0, conf.n_traits): f.write(',' + conf.medium_traitlabels[trait]) f.write('\n') for comp_title, (annotation_value_I, annotation_value_II, subtaskI, subtaskII) in comparisons.items(): f.write(comp_title) result_filename = conf.result_folder + '/predictions_' + comp_title.replace(' ','_') + '.npz' if not os.path.exists(result_filename): print 'computing data for', comp_title print 'Note taht this might take a while - if the script is run again, intermediate results will be available and speed up all computations.' predictions_I = np.zeros((conf.n_participants, conf.n_traits, conf.max_n_iter), dtype=int) predictions_II = np.zeros((conf.n_participants, conf.n_traits, conf.max_n_iter), dtype=int) for trait in xrange(0, conf.n_traits): for si in xrange(0, conf.max_n_iter): filenameI = conf.get_result_filename(annotation_value_I, trait, False, si, add_suffix=True) filenameII = conf.get_result_filename(annotation_value_II, trait, False, si, add_suffix=True) if os.path.exists(filenameI) and os.path.exists(filenameII): dataI = np.load(filenameI) detailed_predictions_I = dataI['detailed_predictions'] chosen_window_indices_I = dataI['chosen_window_indices'] dataII = np.load(filenameII) detailed_predictions_II = dataII['detailed_predictions'] chosen_window_indices_II = dataII['chosen_window_indices'] for p, window_index_I, window_index_II, local_detailed_preds_I, local_detailed_preds_II in zip(xrange(0, conf.n_participants), chosen_window_indices_I, chosen_window_indices_II, detailed_predictions_I, detailed_predictions_II): maskI = window_masks_df[(window_masks_df.annotation == annotation_value_I) & (window_masks_df.participant == p) & (window_masks_df['window size index'] == window_index_I) & (window_masks_df.subtask == subtaskI) ].as_matrix(columns=['mask'])[0][0] maskII = window_masks_df[(window_masks_df.annotation == annotation_value_II) & (window_masks_df.participant == p) & (window_masks_df['window size index'] == window_index_II) & (window_masks_df.subtask == subtaskII) ].as_matrix(columns=['mask'])[0][0] predictions_I[p, trait, si] = get_majority_vote(np.array(local_detailed_preds_I)[maskI]) predictions_II[p, trait, si] = get_majority_vote(np.array(local_detailed_preds_II)[maskII]) else: print 'did not find', filenameI, 'or', filenameII sys.exit(1) np.savez(result_filename, predictions_I=predictions_I, predictions_II=predictions_II) else: data = np.load(result_filename) predictions_I = data['predictions_I'] predictions_II = data['predictions_II'] # predictions_I are predictions from one context, predictions_II is the other context # compute their average correlation and write it to file for t in xrange(0, conf.n_traits): corrI = get_average_correlation(predictions_I[:, t, :], predictions_II[:, t, :], 100) f.write(','+'%.2f'%corrI) f.write('\n')