evaluation code

2018-05-05 22:22:21 +02:00 · 2018-05-05 22:22:21 +02:00 · 3d3cebb956
commit 3d3cebb956
parent fc7973a49b
6 changed files with 660 additions and 13 deletions
--- a/05_plot_weights.py
+++ b/05_plot_weights.py
@ -0,0 +1,234 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from config import conf
+import os, sys
+import pandas as pns
+from config import names as gs
+import getopt
+import matplotlib.gridspec as gridspec
+from sklearn.metrics import f1_score
+
+import seaborn as sns
+sns.set(style='whitegrid', color_codes=True)
+sns.set_context('poster')
+
+dark_color = sns.xkcd_rgb['charcoal grey']
+light_color = sns.xkcd_rgb['cloudy blue']
+
+def plot_weights():
+	# for each personality trait, compute the list of median feature importances across all cross validation folds and iterations
+	medianlist = []
+	for t in xrange(0, conf.n_traits):
+		medianlist.append(
+			list(imp_df.loc[imp_df['T'] == t].groupby(by='feat_num')['feature importance'].median()))
+
+	# find the 5th to highest feature importance for each trait and write their importances into a .tex table - see Table 2, SI
+	n = 15
+	most_important_features = []
+	most_important_features_lists = []
+	for ml in medianlist:
+		locallist = []
+		for i in xrange(1,(n+1)):
+			fn = gs.full_long_label_list[int(np.argsort(np.array(ml))[-i])]
+			locallist.append(fn)
+			if fn not in most_important_features:
+				most_important_features.append(fn)
+		most_important_features_lists.append(locallist)
+	most_important_features.sort()
+
+	# write the full list of feature importances into a .tex table - shown in Table 2, SI
+	filename = conf.figure_folder + '/table2.tex'
+	with open(filename, 'w') as f:
+		f.write('feature&Neur.&Extr.&Open.&Agree.&Consc.&PCS&CEI')
+		f.write('\\\\\n\hline\n')
+		for fi in xrange(0, len(most_important_features)):
+			f.write(most_important_features[fi])
+			for t in xrange(0, conf.n_traits):
+				m = imp_df[(imp_df['T'] == t)&(imp_df.feature == most_important_features[fi])]['feature importance'].median()
+				if most_important_features[fi] in most_important_features_lists[t]:
+					f.write('& \\textbf{' + '%.3f}' % m)
+				else:
+					f.write('&' + '%.3f' % m)
+			f.write('\\\\\n')
+	print filename, 'written.'
+
+	# create Figure 2
+	# first collect the set of individual top TOP_N features per trait:
+	TOP_N = 10
+	featlabels = []
+	for trait in xrange(0, conf.n_traits):
+		basedata = imp_df.loc[imp_df['T'] == trait]
+		gp = basedata.groupby(by='feature')['feature importance'].median()
+		order = gp.sort_values(ascending=False)
+		featlabels.extend(order[:TOP_N].keys())
+	super_feats = np.unique(np.array(featlabels))
+
+	# collect the sum of feature importances for these labels, to sort the features by their median
+	super_feats_importance_sum = np.zeros((len(super_feats)))
+	for i in xrange(0, len(super_feats)):
+		super_feats_importance_sum[i] = imp_df[imp_df.feature==super_feats[i]].groupby(by=['T'])['feature importance'].median().sum()
+	super_feats_sort_indices = np.argsort(super_feats_importance_sum)[::-1]
+
+	# add some interesting features from related work to the list of features whose importance will be shown
+	must_have_feats = [
+		'inter quartile range x', 'range x', 'maximum x', 'std x', '1st quartile x', 'range pupil diameter', 'median y',
+		'mean difference of subsequent x', 'mean fixation duration', '3rd quartile y',
+		'fixation rate', 'mean saccade amplitude', 'dwelling time'
+	]
+	# but only add them if they are not in the list yet
+	additional_feats = np.array([a for a in must_have_feats if a not in super_feats], dtype=object)
+
+	# collect the sum of feature importances for these labels as well, so they can be sorted by their median importance in the plot
+	additional_feats_importance_sum = np.zeros((len(additional_feats)))
+	for trait in xrange(0, conf.n_traits):
+		basedata = imp_df.loc[imp_df['T'] == trait]
+		for i in xrange(0, len(additional_feats)):
+			logi = basedata.feature == additional_feats[i]
+			additional_feats_importance_sum[i] += float(basedata[logi]['feature importance'].median())
+	additional_feats_sort_indices = np.argsort(additional_feats_importance_sum)[::-1]
+
+	# create the figure
+	plt.figure(figsize=(20, 12))
+	grs = gridspec.GridSpec(len(super_feats) + len(additional_feats) + 1, conf.n_traits)
+
+	for trait in xrange(0, conf.n_traits):
+		# upper part of the figure, i.e. important features
+		ax = plt.subplot(grs[:len(super_feats),trait])
+		basedata = imp_df.loc[imp_df['T'] == trait]
+		feat_importances = []
+		for i in xrange(0, len(super_feats)):
+			logi = basedata.feature == super_feats[super_feats_sort_indices][i]
+			feat_importances.append(list(basedata[logi]['feature importance']))
+		bp = plt.boxplot(x=feat_importances, #notch=True, labels=super_feats[super_feats_sort_indices],
+		                 patch_artist=True, sym='', vert=False, whis='range', positions=np.arange(0,len(feat_importances)))
+
+		# asthetics
+		for i in xrange(0, len(super_feats)):
+			bp['boxes'][i].set(color=dark_color)
+			bp['boxes'][i].set(facecolor=light_color)
+			bp['whiskers'][2 * i].set(color=dark_color, linestyle='-')
+			bp['whiskers'][2 * i + 1].set(color=dark_color, linestyle='-')
+			bp['caps'][2 * i].set(color=dark_color)
+			bp['caps'][2 * i + 1].set(color=dark_color)
+			bp['medians'][i].set(color=dark_color)
+
+		if not trait == 0:
+			plt.ylabel('')
+			plt.setp(ax.get_yticklabels(), visible=False)
+		else:
+			ax.set_yticklabels(super_feats[super_feats_sort_indices])
+
+		xlimmax = 0.47
+		xticks = [0.15, 0.35]
+		plt.xlim((0, xlimmax))
+		plt.xticks(xticks)
+		plt.setp(ax.get_xticklabels(), visible=False)
+
+		# lower part of the figure, i.e. features from related work
+		ax = plt.subplot(grs[(-len(additional_feats)):, trait])
+		basedata = imp_df.loc[imp_df['T'] == trait]
+		feat_importances = []
+		for i in xrange(0, len(additional_feats)):
+			logi = basedata.feature == additional_feats[additional_feats_sort_indices][i]
+			feat_importances.append(basedata[logi]['feature importance'])
+		bp = plt.boxplot(x=feat_importances, patch_artist=True, sym='', vert=False, whis='range',
+		                 positions=np.arange(0,len(feat_importances)))
+
+		# asthetics
+		for i in xrange(0, len(additional_feats)):
+			bp['boxes'][i].set(color=dark_color)
+			bp['boxes'][i].set(facecolor=light_color) #, alpha=0.5)
+			bp['whiskers'][2 * i].set(color=dark_color, linestyle='-')
+			bp['whiskers'][2 * i + 1].set(color=dark_color, linestyle='-')
+			bp['caps'][2 * i].set(color=dark_color)
+			bp['caps'][2 * i + 1].set(color=dark_color)
+			bp['medians'][i].set(color=dark_color) #, linewidth=.1)
+
+		if not trait == 0:
+			plt.ylabel('')
+			plt.setp(ax.get_yticklabels(), visible=False)
+		else:
+			ax.set_yticklabels(additional_feats[additional_feats_sort_indices])
+		plt.xlim((0, xlimmax))
+		plt.xticks(xticks)
+		if trait == 3:
+			plt.xlabel(conf.medium_traitlabels[trait] + '\n\nFeature Importance')
+		else:
+			plt.xlabel(conf.medium_traitlabels[trait])
+
+	filename = conf.figure_folder + '/figure2.pdf'
+	plt.savefig(filename, bbox_inches='tight')
+	print filename.split('/')[-1], 'written.'
+	plt.close()
+
+
+if __name__ == "__main__":
+	# target file names - save table of F1 scores, feature importances and majority predictions there
+	datapathI = conf.get_result_folder(conf.annotation_all) + '/f1s.csv'  # F1 scores from each iteration
+	datapathII = conf.get_result_folder(conf.annotation_all) + '/feature_importance.csv'  # Feature importance from each iteration
+	datapathIII = conf.get_result_folder(conf.annotation_all) + '/majority_predictions.csv'  # Majority voting result for each participant over all iterations
+
+	if not os.path.exists(conf.figure_folder):
+		os.mkdir(conf.figure_folder)
+
+	# if target files do not exist yet, create them
+	if (not os.path.exists(datapathI)) or (not os.path.exists(datapathII)) or (not os.path.exists(datapathIII)):
+		f1s = []
+		feature_importances = []
+		majority_predictions = []
+		for trait in xrange(0, conf.n_traits):
+			predictions = np.zeros((conf.n_participants, conf.max_n_iter),dtype=int)-1
+			ground_truth = np.loadtxt(conf.binned_personality_file, delimiter=',', skiprows=1, usecols=(trait+1,))
+			for si in xrange(0, conf.max_n_iter):
+				filename = conf.get_result_filename(conf.annotation_all, trait, False, si, add_suffix=True)
+				if os.path.exists(filename):
+					data = np.load(filename)
+					if (data['predictions'] > 0).all():
+						assert data['f1'] == f1_score(ground_truth, data['predictions'], average='macro')
+						f1s.append([data['f1'], conf.medium_traitlabels[trait]])
+					else:
+						#   if there was no time window for a condition, like if shopping data only is evaluated,
+						#   the F1 score for each person without a single time window will be set to -1
+						#   but should not be used as such to compute the mean F1 score.
+						#   Thus, here the F1 score is re-computed on the relevant participants only.
+						pr = data['predictions']
+						pr = pr[pr > 0]
+
+						dt = ground_truth[pr > 0]
+
+						f1s.append([f1_score(dt, pr, average='macro'), conf.medium_traitlabels[trait]])
+
+					for outer_cv_i in xrange(0, 5):  # number outer CV, not person anymore
+						for fi in xrange(0, conf.max_n_feat):
+							feature_importances.append([data['feature_importances'][outer_cv_i, fi], trait, gs.full_long_label_list[fi], fi])
+
+					predictions[:,si] = data['predictions']
+				else:
+					print 'did not find', filename
+
+			# compute majority voting for each participant over all iterations
+			for p in xrange(0, conf.n_participants):
+				(values, counts) = np.unique(predictions[p, predictions[p,:]>0], return_counts=True)
+				ind = np.argmax(counts)
+				majority_predictions.append([values[ind], p, conf.medium_traitlabels[trait]])
+
+		f1s_df = pns.DataFrame(data=f1s, columns=['F1', 'trait'])
+		f1s_df.to_csv(datapathI)
+
+		imp_df = pns.DataFrame(data=feature_importances, columns=['feature importance', 'T', 'feature', 'feat_num'])
+		imp_df.to_csv(datapathII)
+
+		majority_predictions_df = pns.DataFrame(data=majority_predictions, columns=['prediction','participant','trait'])
+		majority_predictions_df.to_csv(datapathIII)
+
+	else:
+		print 'No new results are collected as previous results were available. If you want to overwrite them, please delete the following files:'
+		print datapathI
+		print datapathII
+		print datapathIII
+
+	f1s_df = pns.read_csv(datapathI)
+	imp_df = pns.read_csv(datapathII)
+	majority_predictions_df = pns.read_csv(datapathIII)
+
+	plot_weights()  # Figure 2
--- a/06_baselines.py
+++ b/06_baselines.py
@ -0,0 +1,101 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from config import conf
+import os, sys
+import pandas as pns
+from config import names as gs
+import getopt
+import matplotlib.gridspec as gridspec
+from sklearn.metrics import f1_score, accuracy_score
+
+import seaborn as sns
+sns.set(style='whitegrid', color_codes=True)
+sns.set_context('poster')
+
+dark_color = sns.xkcd_rgb['charcoal grey']
+light_color = sns.xkcd_rgb['cloudy blue']
+
+max_n_feat = conf.max_n_feat
+m_iter = conf.max_n_iter
+
+featurelabels = gs.full_long_label_list
+participant_ids = np.arange(0, conf.n_participants)
+
+
+def plot_overview():
+	all_baselines.groupby(by=['trait', 'clf_name'])['F1'].mean().to_csv(conf.figure_folder +
+							'/figure1.csv')
+	print 'Figure1.csv written'
+
+	sns.set(font_scale=2.1)
+	plt.figure(figsize=(20, 10))
+	ax = plt.subplot(1,1,1)
+	sns.barplot(x='trait', y='F1', hue='clf_name', data=all_baselines, capsize=.05, errwidth=3,
+	            linewidth=3, estimator=np.mean, edgecolor=dark_color,
+	            palette={'our classifier': sns.xkcd_rgb['windows blue'],
+	                     'most frequent class': sns.xkcd_rgb['faded green'],
+	                     'random guess':sns.xkcd_rgb['greyish brown'],
+						 'label permutation':sns.xkcd_rgb['dusky pink']
+                        }
+	            )
+	plt.plot([-0.5,6.5], [0.33, 0.33], c=dark_color, linestyle='--', linewidth=3, label='theoretical chance level')
+	handles, labels = ax.get_legend_handles_labels()
+	ax.legend([handles[1], handles[2], handles[3], handles[4], handles[0]], [labels[1], labels[2], labels[3], labels[4], labels[0]], fontsize=20)
+	plt.xlabel('')
+	plt.ylabel('F1 score', fontsize=20)
+	plt.ylim((0, 0.55))
+	filename = conf.figure_folder + '/figure1.pdf'
+	plt.savefig(filename, bbox_inches='tight')
+	plt.close()
+	print 'wrote', filename.split('/')[-1]
+
+
+if __name__ == "__main__":
+	# collect F1 scores for classifiers on all data from a file that was written by evaluation_single_context.py
+	datapath = conf.get_result_folder(conf.annotation_all) + '/f1s.csv'
+	if not os.path.exists(datapath):
+		print 'could not find', datapath
+		print 'consider (re-)running evaluation_single_context.py'
+		sys.exit(1)
+	our_classifier = pns.read_csv(datapath)
+	our_classifier['clf_name'] = 'our classifier'
+
+	# baseline 1: guess the most frequent class from each training set that was written by train_baseline.py
+	datapath = conf.result_folder + '/most_frequ_class_baseline.csv'
+	if not os.path.exists(datapath):
+		print 'could not find', datapath
+		print 'consider (re-)running train_baseline.py'
+		sys.exit(1)
+	most_frequent_class_df = pns.read_csv(datapath)
+	most_frequent_class_df['clf_name'] = 'most frequent class'
+
+	# compute all other baselines ad hoc
+	collection = []
+	for trait in xrange(0, conf.n_traits):
+		# baseline 2: random guess
+		truth = np.genfromtxt(conf.binned_personality_file, skip_header=1, usecols=(trait+1,), delimiter=',')
+		for i in xrange(0, 100):
+			rand_guess = np.random.randint(1, 4, conf.n_participants)
+			f1 = f1_score(truth, rand_guess, average='macro')
+			collection.append([f1, conf.medium_traitlabels[trait], i, 'random guess'])
+
+		# baseline 3: label permutation test
+		#             was computed using label_permutation_test.sh and written into results. ie. is just loaded here
+		for si in xrange(0, m_iter):
+			filename_rand = conf.get_result_filename(conf.annotation_all, trait, True, si, add_suffix=True)
+			if os.path.exists(filename_rand):
+				data = np.load(filename_rand)
+				pr = data['predictions']
+				dt = truth[pr > 0]
+				pr = pr[pr > 0]
+				f1 = f1_score(dt, pr, average='macro')
+				collection.append([f1, conf.medium_traitlabels[trait], si, 'label permutation'])
+			else:
+				print 'did not find', filename_rand
+				print 'consider (re-)running label_permutation_test.sh'
+				sys.exit(1)
+
+	collectiondf = pns.DataFrame(data=collection,columns=['F1','trait','iteration','clf_name'])
+	all_baselines = pns.concat([our_classifier, most_frequent_class_df, collectiondf])
+
+	plot_overview()  # Figure 1
--- a/07_evaluation_across_contexts.py
+++ b/07_evaluation_across_contexts.py
@ -0,0 +1,155 @@
+import numpy as np
+from config import conf
+import os, sys
+from config import names as gs
+import pandas as pd
+
+truth = np.genfromtxt(conf.binned_personality_file, skip_header=1, usecols=xrange(1, conf.n_traits+1), delimiter=',')
+
+# all comparisons to perform. Each has
+#     a name,
+#     two annotation values that determine if classifiers trained on all data or on specific subsets only will be examined;
+#     names for both tasks to compare
+comparisons = dict({'split halves': [conf.annotation_all, conf.annotation_all, 'first half', 'second half'],
+                    'two ways': [conf.annotation_ways, conf.annotation_ways, 'way there', 'way back'],
+                    'way vs shop in general classifier': [conf.annotation_all, conf.annotation_all, 'both ways' ,'shop'],
+                    'way vs shop in specialised classifier': [conf.annotation_ways, conf.annotation_shop, 'both ways', 'shop'],
+                    'way in specialised classifier vs way in general classifier': [conf.annotation_ways, conf.annotation_all, 'both ways', 'both ways'],
+                    'shop in specialised classifier vs shop in general classifier': [conf.annotation_shop, conf.annotation_all, 'shop', 'shop']
+                    })
+
+def get_majority_vote(predictions):
+    if len(predictions) == 0:
+        return -1
+    (values, counts) = np.unique(predictions, return_counts=True)
+    ind = np.argmax(counts)
+    return values[ind]
+
+def get_average_correlation(predA, predB, m_iter):
+    """
+    :param predA: predictions for task A, n_participants x m_iter
+    :param predB: predictions for task B, n_participants x m_iter
+    :return:
+    """
+    correlations = []
+    for si in xrange(0, m_iter):
+        if predB.ndim == 1:
+            if np.sum(predA[:,si]) > 0:
+                A = predA[:,si]
+                B = predB
+                consider = (A>0)
+                A = A[consider]
+                B = B[consider]
+            else:
+                continue
+        else:
+            if np.sum(predA[:,si]) > 0 and (np.sum(predB[:,si]) > 0):
+                A = predA[:,si]
+                B = predB[:,si]
+                consider = (A>0) & (B>0)
+                A = A[consider]
+                B = B[consider]
+            else:
+                continue
+
+        correlation = np.corrcoef(np.array([A, B]))[0][1]
+        correlations.append(correlation)
+
+    avg = np.tanh(np.mean(np.arctanh(np.array(correlations))))
+    return avg
+
+
+if __name__ == "__main__":
+    # check if the output target folder already exists and create if not
+    if not os.path.exists(conf.figure_folder):
+        os.mkdir(conf.figure_folder)
+
+    # collect masks for each participant, annotation (all data, shop, way), window size and subset in question (e.g. first half, or way to the shop)
+    # each mask is True for samples of a particular participant and subset; False for all others
+    window_masks = []
+    for wsi in xrange(0, len(conf.all_window_sizes)):
+        x_file, y_file, id_file = conf.get_merged_feature_files(conf.all_window_sizes[wsi])
+
+        for annotation_value in conf.annotation_values:
+            ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)
+
+            if annotation_value == conf.annotation_shop:
+                ids_ws = ids_ws[ids_ws[:, 1] == conf.time_window_annotation_shop, :]
+            elif annotation_value == conf.annotation_ways:
+                ids_ws = ids_ws[(ids_ws[:, 1] == conf.time_window_annotation_wayI) | (ids_ws[:, 1] == conf.time_window_annotation_wayII), :]
+
+            for p in xrange(0, conf.n_participants):
+                ids_ws_p = ids_ws[(ids_ws[:, 0] == p), :]
+
+                window_masks.append([annotation_value, p, wsi, 'first half', ids_ws_p[:, 2] == conf.time_window_annotation_halfI])
+                window_masks.append([annotation_value, p, wsi, 'second half', ids_ws_p[:, 2] == conf.time_window_annotation_halfII])
+
+                window_masks.append([annotation_value, p, wsi, 'way there', ids_ws_p[:, 1] == conf.time_window_annotation_wayI])
+                window_masks.append([annotation_value, p, wsi, 'way back', ids_ws_p[:, 1] == conf.time_window_annotation_wayII])
+
+                window_masks.append([annotation_value, p, wsi, 'shop', ids_ws_p[:, 1] == conf.time_window_annotation_shop])
+                window_masks.append([annotation_value, p, wsi, 'both ways', np.logical_or(ids_ws_p[:, 1] == conf.time_window_annotation_wayI,ids_ws_p[:, 1] == conf.time_window_annotation_wayII)])
+
+    window_masks_df = pd.DataFrame(window_masks, columns=['annotation', 'participant', 'window size index', 'subtask', 'mask'])
+
+    # collect predictions for each participant and each setting that is interesting for one of the comparisons
+    # Results are directly written into figures/table1-5.csv
+    with open(conf.figure_folder + '/table1-5.csv', 'w') as f:
+        f.write('comparison')
+        for trait in xrange(0, conf.n_traits):
+            f.write(',' + conf.medium_traitlabels[trait])
+        f.write('\n')
+
+        for comp_title, (annotation_value_I, annotation_value_II, subtaskI, subtaskII) in comparisons.items():
+            f.write(comp_title)
+            result_filename = conf.result_folder + '/predictions_' + comp_title.replace(' ','_') + '.npz'
+            if not os.path.exists(result_filename):
+                print 'computing data for', comp_title
+                print 'Note taht this might take a while - if the script is run again, intermediate results will be available and speed up all computations.'
+
+                predictions_I = np.zeros((conf.n_participants, conf.n_traits, conf.max_n_iter), dtype=int)
+                predictions_II = np.zeros((conf.n_participants, conf.n_traits, conf.max_n_iter), dtype=int)
+
+                for trait in xrange(0, conf.n_traits):
+                    for si in xrange(0, conf.max_n_iter):
+                        filenameI = conf.get_result_filename(annotation_value_I, trait, False, si, add_suffix=True)
+                        filenameII = conf.get_result_filename(annotation_value_II, trait, False, si, add_suffix=True)
+
+                        if os.path.exists(filenameI) and os.path.exists(filenameII):
+                            dataI = np.load(filenameI)
+                            detailed_predictions_I = dataI['detailed_predictions']
+                            chosen_window_indices_I = dataI['chosen_window_indices']
+
+                            dataII = np.load(filenameII)
+                            detailed_predictions_II = dataII['detailed_predictions']
+                            chosen_window_indices_II = dataII['chosen_window_indices']
+
+                            for p, window_index_I, window_index_II, local_detailed_preds_I, local_detailed_preds_II in zip(xrange(0, conf.n_participants), chosen_window_indices_I, chosen_window_indices_II, detailed_predictions_I, detailed_predictions_II):
+                                maskI = window_masks_df[(window_masks_df.annotation == annotation_value_I) &
+                                                        (window_masks_df.participant == p) &
+                                                        (window_masks_df['window size index'] == window_index_I) &
+                                                        (window_masks_df.subtask == subtaskI)
+                                                        ].as_matrix(columns=['mask'])[0][0]
+                                maskII = window_masks_df[(window_masks_df.annotation == annotation_value_II) &
+                                                        (window_masks_df.participant == p) &
+                                                        (window_masks_df['window size index'] == window_index_II) &
+                                                        (window_masks_df.subtask == subtaskII)
+                                                        ].as_matrix(columns=['mask'])[0][0]
+
+                                predictions_I[p, trait, si] = get_majority_vote(np.array(local_detailed_preds_I)[maskI])
+                                predictions_II[p, trait, si] = get_majority_vote(np.array(local_detailed_preds_II)[maskII])
+                        else:
+                            print 'did not find', filenameI, 'or', filenameII
+                            sys.exit(1)
+                np.savez(result_filename, predictions_I=predictions_I, predictions_II=predictions_II)
+            else:
+                data = np.load(result_filename)
+                predictions_I = data['predictions_I']
+                predictions_II = data['predictions_II']
+
+            # predictions_I are predictions from one context, predictions_II is the other context
+            # compute their average correlation and write it to file
+            for t in xrange(0, conf.n_traits):
+                corrI = get_average_correlation(predictions_I[:, t, :], predictions_II[:, t, :], 100)
+                f.write(','+'%.2f'%corrI)
+            f.write('\n')
--- a/08_descriptive.py
+++ b/08_descriptive.py
@ -0,0 +1,99 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from config import names as gs
+from config import conf
+import sys
+import math
+import os
+
+
+def get_stats():
+	annotation_times = np.genfromtxt(conf.annotation_path, delimiter=',', skip_header=1)[:, 1:]
+	shop_duration = annotation_times[:, 1] - annotation_times[:, 0]
+	print
+	print 'Time spent in the shop:'
+	print 'MEAN', np.mean(shop_duration/60.), 'min'
+	print 'STD', np.std(shop_duration/60.), 'min'
+
+
+def get_feature_correlations():
+	# find the window size that was most frequently chosen
+	hist_sum = np.zeros((len(conf.all_window_sizes)), dtype=int)
+	for trait in xrange(0, conf.n_traits):
+		for si in xrange(0, 100):
+			filename = conf.get_result_filename(conf.annotation_all, trait, False, si, add_suffix=True)
+			if os.path.exists(filename):
+				data = np.load(filename)
+				chosen_window_indices = data['chosen_window_indices']
+				hist, _ = np.histogram(chosen_window_indices, bins=np.arange(-0.5, len(conf.all_window_sizes), 1))
+				hist_sum += hist
+			else:
+				print 'did not find', filename
+
+	ws = conf.all_window_sizes[np.argmax(hist_sum)]
+
+	# load features for the most frequently chosen time window
+	x_file, y_file, id_file = conf.get_merged_feature_files(ws)
+	x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1)
+	ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)[:,0]
+	y = np.genfromtxt(conf.binned_personality_file, skip_header=1, usecols=xrange(1, conf.n_traits+1), delimiter=',')
+	y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)
+
+	# compute average feature per person
+	avg_x_ws = np.zeros((conf.n_participants, conf.max_n_feat))
+	for p in xrange(0,conf.n_participants):
+		avg_x_ws[p,:] = np.mean(x_ws[ids_ws == p, :], axis=0)
+
+	feature_correlations_avg = []
+	for fi in xrange(0, conf.max_n_feat):
+		C_avg = np.corrcoef(y.transpose(), avg_x_ws[:, fi])[-1][:-1]
+		feature_correlations_avg.append(C_avg)
+
+	feature_correlations_avg = np.array(feature_correlations_avg)
+
+	# find the 5th to highest correlation for each trait and write them into a .tex table - see Table 4 in SI
+	n = 15
+	highest_correlated_features = []
+	highest_correlated_features_lists = []
+	highest_correlated_features_names = []
+	for t in xrange(0, conf.n_traits):
+		hcf = feature_correlations_avg[:,t].argsort()[-n:]
+		locallist = []
+		for f in hcf:
+			if f not in highest_correlated_features:
+				highest_correlated_features.append(f)
+				highest_correlated_features_names.append(gs.full_long_label_list[f].lower())
+			locallist.append(f)
+
+		highest_correlated_features_lists.append(locallist)
+
+	features = zip(highest_correlated_features_names, highest_correlated_features)
+	highest_correlated_features = [y for (x,y) in sorted(features)]
+	#highest_correlated_features.sort()
+
+	filename = conf.figure_folder + '/table4.tex'
+	print len(highest_correlated_features)
+	with open(filename, 'w') as f:
+		f.write('feature&Neur.&Extr.&Open.&Agree.&Consc.&PCS&CEI')
+		f.write('\\\\\n\hline\n')
+		for fi in highest_correlated_features:
+			f.write(gs.full_long_label_list[fi])
+			for t in xrange(0, conf.n_traits):
+				fc = feature_correlations_avg[fi,t]
+				if math.isnan(fc):
+					f.write('&-')
+				elif fi in highest_correlated_features_lists[t]:
+					f.write('&\\textbf{'+'%.2f}'%fc)
+				else:
+					f.write('&'+'%.2f'%fc)
+			f.write('\\\\\n')
+	print
+	print filename, 'written'
+
+
+if __name__ == "__main__":
+	import os
+	if not os.path.exists(conf.figure_folder):
+		os.makedirs(conf.figure_folder)
+	get_stats()  # prints statistics on the time participants spent inside the shop
+	get_feature_correlations()  # Table 4
--- a/09_plot_ws_hist.py
+++ b/09_plot_ws_hist.py
@ -0,0 +1,37 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn
+from config import conf
+import os
+
+hist_sum = np.zeros((len(conf.all_window_sizes)), dtype=int)
+for trait in xrange(0, conf.n_traits):
+  for si in xrange(0, 100):
+    filename = conf.get_result_filename(conf.annotation_all, trait, False, si, add_suffix=True)
+    if os.path.exists(filename):
+      data = np.load(filename)
+      chosen_window_indices = data['chosen_window_indices']
+      hist, _ = np.histogram(chosen_window_indices, bins=np.arange(-0.5, len(conf.all_window_sizes), 1))
+      hist_sum += hist
+    else:
+      print 'did not find', filename
+
+hist_sum_sum = np.sum(hist_sum)
+
+plt.figure()
+ax = plt.subplot(111)
+bars = ax.bar(conf.all_window_sizes, hist_sum/float(hist_sum_sum)*100, width=8, tick_label=[str(x) for x in conf.all_window_sizes])
+
+for rect in bars:
+        height = rect.get_height()
+        ax.text(rect.get_x() + rect.get_width()/2., 1.01*height,
+                '%d' % (height/100.*hist_sum_sum),
+                ha='center', va='bottom')
+
+# Hide the right and top spines
+ax.spines['right'].set_visible(False)
+ax.spines['top'].set_visible(False)
+plt.xlabel('window size in s')
+plt.ylabel('percentage')
+plt.savefig('figures/ws_hist.pdf')
+plt.close()
--- a/README.md
+++ b/README.md
@ -25,27 +25,48 @@ reproducing the paper results step by step:
 1. __Extract features from raw gaze data__:    
   `python 00_compute_features.py` to compute gaze features for all participants  
   Once extracted, the features are stored in `features/ParticipantXX/window_features_YY.npy` where XX is the participant number and YY the length of the sliding window in seconds.  
+
+
 2. __Train random forest classifiers__  
-     `./01 train_classifiers.sh` to reproduce the evaluation setting described in the paper in which each classifier was trained 100 times.  
-    `./02_train_specialized_classifiers.sh` to train specialized classifiers on parts of the data (specifically on data from inside the shop or on the way).
+   `./01 train_classifiers.sh` to reproduce the evaluation setting described in the paper in which each classifier was trained 100 times.  
+  `./02_train_specialized_classifiers.sh` to train specialized classifiers on parts of the data (specifically on data from inside the shop or on the way).

-    If the scripts cannot be executed, you might not have the right access permissions to do so. On Linux, you can try `chmod +x 01_train_classifiers.sh`,`chmod +x 02_train_specialized_classifiers.sh` and `chmod +x 03_label_permutation_test.sh` (see below for when/how to use the last script).
+  If the scripts cannot be executed, you might not have the right access permissions to do so. On Linux, you can try `chmod +x 01_train_classifiers.sh`,`chmod +x 02_train_specialized_classifiers.sh` and `chmod +x 03_label_permutation_test.sh` (see below for when/how to use the last script).

-    In case you want to call the script differently, e.g. to speed-up the computation or try with different parameters, you can pass the following arguments to `classifiers.train_classifier`:  
-      `-t` 	trait index between 0 and 6  
-      `-l`   lowest number of repetitions, e.g. 0   
-      `-m`   max number of repetitions, e.g. 100  
-      `-a`   using partial data only: 0 (all data), 1 (way data), 2(shop data)  
+  In case you want to call the script differently, e.g. to speed-up the computation or try with different parameters, you can pass the following arguments to `classifiers.train_classifier`:  
+    `-t` 	trait index between 0 and 6  
+    `-l`   lowest number of repetitions, e.g. 0   
+    `-m`   max number of repetitions, e.g. 100  
+    `-a`   using partial data only: 0 (all data), 1 (way data), 2(shop data)  

-    In case of performance issues, it might be useful to check `_conf.py` and change `max_n_jobs` to restrict the number of jobs (i.e. threads) running in parallel.
+  In case of performance issues, it might be useful to check `_conf.py` and change `max_n_jobs` to restrict the number of jobs (i.e. threads) running in parallel.

-    The results will be saved in `results/A0` for all data, `results/A1` for way data only and `results/A2` for data inside a shop. Each file is named `TTT_XXX.npz`, where TTT is the abbreviation of the personality trait (`O`,`C`,`E`,`A`,`N` for the Big Five and `CEI` or `PCS` for the two curiosity measures). XXX enumerates the classifiers (remember that we always train 100 classifiers for evaluation because there is some randomness involved in the training process).  
+  The results will be saved in `results/A0` for all data, `results/A1` for way data only and `results/A2` for data inside a shop. Each file is named `TTT_XXX.npz`, where TTT is the abbreviation of the personality trait (`O`,`C`,`E`,`A`,`N` for the Big Five and `CEI` or `PCS` for the two curiosity measures). XXX enumerates the classifiers (remember that we always train 100 classifiers for evaluation because there is some randomness involved in the training process).  

-3. __Evaluate Baselines__
-   * To train a classifier that always predicts the most frequent personality score range from its current training set, please execute `python 03_train_baseline.py`  
-   * To train classifiers on permuted labels, i.e. perform the so-called label permutation test, please execute `./04_label_permutation_test.sh`    
+3. __Train baselines__
+  * To train a classifier that always predicts the most frequent personality score range from its current training set, please execute `python 03_train_baseline.py`  
+  * To train classifiers on permuted labels, i.e. perform the so-called label permutation test, please execute `./04_label_permutation_test.sh`    


+4. __Performance analysis__
+   * Run `python 05_plot_weights.py` to extract feature importance scores. These scores will be visualized in `figures/figure2.pdf` which corresponds to Figure 2 in the paper and `figures/table2.tex` which is shown in Table 2 in the supplementary information.
+   (additionally this step computes F1 scores which are required for the next step, so do not skip it)
+   * The results obtained from both baselines will be written to disk and read once you execute `python 06_baselines.py`.
+   A figure illustrating the actual classifiers' performance along with the random results will be written to `figures/figure1.pdf` as well as `figures/figure1.csv` and correspond to Figure 1 in the paper.  
+
+
+5. __Context comparison__  
+`python 07_evaluation_across_contexts.py` to compute the average correlation coefficients between predictions based on data from different contexts. The table with all coefficients will be written to `figures/table1-5.csv` which can be found in Table 1 and Table 5 in supplementary information.  
+If (some) files in the results folder are missing, try re-running all one of the bash (\*.sh) scripts again.
+
+6. __Descriptive analysis__  
+`python 08_descriptive.py` to compute the correlation between each participant's average feature for the most frequently chosen time window and their personality score range. Results are written to four files `figures/table4-1.tex`,`figures/table4-2.tex`,`figures/table4-3.tex`,`figures/table4-4.tex` and are shown together in Table 4 in the supplementary information.  
+
+7. __Window Size Histogram__    
+`python 09_plot_ws_hist.py` to plot a histogram of window sizes chosen during the nested cross validation routine to `figures/ws_hist.pdf`.
+
+All these scripts write intermediate results to disk, i.e. if you start a script a second time, it will be much faster - but the first run can take some time, e.g. up to 8 hours to train classifiers for one context on a 16 core machine; 1 hour to compute correlations between contexts.  
+
 ## Citation  
 If you want to cite this project, please use the following Bibtex format: