eye_movements_personality/07_evaluation_across_contex...

import numpy as np
from config import conf
import os, sys
from config import names as gs
import pandas as pd

truth = np.genfromtxt(conf.binned_personality_file, skip_header=1, usecols=xrange(1, conf.n_traits+1), delimiter=',')

# all comparisons to perform. Each has
#     a name,
#     two annotation values that determine if classifiers trained on all data or on specific subsets only will be examined;
#     names for both tasks to compare
comparisons = dict({'split halves': [conf.annotation_all, conf.annotation_all, 'first half', 'second half'],
                    'two ways': [conf.annotation_ways, conf.annotation_ways, 'way there', 'way back'],
                    'way vs shop in general classifier': [conf.annotation_all, conf.annotation_all, 'both ways' ,'shop'],
                    'way vs shop in specialised classifier': [conf.annotation_ways, conf.annotation_shop, 'both ways', 'shop'],
                    'way in specialised classifier vs way in general classifier': [conf.annotation_ways, conf.annotation_all, 'both ways', 'both ways'],
                    'shop in specialised classifier vs shop in general classifier': [conf.annotation_shop, conf.annotation_all, 'shop', 'shop']
                    })

def get_majority_vote(predictions):
    if len(predictions) == 0:
        return -1
    (values, counts) = np.unique(predictions, return_counts=True)
    ind = np.argmax(counts)
    return values[ind]

def get_average_correlation(predA, predB, m_iter):
    """
    :param predA: predictions for task A, n_participants x m_iter
    :param predB: predictions for task B, n_participants x m_iter
    :return:
    """
    correlations = []
    for si in xrange(0, m_iter):
        if predB.ndim == 1:
            if np.sum(predA[:,si]) > 0:
                A = predA[:,si]
                B = predB
                consider = (A>0)
                A = A[consider]
                B = B[consider]
            else:
                continue
        else:
            if np.sum(predA[:,si]) > 0 and (np.sum(predB[:,si]) > 0):
                A = predA[:,si]
                B = predB[:,si]
                consider = (A>0) & (B>0)
                A = A[consider]
                B = B[consider]
            else:
                continue

        correlation = np.corrcoef(np.array([A, B]))[0][1]
        correlations.append(correlation)

    avg = np.tanh(np.mean(np.arctanh(np.array(correlations))))
    return avg


if __name__ == "__main__":
    # check if the output target folder already exists and create if not
    if not os.path.exists(conf.figure_folder):
        os.mkdir(conf.figure_folder)

    # collect masks for each participant, annotation (all data, shop, way), window size and subset in question (e.g. first half, or way to the shop)
    # each mask is True for samples of a particular participant and subset; False for all others
    window_masks = []
    for wsi in xrange(0, len(conf.all_window_sizes)):
        x_file, y_file, id_file = conf.get_merged_feature_files(conf.all_window_sizes[wsi])

        for annotation_value in conf.annotation_values:
            ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)

            if annotation_value == conf.annotation_shop:
                ids_ws = ids_ws[ids_ws[:, 1] == conf.time_window_annotation_shop, :]
            elif annotation_value == conf.annotation_ways:
                ids_ws = ids_ws[(ids_ws[:, 1] == conf.time_window_annotation_wayI) | (ids_ws[:, 1] == conf.time_window_annotation_wayII), :]

            for p in xrange(0, conf.n_participants):
                ids_ws_p = ids_ws[(ids_ws[:, 0] == p), :]

                window_masks.append([annotation_value, p, wsi, 'first half', ids_ws_p[:, 2] == conf.time_window_annotation_halfI])
                window_masks.append([annotation_value, p, wsi, 'second half', ids_ws_p[:, 2] == conf.time_window_annotation_halfII])

                window_masks.append([annotation_value, p, wsi, 'way there', ids_ws_p[:, 1] == conf.time_window_annotation_wayI])
                window_masks.append([annotation_value, p, wsi, 'way back', ids_ws_p[:, 1] == conf.time_window_annotation_wayII])

                window_masks.append([annotation_value, p, wsi, 'shop', ids_ws_p[:, 1] == conf.time_window_annotation_shop])
                window_masks.append([annotation_value, p, wsi, 'both ways', np.logical_or(ids_ws_p[:, 1] == conf.time_window_annotation_wayI,ids_ws_p[:, 1] == conf.time_window_annotation_wayII)])

    window_masks_df = pd.DataFrame(window_masks, columns=['annotation', 'participant', 'window size index', 'subtask', 'mask'])

    # collect predictions for each participant and each setting that is interesting for one of the comparisons
    # Results are directly written into figures/table1-5.csv
    with open(conf.figure_folder + '/table1-5.csv', 'w') as f:
        f.write('comparison')
        for trait in xrange(0, conf.n_traits):
            f.write(',' + conf.medium_traitlabels[trait])
        f.write('\n')

        for comp_title, (annotation_value_I, annotation_value_II, subtaskI, subtaskII) in comparisons.items():
            f.write(comp_title)
            result_filename = conf.result_folder + '/predictions_' + comp_title.replace(' ','_') + '.npz'
            if not os.path.exists(result_filename):
                print 'computing data for', comp_title
                print 'Note taht this might take a while - if the script is run again, intermediate results will be available and speed up all computations.'

                predictions_I = np.zeros((conf.n_participants, conf.n_traits, conf.max_n_iter), dtype=int)
                predictions_II = np.zeros((conf.n_participants, conf.n_traits, conf.max_n_iter), dtype=int)

                for trait in xrange(0, conf.n_traits):
                    for si in xrange(0, conf.max_n_iter):
                        filenameI = conf.get_result_filename(annotation_value_I, trait, False, si, add_suffix=True)
                        filenameII = conf.get_result_filename(annotation_value_II, trait, False, si, add_suffix=True)

                        if os.path.exists(filenameI) and os.path.exists(filenameII):
                            dataI = np.load(filenameI)
                            detailed_predictions_I = dataI['detailed_predictions']
                            chosen_window_indices_I = dataI['chosen_window_indices']

                            dataII = np.load(filenameII)
                            detailed_predictions_II = dataII['detailed_predictions']
                            chosen_window_indices_II = dataII['chosen_window_indices']

                            for p, window_index_I, window_index_II, local_detailed_preds_I, local_detailed_preds_II in zip(xrange(0, conf.n_participants), chosen_window_indices_I, chosen_window_indices_II, detailed_predictions_I, detailed_predictions_II):
                                maskI = window_masks_df[(window_masks_df.annotation == annotation_value_I) &
                                                        (window_masks_df.participant == p) &
                                                        (window_masks_df['window size index'] == window_index_I) &
                                                        (window_masks_df.subtask == subtaskI)
                                                        ].as_matrix(columns=['mask'])[0][0]
                                maskII = window_masks_df[(window_masks_df.annotation == annotation_value_II) &
                                                        (window_masks_df.participant == p) &
                                                        (window_masks_df['window size index'] == window_index_II) &
                                                        (window_masks_df.subtask == subtaskII)
                                                        ].as_matrix(columns=['mask'])[0][0]

                                predictions_I[p, trait, si] = get_majority_vote(np.array(local_detailed_preds_I)[maskI])
                                predictions_II[p, trait, si] = get_majority_vote(np.array(local_detailed_preds_II)[maskII])
                        else:
                            print 'did not find', filenameI, 'or', filenameII
                            sys.exit(1)
                np.savez(result_filename, predictions_I=predictions_I, predictions_II=predictions_II)
            else:
                data = np.load(result_filename)
                predictions_I = data['predictions_I']
                predictions_II = data['predictions_II']

            # predictions_I are predictions from one context, predictions_II is the other context
            # compute their average correlation and write it to file
            for t in xrange(0, conf.n_traits):
                corrI = get_average_correlation(predictions_I[:, t, :], predictions_II[:, t, :], 100)
                f.write(','+'%.2f'%corrI)
            f.write('\n')
evaluation code 2018-05-05 22:22:21 +02:00			`import numpy as np`
			`from config import conf`
			`import os, sys`
			`from config import names as gs`
			`import pandas as pd`

			`truth = np.genfromtxt(conf.binned_personality_file, skip_header=1, usecols=xrange(1, conf.n_traits+1), delimiter=',')`

			`# all comparisons to perform. Each has`
			`# a name,`
			`# two annotation values that determine if classifiers trained on all data or on specific subsets only will be examined;`
			`# names for both tasks to compare`
			`comparisons = dict({'split halves': [conf.annotation_all, conf.annotation_all, 'first half', 'second half'],`
			`'two ways': [conf.annotation_ways, conf.annotation_ways, 'way there', 'way back'],`
			`'way vs shop in general classifier': [conf.annotation_all, conf.annotation_all, 'both ways' ,'shop'],`
			`'way vs shop in specialised classifier': [conf.annotation_ways, conf.annotation_shop, 'both ways', 'shop'],`
			`'way in specialised classifier vs way in general classifier': [conf.annotation_ways, conf.annotation_all, 'both ways', 'both ways'],`
			`'shop in specialised classifier vs shop in general classifier': [conf.annotation_shop, conf.annotation_all, 'shop', 'shop']`
			`})`

			`def get_majority_vote(predictions):`
			`if len(predictions) == 0:`
			`return -1`
			`(values, counts) = np.unique(predictions, return_counts=True)`
			`ind = np.argmax(counts)`
			`return values[ind]`

			`def get_average_correlation(predA, predB, m_iter):`
			`"""`
			`:param predA: predictions for task A, n_participants x m_iter`
			`:param predB: predictions for task B, n_participants x m_iter`
			`:return:`
			`"""`
			`correlations = []`
			`for si in xrange(0, m_iter):`
			`if predB.ndim == 1:`
			`if np.sum(predA[:,si]) > 0:`
			`A = predA[:,si]`
			`B = predB`
			`consider = (A>0)`
			`A = A[consider]`
			`B = B[consider]`
			`else:`
			`continue`
			`else:`
			`if np.sum(predA[:,si]) > 0 and (np.sum(predB[:,si]) > 0):`
			`A = predA[:,si]`
			`B = predB[:,si]`
			`consider = (A>0) & (B>0)`
			`A = A[consider]`
			`B = B[consider]`
			`else:`
			`continue`

			`correlation = np.corrcoef(np.array([A, B]))[0][1]`
			`correlations.append(correlation)`

			`avg = np.tanh(np.mean(np.arctanh(np.array(correlations))))`
			`return avg`


			`if __name__ == "__main__":`
			`# check if the output target folder already exists and create if not`
			`if not os.path.exists(conf.figure_folder):`
			`os.mkdir(conf.figure_folder)`

			`# collect masks for each participant, annotation (all data, shop, way), window size and subset in question (e.g. first half, or way to the shop)`
			`# each mask is True for samples of a particular participant and subset; False for all others`
			`window_masks = []`
			`for wsi in xrange(0, len(conf.all_window_sizes)):`
			`x_file, y_file, id_file = conf.get_merged_feature_files(conf.all_window_sizes[wsi])`

			`for annotation_value in conf.annotation_values:`
			`ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)`

			`if annotation_value == conf.annotation_shop:`
			`ids_ws = ids_ws[ids_ws[:, 1] == conf.time_window_annotation_shop, :]`
			`elif annotation_value == conf.annotation_ways:`
			`ids_ws = ids_ws[(ids_ws[:, 1] == conf.time_window_annotation_wayI) \| (ids_ws[:, 1] == conf.time_window_annotation_wayII), :]`

			`for p in xrange(0, conf.n_participants):`
			`ids_ws_p = ids_ws[(ids_ws[:, 0] == p), :]`

			`window_masks.append([annotation_value, p, wsi, 'first half', ids_ws_p[:, 2] == conf.time_window_annotation_halfI])`
			`window_masks.append([annotation_value, p, wsi, 'second half', ids_ws_p[:, 2] == conf.time_window_annotation_halfII])`

			`window_masks.append([annotation_value, p, wsi, 'way there', ids_ws_p[:, 1] == conf.time_window_annotation_wayI])`
			`window_masks.append([annotation_value, p, wsi, 'way back', ids_ws_p[:, 1] == conf.time_window_annotation_wayII])`

			`window_masks.append([annotation_value, p, wsi, 'shop', ids_ws_p[:, 1] == conf.time_window_annotation_shop])`
			`window_masks.append([annotation_value, p, wsi, 'both ways', np.logical_or(ids_ws_p[:, 1] == conf.time_window_annotation_wayI,ids_ws_p[:, 1] == conf.time_window_annotation_wayII)])`

			`window_masks_df = pd.DataFrame(window_masks, columns=['annotation', 'participant', 'window size index', 'subtask', 'mask'])`

			`# collect predictions for each participant and each setting that is interesting for one of the comparisons`
			`# Results are directly written into figures/table1-5.csv`
			`with open(conf.figure_folder + '/table1-5.csv', 'w') as f:`
			`f.write('comparison')`
			`for trait in xrange(0, conf.n_traits):`
			`f.write(',' + conf.medium_traitlabels[trait])`
			`f.write('\n')`

			`for comp_title, (annotation_value_I, annotation_value_II, subtaskI, subtaskII) in comparisons.items():`
			`f.write(comp_title)`
			`result_filename = conf.result_folder + '/predictions_' + comp_title.replace(' ','_') + '.npz'`
			`if not os.path.exists(result_filename):`
			`print 'computing data for', comp_title`
			`print 'Note taht this might take a while - if the script is run again, intermediate results will be available and speed up all computations.'`

			`predictions_I = np.zeros((conf.n_participants, conf.n_traits, conf.max_n_iter), dtype=int)`
			`predictions_II = np.zeros((conf.n_participants, conf.n_traits, conf.max_n_iter), dtype=int)`

			`for trait in xrange(0, conf.n_traits):`
			`for si in xrange(0, conf.max_n_iter):`
			`filenameI = conf.get_result_filename(annotation_value_I, trait, False, si, add_suffix=True)`
			`filenameII = conf.get_result_filename(annotation_value_II, trait, False, si, add_suffix=True)`

			`if os.path.exists(filenameI) and os.path.exists(filenameII):`
			`dataI = np.load(filenameI)`
			`detailed_predictions_I = dataI['detailed_predictions']`
			`chosen_window_indices_I = dataI['chosen_window_indices']`

			`dataII = np.load(filenameII)`
			`detailed_predictions_II = dataII['detailed_predictions']`
			`chosen_window_indices_II = dataII['chosen_window_indices']`

			`for p, window_index_I, window_index_II, local_detailed_preds_I, local_detailed_preds_II in zip(xrange(0, conf.n_participants), chosen_window_indices_I, chosen_window_indices_II, detailed_predictions_I, detailed_predictions_II):`
			`maskI = window_masks_df[(window_masks_df.annotation == annotation_value_I) &`
			`(window_masks_df.participant == p) &`
			`(window_masks_df['window size index'] == window_index_I) &`
			`(window_masks_df.subtask == subtaskI)`
			`].as_matrix(columns=['mask'])[0][0]`
			`maskII = window_masks_df[(window_masks_df.annotation == annotation_value_II) &`
			`(window_masks_df.participant == p) &`
			`(window_masks_df['window size index'] == window_index_II) &`
			`(window_masks_df.subtask == subtaskII)`
			`].as_matrix(columns=['mask'])[0][0]`

			`predictions_I[p, trait, si] = get_majority_vote(np.array(local_detailed_preds_I)[maskI])`
			`predictions_II[p, trait, si] = get_majority_vote(np.array(local_detailed_preds_II)[maskII])`
			`else:`
			`print 'did not find', filenameI, 'or', filenameII`
			`sys.exit(1)`
			`np.savez(result_filename, predictions_I=predictions_I, predictions_II=predictions_II)`
			`else:`
			`data = np.load(result_filename)`
			`predictions_I = data['predictions_I']`
			`predictions_II = data['predictions_II']`

			`# predictions_I are predictions from one context, predictions_II is the other context`
			`# compute their average correlation and write it to file`
			`for t in xrange(0, conf.n_traits):`
			`corrI = get_average_correlation(predictions_I[:, t, :], predictions_II[:, t, :], 100)`
			`f.write(','+'%.2f'%corrI)`
			`f.write('\n')`