feature extraction code

2018-05-05 21:25:39 +02:00 · 2018-05-05 21:25:39 +02:00 · cebde1be17
commit cebde1be17
parent a20084fefa
9 changed files with 1279 additions and 2 deletions
--- a/00_compute_features.py
+++ b/00_compute_features.py
@ -0,0 +1,103 @@
 import numpy as np
 import os
 from config import conf as conf
 from featureExtraction import gaze_analysis as ga
 import threading
 import getopt
 import sys
 from config import names as gs
 def compute_sliding_window_features(participant, ws, gazeAnalysis_instance):
 	"""
 	calls the gazeAnalysis instance it was given, calls it to get features and saves those to file
 	"""
 	window_features, window_times = gazeAnalysis_instance.get_window_features(ws, conf.get_step_size(ws))
 	np.save(conf.get_window_features_file(participant, ws), window_features)
 	np.save(conf.get_window_times_file(participant, ws), window_times)
 if __name__ == "__main__":
 	for p in xrange(0,conf.n_participants):
 		threads = []  # one thread per time window will be used and collected in this list
 		# create data folder, plus one subfolder for participant p
 		if not os.path.exists(conf.get_feature_folder(p)):
 			os.makedirs(conf.get_feature_folder(p))
 		# make sure all relevant raw data files exist in the right folder
 		gaze_file = conf.get_data_folder(p) + '/gaze_positions.csv'
 		pupil_diameter_file = conf.get_data_folder(p) + '/pupil_diameter.csv'
 		events_file = conf.get_data_folder(p) + '/events.csv'
 		assert os.path.exists(gaze_file) and os.path.exists(pupil_diameter_file) and os.path.exists(events_file)
 		# load relevant data
 		gaze = np.genfromtxt(gaze_file, delimiter=',', skip_header=1)
 		pupil_diameter = np.genfromtxt(pupil_diameter_file, delimiter=',', skip_header=1)
 		events = np.genfromtxt(events_file, delimiter=',', skip_header=1, dtype=str)
 		# create instance of gazeAnalysis class that will be used for feature extraction
 		# this already does some initial computation that will be useful for all window sizes:
 		extractor = ga.gazeAnalysis(gaze, conf.fixation_radius_threshold, conf.fixation_duration_threshold,
 													conf.saccade_min_velocity, conf.max_saccade_duration,
 													pupil_diameter=pupil_diameter, event_strings=events)
 		# compute sliding window features by creating one thread per window size
 		for window_size in conf.all_window_sizes:
 			if not os.path.exists(conf.get_window_features_file(p, window_size)):
 				thread = threading.Thread(target=compute_sliding_window_features, args=(p, window_size, extractor))
 				thread.start()
 				threads.append(thread)
 		for t in threads:
 			t.join()
 		print 'finished all features for participant', p
 	# Merge the features from all participants into three files per window_size:
 	# merged_features includes all features
 	# merged_traits contains the ground truth personality score ranges
 	# merged_ids contains the participant number and context (way, shop, half of the recording)
 	# load ground truth from info folder:
 	binned_personality = np.genfromtxt(conf.binned_personality_file, delimiter=',', skip_header=1)
 	trait_labels = np.loadtxt(conf.binned_personality_file, delimiter=',', dtype=str)[0,:]
 	annotation = np.genfromtxt(conf.annotation_path, delimiter=',', skip_header=1)
 	for window_size in conf.all_window_sizes:
 		print 'merging window size', window_size
 		windowfeats_subtask_all = []
 		windowfeats_subtask_ids = []
 		windowfeats_subtask_all_y = []
 		for p in xrange(0, conf.n_participants):
 			featfilename = conf.get_window_features_file(p, window_size)
 			timesfilename = conf.get_window_times_file(p, window_size)
 			if os.path.exists(featfilename) and os.path.exists(timesfilename):
 				data = np.load(featfilename).tolist()
 				windowfeats_subtask_all.extend(data)
 				windowfeats_subtask_all_y.extend([binned_personality[p, 1:]] * len(data))
 				times = np.load(timesfilename)[:, 2:]
 				ann = annotation[p,1:]
 				ids_annotation = np.zeros((len(data), 3), dtype=int) # person, way/shop, half
 				ids_annotation[:,0] = p
 				ids_annotation[(times[:,1] < ann[0]),1] = conf.time_window_annotation_wayI
 				ids_annotation[(times[:,0] > ann[0]) & (times[:,1] < ann[1]),1] = conf.time_window_annotation_shop
 				ids_annotation[(times[:,0] > ann[1]),1] = conf.time_window_annotation_wayII
 				ids_annotation[:(len(data)/2), 2] = conf.time_window_annotation_halfI
 				ids_annotation[(len(data)/2):, 2] = conf.time_window_annotation_halfII
 				windowfeats_subtask_ids.extend(ids_annotation.tolist())
 			else:
 				print 'did not find ', featfilename
 				sys.exit(1)
 		ids = np.array(windowfeats_subtask_ids)
 		x = np.array(windowfeats_subtask_all, dtype=float)
 		y = np.array(windowfeats_subtask_all_y)
 		f1, f2, f3 = conf.get_merged_feature_files(window_size)
 		np.savetxt(f1, x, delimiter=',', header=','.join(gs.full_long_label_list), comments='')
 		np.savetxt(f2, y, delimiter=',', header=','.join(trait_labels), comments='')
 		np.savetxt(f3, ids, delimiter=',', header='Participant ID', comments='')
--- a/README.md
+++ b/README.md
@ -1,8 +1,7 @@
 # Eye movements during everyday behavior predict personality traits
 *Sabrina Hoppe, Tobias Loetscher, Stephanie Morey and Andreas Bulling*
-This repository provides all data used for the publication [in Frontiers in Human Neuroscience](https://dx.doi.org/10.3389/fnhum.2018.00105).  
+This repository provides all data and code used for the publication [in Frontiers in Human Neuroscience](https://dx.doi.org/10.3389/fnhum.2018.00105).  
 Code is coming soon!
 ## Dataset
   *  Gaze data recorded at 60Hz from 42 participants is stored in `data/ParticipantXX`.  
@ -20,6 +19,14 @@ Code is coming soon!
   * Timestamps indicating the times when participants entered and left the shop are given in `info/annotation.csv` in seconds.  
 ## Code
 reproducing the paper results step by step:
 1. __Extract features from raw gaze data__:    
   `python compute_features.py` to compute gaze features for all participants  
   Once extracted, the features are stored in `features/ParticipantXX/window_features_YY.npy` where XX is the participant number and YY the length of the sliding window in seconds.  
 ## Citation  
 If you want to cite this project, please use the following Bibtex format:
--- a/init.py
+++ b/init.py
@ -0,0 +1 @@
--- a/config/init.py
+++ b/config/init.py
--- a/config/conf.py
+++ b/config/conf.py
@ -0,0 +1,97 @@
 import numpy as np
 # global parameters
 n_participants = 42
 n_traits = 7
 max_n_feat = 207
 max_n_iter = 100
 all_window_sizes = [5, 15, 30, 45, 60, 75, 90, 105, 120, 135]
 all_shop_window_sizes = [5, 15]  # at least 3/4 of the people have a time window in these times
 # cross validation paramters
 n_inner_folds = 3
 n_outer_folds = 5
 # Random Forest Parameters
 tree_max_features = 15
 tree_max_depth = 5
 n_estimators = 100
 max_n_jobs = 5
 # given a window size, determine step size correctly for even and odd numbers
 def get_step_size(window_size):
 	step_size = window_size / 2.0
 	if step_size * 10 % 2 == 0:
 		step_size = int(step_size)
 	return step_size
 # relative paths
 data_folder = 'data'
 info_folder = 'info'
 feature_folder = 'features'
 result_folder = 'results'
 figure_folder = 'figures'
 annotation_path = info_folder + '/annotation.csv'
 binned_personality_file = info_folder + '/binned_personality.csv'
 personality_sex_age_file = info_folder + '/personality_sex_age.csv'
 # load the personality trait names from file and map them to abbreviations
 traitlabels = np.loadtxt(binned_personality_file, delimiter=',', dtype=str)[0, 1:]
 def get_abbr(s):
 	return ''.join(item[0] for item in s.split() if item[0].isupper())
 medium_traitlabels = [get_abbr(s) if (" " in s) else s for s in traitlabels]
 short_traitlabels = [''.join(item[0] for item in tl.split() if item[0].isupper()) for tl in traitlabels]
 # dynamically create relative paths for result files to create
 def get_result_folder(annotation_val):
 	return result_folder + '/A' + str(annotation_val)
 def get_result_filename(annotation_val, trait, shuffle_labels, i, add_suffix=False):
 	filename = get_result_folder(annotation_val) + '/' + short_traitlabels[trait]
 	if shuffle_labels:
 		filename += '_rnd'
 	filename += '_' + str(i).zfill(3)
 	if add_suffix:
 		filename += '.npz'
 	return filename
 def get_feature_folder(participant):
 	return feature_folder + '/Participant' + str(participant).zfill(2)
 def get_merged_feature_files(window_size):
 	return feature_folder + '/merged_features_' + str(window_size) + '.csv', feature_folder + '/merged_traits_' + str(window_size) + '.csv', feature_folder + '/merged_ids_' + str(window_size) + '.csv'
 def get_data_folder(participant):
 	return data_folder + '/Participant' + str(participant).zfill(2)
 def get_window_times_file(participant, window_size):
 	return get_feature_folder(participant) + "/window_times_" + str(window_size) + '.npy'
 def get_window_features_file(participant, window_size):
 	return get_feature_folder(participant) + "/window_features_" + str(window_size) + '.npy'
 def get_overall_features_file(participant):
 	return get_feature_folder(participant) + "/overall_features.npy"
 # parameters for fixation/saccade detection
 fixation_radius_threshold = 0.025
 fixation_duration_threshold = 0.1
 saccade_min_velocity = 2
 max_saccade_duration = 0.5
 # annotation constants (as given as arguments to train_classifier, and as used for file names in result_folder)
 annotation_all = 0
 annotation_ways = 1
 annotation_shop = 2
 annotation_values = [annotation_all, annotation_ways, annotation_shop]
 # annotations used in merged_ids_* files in the feature_folder
 # column 1
 time_window_annotation_wayI = 1
 time_window_annotation_shop = 2
 time_window_annotation_wayII = 3
 # column 2
 time_window_annotation_halfI = 1
 time_window_annotation_halfII = 2
--- a/config/names.py
+++ b/config/names.py
@ -0,0 +1,160 @@
 fixations_list_labels = ['mean x', 'mean y',
                         'var x', 'var y',
                         't start', 't end',
                         'start index', 'end index',
                         'mean diameter', 'var diameter',
                         'mean successive angles', 'var successive angles'
                         ]
 fix_mean_x_i = 0
 fix_mean_y_i = 1
 fix_var_x_i = 2
 fix_var_y_i = 3
 fix_start_t_i = 4
 fix_end_t_i = 5
 fix_start_index_i = 6
 fix_end_index_i = 7
 fix_mean_diam_i = 8
 fix_var_diam_i = 9
 fix_mean_succ_angles = 10
 fix_var_succ_angles = 11
 saccades_list_labels = ['start x', 'start y',
                        'end x', 'end y',
                        'angle',
                        't start', 't end',
                        'start index', 'end index',
                        'mean diameter', 'var diameter',
                        'peak velocity', 'amplitude',
                        ]
 sacc_start_x_i = 0
 sacc_start_y_i = 1
 sacc_end_x_i = 2
 sacc_end_y_i = 3
 sacc_angle_i = 4
 sacc_t_start_i = 5
 sacc_t_end_i = 6
 sacc_start_index_i = 7
 sacc_end_index_i = 8
 sacc_mean_diam_i = 9
 sacc_var_diam_i = 10
 sacc_peak_vel_i = 11
 sacc_amplitude_i = 12
 blink_list_labels = ['t start', 't end', 'start index', 'end index']
 blink_start_t_i = 0
 blink_end_ti_i = 1
 blink_start_index_i = 2
 blink_end_index_i = 3
 event_feature_labels = ['fixation rate', 'saccade rate',  # 0 1
                        'small sacc. rate', 'large sacc. rate', 'positive sacc. rate', 'negative sacc. rate',  # 2 3 4 5
                        'ratio sacc - fix',  # 6
                        'ratio small sacc', 'ratio large sacc', 'ratio right sacc', 'ratio left sacc',  # 7 8 9 10
                        'mean sacc amplitude', 'var sacc amplitude', 'min sacc amplitude', 'max sacc amplitude',  #11 12 13 14
                        'mean peak velocity', 'var peak velocity', 'min peak velocity', 'max peak velocity',  # 15 16 17 18
                        'mean mean diameter sacc', 'var mean  diameter sacc', 'mean var diameter sacc',  # 19 20 21 22
                        'var var diameter sacc',
                        'mean fix duration', 'var fix duration', 'min fix duration', 'max fix duration',  # 23 24 25 26
                        'dwelling time',
                        'mean mean subsequent angle', 'var mean subsequent angle', 'mean var subsequent angle', 'var var subsequent angle',
                        'mean var x', 'mean var y', 'var var x', 'var var y',  # 27 28 29 30
                        'mean mean diameter fix', 'var mean diameter fix', 'mean var diameter fix', 'var var diameter fix',  # 31 32 33 34
                        'mean blink duration', 'var blink duration', 'min blink duration', 'max blink duration',  # 35 36 37 38
                        'blink rate'  # 39
                        ]
 event_feature_labels_long = ['fixation rate', 'saccade rate',  # 0 1
                             'small saccade rate', 'large saccade rate', 'positive saccade rate', 'negative saccade rate',  # 2 3 4 5
                             'saccade:fixation ratio',  # 6
                             'ratio of small saccades', 'ratio of large saccades', 'ratio of right saccades', 'ratio of left saccades',  # 7 8 9 10
                             'mean saccade amplitude', 'var saccade amplitude', 'min saccade amplitude', 'max saccade amplitude',  #11 12 13 14
                             'mean saccadic peak velocity', 'var saccadic peak velocity', 'min saccadic peak velocity', 'max saccadic peak velocity',  # 15 16 17 18
                             'mean of the mean pupil diameter during saccades', 'var of the mean pupil diameter during saccades',
                             'mean of the var pupil diameter during saccades', 'var of the var pupil diameter during saccades', # 19 20 21 22
                             'mean fixation duration', 'var fixation duration', 'min fixation duration', 'max fixation duration',  # 23 24 25 26
                             'dwelling time',
                             'mean of the mean of subsequent angles', 'var of the mean of subsequent angles',
                             'mean of the var of subsequent angles', 'var of the var of subsequent angles',
                             'mean of the var of x', 'mean of the var of y', 'var of the var of x', 'var of the var of y',  # 27 28 29 30
                             'mean of the mean pupil diameter during fixations', 'var of the mean pupil diameter during fixations',
                             'mean of the var pupil diameter during fixations', 'var of the var pupil diameter during fixations',  # 31 32 33 34
                             'mean blink duration', 'var blink duration', 'min blink duration', 'max blink duration',  # 35 36 37 38
                             'blink rate'  # 39
                            ]
 def get_wordbook_feature_labels(movement_abbreviation):
    return [movement_abbreviation + s + ' WB' + str(n) for n in [1, 2, 3, 4] for s in ['>0', 'max', 'min', 'arg max', 'arg min', 'range', 'mean', 'var']]
 def get_wordbook_feature_labels_long(movement_abbreviation):
    return [s1 + str(n) + '-gram ' + movement_abbreviation + s2 for n in [1, 2, 3, 4]
                                                 for (s1, s2) in [('number of different ', ' movements'),
                                                                  ('max frequency ', ' movements'),
                                                                  ('min frequency ', ' movements'),
                                                                  ('most frequent ', ' movement'),
                                                                  ('least frequent ', ' movement'),
                                                                  ('range of frequencies of ', ' movements'),
                                                                  ('mean frequency of ', ' movements'),
                                                                  ('var frequency of ', ' movements')
                                                                  ]]
 position_feature_labels = ['mean x', 'mean y', 'mean diameter',
                           'min x', 'min y', 'min diameter',
                           'max x', 'max y', 'max diameter',
                           'min-max x', 'min-max y', 'min-max diameter',
                           'std x', 'std y', 'std diameter',
                           'median x', 'median y', 'median diameter',
                           '1st quart x', '1st quart y', '1st quart diameter',
                           '3rd quart x', '3rd quart y', '3rd quart diameter',
                           'IQR x', 'IQR y', 'IQR diameter',
                           'mean abs diff x', 'mean abs diff y', 'mean abs diff diameter',
                           'mean diff x', 'mean diff y', 'mean diff diameter',
                           'mean subsequent angle'
                           ]
 position_feature_labels_long = ['mean x', 'mean y', 'mean pupil diameter',
                                'minimum x', 'minimum y', 'minimum pupil diameter',
                                'maximum x', 'maximum y', 'maximum pupil diameter',
                                'range x', 'range y', 'range pupil diameter',
                                'std x', 'std y', 'std pupil diameter',
                                'median x', 'median y', 'median pupil diameter',
                                '1st quartile x', '1st quartile y', '1st quartile pupil diameter',
                                '3rd quartile x', '3rd quartile y', '3rd quartile pupil diameter',
                                'inter quartile range x', 'inter quartile range y', 'inter quartile range pupil diameter',
                                'mean difference of subsequent x', 'mean difference of subsequent y', 'mean difference of subsequent pupil diameters',
                                'mean diff x', 'mean diff y', 'mean diff pupil diameter',
                                'mean subsequent angle'
                                ]
 heatmap_feature_labels = ['heatmap_'+str(i).zfill(2) for i in xrange(0, 64)]
 heatmap_feature_labels_long = ['heatmap cell '+str(i).zfill(2) for i in xrange(0, 64)]
 full_label_list = event_feature_labels + heatmap_feature_labels + position_feature_labels + \
                  get_wordbook_feature_labels('sacc.') + get_wordbook_feature_labels('SF')
 full_long_label_list = event_feature_labels_long + heatmap_feature_labels_long + position_feature_labels_long + \
                  get_wordbook_feature_labels_long('sacc.') + get_wordbook_feature_labels_long('SF')
 sacc_dictionary = ['A', 'B', 'C', 'R', 'E', 'F', 'G', 'D', 'H', 'J', 'K', 'L', 'M', 'N', 'O', 'U', 'u', 'b', 'r', 'f',
 					'd', 'j', 'l', 'n']
 sacc_bins_two = [a+b for a in sacc_dictionary for b in sacc_dictionary]
 sacc_bins_three = [a+b+c for a in sacc_dictionary for b in sacc_dictionary for c in sacc_dictionary]
 sacc_bins_four = [a+b+c+d for a in sacc_dictionary for b in sacc_dictionary for c in sacc_dictionary for d in sacc_dictionary]
 sacc_bins = [sacc_dictionary, sacc_bins_two, sacc_bins_three, sacc_bins_four]
 saccFix_dictionary = ['S_lu', 'S_ld', 'S_lr', 'S_ll', 'S_su', 'S_sd', 'S_sr', 'S_sl', 'F_l', 'F_s']
 saccFix_bins_two = [a+b for a in saccFix_dictionary for b in saccFix_dictionary]
 saccFix_bins_three = [a+b+c for a in saccFix_dictionary for b in saccFix_dictionary for c in saccFix_dictionary]
 saccFix_bins_four = [a+b+c+d for a in saccFix_dictionary for b in saccFix_dictionary for c in saccFix_dictionary for d in saccFix_dictionary]
 saccFix_bins = [saccFix_dictionary, saccFix_bins_two, saccFix_bins_three, saccFix_bins_four]
 def write_pami_feature_labels_to_file(targetfile):
    f = open(targetfile, 'w')  # creates if it does not exist
    f.write(',short,long\n')
    i = 0
    for item1, item2 in zip(full_label_list, full_long_label_list):
        f.write(str(i) + ',' + item1 + ',' + item2 + '\n')
        i += 1
    f.close()
--- a/featureExtraction/init.py
+++ b/featureExtraction/init.py
--- a/featureExtraction/event_detection.py
+++ b/featureExtraction/event_detection.py
@ -0,0 +1,327 @@
 import numpy as np
 import sys
 import math
 from config import names as gs
 def get_fixation_list(gaze, errors, xi, yi, ti, fixation_radius_threshold, fixation_duration_threshold, pupil_diameter):
 	n, m = gaze.shape
 	fixations = []
 	fixation = []  # single fixation, to be appended to fixations
 	counter = 0  # number of points in the fixation
 	sumx = 0 # used to compute the center of a fixation in x and y direction
 	sumy = 0
 	distance = 0 # captures the distance of a current sample from the fixation center
 	i = 0 # iterates through the gaze samples
 	while i < n - 1:
 		x = gaze[i, xi]
 		y = gaze[i, yi]
 		if counter == 0:
 			# ignore erroneous samples before a fixation
 			if errors[i]:
 				i += 1
 				continue
 			centerx = x
 			centery = y
 		else:
 			centerx = np.true_divide(sumx, counter)
 			centery = np.true_divide(sumy, counter)
 		if not errors[i]: # only update distance if the current sample is not erroneous
 			distance = np.sqrt((x - centerx) * (x - centerx) + (y - centery) * (y - centery))
 		if distance > fixation_radius_threshold:  # start new fixation
 			if gaze[(i - 1), ti] - gaze[(i - counter), ti] >= fixation_duration_threshold:
 				start_index = i - counter + 1
 				end_index = i - 1 - 1
 				# discard fixations with more than 50% erroneous samples
 				percentage_error = np.sum(errors[start_index:(end_index + 1)]) / float(end_index - start_index)
 				if percentage_error >= 0.5:
 					if errors[i]:
 						i += 1
 						counter = 0
 					else:
 						counter = 1
 						sumx = x
 						sumy = y
 					continue
 				gaze_indices = np.arange(start_index, end_index+1)[np.logical_not(errors[start_index:(end_index + 1)])]
 				start_index = gaze_indices[0]
 				end_index = gaze_indices[-1]
 				gazex = gaze[start_index:(end_index + 1), xi][np.logical_not(errors[start_index:(end_index + 1)])]
 				gazey = gaze[start_index:(end_index + 1), yi][np.logical_not(errors[start_index:(end_index + 1)])]
 				gazet = gaze[start_index:(end_index + 1), ti][np.logical_not(errors[start_index:(end_index + 1)])]
 				# extract fixation characteristics
 				fixation.append(np.mean(gazex))  # 0.-1. mean x,y
 				fixation.append(np.mean(gazey))
 				fixation.append(np.var(gazex))  # 2-3. var x, y
 				fixation.append(np.var(gazey))
 				fixation.append(gazet[0])  # 4-5. t_start, t_end
 				fixation.append(gazet[-1])
 				fixation.append(gaze_indices[0])  # 6-7. index_start, index_end
 				fixation.append(gaze_indices[-1])
 				ds = ((pupil_diameter[start_index:(end_index+1), 1] + pupil_diameter[start_index:(end_index+1), 2]) / 2.)[np.logical_not(errors[start_index:(end_index+1)])]
 				fixation.append(np.mean(ds))  # 8. mean pupil diameter
 				fixation.append(np.var(ds))  # 9. var pupil diameter
 				succ_dx = gazex[1:] - gazex[:-1]
 				succ_dy = gazey[1:] - gazey[:-1]
 				succ_angles = np.arctan2(succ_dy, succ_dx)
 				fixation.append(np.mean(succ_angles))  # 10 mean successive angle
 				fixation.append(np.var(succ_angles))  # 11 var successive angle
 				fixations.append(fixation)
 				assert len(fixation) == len(gs.fixations_list_labels)
 			# set up new fixation
 			fixation = []
 			if errors[i]:
 				i += 1
 				counter = 0
 			else:
 				counter = 1
 				sumx = x
 				sumy = y
 		else:
 			if not errors[i]:
 				counter += 1
 				sumx += x
 				sumy += y
 		i += 1
 	return fixations
 def get_saccade_list(gaze, fixations, xi, yi, ti, pupil_diameter, fixation_radius_threshold, errors,
 						saccade_min_velocity, max_saccade_duration):
 	saccades = []
 	wordbook_string = []
 	# each movement between two subsequent fixations could be a saccade, but
 	for i in xrange(1, len(fixations)):
 		# ...not if the window is too long
 		duration = float(fixations[i][gs.fix_start_t_i] - fixations[i - 1][gs.fix_end_t_i])
 		if duration > max_saccade_duration:
 			continue
 		start_index = fixations[i - 1][gs.fix_end_index_i]
 		end_index = fixations[i][gs.fix_start_index_i]
 		gazex = gaze[start_index:(end_index + 1), xi][np.logical_not(errors[start_index:(end_index + 1)])]
 		gazey = gaze[start_index:(end_index + 1), yi][np.logical_not(errors[start_index:(end_index + 1)])]
 		gazet = gaze[start_index:(end_index + 1), ti][np.logical_not(errors[start_index:(end_index + 1)])]
 		dx = np.abs(gazex[1:] - gazex[:-1])
 		dy = np.abs(gazey[1:] - gazey[:-1])
 		dt = np.abs(gazet[1:] - gazet[:-1])
 		# ...not if less than 2 non-errouneous amples are left:
 		if len(dt) < 2:
 			continue
 		distance = np.linalg.norm([dx, dy])
 		peak_velocity = np.amax(distance / dt)
 		start_x = gazex[0]
 		start_y = gazey[0]
 		end_x = gazex[-1]
 		end_y = gazey[-1]
 		dx = end_x - start_x
 		dy = end_y - start_y
 		# ...not if the amplitude is shorter than a fith of fixation_radius_threshold
 		amplitude = np.linalg.norm([dx, dy])
 		if amplitude < fixation_radius_threshold / 5.0:
 			continue
 		# ...not if the peak velocity is very low
 		if peak_velocity < saccade_min_velocity:
 			continue
 		percentage_error = np.sum(errors[start_index:(end_index + 1)]) / float(end_index - start_index)
 		# ...not if more than 50% of the data are erroneous
 		if percentage_error >= 0.5:
 			continue
 		else:  # found saccade!
 			# compute characteristics of the saccade, like start and end point, amplitude, ...
 			saccade = []
 			saccade.append(start_x)  # 0.-1. start x,y
 			saccade.append(start_y)
 			saccade.append(end_x)  # 2-3. end x,y
 			saccade.append(end_y)
 			if dx == 0:
 				radians = 0
 			else:
 				radians = np.arctan(np.true_divide(dy, dx))
 			if dx > 0:
 				if dy < 0:
 					radians += (2 * np.pi)
 			else:
 				radians = np.pi + radians
 			saccade.append(radians)  # 4. angle
 			saccade.append(fixations[i - 1][gs.fix_end_t_i])  # 5-6. t_start, t_end
 			saccade.append(fixations[i][gs.fix_start_t_i])
 			saccade.append(start_index)  # 7-8. index_start, index_end
 			saccade.append(end_index)
 			ds = (pupil_diameter[start_index:(end_index + 1), 1] + pupil_diameter[start_index:(end_index + 1),
 			                                                       2]) / 2.0
 			saccade.append(np.mean(ds))  # 9. mean pupil diameter
 			saccade.append(np.var(ds))  # 10. var pupil diameter
 			saccade.append(peak_velocity)  # 11. peak velocity
 			saccade.append(amplitude)  # 12. amplitude
 			# append character representing this kind of saccade to the wordbook_string which will be used for n-gram features
 			sac_id = get_dictionary_entry_for_saccade(amplitude, fixation_radius_threshold, radians)
 			wordbook_string.append(sac_id)
 			saccades.append(saccade)
 			# assert all saccade characteristics were computed
 			assert len(saccade) == len(gs.saccades_list_labels)
 	return saccades, wordbook_string
 def get_blink_list(event_strings, gaze, ti):
 	assert len(event_strings) == len(gaze)
 	# detect Blinks
 	blinks = []
 	blink = []  # single blink, to be appended to blinks
 	i = 0
 	starti = i
 	blink_started = False
 	while i < len(event_strings) - 1:
 		if event_strings[i] == 'Blink' and not blink_started:  # start new blink
 			starti = i
 			blink_started = True
 		elif blink_started and not event_strings[i] == 'Blink':
 			blink.append(gaze[starti, ti])
 			blink.append(gaze[i - 1, ti])
 			blink.append(starti)
 			blink.append(i - 1)
 			blinks.append(blink)
 			assert len(blink) == len(gs.blink_list_labels)
 			blink_started = False
 			blink = []
 		i += 1
 	return blinks
 def get_dictionary_entry_for_saccade(amplitude, fixation_radius_threshold, degree_radians):
 	# Saacade Type: small, iff amplitude less than 2 fixation_radius_thresholds
 	#						U
 	#					O		A
 	#				N		u		B
 	#			M		n		b		C
 	#		L		l				r		R
 	#			K		j		f		E
 	#				J		d		F
 	#					H		G
 	#						D
 	degrees = np.true_divide(degree_radians * 180.0, np.pi)
 	if amplitude < 2 * fixation_radius_threshold:
 		d_degrees = degrees / (np.true_divide(90, 4))
 		if d_degrees < 1:
 			sac_id = 'r'
 		elif d_degrees < 3:
 			sac_id = 'b'
 		elif d_degrees < 5:
 			sac_id = 'u'
 		elif d_degrees < 7:
 			sac_id = 'n'
 		elif d_degrees < 9:
 			sac_id = 'l'
 		elif d_degrees < 11:
 			sac_id = 'j'
 		elif d_degrees < 13:
 			sac_id = 'd'
 		elif d_degrees < 15:
 			sac_id = 'f'
 		elif d_degrees < 16:
 			sac_id = 'r'
 		else:
 			print
 			print 'error! d_degrees cannot be matched to a sac_id for a small saccade ', d_degrees
 			sys.exit(1)
 	else:  # large
 		d_degrees = degrees / (np.true_divide(90, 8))
 		if d_degrees < 1:
 			sac_id = 'R'
 		elif d_degrees < 3:
 			sac_id = 'C'
 		elif d_degrees < 5:
 			sac_id = 'B'
 		elif d_degrees < 7:
 			sac_id = 'A'
 		elif d_degrees < 9:
 			sac_id = 'U'
 		elif d_degrees < 11:
 			sac_id = 'O'
 		elif d_degrees < 13:
 			sac_id = 'N'
 		elif d_degrees < 15:
 			sac_id = 'M'
 		elif d_degrees < 17:
 			sac_id = 'L'
 		elif d_degrees < 19:
 			sac_id = 'K'
 		elif d_degrees < 21:
 			sac_id = 'J'
 		elif d_degrees < 23:
 			sac_id = 'H'
 		elif d_degrees < 25:
 			sac_id = 'D'
 		elif d_degrees < 27:
 			sac_id = 'G'
 		elif d_degrees < 29:
 			sac_id = 'F'
 		elif d_degrees < 31:
 			sac_id = 'E'
 		elif d_degrees < 33:
 			sac_id = 'R'
 		else:
 			print 'error! d_degrees cannot be matched to a sac_id for a large saccade: ', d_degrees
 			sys.exit(1)
 	return sac_id
 def detect_all(gaze, errors, ti, xi, yi, fixation_radius_threshold=0.01, pupil_diameter=None, event_strings=None,
               fixation_duration_threshold=0.1, saccade_min_velocity=2, max_saccade_duration=0.1):
 	"""
 	:param gaze: gaze data, typically [t,x,y]
 	:param fixation_radius_threshold: dispersion threshold
 	:param fixation_duration_threshold: temporal threshold
 	:param ti, xi, yi: index data for gaze,i.e. for [t,x,y] ti=0, xi=1, yi=2
 	:param pupil_diameter: pupil diameters values, same length as gaze
 	:param event_strings: list of events, here provided by SMI. used to extract blink information
 	"""
 	fixations = get_fixation_list(gaze, errors, xi, yi, ti, fixation_radius_threshold, fixation_duration_threshold,
 		                              pupil_diameter)
 	saccades, wordbook_string = get_saccade_list(gaze, fixations, xi, yi, ti, pupil_diameter,
 		                                             fixation_radius_threshold, errors, saccade_min_velocity,
 		                                             max_saccade_duration)
 	blinks = get_blink_list(event_strings, gaze, ti)
 	return fixations, saccades, blinks, wordbook_string
--- a/featureExtraction/gaze_analysis.py
+++ b/featureExtraction/gaze_analysis.py
@ -0,0 +1,582 @@
 #!/usr/bin/python
 import numpy as np
 import sys, os
 from featureExtraction import event_detection as ed
 import operator
 from config import names as gs
 class gazeAnalysis (object):
 	# dictionary for saccade-based n-grams:
 	# each character encodes one direction, capital characters stand for long saccades, the others for short ones
 	# short means the saccade amplitude is less than 2 fixation_radius_thresholds
 	#						U
 	#					O		A
 	#				N		u		B
 	#			M		n		b		C
 	#		L		l		.		r		R
 	#			K		j		f		E
 	#				J		d		F
 	#					H		G
 	#						D
 	sacc_dictionary = ['A', 'B', 'C', 'R', 'E', 'F', 'G', 'D', 'H', 'J', 'K', 'L', 'M', 'N', 'O', 'U', 'u', 'b', 'r', 'f',
 						'd', 'j', 'l', 'n']
 	sacc_bins_two = [a+b for a in sacc_dictionary for b in sacc_dictionary]
 	sacc_bins_three = [a+b+c for a in sacc_dictionary for b in sacc_dictionary for c in sacc_dictionary]
 	sacc_bins_four = [a+b+c+d for a in sacc_dictionary for b in sacc_dictionary for c in sacc_dictionary for d in sacc_dictionary]
 	sacc_bins = [sacc_dictionary, sacc_bins_two, sacc_bins_three, sacc_bins_four]
 	# dictionary for saccade and fixation-based n-grams:
 	# S are saccades, long or short (i.e. longer or shorter than the fixation radius), and up/down/right/left
 	# e.g. S_lu is a long saccade up
 	# F are fixations, either long or short (i.e. longer or shorter than twice the minimum fixation duration)
 	# saccFix_dictionary = ['S_lu', 'S_ld', 'S_lr', 'S_ll', 'S_su', 'S_sd', 'S_sr', 'S_sl', 'F_l', 'F_s']
 	saccFix_dictionary = ['U', 'D', 'R', 'L', 'u', 'd', 'r', 'l', 'F', 'f']
 	saccFix_bins_two = [a+b for a in saccFix_dictionary for b in saccFix_dictionary]
 	saccFix_bins_three = [a+b+c for a in saccFix_dictionary for b in saccFix_dictionary for c in saccFix_dictionary]
 	saccFix_bins_four = [a+b+c+d for a in saccFix_dictionary for b in saccFix_dictionary for c in saccFix_dictionary for d in saccFix_dictionary]
 	saccFix_bins = [saccFix_dictionary, saccFix_bins_two, saccFix_bins_three, saccFix_bins_four]
 	def __init__(self, gaze, fixation_radius_threshold, fixation_duration_threshold, saccade_min_velocity,max_saccade_duration,
 					pupil_diameter=None, event_strings=None, ti=0, xi=1, yi=2):
 		assert gaze.size > 0
 		# save data in instance
 		self.gaze = gaze
 		self.diams = pupil_diameter
 		self.event_strings = event_strings
 		# save constants, indices and thresholds that will be used muttiple times
 		self.fixation_radius_threshold = fixation_radius_threshold
 		self.fixation_duration_threshold = fixation_duration_threshold
 		self.xi = xi
 		self.yi = yi
 		self.ti = ti
 		# detect errors, fixations, saccades and blinks
 		self.errors = self.detect_errors()
 		self.fixations, self.saccades, self.blinks, self.wordbook_string = \
 			ed.detect_all(self.gaze, self.errors, self.ti, self.xi, self.yi, pupil_diameter=pupil_diameter,
 							event_strings=event_strings, fixation_duration_threshold=fixation_duration_threshold,
 							fixation_radius_threshold=fixation_radius_threshold, saccade_min_velocity=saccade_min_velocity,
 							max_saccade_duration=max_saccade_duration)
 	def detect_errors(self, confidence_threshold=0.8, outlier_threshold=0.5):
 		"""
 		:param confidence_threshold: threshold below which all gaze data is deleted
 		:param outlier_threshold: threshold beyond which gaze must not be outside the calibration area (i.e. [0,1])
 		"""
 		errors = np.full((len(self.gaze)), False, dtype=bool)
 		# gaze is nan
 		errors[np.isnan(self.gaze[:, self.xi])] = True
 		errors[np.isnan(self.gaze[:, self.yi])] = True
 		# gaze outside a certain range
 		errors[self.gaze[:, self.xi] < -outlier_threshold] = True
 		errors[self.gaze[:, self.xi] > outlier_threshold + 1] = True
 		errors[self.gaze[:, self.yi] < -outlier_threshold] = True
 		errors[self.gaze[:, self.yi] > outlier_threshold + 1] = True
 		return errors
 	def get_window_features(self, sliding_window_size, sliding_window_step_size,  start_index=-1, end_index=-1):
 		"""
 		computes features using a sliding window approach with the given sliding_window_size and sliding_window_step_size
 		"""
 		# if no start and end index are given, use all data
 		if start_index == -1 and end_index == -1:
 			start_index = 0
 			end_index = len(self.gaze[:, 0]) - 1
 		# compute start and end times of each resulting sliding window
 		window_times = self.get_sliding_windows(start_index, end_index, sliding_window_size, sliding_window_step_size)
 		#compute features for each of these windows:
 		window_feature_list = []
 		for [a,b,at,bt] in window_times:
 			overallstats = self.get_full_feature_vector(a, b)
 			window_feature_list.append(overallstats)
 		assert len(gs.full_label_list) == len(window_feature_list[0])
 		window_feature_list = np.array(window_feature_list)
 		window_times = np.array(window_times)
 		return window_feature_list, window_times
 	def get_full_feature_vector(self, start_index, end_index):
 		"""
 		assembles the full feature vector of its part:
 		features based on fixations/saccades/blinks, raw data, heatmaps, n-grams based on saccades and n-grams based on saccades and fixations
 		"""
 		# features based on events, i.e. fixations/saccades/blinks
 		features = self.get_event_features(start_index, end_index)
 		assert len(gs.event_feature_labels) == len(features)
 		# features based on raw data, like quartiles of gaze posiitons
 		raw_features = self.get_raw_features(start_index, end_index)
 		features.extend(raw_features)
 		assert len(gs.position_feature_labels) == len(raw_features)
 		# heatmap features
 		heatmap_features = self.get_heatmap_features(start_index, end_index)
 		features.extend(heatmap_features)
 		assert len(gs.heatmap_feature_labels) == len(heatmap_features)
 		# n-gram features based on saccades
 		sacc_wordbook_features = self.get_sacc_ngram_features(start_index, end_index)
 		features.extend(sacc_wordbook_features)
 		assert len(gs.get_wordbook_feature_labels('')) == len(sacc_wordbook_features)
 		# n-gram features based on saccades and fixations
 		saccFix_wordbook_features = self.get_saccFix_ngram_features(start_index, end_index)
 		features.extend(saccFix_wordbook_features)
 		assert len(gs.get_wordbook_feature_labels('')) == len(saccFix_wordbook_features)
 		return features
 	def get_event_features(self, start_index, end_index):
 		"""
 		computes features based on fixations, saccades and blinks within the selected gaze window
 		"""
 		event_features = []
 		# get non-errouneous samples between start_index and end_index:
 		x,y,d = self.get_x_y_d_part(start_index, end_index)
 		data = self.gaze[start_index:(end_index+1), :]
 		fixations, saccades, wordbook_string, blinks = self.get_FixSacWb_timed(start_index, end_index)
 		n, m = data.shape
 		timespan = (data[-1,self.ti] - data[0,self.ti])
 		num_fix =len(fixations)
 		num_sacc =len(saccades)
 		if timespan == 0:
 			fix_rate = 0
 			sacc_rate = 0
 		else:
 			fix_rate = num_fix / float(timespan)
 			sacc_rate = num_sacc / float(timespan)
 		event_features.append(fix_rate) # 0. rate of fixations
 		event_features.append(sacc_rate) # 1. rate of saccades
 		sacc_UR_L = wordbook_string.count('U') + wordbook_string.count('A')  + wordbook_string.count('B') + wordbook_string.count('C')
 		sacc_BR_L = wordbook_string.count('R') + \
 							wordbook_string.count('E')  + \
 							wordbook_string.count('F')  + \
 							wordbook_string.count('G')
 		sacc_UL_L = wordbook_string.count('O')  \
 							+ wordbook_string.count('N')  \
 							+ wordbook_string.count('M')  \
 							+ wordbook_string.count('L')
 		sacc_BL_L = wordbook_string.count('K') \
 							+ wordbook_string.count('J') \
 							+ wordbook_string.count('H') \
 							+ wordbook_string.count('D')
 		sacc_UR_S = wordbook_string.count('u') \
 							+ wordbook_string.count('b')
 		sacc_BR_S = wordbook_string.count('r') \
 							+ wordbook_string.count('f')
 		sacc_UL_S = wordbook_string.count('n') \
 							+ wordbook_string.count('l')
 		sacc_BL_S = wordbook_string.count('j') \
 							+ wordbook_string.count('d')
 		num_s_sacc = sacc_UR_S + sacc_BR_S + sacc_UL_S + sacc_BL_S
 		num_la_sacc = sacc_UR_L + sacc_BR_L + sacc_UL_L + sacc_BL_L
 		num_r_sacc = sacc_UR_S + sacc_BR_S + sacc_UR_L + sacc_BR_L
 		num_l_sacc = sacc_UL_S + sacc_BL_S + sacc_UL_L + sacc_BL_L
 		if timespan > 0:
 			event_features.append(num_s_sacc / float(timespan)) #2. rate of small saccades
 			event_features.append(num_la_sacc / float(timespan)) #3. rate of large saccades
 			event_features.append(num_r_sacc / float(timespan)) #4. rate of pos saccades
 			event_features.append(num_l_sacc / float(timespan)) #5. rate of neg saccades
 		else:
 			event_features.extend([0]*4)
 		if num_fix > 0:
 			event_features.append(num_sacc / float(num_fix)) #	6. ratio saccades / fixations
 		else:
 			event_features.append(0)
 		if num_sacc > 0:
 			event_features.append(num_s_sacc /float(num_sacc)) # 	7. ratio small sacc
 			event_features.append(num_la_sacc / float(num_sacc)) #	8. ratio large sacc
 			event_features.append(num_r_sacc / float(num_sacc)) #	9. ratio pos sacc
 			event_features.append(num_l_sacc / float(num_sacc)) #	10. ratio neg sacc
 		else:
 			event_features.extend([0]*4)
 		sacc_array = np.array(saccades)
 		fix_array = np.array(fixations)
 		if sacc_array.size > 0:
 			# amplitude features
 			amplitudes = sacc_array[:, gs.sacc_amplitude_i]
 			event_features.append(np.mean(amplitudes))  # 11: mean sacc amplitude
 			event_features.append(np.var(amplitudes))  # 12: var  sacc amplitude
 			event_features.append(amplitudes.min())    # 13 min sacc amplitude
 			event_features.append(amplitudes.max())  # 14: max  sacc amplitude
 			# peak velocity features
 			velocities = sacc_array[:, gs.sacc_peak_vel_i]
 			event_features.append(np.mean(velocities))  # 15: mean peak velocity
 			event_features.append(np.var(velocities))  # 16: var peak velocity
 			event_features.append(velocities.min())   # 17: min  peak velocity
 			event_features.append(velocities.max())  # 18: max  peak velocity
 			if sacc_array[0, :].size == 13:
 				event_features.append(np.mean(sacc_array[:, gs.sacc_mean_diam_i]))  # 19 mean mean diameter
 				event_features.append(np.var(sacc_array[:, gs.sacc_mean_diam_i]))  # 20 var mean diameter
 				event_features.append(np.mean(sacc_array[:, gs.sacc_var_diam_i]))  # 21 mean var diameter
 				event_features.append(np.var(sacc_array[:, gs.sacc_var_diam_i]))  # 22 var var diameter
 			else:
 				event_features.extend([0]*4)
 		else:
 			event_features.extend([0]*12)
 		if fix_array.size > 0:
 			durations = np.array(fix_array[:, gs.fix_end_t_i]) - np.array(fix_array[:, gs.fix_start_t_i])
 			event_features.append(np.mean(durations))  # 23: mean fix duration
 			event_features.append(np.var(durations))  # 24: var  fix duration
 			event_features.append(durations.min())  # 25: min  fix duration
 			event_features.append(durations.max())  # 26: max  fix duration
 			event_features.append(durations.sum())  # 27: dwelling time
 			event_features.append(np.mean(fix_array[:, gs.fix_mean_succ_angles]))  # 28: mean mean subsequent angle
 			event_features.append(np.var(fix_array[:, gs.fix_mean_succ_angles]))  # 28: var mean subsequent angle
 			event_features.append(np.mean(fix_array[:, gs.fix_var_succ_angles]))  # 28: mean var subsequent angle
 			event_features.append(np.var(fix_array[:, gs.fix_var_succ_angles]))  # 28: var var subsequent angle
 			lnnII = np.logical_not(np.isnan(fix_array[:, gs.fix_var_x_i]))
 			lnnIII = np.logical_not(np.isnan(fix_array[:, gs.fix_var_y_i]))
 			event_features.append(np.mean(fix_array[lnnII, gs.fix_var_x_i]))  # mean var x
 			event_features.append(np.mean(fix_array[lnnIII, gs.fix_var_y_i]))  # mean var y
 			event_features.append(np.var(fix_array[lnnII, gs.fix_var_x_i]))  # 29: var var x
 			event_features.append(np.var(fix_array[lnnIII, gs.fix_var_y_i]))  # 30: var var y
 			if fix_array[0, :].size == 12:
 				event_features.append(np.mean(fix_array[:, gs.fix_mean_diam_i]))  # 31 mean mean diameter
 				event_features.append(np.var(fix_array[:, gs.fix_mean_diam_i]))  # 32 var mean diameter
 				event_features.append(np.mean(fix_array[:, gs.fix_var_diam_i]))  # 33 mean var diameter
 				event_features.append(np.var(fix_array[:, gs.fix_var_diam_i]))  # 34 var var diameter
 			else:
 				event_features.extend([0]*4)
 		else:
 			event_features.extend([0]*17)
 		blink_array = np.array(blinks)
 		if blink_array.size > 0:
 			durations = np.array(blink_array[:, 1]) - np.array(blink_array[:, 0])
 			event_features.append(np.mean(durations)) #35
 			event_features.append(np.var(durations)) #36
 			event_features.append(durations.min()) #37
 			event_features.append(durations.max()) #38
 			event_features.append(np.true_divide(len(blink_array), timespan)) #39
 		else:
 			event_features.extend([0]*5)
 		return event_features
 	def get_x_y_d_part(self, start_index, end_index):
 		x = self.gaze[start_index:(end_index + 1), self.xi]
 		y = self.gaze[start_index:(end_index + 1), self.yi]
 		d = (self.diams[start_index:(end_index + 1), 1] + self.diams[start_index:(end_index + 1), 2]) / 2.0
 		err = self.errors[start_index:(end_index + 1)]
 		x = x[np.logical_not(err)]
 		y = y[np.logical_not(err)]
 		d = d[np.logical_not(err)]
 		return x,y,d
 	def get_raw_features(self, start_index, end_index):
 		"""
 		computes features based on raw gaze data, like percentiles of x coordinates
 		"""
 		raw_features = []
 		x,y,d = self.get_x_y_d_part(start_index, end_index)
 		for a in [x, y, d]:
 			raw_features.append(np.mean(a))
 		for a in [x, y, d]:
 			raw_features.append(np.amin(a))
 		for a in [x, y, d]:
 			raw_features.append(np.amax(a))
 		for a in [x, y, d]:
 			raw_features.append(np.amax(a) - np.amin(a))
 		for a in [x, y, d]:
 			raw_features.append(np.std(a))
 		for a in [x, y, d]:
 			raw_features.append(np.median(a))
 		for a in [x, y, d]:
 			raw_features.append(np.percentile(a, 25))
 		for a in [x, y, d]:
 			raw_features.append(np.percentile(a, 75))
 		for a in [x, y, d]:
 			raw_features.append(np.percentile(a, 75) - np.percentile(a, 25))
 		for a in [x, y, d]:
 			raw_features.append(np.mean(np.abs(a[1:] - a[:-1])))
 		for a in [x, y, d]:
 			raw_features.append(np.mean(a[1:] - a[:-1]))
 		dx = x[:-1] - x[1:]
 		dy = y[:-1] - y[1:]
 		succ_angles = np.arctan2(dy, dx)
 		raw_features.append(np.mean(succ_angles))#  28: mean subsequent angle
 		return raw_features
 	def get_heatmap_features(self, start_index, end_index):
 		"""
 		computes a heatmap over the raw gaze positions, in a 8x8 grid
 		"""
 		x,y,d = self.get_x_y_d_part(start_index, end_index)
 		xmin = np.percentile(x, 2.5)
 		xmax = np.percentile(x, 97.5)
 		ymin = np.percentile(y, 2.5)
 		ymax = np.percentile(y, 97.5)
 		heatmap, xedges, yedges = np.histogram2d(x, y, bins=(8, 8), range=[[xmin, xmax], [ymin, ymax]])
 		normalised_flat_heatmap = heatmap.flatten() / np.sum(heatmap)
 		return normalised_flat_heatmap
 	def get_sacc_ngram_features(self, start_index, end_index):
 		# find those saccades that either start or end within start_index and end_index, or start before start_index and end after end_index
 		mysacc = [sacc for sacc in self.saccades if (sacc[gs.sacc_start_index_i] > start_index and sacc[gs.sacc_start_index_i] < end_index)
 													or (sacc[gs.sacc_end_index_i] > start_index and sacc[gs.sacc_end_index_i] < end_index)
 													or (sacc[gs.sacc_start_index_i] < start_index and sacc[gs.sacc_end_index_i] > end_index)]
 		# create string representing all saccades in mysacc
 		mywbs = [self.wordbook_string[mysacc.index(sacc)] for sacc in mysacc]
 		# create all possible n-grams of a certain length
 		ngrams = []
 		ngrams.append(mywbs)
 		for n in xrange(2,5):
 			ngrams.append([reduce(operator.add, mywbs[i:i+n]) for i in range(len(mywbs) - n)])
 		# compute histograms of the actual n-grams occuring in the data
 		histograms=[]
 		for i in xrange(0,4):
 			histograms.append(dict((x, ngrams[i].count(x)) for x in self.sacc_bins[i]))
 		# compute features from each histogram and append them to one list
 		sacc_ngram_features = []
 		for h in histograms:
 			wb_feat = self.get_ngram_features(h)
 			sacc_ngram_features.extend(wb_feat)
 		return sacc_ngram_features
 	def get_saccFix_wb_string_saccades(self, sacc):
 		"""
 		returns a string for a single saccade that will be used for n-gram features based on saccades and fixations
 		"""
 		amplitude = sacc[gs.sacc_amplitude_i]
 		angle_rad = sacc[gs.sacc_angle_i]
 		angle_deg = np.true_divide(angle_rad * 180.0, np.pi)
 		# 0 degrees is pointing to the right
 		if angle_deg < 45:
 			wb_str = 'r'
 		elif angle_deg < 135:
 			wb_str = 'u'
 		elif angle_deg < 225:
 			wb_str = 'l'
 		elif angle_deg < 315:
 			wb_str = 'd'
 		else:
 			wb_str = 'r'
 		if amplitude >= 2 * self.fixation_radius_threshold:  # less than 2 fixation_radius_thresholds
 			wb_str = wb_str.upper()
 		return wb_str
 	def get_saccFix_wb_string_fixations(self, fix):
 		"""
 		returns a string for a single fixation that will be used for n-gram features based on saccades and fixations
 		"""
 		if fix[gs.fix_end_t_i] - fix[gs.fix_start_t_i] < 2 * self.fixation_duration_threshold:
 			return 'f'
 		else:
 			return 'F'
 	def get_saccFix_ngram_features(self, start_index, end_index):
 		"""
 		computes n-gram features based on saccades and fixations
 		"""
 		# find all saccades and fixations between start_index and end_index,
 		# and create a string of their encodings
 		sacc_index = 0
 		fix_index = 0
 		wordbook_string = []
 		sacc_start_i = gs.sacc_start_index_i
 		sacc_end_i = gs.sacc_end_index_i
 		fix_start_i = gs.fix_start_index_i
 		fix_end_i = gs.fix_end_index_i
 		while (self.saccades[sacc_index][sacc_end_i] < start_index) and (sacc_index < len(self.saccades) - 1):
 			sacc_index += 1
 		while (self.fixations[fix_index][fix_end_i] < start_index) and (fix_index < len(self.fixations) - 1):
 			fix_index += 1
 		while sacc_index < len(self.saccades) and fix_index < len(self.fixations):
 			if (self.saccades[sacc_index][sacc_start_i] < end_index) and (self.fixations[fix_index][fix_start_i] < end_index):
 				if self.saccades[sacc_index][sacc_start_i] < self.fixations[fix_index][fix_start_i]:
 					wordbook_string.append(self.get_saccFix_wb_string_saccades(self.saccades[sacc_index]))
 					sacc_index += 1
 				else:
 					wordbook_string.append(self.get_saccFix_wb_string_fixations(self.fixations[fix_index]))
 					fix_index += 1
 			elif self.saccades[sacc_index][sacc_start_i] < end_index:
 				wordbook_string.append(self.get_saccFix_wb_string_saccades(self.saccades[sacc_index]))
 				sacc_index += 1
 			elif self.fixations[fix_index][fix_start_i] < end_index:
 				wordbook_string.append(self.get_saccFix_wb_string_fixations(self.fixations[fix_index]))
 				fix_index += 1
 			else:
 				sacc_index += 1
 				fix_index += 1
 		# compute all possible n-grams
 		ngrams = []
 		ngrams.append(wordbook_string)
 		for n in xrange(2,5):
 			ngrams.append([reduce(operator.add, wordbook_string[i:i+n]) for i in range(len(wordbook_string) - n)])
 		# compute histograms for n-grams
 		histograms=[]
 		for i in xrange(0,4):
 			histograms.append(dict((x, ngrams[i].count(x)) for x in self.saccFix_bins[i]))
 		# compute features from each histogram and append to one list
 		ngram_features = []
 		for h in histograms:
 			wb_feat = self.get_ngram_features(h)
 			ngram_features.extend(wb_feat)
 		return ngram_features
 	def get_FixSacWb_timed(self, start_index, end_index):
 		"""
 		returns list of fixations, saccades, blinks and the associated wordbook_string
 		for the time between start_index and end_index
 		"""
 		myfix = [fix for fix in self.fixations if (fix[gs. fix_start_index_i] > start_index
 		                                           and fix[gs. fix_start_index_i] < end_index)
 		         or (fix[gs.fix_end_index_i]>start_index and fix[gs.fix_end_index_i]<end_index)
 		         or (fix[gs.fix_start_index_i]<start_index and fix[gs.fix_end_index_i]>end_index)]
 		mysacc = [sacc for sacc in self.saccades if (sacc[gs.sacc_start_index_i]>start_index and sacc[gs.sacc_start_index_i]<end_index)
 		          or (sacc[gs.sacc_end_index_i]>start_index and sacc[gs.sacc_end_index_i]<end_index)
 		          or (sacc[gs.sacc_start_index_i]<start_index and sacc[gs.sacc_end_index_i]>end_index)]
 		mywbs = [self.wordbook_string[self.saccades.index(sacc)] for sacc in mysacc]
 		blinks = [b for b in self.blinks if (b[gs.blink_start_index_i]>start_index and b[gs.blink_start_index_i]<end_index)
 		          or (b[gs.blink_end_index_i]>start_index and b[gs.blink_end_index_i]<end_index)
 		          or (b[gs.blink_start_index_i]<start_index and b[gs.blink_end_index_i]>end_index)]
 		return myfix, mysacc, mywbs, blinks
 	def get_sliding_windows(self, start_index, end_index, sliding_window_size, sliding_window_step_size):
 		"""
 		computes a list of time windows resulting from the sliding windows approach with the given sliding_window_size and sliding_window_step_size
 		"""
 		window_times = []  # consisting of lists [a,b,t_a,t_b] where a is start index, b ist end index
 		a = start_index
 		b = min(self.getEndWindow(a, sliding_window_size), end_index)
 		if self.check_sliding_window_conditions(a, b, end_index, sliding_window_size):
 			window_times.append([a, b, self.gaze[a, self.ti], self.gaze[b, self.ti]])
 		while b < end_index:
 			a = self.getStartWindow(a, sliding_window_step_size)
 			b = self.getEndWindow(a, sliding_window_size)
 			# check if some conditions are fulfilled and if so, add to the window list
 			if self.check_sliding_window_conditions(a, b, end_index, sliding_window_size):
 				window_times.append([a, b, self.gaze[a, self.ti], self.gaze[b, self.ti]])
 		return window_times
 	def check_sliding_window_conditions(self, a, b, end_index, sliding_window_size):
 		# discard too short or too long sliding windows
 		window_duration = self.gaze[b, self.ti] - self.gaze[a, self.ti]
 		min_duration = sliding_window_size - 0.1*sliding_window_size
 		max_duration = sliding_window_size + 0.1*sliding_window_size
 		if window_duration < min_duration or window_duration>max_duration:
 			return False
 		if b <= end_index:
 			errors_in_window = np.sum(self.errors[a:(b+1)]) / (b-a)
 			if errors_in_window < 0.5:
 				# discard window if no saccade or fixation was detected
 				fixations, saccades, wordbook_string, blinks = self.get_FixSacWb_timed(a,b)
 				if len(fixations) == 0 and len(saccades) == 0:
 					return False
 				xgaze = self.gaze[a:(b+1), self.xi][np.logical_not(self.errors[a:(b+1)])]
 				ygaze = self.gaze[a:(b+1), self.yi][np.logical_not(self.errors[a:(b+1)])]
 				# discard window if less than 5 samples
 				if len(xgaze) < 5:
 					return False
 				# exclude windows with more than 66% constant gaze
 				(xvals, xcounts) = np.unique(xgaze, return_counts=True)
 				(yvals, ycounts) = np.unique(ygaze, return_counts=True)
 				xcounts = xcounts / float(np.sum(xcounts))
 				ycounts = ycounts / float(np.sum(ycounts))
 				if xcounts.max() > 0.66 or ycounts.max() > 0.66:
 					return False
 				#accept every remaining window
 				return True
 			else:
 				# discard windows with more than 50% erroneous samples
 				return False
 		return False
 	def getStartWindow(self, old_start, sliding_window_step_size):
 		start = old_start
 		starttime = self.gaze[old_start, self.ti]
 		while starttime < self.gaze[old_start, self.ti] + sliding_window_step_size:
 			start += 1
 			if start >= len(self.gaze[:, self.xi])-1:
 				return len(self.gaze[:, self.xi])-1
 			starttime = self.gaze[start, self.ti]
 		return start
 	def getEndWindow(self, start, sliding_window_size):
 		end = start
 		while self.gaze[end, self.ti] < self.gaze[start, self.ti] + sliding_window_size:
 			end += 1
 			if end >= len(self.gaze[:, self.xi])-1:
 				return len(self.gaze[:, self.xi])-1
 		return end
 	def get_ngram_features(self, wb):
 		feat = []
 		feat.append(sum(x > 0 for x in wb.values()))  # 1. size
 		feat.append(np.max(wb.values()))  # 2. maximum
 		nonzeros = [i for i in wb.values() if i]
 		if len(nonzeros)<1:
 			feat.append(0)
 		else:
 			feat.append(min(nonzeros))  # 3. non-zero minimum
 		feat.append(np.argmax(wb.values()))  # 2. arg max
 		if len(nonzeros)<1:
 			feat.append(0)
 		else:
 			feat.append(np.argmin(np.array(nonzeros)))  # 3. arg min
 		feat.append(feat[1] - feat[2])  # 4. diff max - min
 		feat.append(np.mean(wb.values()))  # 5. mean of all counts
 		feat.append(np.var(wb.values()))  # 6. var of all counts
 		return feat