eye_movements_personality/classifiers/train_classifier.py

import sys
import numpy as np
from config import conf 
import os
import getopt
import threading
from sklearn.cross_validation import LabelKFold as LKF
from sklearn.cross_validation import StratifiedKFold as SKF
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score


def predict_all():
	# add threads to a list, and wait for all of them in the end
	threads = []

	for trait in trait_list:
		for si in xrange(low_repetitions, num_repetitions):
			fname = conf.get_result_filename(annotation_value, trait, shuffle_labels, si, add_suffix=True)
			if not os.path.exists(fname):
				thread = threading.Thread(target=save_predictions, args=(trait, conf.get_result_filename(annotation_value, trait, shuffle_labels, si), si))
				sys.stdout.flush()
				thread.start()
				threads.append(thread)
			else:
				print "existing solution:", fname

	for thread in threads:
		thread.join()
		print 'waiting to join'

def load_data(ws, annotation_value, t, chosen_features = None):
	x_file, y_file, id_file = conf.get_merged_feature_files(ws)
	if annotation_value == conf.annotation_all:
		x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1)
		y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t]
		ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)[:,0]
	elif annotation_value == conf.annotation_shop:
		x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1)
		y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t]
		ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)

		x_ws = x_ws[ids_ws[:,1] == conf.time_window_annotation_shop,:]
		y_ws = y_ws[ids_ws[:,1] == conf.time_window_annotation_shop]
		ids_ws = ids_ws[ids_ws[:,1] == conf.time_window_annotation_shop,0]
	elif annotation_value == conf.annotation_ways:
		x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1)
		y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t]
		ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)

		x_ws = x_ws[(ids_ws[:,1] == conf.time_window_annotation_wayI) | (ids_ws[:,1] == conf.time_window_annotation_wayII),:]
		y_ws = y_ws[(ids_ws[:,1] == conf.time_window_annotation_wayI) | (ids_ws[:,1] == conf.time_window_annotation_wayII)]
		ids_ws = ids_ws[(ids_ws[:,1] == conf.time_window_annotation_wayI) | (ids_ws[:,1] == conf.time_window_annotation_wayII),0]
	else:
		print 'unknown annotation value', annotation_value
		print 'should be 0 (all data), 1 (way) or 2 (shop).'
		sys.exit(1)
	if chosen_features is not None:
		x_ws = x_ws[:,chosen_features]
	return x_ws, y_ws, ids_ws


def save_predictions(t, filename, rs):
	"""
	train a classifier and write results to file
	"""
	# create RandomForest classifier with parameters given in _conf.py
	clf = RandomForestClassifier(random_state=rs, verbose=verbosity, class_weight='balanced',
	                             n_estimators=conf.n_estimators, n_jobs=conf.max_n_jobs, max_features=conf.tree_max_features,
	                             max_depth=conf.tree_max_depth)

	# create StandardScaler that will be used to scale each feature
	# such that it has mean 0 and std 1 on the trianing set
	scaler = StandardScaler(with_std=True, with_mean=True)

	# use ground truth to create folds for outer cross validation in a stratified way, i.e. such that
	# each label occurs equally often
	participant_scores = np.loadtxt(conf.binned_personality_file, delimiter=',', skiprows=1, usecols=(t+1,))
	outer_cv = SKF(participant_scores, conf.n_outer_folds, shuffle=True)

	# initialise arrays to save information
	feat_imp = np.zeros((len(outer_cv), conf.max_n_feat))  # feature importance
	preds = np.zeros((conf.n_participants), dtype=int)  # predictions on participant level
	detailed_preds = np.zeros((conf.n_participants), dtype=object)  # predictions on time window level, array of lists
	chosen_ws_is = np.zeros((conf.n_participants), dtype=int)  # indices of window sizes chosen in the inner cross validation

	for outer_i, (outer_train_participants, outer_test_participants) in enumerate(outer_cv):
		print
		print str(outer_i + 1) + '/' + str(conf.n_outer_folds)

		# find best window size in inner cv, and discard unimportant features
		inner_performance = np.zeros((conf.n_inner_folds, len(all_window_sizes)))
		inner_feat_importances = np.zeros((conf.max_n_feat, len(all_window_sizes)))

		for ws_i in xrange(0, len(all_window_sizes)):
			ws = all_window_sizes[ws_i]
			print '\t', 'ws ' + str(ws_i + 1) + '/' + str(len(all_window_sizes))

			# load data for this window size
			x_ws, y_ws, ids_ws = load_data(ws, annotation_value, t)
			if shuffle_labels:
				np.random.seed(316588 + 111 * t + rs)
				perm = np.random.permutation(len(y_ws))
				y_ws = y_ws[perm]
				ids_ws = ids_ws[perm]

			# cut out the outer train samples
			outer_train_samples = np.array([p in outer_train_participants for p in ids_ws])
			outer_train_x = x_ws[outer_train_samples, :]
			outer_train_y = y_ws[outer_train_samples]
			outer_train_y_ids = ids_ws[outer_train_samples]

			# build inner cross validation such that all samples of one person are either in training or testing
			inner_cv = LKF(outer_train_y_ids, n_folds=conf.n_inner_folds)
			for inner_i, (inner_train_indices, inner_test_indices) in enumerate(inner_cv):
				# create inner train and test samples. Note: both are taken from outer train samples!
				inner_x_train = outer_train_x[inner_train_indices, :]
				inner_y_train = outer_train_y[inner_train_indices]

				inner_x_test = outer_train_x[inner_test_indices, :]
				inner_y_test = outer_train_y[inner_test_indices]

				# fit scaler on train set and scale both train and test set with the result
				scaler.fit(inner_x_train)
				inner_x_train = scaler.transform(inner_x_train)
				inner_x_test = scaler.transform(inner_x_test)

				# fit Random Forest
				clf.fit(inner_x_train, inner_y_train)

				# save predictions and feature importance
				inner_pred = clf.predict(inner_x_test)
				inner_feat_importances[:, ws_i] += clf.feature_importances_

				# compute and save performance in terms of accuracy
				innerpreds = []
				innertruth = []
				inner_test_ids = outer_train_y_ids[inner_test_indices]
				for testp in np.unique(inner_test_ids):
					(values, counts) = np.unique(inner_pred[inner_test_ids == testp], return_counts=True)
					ind = np.argmax(counts)
					innerpreds.append(values[ind])
					innertruth.append(inner_y_test[inner_test_ids == testp][0])
				inner_performance[inner_i, ws_i] = accuracy_score(np.array(innertruth), np.array(innerpreds))
				print '                ACC: ', '%.2f' % (inner_performance[inner_i, ws_i] * 100)

		# evaluate classifier on outer cv using the best window size from inner cv, and the most informative features
		chosen_ws_i = np.argmax(np.mean(inner_performance, axis=0))
		chosen_ws = all_window_sizes[chosen_ws_i]
		chosen_features = (inner_feat_importances[:,chosen_ws_i]/float(conf.n_inner_folds)) > 0.005

		# reload all data
		x, y, ids = load_data(chosen_ws, annotation_value, t, chosen_features=chosen_features)
		if shuffle_labels:
			np.random.seed(316588 + 111 * t + rs + 435786)
			perm = np.random.permutation(len(y))
			y = y[perm]
			ids = ids[perm]

		outer_train_samples = np.array([p in outer_train_participants for p in ids])
		outer_test_samples = np.array([p in outer_test_participants for p in ids])

		if outer_train_samples.size > 0 and outer_test_samples.size > 0:
			x_train = x[outer_train_samples, :]
			y_train = y[outer_train_samples]

			x_test = x[outer_test_samples, :]
			y_test = y[outer_test_samples]

			# scaling
			scaler.fit(x_train)
			x_train = scaler.transform(x_train)
			x_test = scaler.transform(x_test)

			# fit Random Forest
			clf.fit(x_train, y_train)
			pred = clf.predict(x_test)

			for testp in outer_test_participants:
				chosen_ws_is[testp] = chosen_ws_i
				if testp in ids[outer_test_samples]:
					# majority voting over all samples that belong to participant testp
					(values, counts) = np.unique(pred[ids[outer_test_samples] == testp], return_counts=True)
					ind = np.argmax(counts)
					preds[testp] = values[ind]
					detailed_preds[testp] = list(pred[ids[outer_test_samples] == testp])
				else:
					# participant does not occour in outer test set, e.g. because their time in the shop was too short
					preds[testp] = -1
					detailed_preds[testp] = []

			# save the resulting feature importance
			feat_imp[outer_i, chosen_features] = clf.feature_importances_

		else:
			for testp in outer_test_participants:
				chosen_ws_is[testp] = -1
				preds[testp] = np.array([])
				truth[testp] = -1
			feat_imp[outer_i, chosen_features] = -1

	# compute resulting F1 score and save to file
	nonzero_preds = preds[preds>0]
	nonzero_truth = participant_scores[preds>0]
	f1 = f1_score(nonzero_truth, nonzero_preds, average='macro')
	np.savez(filename, f1=f1, predictions=preds, chosen_window_indices=chosen_ws_is,
				feature_importances=feat_imp, detailed_predictions=detailed_preds)
	print f1, 'written', filename

# If the program is run directly:
if __name__ == "__main__":
	try:
		opts, args = getopt.getopt(sys.argv[1:], "t:m:l:s:a:", [])
	except getopt.GetoptError:
		print 'valid arguments:'
		print '-t 	trait index'
		print '-s   1 to perform label permutation test, do not pass s or use -s 0 otherwise'
		print '-l   lowest number of repetitions'
		print '-m   max number of repetitions'
		print '-a   using partial data only: 0 (all data), 1 (way data), 2(shop data)'
		sys.exit(2)


	low_repetitions = 0
	num_repetitions = conf.max_n_iter
	verbosity = 0
	shuffle_labels = False
	annotation_value = conf.annotation_all
	trait_list = xrange(0, conf.n_traits)

	for opt, arg in opts:
		if opt == '-t':
			t = int(arg)
			assert t in trait_list
			trait_list = [t]
		elif opt == '-a':
			annotation_value = int(arg)
			assert annotation_value in conf.annotation_values
		elif opt == '-s':
			shuffle_labels = bool(int(arg))
		elif opt == '-m':
			num_repetitions = int(arg)
		elif opt == '-l':
			low_repetitions = int(arg)
		else:
			print 'valid arguments:'
			print '-t 	trait index'
			print '-s   1 to perform label permutation test, do not pass s or use -s 0 otherwise'
			print '-l   lowest number of repetitions'
			print '-m   max number of repetitions'
			print '-a   using partial data only: 0 (all data), 1 (way data), 2(shop data)'
			sys.exit(2)

	result_folder = conf.get_result_folder(annotation_value)
	if not os.path.exists(result_folder):
		os.makedirs(result_folder)

	# restrict window sizes in case shop data should be used
	if annotation_value == conf.annotation_shop:
		all_window_sizes = conf.all_shop_window_sizes
	else:
		all_window_sizes = conf.all_window_sizes

	predict_all()
code to train classifiers 2018-05-05 22:05:03 +02:00			`import sys`
			`import numpy as np`
			`from config import conf`
			`import os`
			`import getopt`
			`import threading`
			`from sklearn.cross_validation import LabelKFold as LKF`
			`from sklearn.cross_validation import StratifiedKFold as SKF`
			`from sklearn.preprocessing import StandardScaler`
			`from sklearn.ensemble import RandomForestClassifier`
			`from sklearn.metrics import f1_score, accuracy_score`


			`def predict_all():`
			`# add threads to a list, and wait for all of them in the end`
			`threads = []`

			`for trait in trait_list:`
			`for si in xrange(low_repetitions, num_repetitions):`
			`fname = conf.get_result_filename(annotation_value, trait, shuffle_labels, si, add_suffix=True)`
			`if not os.path.exists(fname):`
			`thread = threading.Thread(target=save_predictions, args=(trait, conf.get_result_filename(annotation_value, trait, shuffle_labels, si), si))`
			`sys.stdout.flush()`
			`thread.start()`
			`threads.append(thread)`
			`else:`
			`print "existing solution:", fname`

			`for thread in threads:`
			`thread.join()`
			`print 'waiting to join'`

			`def load_data(ws, annotation_value, t, chosen_features = None):`
			`x_file, y_file, id_file = conf.get_merged_feature_files(ws)`
			`if annotation_value == conf.annotation_all:`
			`x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1)`
			`y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t]`
			`ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)[:,0]`
			`elif annotation_value == conf.annotation_shop:`
			`x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1)`
			`y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t]`
			`ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)`

			`x_ws = x_ws[ids_ws[:,1] == conf.time_window_annotation_shop,:]`
			`y_ws = y_ws[ids_ws[:,1] == conf.time_window_annotation_shop]`
			`ids_ws = ids_ws[ids_ws[:,1] == conf.time_window_annotation_shop,0]`
			`elif annotation_value == conf.annotation_ways:`
			`x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1)`
			`y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t]`
			`ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)`

			`x_ws = x_ws[(ids_ws[:,1] == conf.time_window_annotation_wayI) \| (ids_ws[:,1] == conf.time_window_annotation_wayII),:]`
			`y_ws = y_ws[(ids_ws[:,1] == conf.time_window_annotation_wayI) \| (ids_ws[:,1] == conf.time_window_annotation_wayII)]`
			`ids_ws = ids_ws[(ids_ws[:,1] == conf.time_window_annotation_wayI) \| (ids_ws[:,1] == conf.time_window_annotation_wayII),0]`
			`else:`
			`print 'unknown annotation value', annotation_value`
			`print 'should be 0 (all data), 1 (way) or 2 (shop).'`
			`sys.exit(1)`
			`if chosen_features is not None:`
			`x_ws = x_ws[:,chosen_features]`
			`return x_ws, y_ws, ids_ws`


			`def save_predictions(t, filename, rs):`
			`"""`
			`train a classifier and write results to file`
			`"""`
			`# create RandomForest classifier with parameters given in _conf.py`
			`clf = RandomForestClassifier(random_state=rs, verbose=verbosity, class_weight='balanced',`
			`n_estimators=conf.n_estimators, n_jobs=conf.max_n_jobs, max_features=conf.tree_max_features,`
			`max_depth=conf.tree_max_depth)`

			`# create StandardScaler that will be used to scale each feature`
			`# such that it has mean 0 and std 1 on the trianing set`
			`scaler = StandardScaler(with_std=True, with_mean=True)`

			`# use ground truth to create folds for outer cross validation in a stratified way, i.e. such that`
			`# each label occurs equally often`
			`participant_scores = np.loadtxt(conf.binned_personality_file, delimiter=',', skiprows=1, usecols=(t+1,))`
			`outer_cv = SKF(participant_scores, conf.n_outer_folds, shuffle=True)`

			`# initialise arrays to save information`
			`feat_imp = np.zeros((len(outer_cv), conf.max_n_feat)) # feature importance`
			`preds = np.zeros((conf.n_participants), dtype=int) # predictions on participant level`
			`detailed_preds = np.zeros((conf.n_participants), dtype=object) # predictions on time window level, array of lists`
			`chosen_ws_is = np.zeros((conf.n_participants), dtype=int) # indices of window sizes chosen in the inner cross validation`

			`for outer_i, (outer_train_participants, outer_test_participants) in enumerate(outer_cv):`
			`print`
			`print str(outer_i + 1) + '/' + str(conf.n_outer_folds)`

			`# find best window size in inner cv, and discard unimportant features`
			`inner_performance = np.zeros((conf.n_inner_folds, len(all_window_sizes)))`
			`inner_feat_importances = np.zeros((conf.max_n_feat, len(all_window_sizes)))`

			`for ws_i in xrange(0, len(all_window_sizes)):`
			`ws = all_window_sizes[ws_i]`
			`print '\t', 'ws ' + str(ws_i + 1) + '/' + str(len(all_window_sizes))`

			`# load data for this window size`
			`x_ws, y_ws, ids_ws = load_data(ws, annotation_value, t)`
			`if shuffle_labels:`
			`np.random.seed(316588 + 111 * t + rs)`
			`perm = np.random.permutation(len(y_ws))`
			`y_ws = y_ws[perm]`
			`ids_ws = ids_ws[perm]`

			`# cut out the outer train samples`
			`outer_train_samples = np.array([p in outer_train_participants for p in ids_ws])`
			`outer_train_x = x_ws[outer_train_samples, :]`
			`outer_train_y = y_ws[outer_train_samples]`
			`outer_train_y_ids = ids_ws[outer_train_samples]`

			`# build inner cross validation such that all samples of one person are either in training or testing`
			`inner_cv = LKF(outer_train_y_ids, n_folds=conf.n_inner_folds)`
			`for inner_i, (inner_train_indices, inner_test_indices) in enumerate(inner_cv):`
			`# create inner train and test samples. Note: both are taken from outer train samples!`
			`inner_x_train = outer_train_x[inner_train_indices, :]`
			`inner_y_train = outer_train_y[inner_train_indices]`

			`inner_x_test = outer_train_x[inner_test_indices, :]`
			`inner_y_test = outer_train_y[inner_test_indices]`

			`# fit scaler on train set and scale both train and test set with the result`
			`scaler.fit(inner_x_train)`
			`inner_x_train = scaler.transform(inner_x_train)`
			`inner_x_test = scaler.transform(inner_x_test)`

			`# fit Random Forest`
			`clf.fit(inner_x_train, inner_y_train)`

			`# save predictions and feature importance`
			`inner_pred = clf.predict(inner_x_test)`
			`inner_feat_importances[:, ws_i] += clf.feature_importances_`

			`# compute and save performance in terms of accuracy`
			`innerpreds = []`
			`innertruth = []`
			`inner_test_ids = outer_train_y_ids[inner_test_indices]`
			`for testp in np.unique(inner_test_ids):`
			`(values, counts) = np.unique(inner_pred[inner_test_ids == testp], return_counts=True)`
			`ind = np.argmax(counts)`
			`innerpreds.append(values[ind])`
			`innertruth.append(inner_y_test[inner_test_ids == testp][0])`
			`inner_performance[inner_i, ws_i] = accuracy_score(np.array(innertruth), np.array(innerpreds))`
			`print ' ACC: ', '%.2f' % (inner_performance[inner_i, ws_i] * 100)`

			`# evaluate classifier on outer cv using the best window size from inner cv, and the most informative features`
			`chosen_ws_i = np.argmax(np.mean(inner_performance, axis=0))`
			`chosen_ws = all_window_sizes[chosen_ws_i]`
			`chosen_features = (inner_feat_importances[:,chosen_ws_i]/float(conf.n_inner_folds)) > 0.005`

			`# reload all data`
			`x, y, ids = load_data(chosen_ws, annotation_value, t, chosen_features=chosen_features)`
			`if shuffle_labels:`
			`np.random.seed(316588 + 111 * t + rs + 435786)`
			`perm = np.random.permutation(len(y))`
			`y = y[perm]`
			`ids = ids[perm]`

			`outer_train_samples = np.array([p in outer_train_participants for p in ids])`
			`outer_test_samples = np.array([p in outer_test_participants for p in ids])`

			`if outer_train_samples.size > 0 and outer_test_samples.size > 0:`
			`x_train = x[outer_train_samples, :]`
			`y_train = y[outer_train_samples]`

			`x_test = x[outer_test_samples, :]`
			`y_test = y[outer_test_samples]`

			`# scaling`
			`scaler.fit(x_train)`
			`x_train = scaler.transform(x_train)`
			`x_test = scaler.transform(x_test)`

			`# fit Random Forest`
			`clf.fit(x_train, y_train)`
			`pred = clf.predict(x_test)`

			`for testp in outer_test_participants:`
			`chosen_ws_is[testp] = chosen_ws_i`
			`if testp in ids[outer_test_samples]:`
			`# majority voting over all samples that belong to participant testp`
			`(values, counts) = np.unique(pred[ids[outer_test_samples] == testp], return_counts=True)`
			`ind = np.argmax(counts)`
			`preds[testp] = values[ind]`
			`detailed_preds[testp] = list(pred[ids[outer_test_samples] == testp])`
			`else:`
			`# participant does not occour in outer test set, e.g. because their time in the shop was too short`
			`preds[testp] = -1`
			`detailed_preds[testp] = []`

			`# save the resulting feature importance`
			`feat_imp[outer_i, chosen_features] = clf.feature_importances_`

			`else:`
			`for testp in outer_test_participants:`
			`chosen_ws_is[testp] = -1`
			`preds[testp] = np.array([])`
			`truth[testp] = -1`
			`feat_imp[outer_i, chosen_features] = -1`

			`# compute resulting F1 score and save to file`
			`nonzero_preds = preds[preds>0]`
			`nonzero_truth = participant_scores[preds>0]`
			`f1 = f1_score(nonzero_truth, nonzero_preds, average='macro')`
			`np.savez(filename, f1=f1, predictions=preds, chosen_window_indices=chosen_ws_is,`
			`feature_importances=feat_imp, detailed_predictions=detailed_preds)`
			`print f1, 'written', filename`

			`# If the program is run directly:`
			`if __name__ == "__main__":`
			`try:`
			`opts, args = getopt.getopt(sys.argv[1:], "t:m:l:s:a:", [])`
			`except getopt.GetoptError:`
			`print 'valid arguments:'`
			`print '-t trait index'`
			`print '-s 1 to perform label permutation test, do not pass s or use -s 0 otherwise'`
			`print '-l lowest number of repetitions'`
			`print '-m max number of repetitions'`
			`print '-a using partial data only: 0 (all data), 1 (way data), 2(shop data)'`
			`sys.exit(2)`


			`low_repetitions = 0`
			`num_repetitions = conf.max_n_iter`
			`verbosity = 0`
			`shuffle_labels = False`
			`annotation_value = conf.annotation_all`
			`trait_list = xrange(0, conf.n_traits)`

			`for opt, arg in opts:`
			`if opt == '-t':`
			`t = int(arg)`
			`assert t in trait_list`
			`trait_list = [t]`
			`elif opt == '-a':`
			`annotation_value = int(arg)`
			`assert annotation_value in conf.annotation_values`
			`elif opt == '-s':`
			`shuffle_labels = bool(int(arg))`
			`elif opt == '-m':`
			`num_repetitions = int(arg)`
			`elif opt == '-l':`
			`low_repetitions = int(arg)`
			`else:`
			`print 'valid arguments:'`
			`print '-t trait index'`
			`print '-s 1 to perform label permutation test, do not pass s or use -s 0 otherwise'`
			`print '-l lowest number of repetitions'`
			`print '-m max number of repetitions'`
			`print '-a using partial data only: 0 (all data), 1 (way data), 2(shop data)'`
			`sys.exit(2)`

			`result_folder = conf.get_result_folder(annotation_value)`
			`if not os.path.exists(result_folder):`
			`os.makedirs(result_folder)`

			`# restrict window sizes in case shop data should be used`
			`if annotation_value == conf.annotation_shop:`
			`all_window_sizes = conf.all_shop_window_sizes`
			`else:`
			`all_window_sizes = conf.all_window_sizes`

			`predict_all()`