code to train classifiers

This commit is contained in:
Sabrina Hoppe 2018-05-05 22:05:03 +02:00
parent 0403f2ce55
commit 34ff6100e6
7 changed files with 438 additions and 0 deletions

18
01_train_classifiers.sh Normal file
View File

@ -0,0 +1,18 @@
n_parallel_jobs=6 # number of jobs that will be run in parallel
n_total_jobs=100
for li in $(seq 0 $n_parallel_jobs $n_total_jobs);
do
# the for loop below will start n_parallel_jobs for each trait
# unless n_total_jobs will be reached first (this is checked in the if statementc)
mi=`expr "$li" + "$n_parallel_jobs"`
if [ "$mi" -gt "$n_total_jobs" ]
then
mi=$n_total_jobs
fi
for t in $(seq 0 6);
do
python2.7 -m classifiers.train_classifier -t $t -s 0 -a 0 -l $li -m $mi
wait
done
done

View File

@ -0,0 +1,20 @@
n_parallel_jobs=6 # number of jobs that will be run in parallel
n_total_jobs=100
for li in $(seq 0 $n_parallel_jobs $n_total_jobs);
do
# the for loop will start n_parallel_jobs for each trait
# unless n_total_jobs will be reached first (this is checked in the if statementc)
mi=`expr "$li" + "$n_parallel_jobs"`
if [ "$mi" -gt "$n_total_jobs" ]
then
mi=$n_total_jobs
fi
for t in $(seq 0 6);
do
for a in 1 2; do
python2.7 -m classifiers.train_classifier -t $t -s 0 -a $a -l $li -m $mi
wait
done
done
done

99
03_train_baseline.py Normal file
View File

@ -0,0 +1,99 @@
import sys
import numpy as np
from config import onf
import getopt
from sklearn.cross_validation import LabelKFold as LKF
from sklearn.cross_validation import StratifiedKFold as SKF
from sklearn.metrics import f1_score, accuracy_score
import pandas as pns
def load_data(ws, t):
_, y_file, id_file = conf.get_merged_feature_files(ws)
y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t]
ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)[:,0]
return y_ws, ids_ws
def get_baseline_f1_score(t):
"""
train a baseline classifier and return the F1 score it achieves
"""
outer_cv = SKF(participant_scores, conf.n_outer_folds, shuffle=True)
preds = np.zeros((conf.n_participants), dtype=int)
truth = np.zeros((conf.n_participants), dtype=int)
for outer_i, (outer_train_participants, outer_test_participants) in enumerate(outer_cv):
inner_performance = np.zeros((conf.n_inner_folds, len(conf.all_window_sizes)))
for ws_i in xrange(0, len(conf.all_window_sizes)):
ws = conf.all_window_sizes[ws_i]
# load data for this window size
y_ws, ids_ws = load_data(ws, t)
# cut out the outer train samples
outer_train_samples = np.array([p in outer_train_participants for p in ids_ws])
outer_train_y = y_ws[outer_train_samples]
outer_train_y_ids = ids_ws[outer_train_samples]
# build inner cross validation such that all samples of one person are either in training or testing
inner_cv = LKF(outer_train_y_ids, n_folds=conf.n_inner_folds)
for inner_i, (inner_train_indices, inner_test_indices) in enumerate(inner_cv):
# create inner train and test samples. Note: both are taken from outer train samples!
inner_y_train = outer_train_y[inner_train_indices]
unique_inner_test_ids = np.unique(outer_train_y_ids[inner_test_indices])
# predict the most frequent class from the training set
hist,_ = np.histogram(inner_y_train, bins=[0.5,1.5,2.5,3.5])
guess = np.argmax(hist) + 1
innerpreds = np.full(len(unique_inner_test_ids), guess, dtype=int)
innertruth = participant_scores[unique_inner_test_ids]
inner_performance[inner_i, ws_i] = accuracy_score(np.array(innertruth), np.array(innerpreds))
# evaluate classifier on outer cv using the best window size from inner cv
chosen_ws_i = np.argmax(np.mean(inner_performance, axis=0))
chosen_ws = conf.all_window_sizes[chosen_ws_i]
y, ids = load_data(chosen_ws, t)
outer_train_samples = np.array([p in outer_train_participants for p in ids])
outer_test_samples = np.array([p in outer_test_participants for p in ids])
if outer_train_samples.size > 0 and outer_test_samples.size > 0:
y_train = y[outer_train_samples]
# guess the most frequent class
hist,_ = np.histogram(y_train, bins=[0.5, 1.5, 2.5, 3.5])
guess = np.argmax(hist) + 1
for testp in outer_test_participants:
if testp in ids[outer_test_samples]:
preds[testp] = guess
truth[testp] = participant_scores[testp]
else:
# participant does not occour in outer test set, e.g. because their time in the shop was too short
preds[testp] = -1
truth[testp] = -1
print 'not enough samples for participant', testp
#print 'preds collected'
else:
for testp in outer_test_participants:
preds[testp] = np.array([])
truth[testp] = -1
f1 = f1_score(truth, preds, average='macro')
return f1
# If the program is run directly:
if __name__ == "__main__":
df = []
for trait in xrange(0, conf.n_traits):
participant_scores = np.loadtxt(conf.binned_personality_file, delimiter=',', skiprows=1, usecols=(trait+1,))
print conf.medium_traitlabels[trait]
for si in xrange(0,conf.max_n_iter):
f1 = get_baseline_f1_score(trait)
print '\t'+str(si)+':', f1
df.append([f1, conf.medium_traitlabels[trait], si])
df_pns = pns.DataFrame(data=df, columns=['F1', 'trait', 'iteration'])
df_pns.to_csv(conf.result_folder + '/most_frequ_class_baseline.csv')
print conf.result_folder + '/most_frequ_class_baseline.csv written.'

View File

@ -0,0 +1,18 @@
n_parallel_jobs=6 # number of jobs that will be run in parallel
n_total_jobs=100
for li in $(seq 0 $n_parallel_jobs $n_total_jobs);
do
# the for loop below will start n_parallel_jobs for each trait
# unless n_total_jobs will be reached first (this is checked in the if statementc)
mi=`expr "$li" + "$n_parallel_jobs"`
if [ "$mi" -gt "$n_total_jobs" ]
then
mi=$n_total_jobs
fi
for t in $(seq 0 6);
do
python2.7 -m classifiers.train_classifier -t $t -s 1 -a 0 -l $li -m $mi
wait
done
done

View File

@ -25,7 +25,25 @@ reproducing the paper results step by step:
1. __Extract features from raw gaze data__:
`python 00_compute_features.py` to compute gaze features for all participants
Once extracted, the features are stored in `features/ParticipantXX/window_features_YY.npy` where XX is the participant number and YY the length of the sliding window in seconds.
2. __Train random forest classifiers__
`./01 train_classifiers.sh` to reproduce the evaluation setting described in the paper in which each classifier was trained 100 times.
`./02_train_specialized_classifiers.sh` to train specialized classifiers on parts of the data (specifically on data from inside the shop or on the way).
If the scripts cannot be executed, you might not have the right access permissions to do so. On Linux, you can try `chmod +x 01_train_classifiers.sh`,`chmod +x 02_train_specialized_classifiers.sh` and `chmod +x 03_label_permutation_test.sh` (see below for when/how to use the last script).
In case you want to call the script differently, e.g. to speed-up the computation or try with different parameters, you can pass the following arguments to `classifiers.train_classifier`:
`-t` trait index between 0 and 6
`-l` lowest number of repetitions, e.g. 0
`-m` max number of repetitions, e.g. 100
`-a` using partial data only: 0 (all data), 1 (way data), 2(shop data)
In case of performance issues, it might be useful to check `_conf.py` and change `max_n_jobs` to restrict the number of jobs (i.e. threads) running in parallel.
The results will be saved in `results/A0` for all data, `results/A1` for way data only and `results/A2` for data inside a shop. Each file is named `TTT_XXX.npz`, where TTT is the abbreviation of the personality trait (`O`,`C`,`E`,`A`,`N` for the Big Five and `CEI` or `PCS` for the two curiosity measures). XXX enumerates the classifiers (remember that we always train 100 classifiers for evaluation because there is some randomness involved in the training process).
3. __Evaluate Baselines__
* To train a classifier that always predicts the most frequent personality score range from its current training set, please execute `python 03_train_baseline.py`
* To train classifiers on permuted labels, i.e. perform the so-called label permutation test, please execute `./04_label_permutation_test.sh`
## Citation

0
classifiers/__init__.py Normal file
View File

View File

@ -0,0 +1,265 @@
import sys
import numpy as np
from config import conf
import os
import getopt
import threading
from sklearn.cross_validation import LabelKFold as LKF
from sklearn.cross_validation import StratifiedKFold as SKF
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
def predict_all():
# add threads to a list, and wait for all of them in the end
threads = []
for trait in trait_list:
for si in xrange(low_repetitions, num_repetitions):
fname = conf.get_result_filename(annotation_value, trait, shuffle_labels, si, add_suffix=True)
if not os.path.exists(fname):
thread = threading.Thread(target=save_predictions, args=(trait, conf.get_result_filename(annotation_value, trait, shuffle_labels, si), si))
sys.stdout.flush()
thread.start()
threads.append(thread)
else:
print "existing solution:", fname
for thread in threads:
thread.join()
print 'waiting to join'
def load_data(ws, annotation_value, t, chosen_features = None):
x_file, y_file, id_file = conf.get_merged_feature_files(ws)
if annotation_value == conf.annotation_all:
x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1)
y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t]
ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)[:,0]
elif annotation_value == conf.annotation_shop:
x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1)
y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t]
ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)
x_ws = x_ws[ids_ws[:,1] == conf.time_window_annotation_shop,:]
y_ws = y_ws[ids_ws[:,1] == conf.time_window_annotation_shop]
ids_ws = ids_ws[ids_ws[:,1] == conf.time_window_annotation_shop,0]
elif annotation_value == conf.annotation_ways:
x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1)
y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t]
ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)
x_ws = x_ws[(ids_ws[:,1] == conf.time_window_annotation_wayI) | (ids_ws[:,1] == conf.time_window_annotation_wayII),:]
y_ws = y_ws[(ids_ws[:,1] == conf.time_window_annotation_wayI) | (ids_ws[:,1] == conf.time_window_annotation_wayII)]
ids_ws = ids_ws[(ids_ws[:,1] == conf.time_window_annotation_wayI) | (ids_ws[:,1] == conf.time_window_annotation_wayII),0]
else:
print 'unknown annotation value', annotation_value
print 'should be 0 (all data), 1 (way) or 2 (shop).'
sys.exit(1)
if chosen_features is not None:
x_ws = x_ws[:,chosen_features]
return x_ws, y_ws, ids_ws
def save_predictions(t, filename, rs):
"""
train a classifier and write results to file
"""
# create RandomForest classifier with parameters given in _conf.py
clf = RandomForestClassifier(random_state=rs, verbose=verbosity, class_weight='balanced',
n_estimators=conf.n_estimators, n_jobs=conf.max_n_jobs, max_features=conf.tree_max_features,
max_depth=conf.tree_max_depth)
# create StandardScaler that will be used to scale each feature
# such that it has mean 0 and std 1 on the trianing set
scaler = StandardScaler(with_std=True, with_mean=True)
# use ground truth to create folds for outer cross validation in a stratified way, i.e. such that
# each label occurs equally often
participant_scores = np.loadtxt(conf.binned_personality_file, delimiter=',', skiprows=1, usecols=(t+1,))
outer_cv = SKF(participant_scores, conf.n_outer_folds, shuffle=True)
# initialise arrays to save information
feat_imp = np.zeros((len(outer_cv), conf.max_n_feat)) # feature importance
preds = np.zeros((conf.n_participants), dtype=int) # predictions on participant level
detailed_preds = np.zeros((conf.n_participants), dtype=object) # predictions on time window level, array of lists
chosen_ws_is = np.zeros((conf.n_participants), dtype=int) # indices of window sizes chosen in the inner cross validation
for outer_i, (outer_train_participants, outer_test_participants) in enumerate(outer_cv):
print
print str(outer_i + 1) + '/' + str(conf.n_outer_folds)
# find best window size in inner cv, and discard unimportant features
inner_performance = np.zeros((conf.n_inner_folds, len(all_window_sizes)))
inner_feat_importances = np.zeros((conf.max_n_feat, len(all_window_sizes)))
for ws_i in xrange(0, len(all_window_sizes)):
ws = all_window_sizes[ws_i]
print '\t', 'ws ' + str(ws_i + 1) + '/' + str(len(all_window_sizes))
# load data for this window size
x_ws, y_ws, ids_ws = load_data(ws, annotation_value, t)
if shuffle_labels:
np.random.seed(316588 + 111 * t + rs)
perm = np.random.permutation(len(y_ws))
y_ws = y_ws[perm]
ids_ws = ids_ws[perm]
# cut out the outer train samples
outer_train_samples = np.array([p in outer_train_participants for p in ids_ws])
outer_train_x = x_ws[outer_train_samples, :]
outer_train_y = y_ws[outer_train_samples]
outer_train_y_ids = ids_ws[outer_train_samples]
# build inner cross validation such that all samples of one person are either in training or testing
inner_cv = LKF(outer_train_y_ids, n_folds=conf.n_inner_folds)
for inner_i, (inner_train_indices, inner_test_indices) in enumerate(inner_cv):
# create inner train and test samples. Note: both are taken from outer train samples!
inner_x_train = outer_train_x[inner_train_indices, :]
inner_y_train = outer_train_y[inner_train_indices]
inner_x_test = outer_train_x[inner_test_indices, :]
inner_y_test = outer_train_y[inner_test_indices]
# fit scaler on train set and scale both train and test set with the result
scaler.fit(inner_x_train)
inner_x_train = scaler.transform(inner_x_train)
inner_x_test = scaler.transform(inner_x_test)
# fit Random Forest
clf.fit(inner_x_train, inner_y_train)
# save predictions and feature importance
inner_pred = clf.predict(inner_x_test)
inner_feat_importances[:, ws_i] += clf.feature_importances_
# compute and save performance in terms of accuracy
innerpreds = []
innertruth = []
inner_test_ids = outer_train_y_ids[inner_test_indices]
for testp in np.unique(inner_test_ids):
(values, counts) = np.unique(inner_pred[inner_test_ids == testp], return_counts=True)
ind = np.argmax(counts)
innerpreds.append(values[ind])
innertruth.append(inner_y_test[inner_test_ids == testp][0])
inner_performance[inner_i, ws_i] = accuracy_score(np.array(innertruth), np.array(innerpreds))
print ' ACC: ', '%.2f' % (inner_performance[inner_i, ws_i] * 100)
# evaluate classifier on outer cv using the best window size from inner cv, and the most informative features
chosen_ws_i = np.argmax(np.mean(inner_performance, axis=0))
chosen_ws = all_window_sizes[chosen_ws_i]
chosen_features = (inner_feat_importances[:,chosen_ws_i]/float(conf.n_inner_folds)) > 0.005
# reload all data
x, y, ids = load_data(chosen_ws, annotation_value, t, chosen_features=chosen_features)
if shuffle_labels:
np.random.seed(316588 + 111 * t + rs + 435786)
perm = np.random.permutation(len(y))
y = y[perm]
ids = ids[perm]
outer_train_samples = np.array([p in outer_train_participants for p in ids])
outer_test_samples = np.array([p in outer_test_participants for p in ids])
if outer_train_samples.size > 0 and outer_test_samples.size > 0:
x_train = x[outer_train_samples, :]
y_train = y[outer_train_samples]
x_test = x[outer_test_samples, :]
y_test = y[outer_test_samples]
# scaling
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# fit Random Forest
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
for testp in outer_test_participants:
chosen_ws_is[testp] = chosen_ws_i
if testp in ids[outer_test_samples]:
# majority voting over all samples that belong to participant testp
(values, counts) = np.unique(pred[ids[outer_test_samples] == testp], return_counts=True)
ind = np.argmax(counts)
preds[testp] = values[ind]
detailed_preds[testp] = list(pred[ids[outer_test_samples] == testp])
else:
# participant does not occour in outer test set, e.g. because their time in the shop was too short
preds[testp] = -1
detailed_preds[testp] = []
# save the resulting feature importance
feat_imp[outer_i, chosen_features] = clf.feature_importances_
else:
for testp in outer_test_participants:
chosen_ws_is[testp] = -1
preds[testp] = np.array([])
truth[testp] = -1
feat_imp[outer_i, chosen_features] = -1
# compute resulting F1 score and save to file
nonzero_preds = preds[preds>0]
nonzero_truth = participant_scores[preds>0]
f1 = f1_score(nonzero_truth, nonzero_preds, average='macro')
np.savez(filename, f1=f1, predictions=preds, chosen_window_indices=chosen_ws_is,
feature_importances=feat_imp, detailed_predictions=detailed_preds)
print f1, 'written', filename
# If the program is run directly:
if __name__ == "__main__":
try:
opts, args = getopt.getopt(sys.argv[1:], "t:m:l:s:a:", [])
except getopt.GetoptError:
print 'valid arguments:'
print '-t trait index'
print '-s 1 to perform label permutation test, do not pass s or use -s 0 otherwise'
print '-l lowest number of repetitions'
print '-m max number of repetitions'
print '-a using partial data only: 0 (all data), 1 (way data), 2(shop data)'
sys.exit(2)
low_repetitions = 0
num_repetitions = conf.max_n_iter
verbosity = 0
shuffle_labels = False
annotation_value = conf.annotation_all
trait_list = xrange(0, conf.n_traits)
for opt, arg in opts:
if opt == '-t':
t = int(arg)
assert t in trait_list
trait_list = [t]
elif opt == '-a':
annotation_value = int(arg)
assert annotation_value in conf.annotation_values
elif opt == '-s':
shuffle_labels = bool(int(arg))
elif opt == '-m':
num_repetitions = int(arg)
elif opt == '-l':
low_repetitions = int(arg)
else:
print 'valid arguments:'
print '-t trait index'
print '-s 1 to perform label permutation test, do not pass s or use -s 0 otherwise'
print '-l lowest number of repetitions'
print '-m max number of repetitions'
print '-a using partial data only: 0 (all data), 1 (way data), 2(shop data)'
sys.exit(2)
result_folder = conf.get_result_folder(annotation_value)
if not os.path.exists(result_folder):
os.makedirs(result_folder)
# restrict window sizes in case shop data should be used
if annotation_value == conf.annotation_shop:
all_window_sizes = conf.all_shop_window_sizes
else:
all_window_sizes = conf.all_window_sizes
predict_all()