code to train classifiers
This commit is contained in:
parent
0403f2ce55
commit
34ff6100e6
7 changed files with 438 additions and 0 deletions
18
01_train_classifiers.sh
Normal file
18
01_train_classifiers.sh
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
n_parallel_jobs=6 # number of jobs that will be run in parallel
|
||||||
|
n_total_jobs=100
|
||||||
|
for li in $(seq 0 $n_parallel_jobs $n_total_jobs);
|
||||||
|
do
|
||||||
|
# the for loop below will start n_parallel_jobs for each trait
|
||||||
|
# unless n_total_jobs will be reached first (this is checked in the if statementc)
|
||||||
|
mi=`expr "$li" + "$n_parallel_jobs"`
|
||||||
|
if [ "$mi" -gt "$n_total_jobs" ]
|
||||||
|
then
|
||||||
|
mi=$n_total_jobs
|
||||||
|
fi
|
||||||
|
|
||||||
|
for t in $(seq 0 6);
|
||||||
|
do
|
||||||
|
python2.7 -m classifiers.train_classifier -t $t -s 0 -a 0 -l $li -m $mi
|
||||||
|
wait
|
||||||
|
done
|
||||||
|
done
|
20
02_train_specialized_classifiers.sh
Normal file
20
02_train_specialized_classifiers.sh
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
n_parallel_jobs=6 # number of jobs that will be run in parallel
|
||||||
|
n_total_jobs=100
|
||||||
|
for li in $(seq 0 $n_parallel_jobs $n_total_jobs);
|
||||||
|
do
|
||||||
|
# the for loop will start n_parallel_jobs for each trait
|
||||||
|
# unless n_total_jobs will be reached first (this is checked in the if statementc)
|
||||||
|
mi=`expr "$li" + "$n_parallel_jobs"`
|
||||||
|
if [ "$mi" -gt "$n_total_jobs" ]
|
||||||
|
then
|
||||||
|
mi=$n_total_jobs
|
||||||
|
fi
|
||||||
|
|
||||||
|
for t in $(seq 0 6);
|
||||||
|
do
|
||||||
|
for a in 1 2; do
|
||||||
|
python2.7 -m classifiers.train_classifier -t $t -s 0 -a $a -l $li -m $mi
|
||||||
|
wait
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
99
03_train_baseline.py
Normal file
99
03_train_baseline.py
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
from config import onf
|
||||||
|
import getopt
|
||||||
|
from sklearn.cross_validation import LabelKFold as LKF
|
||||||
|
from sklearn.cross_validation import StratifiedKFold as SKF
|
||||||
|
from sklearn.metrics import f1_score, accuracy_score
|
||||||
|
import pandas as pns
|
||||||
|
|
||||||
|
def load_data(ws, t):
|
||||||
|
_, y_file, id_file = conf.get_merged_feature_files(ws)
|
||||||
|
y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t]
|
||||||
|
ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)[:,0]
|
||||||
|
return y_ws, ids_ws
|
||||||
|
|
||||||
|
def get_baseline_f1_score(t):
|
||||||
|
"""
|
||||||
|
train a baseline classifier and return the F1 score it achieves
|
||||||
|
"""
|
||||||
|
outer_cv = SKF(participant_scores, conf.n_outer_folds, shuffle=True)
|
||||||
|
|
||||||
|
preds = np.zeros((conf.n_participants), dtype=int)
|
||||||
|
truth = np.zeros((conf.n_participants), dtype=int)
|
||||||
|
|
||||||
|
for outer_i, (outer_train_participants, outer_test_participants) in enumerate(outer_cv):
|
||||||
|
inner_performance = np.zeros((conf.n_inner_folds, len(conf.all_window_sizes)))
|
||||||
|
|
||||||
|
for ws_i in xrange(0, len(conf.all_window_sizes)):
|
||||||
|
ws = conf.all_window_sizes[ws_i]
|
||||||
|
|
||||||
|
# load data for this window size
|
||||||
|
y_ws, ids_ws = load_data(ws, t)
|
||||||
|
|
||||||
|
# cut out the outer train samples
|
||||||
|
outer_train_samples = np.array([p in outer_train_participants for p in ids_ws])
|
||||||
|
outer_train_y = y_ws[outer_train_samples]
|
||||||
|
outer_train_y_ids = ids_ws[outer_train_samples]
|
||||||
|
|
||||||
|
# build inner cross validation such that all samples of one person are either in training or testing
|
||||||
|
inner_cv = LKF(outer_train_y_ids, n_folds=conf.n_inner_folds)
|
||||||
|
for inner_i, (inner_train_indices, inner_test_indices) in enumerate(inner_cv):
|
||||||
|
# create inner train and test samples. Note: both are taken from outer train samples!
|
||||||
|
inner_y_train = outer_train_y[inner_train_indices]
|
||||||
|
unique_inner_test_ids = np.unique(outer_train_y_ids[inner_test_indices])
|
||||||
|
|
||||||
|
# predict the most frequent class from the training set
|
||||||
|
hist,_ = np.histogram(inner_y_train, bins=[0.5,1.5,2.5,3.5])
|
||||||
|
guess = np.argmax(hist) + 1
|
||||||
|
innerpreds = np.full(len(unique_inner_test_ids), guess, dtype=int)
|
||||||
|
innertruth = participant_scores[unique_inner_test_ids]
|
||||||
|
|
||||||
|
inner_performance[inner_i, ws_i] = accuracy_score(np.array(innertruth), np.array(innerpreds))
|
||||||
|
|
||||||
|
# evaluate classifier on outer cv using the best window size from inner cv
|
||||||
|
chosen_ws_i = np.argmax(np.mean(inner_performance, axis=0))
|
||||||
|
chosen_ws = conf.all_window_sizes[chosen_ws_i]
|
||||||
|
y, ids = load_data(chosen_ws, t)
|
||||||
|
|
||||||
|
outer_train_samples = np.array([p in outer_train_participants for p in ids])
|
||||||
|
outer_test_samples = np.array([p in outer_test_participants for p in ids])
|
||||||
|
|
||||||
|
if outer_train_samples.size > 0 and outer_test_samples.size > 0:
|
||||||
|
y_train = y[outer_train_samples]
|
||||||
|
|
||||||
|
# guess the most frequent class
|
||||||
|
hist,_ = np.histogram(y_train, bins=[0.5, 1.5, 2.5, 3.5])
|
||||||
|
guess = np.argmax(hist) + 1
|
||||||
|
|
||||||
|
for testp in outer_test_participants:
|
||||||
|
if testp in ids[outer_test_samples]:
|
||||||
|
preds[testp] = guess
|
||||||
|
truth[testp] = participant_scores[testp]
|
||||||
|
else:
|
||||||
|
# participant does not occour in outer test set, e.g. because their time in the shop was too short
|
||||||
|
preds[testp] = -1
|
||||||
|
truth[testp] = -1
|
||||||
|
print 'not enough samples for participant', testp
|
||||||
|
#print 'preds collected'
|
||||||
|
else:
|
||||||
|
for testp in outer_test_participants:
|
||||||
|
preds[testp] = np.array([])
|
||||||
|
truth[testp] = -1
|
||||||
|
|
||||||
|
f1 = f1_score(truth, preds, average='macro')
|
||||||
|
return f1
|
||||||
|
|
||||||
|
# If the program is run directly:
|
||||||
|
if __name__ == "__main__":
|
||||||
|
df = []
|
||||||
|
for trait in xrange(0, conf.n_traits):
|
||||||
|
participant_scores = np.loadtxt(conf.binned_personality_file, delimiter=',', skiprows=1, usecols=(trait+1,))
|
||||||
|
print conf.medium_traitlabels[trait]
|
||||||
|
for si in xrange(0,conf.max_n_iter):
|
||||||
|
f1 = get_baseline_f1_score(trait)
|
||||||
|
print '\t'+str(si)+':', f1
|
||||||
|
df.append([f1, conf.medium_traitlabels[trait], si])
|
||||||
|
df_pns = pns.DataFrame(data=df, columns=['F1', 'trait', 'iteration'])
|
||||||
|
df_pns.to_csv(conf.result_folder + '/most_frequ_class_baseline.csv')
|
||||||
|
print conf.result_folder + '/most_frequ_class_baseline.csv written.'
|
18
04_label_permutation_test.sh
Normal file
18
04_label_permutation_test.sh
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
n_parallel_jobs=6 # number of jobs that will be run in parallel
|
||||||
|
n_total_jobs=100
|
||||||
|
for li in $(seq 0 $n_parallel_jobs $n_total_jobs);
|
||||||
|
do
|
||||||
|
# the for loop below will start n_parallel_jobs for each trait
|
||||||
|
# unless n_total_jobs will be reached first (this is checked in the if statementc)
|
||||||
|
mi=`expr "$li" + "$n_parallel_jobs"`
|
||||||
|
if [ "$mi" -gt "$n_total_jobs" ]
|
||||||
|
then
|
||||||
|
mi=$n_total_jobs
|
||||||
|
fi
|
||||||
|
|
||||||
|
for t in $(seq 0 6);
|
||||||
|
do
|
||||||
|
python2.7 -m classifiers.train_classifier -t $t -s 1 -a 0 -l $li -m $mi
|
||||||
|
wait
|
||||||
|
done
|
||||||
|
done
|
18
README.md
18
README.md
|
@ -25,7 +25,25 @@ reproducing the paper results step by step:
|
||||||
1. __Extract features from raw gaze data__:
|
1. __Extract features from raw gaze data__:
|
||||||
`python 00_compute_features.py` to compute gaze features for all participants
|
`python 00_compute_features.py` to compute gaze features for all participants
|
||||||
Once extracted, the features are stored in `features/ParticipantXX/window_features_YY.npy` where XX is the participant number and YY the length of the sliding window in seconds.
|
Once extracted, the features are stored in `features/ParticipantXX/window_features_YY.npy` where XX is the participant number and YY the length of the sliding window in seconds.
|
||||||
|
2. __Train random forest classifiers__
|
||||||
|
`./01 train_classifiers.sh` to reproduce the evaluation setting described in the paper in which each classifier was trained 100 times.
|
||||||
|
`./02_train_specialized_classifiers.sh` to train specialized classifiers on parts of the data (specifically on data from inside the shop or on the way).
|
||||||
|
|
||||||
|
If the scripts cannot be executed, you might not have the right access permissions to do so. On Linux, you can try `chmod +x 01_train_classifiers.sh`,`chmod +x 02_train_specialized_classifiers.sh` and `chmod +x 03_label_permutation_test.sh` (see below for when/how to use the last script).
|
||||||
|
|
||||||
|
In case you want to call the script differently, e.g. to speed-up the computation or try with different parameters, you can pass the following arguments to `classifiers.train_classifier`:
|
||||||
|
`-t` trait index between 0 and 6
|
||||||
|
`-l` lowest number of repetitions, e.g. 0
|
||||||
|
`-m` max number of repetitions, e.g. 100
|
||||||
|
`-a` using partial data only: 0 (all data), 1 (way data), 2(shop data)
|
||||||
|
|
||||||
|
In case of performance issues, it might be useful to check `_conf.py` and change `max_n_jobs` to restrict the number of jobs (i.e. threads) running in parallel.
|
||||||
|
|
||||||
|
The results will be saved in `results/A0` for all data, `results/A1` for way data only and `results/A2` for data inside a shop. Each file is named `TTT_XXX.npz`, where TTT is the abbreviation of the personality trait (`O`,`C`,`E`,`A`,`N` for the Big Five and `CEI` or `PCS` for the two curiosity measures). XXX enumerates the classifiers (remember that we always train 100 classifiers for evaluation because there is some randomness involved in the training process).
|
||||||
|
|
||||||
|
3. __Evaluate Baselines__
|
||||||
|
* To train a classifier that always predicts the most frequent personality score range from its current training set, please execute `python 03_train_baseline.py`
|
||||||
|
* To train classifiers on permuted labels, i.e. perform the so-called label permutation test, please execute `./04_label_permutation_test.sh`
|
||||||
|
|
||||||
|
|
||||||
## Citation
|
## Citation
|
||||||
|
|
0
classifiers/__init__.py
Normal file
0
classifiers/__init__.py
Normal file
265
classifiers/train_classifier.py
Normal file
265
classifiers/train_classifier.py
Normal file
|
@ -0,0 +1,265 @@
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
from config import conf
|
||||||
|
import os
|
||||||
|
import getopt
|
||||||
|
import threading
|
||||||
|
from sklearn.cross_validation import LabelKFold as LKF
|
||||||
|
from sklearn.cross_validation import StratifiedKFold as SKF
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
from sklearn.metrics import f1_score, accuracy_score
|
||||||
|
|
||||||
|
|
||||||
|
def predict_all():
|
||||||
|
# add threads to a list, and wait for all of them in the end
|
||||||
|
threads = []
|
||||||
|
|
||||||
|
for trait in trait_list:
|
||||||
|
for si in xrange(low_repetitions, num_repetitions):
|
||||||
|
fname = conf.get_result_filename(annotation_value, trait, shuffle_labels, si, add_suffix=True)
|
||||||
|
if not os.path.exists(fname):
|
||||||
|
thread = threading.Thread(target=save_predictions, args=(trait, conf.get_result_filename(annotation_value, trait, shuffle_labels, si), si))
|
||||||
|
sys.stdout.flush()
|
||||||
|
thread.start()
|
||||||
|
threads.append(thread)
|
||||||
|
else:
|
||||||
|
print "existing solution:", fname
|
||||||
|
|
||||||
|
for thread in threads:
|
||||||
|
thread.join()
|
||||||
|
print 'waiting to join'
|
||||||
|
|
||||||
|
def load_data(ws, annotation_value, t, chosen_features = None):
|
||||||
|
x_file, y_file, id_file = conf.get_merged_feature_files(ws)
|
||||||
|
if annotation_value == conf.annotation_all:
|
||||||
|
x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1)
|
||||||
|
y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t]
|
||||||
|
ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)[:,0]
|
||||||
|
elif annotation_value == conf.annotation_shop:
|
||||||
|
x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1)
|
||||||
|
y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t]
|
||||||
|
ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)
|
||||||
|
|
||||||
|
x_ws = x_ws[ids_ws[:,1] == conf.time_window_annotation_shop,:]
|
||||||
|
y_ws = y_ws[ids_ws[:,1] == conf.time_window_annotation_shop]
|
||||||
|
ids_ws = ids_ws[ids_ws[:,1] == conf.time_window_annotation_shop,0]
|
||||||
|
elif annotation_value == conf.annotation_ways:
|
||||||
|
x_ws = np.genfromtxt(x_file, delimiter=',', skip_header=1)
|
||||||
|
y_ws = np.genfromtxt(y_file, delimiter=',', skip_header=1).astype(int)[:,t]
|
||||||
|
ids_ws = np.genfromtxt(id_file, delimiter=',', skip_header=1).astype(int)
|
||||||
|
|
||||||
|
x_ws = x_ws[(ids_ws[:,1] == conf.time_window_annotation_wayI) | (ids_ws[:,1] == conf.time_window_annotation_wayII),:]
|
||||||
|
y_ws = y_ws[(ids_ws[:,1] == conf.time_window_annotation_wayI) | (ids_ws[:,1] == conf.time_window_annotation_wayII)]
|
||||||
|
ids_ws = ids_ws[(ids_ws[:,1] == conf.time_window_annotation_wayI) | (ids_ws[:,1] == conf.time_window_annotation_wayII),0]
|
||||||
|
else:
|
||||||
|
print 'unknown annotation value', annotation_value
|
||||||
|
print 'should be 0 (all data), 1 (way) or 2 (shop).'
|
||||||
|
sys.exit(1)
|
||||||
|
if chosen_features is not None:
|
||||||
|
x_ws = x_ws[:,chosen_features]
|
||||||
|
return x_ws, y_ws, ids_ws
|
||||||
|
|
||||||
|
|
||||||
|
def save_predictions(t, filename, rs):
|
||||||
|
"""
|
||||||
|
train a classifier and write results to file
|
||||||
|
"""
|
||||||
|
# create RandomForest classifier with parameters given in _conf.py
|
||||||
|
clf = RandomForestClassifier(random_state=rs, verbose=verbosity, class_weight='balanced',
|
||||||
|
n_estimators=conf.n_estimators, n_jobs=conf.max_n_jobs, max_features=conf.tree_max_features,
|
||||||
|
max_depth=conf.tree_max_depth)
|
||||||
|
|
||||||
|
# create StandardScaler that will be used to scale each feature
|
||||||
|
# such that it has mean 0 and std 1 on the trianing set
|
||||||
|
scaler = StandardScaler(with_std=True, with_mean=True)
|
||||||
|
|
||||||
|
# use ground truth to create folds for outer cross validation in a stratified way, i.e. such that
|
||||||
|
# each label occurs equally often
|
||||||
|
participant_scores = np.loadtxt(conf.binned_personality_file, delimiter=',', skiprows=1, usecols=(t+1,))
|
||||||
|
outer_cv = SKF(participant_scores, conf.n_outer_folds, shuffle=True)
|
||||||
|
|
||||||
|
# initialise arrays to save information
|
||||||
|
feat_imp = np.zeros((len(outer_cv), conf.max_n_feat)) # feature importance
|
||||||
|
preds = np.zeros((conf.n_participants), dtype=int) # predictions on participant level
|
||||||
|
detailed_preds = np.zeros((conf.n_participants), dtype=object) # predictions on time window level, array of lists
|
||||||
|
chosen_ws_is = np.zeros((conf.n_participants), dtype=int) # indices of window sizes chosen in the inner cross validation
|
||||||
|
|
||||||
|
for outer_i, (outer_train_participants, outer_test_participants) in enumerate(outer_cv):
|
||||||
|
print
|
||||||
|
print str(outer_i + 1) + '/' + str(conf.n_outer_folds)
|
||||||
|
|
||||||
|
# find best window size in inner cv, and discard unimportant features
|
||||||
|
inner_performance = np.zeros((conf.n_inner_folds, len(all_window_sizes)))
|
||||||
|
inner_feat_importances = np.zeros((conf.max_n_feat, len(all_window_sizes)))
|
||||||
|
|
||||||
|
for ws_i in xrange(0, len(all_window_sizes)):
|
||||||
|
ws = all_window_sizes[ws_i]
|
||||||
|
print '\t', 'ws ' + str(ws_i + 1) + '/' + str(len(all_window_sizes))
|
||||||
|
|
||||||
|
# load data for this window size
|
||||||
|
x_ws, y_ws, ids_ws = load_data(ws, annotation_value, t)
|
||||||
|
if shuffle_labels:
|
||||||
|
np.random.seed(316588 + 111 * t + rs)
|
||||||
|
perm = np.random.permutation(len(y_ws))
|
||||||
|
y_ws = y_ws[perm]
|
||||||
|
ids_ws = ids_ws[perm]
|
||||||
|
|
||||||
|
# cut out the outer train samples
|
||||||
|
outer_train_samples = np.array([p in outer_train_participants for p in ids_ws])
|
||||||
|
outer_train_x = x_ws[outer_train_samples, :]
|
||||||
|
outer_train_y = y_ws[outer_train_samples]
|
||||||
|
outer_train_y_ids = ids_ws[outer_train_samples]
|
||||||
|
|
||||||
|
# build inner cross validation such that all samples of one person are either in training or testing
|
||||||
|
inner_cv = LKF(outer_train_y_ids, n_folds=conf.n_inner_folds)
|
||||||
|
for inner_i, (inner_train_indices, inner_test_indices) in enumerate(inner_cv):
|
||||||
|
# create inner train and test samples. Note: both are taken from outer train samples!
|
||||||
|
inner_x_train = outer_train_x[inner_train_indices, :]
|
||||||
|
inner_y_train = outer_train_y[inner_train_indices]
|
||||||
|
|
||||||
|
inner_x_test = outer_train_x[inner_test_indices, :]
|
||||||
|
inner_y_test = outer_train_y[inner_test_indices]
|
||||||
|
|
||||||
|
# fit scaler on train set and scale both train and test set with the result
|
||||||
|
scaler.fit(inner_x_train)
|
||||||
|
inner_x_train = scaler.transform(inner_x_train)
|
||||||
|
inner_x_test = scaler.transform(inner_x_test)
|
||||||
|
|
||||||
|
# fit Random Forest
|
||||||
|
clf.fit(inner_x_train, inner_y_train)
|
||||||
|
|
||||||
|
# save predictions and feature importance
|
||||||
|
inner_pred = clf.predict(inner_x_test)
|
||||||
|
inner_feat_importances[:, ws_i] += clf.feature_importances_
|
||||||
|
|
||||||
|
# compute and save performance in terms of accuracy
|
||||||
|
innerpreds = []
|
||||||
|
innertruth = []
|
||||||
|
inner_test_ids = outer_train_y_ids[inner_test_indices]
|
||||||
|
for testp in np.unique(inner_test_ids):
|
||||||
|
(values, counts) = np.unique(inner_pred[inner_test_ids == testp], return_counts=True)
|
||||||
|
ind = np.argmax(counts)
|
||||||
|
innerpreds.append(values[ind])
|
||||||
|
innertruth.append(inner_y_test[inner_test_ids == testp][0])
|
||||||
|
inner_performance[inner_i, ws_i] = accuracy_score(np.array(innertruth), np.array(innerpreds))
|
||||||
|
print ' ACC: ', '%.2f' % (inner_performance[inner_i, ws_i] * 100)
|
||||||
|
|
||||||
|
# evaluate classifier on outer cv using the best window size from inner cv, and the most informative features
|
||||||
|
chosen_ws_i = np.argmax(np.mean(inner_performance, axis=0))
|
||||||
|
chosen_ws = all_window_sizes[chosen_ws_i]
|
||||||
|
chosen_features = (inner_feat_importances[:,chosen_ws_i]/float(conf.n_inner_folds)) > 0.005
|
||||||
|
|
||||||
|
# reload all data
|
||||||
|
x, y, ids = load_data(chosen_ws, annotation_value, t, chosen_features=chosen_features)
|
||||||
|
if shuffle_labels:
|
||||||
|
np.random.seed(316588 + 111 * t + rs + 435786)
|
||||||
|
perm = np.random.permutation(len(y))
|
||||||
|
y = y[perm]
|
||||||
|
ids = ids[perm]
|
||||||
|
|
||||||
|
outer_train_samples = np.array([p in outer_train_participants for p in ids])
|
||||||
|
outer_test_samples = np.array([p in outer_test_participants for p in ids])
|
||||||
|
|
||||||
|
if outer_train_samples.size > 0 and outer_test_samples.size > 0:
|
||||||
|
x_train = x[outer_train_samples, :]
|
||||||
|
y_train = y[outer_train_samples]
|
||||||
|
|
||||||
|
x_test = x[outer_test_samples, :]
|
||||||
|
y_test = y[outer_test_samples]
|
||||||
|
|
||||||
|
# scaling
|
||||||
|
scaler.fit(x_train)
|
||||||
|
x_train = scaler.transform(x_train)
|
||||||
|
x_test = scaler.transform(x_test)
|
||||||
|
|
||||||
|
# fit Random Forest
|
||||||
|
clf.fit(x_train, y_train)
|
||||||
|
pred = clf.predict(x_test)
|
||||||
|
|
||||||
|
for testp in outer_test_participants:
|
||||||
|
chosen_ws_is[testp] = chosen_ws_i
|
||||||
|
if testp in ids[outer_test_samples]:
|
||||||
|
# majority voting over all samples that belong to participant testp
|
||||||
|
(values, counts) = np.unique(pred[ids[outer_test_samples] == testp], return_counts=True)
|
||||||
|
ind = np.argmax(counts)
|
||||||
|
preds[testp] = values[ind]
|
||||||
|
detailed_preds[testp] = list(pred[ids[outer_test_samples] == testp])
|
||||||
|
else:
|
||||||
|
# participant does not occour in outer test set, e.g. because their time in the shop was too short
|
||||||
|
preds[testp] = -1
|
||||||
|
detailed_preds[testp] = []
|
||||||
|
|
||||||
|
# save the resulting feature importance
|
||||||
|
feat_imp[outer_i, chosen_features] = clf.feature_importances_
|
||||||
|
|
||||||
|
else:
|
||||||
|
for testp in outer_test_participants:
|
||||||
|
chosen_ws_is[testp] = -1
|
||||||
|
preds[testp] = np.array([])
|
||||||
|
truth[testp] = -1
|
||||||
|
feat_imp[outer_i, chosen_features] = -1
|
||||||
|
|
||||||
|
# compute resulting F1 score and save to file
|
||||||
|
nonzero_preds = preds[preds>0]
|
||||||
|
nonzero_truth = participant_scores[preds>0]
|
||||||
|
f1 = f1_score(nonzero_truth, nonzero_preds, average='macro')
|
||||||
|
np.savez(filename, f1=f1, predictions=preds, chosen_window_indices=chosen_ws_is,
|
||||||
|
feature_importances=feat_imp, detailed_predictions=detailed_preds)
|
||||||
|
print f1, 'written', filename
|
||||||
|
|
||||||
|
# If the program is run directly:
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
opts, args = getopt.getopt(sys.argv[1:], "t:m:l:s:a:", [])
|
||||||
|
except getopt.GetoptError:
|
||||||
|
print 'valid arguments:'
|
||||||
|
print '-t trait index'
|
||||||
|
print '-s 1 to perform label permutation test, do not pass s or use -s 0 otherwise'
|
||||||
|
print '-l lowest number of repetitions'
|
||||||
|
print '-m max number of repetitions'
|
||||||
|
print '-a using partial data only: 0 (all data), 1 (way data), 2(shop data)'
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
|
||||||
|
low_repetitions = 0
|
||||||
|
num_repetitions = conf.max_n_iter
|
||||||
|
verbosity = 0
|
||||||
|
shuffle_labels = False
|
||||||
|
annotation_value = conf.annotation_all
|
||||||
|
trait_list = xrange(0, conf.n_traits)
|
||||||
|
|
||||||
|
for opt, arg in opts:
|
||||||
|
if opt == '-t':
|
||||||
|
t = int(arg)
|
||||||
|
assert t in trait_list
|
||||||
|
trait_list = [t]
|
||||||
|
elif opt == '-a':
|
||||||
|
annotation_value = int(arg)
|
||||||
|
assert annotation_value in conf.annotation_values
|
||||||
|
elif opt == '-s':
|
||||||
|
shuffle_labels = bool(int(arg))
|
||||||
|
elif opt == '-m':
|
||||||
|
num_repetitions = int(arg)
|
||||||
|
elif opt == '-l':
|
||||||
|
low_repetitions = int(arg)
|
||||||
|
else:
|
||||||
|
print 'valid arguments:'
|
||||||
|
print '-t trait index'
|
||||||
|
print '-s 1 to perform label permutation test, do not pass s or use -s 0 otherwise'
|
||||||
|
print '-l lowest number of repetitions'
|
||||||
|
print '-m max number of repetitions'
|
||||||
|
print '-a using partial data only: 0 (all data), 1 (way data), 2(shop data)'
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
result_folder = conf.get_result_folder(annotation_value)
|
||||||
|
if not os.path.exists(result_folder):
|
||||||
|
os.makedirs(result_folder)
|
||||||
|
|
||||||
|
# restrict window sizes in case shop data should be used
|
||||||
|
if annotation_value == conf.annotation_shop:
|
||||||
|
all_window_sizes = conf.all_shop_window_sizes
|
||||||
|
else:
|
||||||
|
all_window_sizes = conf.all_window_sizes
|
||||||
|
|
||||||
|
predict_all()
|
Loading…
Reference in a new issue