In [1]:
import pandas as pd
import numpy as np
import os, pdb
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences

2021-09-28 16:10:28.497166: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [2]:
study_data_path = "../IntentData/"
data = pd.read_pickle(study_data_path + "/Preprocessing_data/clean_data.pkl")
Task_IDs = np.arange(7).tolist()
StartIndexOffset = 0 #if set to 5 ignore first 5 elements
EndIndexOffset = 0 #if set to 5 ignore last 5 elements

In [3]:
data.Rule.unique()
data.columns
data.Type.unique()

array(['Cmd', 'Toolbar'], dtype=object)

In [4]:
# grouping by part is needed to have one ruleset for the whole part
# Participant [1,16]
# Repeat for 5 times [1,5]
# ???????? [0,6]
g = data.groupby(["PID", "Part", "TaskID"])

In [5]:
param_grid = {'n_estimators': [10,50,100],  
              'max_depth': [10,20,30]}

grid = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 0, return_train_score=True) 

In [6]:
def createTrainTest(test_IDs, task_IDs, start_index_offset, end_index_offset, shapes=False):
    assert isinstance(test_IDs, list)
    assert isinstance(task_IDs, list)
    # Fill data arrays
    y_train = []
    x_train = []
    y_test = []
    x_test = []
    for current in g.groups.keys():
        c = g.get_group(current)
        if (c.TaskID.isin(task_IDs).all()):
            new_rule = c.Rule.unique()[0]
            if end_index_offset == 0:
                new_data = c.Event.values[start_index_offset:]
            else:
                new_data = c.Event.values[start_index_offset:-end_index_offset]
            if (c.PID.isin(test_IDs).all()):
                y_test.append(new_rule)
                x_test.append(new_data)
            else:
                y_train.append(new_rule)
                x_train.append(new_data)
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    print('x_train\n',x_train)
    print('y_train\n',y_train)
    print('x_test\n',x_test)
    print('y_test\n',y_test)
    pdb.set_trace()
    if (shapes):
        print(x_train.shape)
        print(y_train.shape)
        print(x_test.shape)
        print(y_test.shape)
        print(np.unique(y_test))
        print(np.unique(y_train))
    return (x_train, y_train, x_test, y_test)

def runSVMS(train_test, maxlen=None, plots=False, last_elements=False):
    x_train, y_train, x_test, y_test = train_test
    # Get maxlen to pad and pad
    if (maxlen==None):
        maxlen = 0
        for d in np.concatenate((x_train,x_test)):
            if len(d) > maxlen:
                maxlen = len(d)
    
    truncating_elements = "post"
    if last_elements:
        truncating_elements = "pre"

    x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen, dtype='int32', padding='post', truncating=truncating_elements, value=0)
    x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen, dtype='int32', padding='post', truncating=truncating_elements, value=0)

    # fitting the model for grid search 
    grid.fit(x_train, y_train) 

    # print how our model looks after hyper-parameter tuning
    if (plots==True):
        print(grid.best_estimator_) 

    # Predict with best SVM
    pred = grid.predict(x_test)

    return accuracy_score(pred, y_test), pred, y_test 

In [None]:
accuracies_full = dict()
accuracies_small = dict()
accuracies_last = dict()

for current_PID in sorted(data.PID.unique()):
    accuracies_full[current_PID], pred_label, test_label = runSVMS(createTrainTest([current_PID], Task_IDs, StartIndexOffset, EndIndexOffset, shapes=True))
    # Only the first 5
    accuracies_small[current_PID], pred_label, test_label = runSVMS(createTrainTest([current_PID], Task_IDs, StartIndexOffset, EndIndexOffset, shapes=True), 5)
    # Only the last 5
    accuracies_last[current_PID], pred_label, test_label = runSVMS(createTrainTest([current_PID], Task_IDs, StartIndexOffset, EndIndexOffset, shapes=True), 5, last_elements=True)
    #pdb.set_trace()
print(accuracies_full)
print(accuracies_small)
print(accuracies_last)
print("mean full", np.array(list(accuracies_full.values())).mean())
print("mean small", np.array(list(accuracies_small.values())).mean())
print("mean last", np.array(list(accuracies_last.values())).mean())

  x_train = np.array(x_train)
  x_test = np.array(x_test)


x_train
 [array([2, 7, 7, 7, 7, 7, 7, 2, 6, 6, 6, 2, 2, 2])
 array([4, 1, 4, 1, 1, 1, 1, 1, 7, 7, 7, 7, 7, 7])
 array([5, 7, 5, 7, 5, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1])
 array([3, 3, 3, 3, 3, 3, 6, 6, 5, 5, 5, 5, 5, 5, 5])
 array([5, 3, 5, 3, 3, 5, 3, 5, 3, 3, 4, 4, 4, 4, 4, 4])
 array([2, 6, 2, 6, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1])
 array([2, 3, 2, 3, 2, 3, 2, 2, 2, 4, 4, 4, 4, 4, 4])
 array([6, 6, 6, 2, 2, 2, 7, 7, 7, 7, 7, 7, 7])
 array([1, 4, 4, 7, 4, 1, 1, 4, 4, 4, 7, 7, 7, 7, 7])
 array([7, 5, 7, 1, 5, 7, 7, 5, 7, 7, 1, 1, 1, 1, 1])
 array([3, 6, 3, 5, 3, 6, 3, 6, 3, 6, 3, 5, 5, 5, 5, 5])
 array([3, 5, 3, 4, 3, 5, 3, 3, 3, 4, 4, 4, 4, 4])
 array([2, 6, 2, 1, 2, 6, 2, 6, 2, 2, 1, 1, 1, 1, 1])
 array([2, 3, 3, 4, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4])
 array([6, 2, 7, 6, 6, 2, 2, 7, 7, 7, 7, 7])
 array([4, 1, 4, 7, 4, 1, 4, 1, 7, 7, 7, 7, 7, 4, 4])
 array([5, 7, 7, 1, 5, 7, 7, 7, 7, 1, 1, 1, 1, 1])
 array([3, 6, 3, 5, 3, 6, 3, 3, 3, 5, 5, 5, 5, 5])
 array([5, 3, 3, 4, 3, 5, 3, 5, 3, 5, 3, 4, 4, 4,

ipdb> x_train.shape
(525,)
ipdb> x_test.shape
(35,)


In [None]:
len(g.groups.keys())
g.groups.keys()