InferringIntention/keyboard_and_mouse/dataset/.ipynb_checkpoints/03-NextActionPrediction-checkpoint.ipynb
2024-03-24 23:42:27 +01:00

36 KiB
Raw Blame History

In [1]:
import pandas as pd
import numpy as np
import datetime
import time,pdb
import json
import random
import statistics
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from sklearn import svm
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.layers import *
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import *
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, Callback
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
import tqdm
from multiprocessing import Pool
import os
from tensorflow.compat.v1.keras.layers import Bidirectional, CuDNNLSTM
2021-09-27 15:31:30.518074: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
In [18]:
study_data_path = "../IntentData/"
data = pd.read_pickle(study_data_path + "/Preprocessing_data/clean_data.pkl")
#val_data = pd.read_pickle(study_data_path + "/Preprocessing_data/clean_data_condition2.pkl")

print("available PIDs", data.PID.unique())

print("available TaskIDs", data.TaskID.unique())

data.Event.unique()
data
available PIDs [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16.]
available TaskIDs [0. 1. 2. 3. 4. 5. 6.]
Out[18]:
Timestamp Event TaskID Part PID TextRule Rule Type
0 1.575388e+12 4 0.0 1.0 1.0 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Cmd
1 1.575388e+12 1 0.0 1.0 1.0 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Toolbar
2 1.575388e+12 1 0.0 1.0 1.0 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Cmd
3 1.575388e+12 4 0.0 1.0 1.0 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Cmd
4 1.575388e+12 4 0.0 1.0 1.0 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Cmd
... ... ... ... ... ... ... ... ...
8376 1.603898e+12 7 6.0 5.0 16.0 {'Title': ['Size', 'Big'], 'Subtitle': ['Bold'... 5.0 Toolbar
8377 1.603898e+12 2 6.0 5.0 16.0 {'Title': ['Size', 'Big'], 'Subtitle': ['Bold'... 5.0 Cmd
8378 1.603898e+12 2 6.0 5.0 16.0 {'Title': ['Size', 'Big'], 'Subtitle': ['Bold'... 5.0 Cmd
8379 1.603898e+12 6 6.0 5.0 16.0 {'Title': ['Size', 'Big'], 'Subtitle': ['Bold'... 5.0 Toolbar
8380 1.603898e+12 6 6.0 5.0 16.0 {'Title': ['Size', 'Big'], 'Subtitle': ['Bold'... 5.0 Toolbar

8381 rows × 8 columns

In [3]:
data.groupby(["PID", "Part", "TaskID"])["Event"].count().describe()
Out[3]:
count    560.000000
mean      14.966071
std        2.195440
min        8.000000
25%       14.000000
50%       15.000000
75%       16.000000
max       28.000000
Name: Event, dtype: float64
In [4]:
Task_IDs = list(range(0,7))

# grouping by part is needed to have one ruleset for the whole part
g = data.groupby(["PID", "Part", "TaskID"])
df_all = []
In [5]:
def createTrainTestalaSven(test_IDs, task_IDs, window_size, stride, shapes=False, val_IDs=None):
    if not isinstance(test_IDs, list):
        raise ValueError("Test_IDs are not a list")
    if not isinstance(task_IDs, list):
        raise ValueError("Task_IDs are not a list")
    # Fill data arrays
    all_elem = []
    for current in g.groups.keys():
        c = g.get_group(current)
        if (c.TaskID.isin(task_IDs).all()):
          
            new_data = c.Event.values
            stepper = 0
            while stepper <= (len(new_data)-window_size-1):
                tmp = new_data[stepper:stepper + window_size]
                x = tmp[:-1]
                y = tmp[-1]
                stepper += stride
                
                if (c.PID.isin(test_IDs).all()):
                    all_elem.append(["Test", x, y])
                elif (c.PID.isin(val_IDs).all()):
                    all_elem.append(["Val", x, y])
                else:
                    all_elem.append(["Train", x, y])
    df_tmp = pd.DataFrame(all_elem, columns =["Split", "X", "Y"])
    turbo = []
    for s in df_tmp.Split.unique():
        dfX = df_tmp[df_tmp.Split == s]
        max_amount = dfX.groupby(["Y"]).count().max().X
        for y in dfX.Y.unique():
            df_turbotmp = dfX[dfX.Y == y]
            turbo.append(df_turbotmp)
            turbo.append(df_turbotmp.sample(max_amount-len(df_turbotmp), replace=True))
    #         if len(df_turbotmp) < max_amount:

    df_tmp = pd.concat(turbo)
    x_train, y_train = df_tmp[df_tmp.Split == "Train"].X.values, df_tmp[df_tmp.Split == "Train"].Y.values
    x_test, y_test = df_tmp[df_tmp.Split == "Test"].X.values, df_tmp[df_tmp.Split == "Test"].Y.values
    x_val, y_val = df_tmp[df_tmp.Split == "Val"].X.values, df_tmp[df_tmp.Split == "Val"].Y.values
    
    x_train = np.expand_dims(np.stack(x_train), axis=2)
    y_train = np.array(y_train)
    x_test = np.expand_dims(np.stack(x_test), axis=2)
    y_test = np.array(y_test)
    if len(x_val) > 0:
        x_val = np.expand_dims(np.stack(x_val), axis=2)
        y_val = np.array(y_val)
        return(x_train, y_train, x_test, y_test, x_val, y_val)
    return(x_train, y_train, x_test, y_test)
In [11]:
def createTrainTest(test_IDs, task_IDs, window_size, stride, shapes=False, val_IDs=None):
    if not isinstance(test_IDs, list):
        raise ValueError("Test_IDs are not a list")
    if not isinstance(task_IDs, list):
        raise ValueError("Task_IDs are not a list")
    # Fill data arrays
    y_train = []
    x_train = []
    y_test = []
    x_test = []
    x_val = []
    y_val = []
 
    for current in g.groups.keys():
        c = g.get_group(current)
        if (c.TaskID.isin(task_IDs).all()):
          
            new_data = c.Event.values
            stepper = 0
            while stepper <= (len(new_data)-window_size-1):
                tmp = new_data[stepper:stepper + window_size]
                pdb.set_trace()
                x = tmp[:-1]
                y = tmp[-1]
                stepper += stride
                if (c.PID.isin(test_IDs).all()):
                    if y == 6:
                        y_test.append(y)
                        x_test.append(x)
                    y_test.append(y)
                    x_test.append(x)
                elif (c.PID.isin(val_IDs).all()):
                    if y == 6:
                        y_val.append(y)
                        x_val.append(x)
                    y_val.append(y)
                    x_val.append(x)
                else:
                    if y == 6:
                        y_train.append(y)
                        x_train.append(x)
                    y_train.append(y)
                    x_train.append(x)
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    x_val = np.array(x_val)
    y_val = np.array(y_val)
    pdb.set_trace()
    if (shapes):
        print(x_train.shape)
        print(y_train.shape)
        print(x_test.shape)
        print(y_test.shape)
        print(x_val.shape)
        print(y_val.shape)
        print(np.unique(y_test))
        print(np.unique(y_train))
    if len(x_val) > 0:
        return(x_train, y_train, x_test, y_test, x_val, y_val)
    return (x_train, y_train, x_test, y_test)
In [12]:
maxlen = 1000
lens = []
for current in g.groups.keys():
    c = g.get_group(current)
    lens.append(len(c.Event.values))
    maxlen = min(maxlen, len(c.Event.values))
In [13]:
# Number of trees in random forest
n_estimators = np.arange(5,100, 5)
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = np.arange(5,100, 5)
# Minimum number of samples required to split a node
min_samples_split = np.arange(2,10, 1)
# Minimum number of samples required at each leaf node
min_samples_leaf = np.arange(2,5, 1)
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

grid = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 0, return_train_score=True)
In [14]:
def doTrainSlideWindowNoPad(currentPid):
    print(f"doTrain: {currentPid}")
    dfs = []
    for window_size in range(8, 15):  
        (x_train, y_train, x_test, y_test) = createTrainTest([currentPid], Task_IDs, window_size, 1, False, [200])
        print(f"doTrain: created TrainTestsplit")

    #     print("window_size", 5, "PID", currentPid, "samples", x_train.shape[0], "generated_samples", "samples", x_train_window.shape[0])

        grid.fit(x_train, y_train)
        print("fitted")
    #     y_pred = grid.predict(x_test)

        df_params = pd.DataFrame(grid.cv_results_["params"])
        df_params["Mean_test"] = grid.cv_results_["mean_test_score"]
        df_params["Mean_train"] = grid.cv_results_["mean_train_score"]
        df_params["STD_test"] = grid.cv_results_["std_test_score"]
        df_params["STD_train"] = grid.cv_results_["std_train_score"]
        df_params['Window_Size'] = window_size
        df_params['PID'] = currentPid
    #     df_params["Accuracy"] = accuracy_score(y_pred, y_test)
        dfs.append(df_params)

    return pd.concat(dfs)
In [15]:
doTrainSlideWindowNoPad(1)
doTrain: 1
> /tmp/ipykernel_90176/2602038955.py(23)createTrainTest()
     21                 tmp = new_data[stepper:stepper + window_size]
     22                 pdb.set_trace()
---> 23                 x = tmp[:-1]
     24                 y = tmp[-1]
     25                 stepper += stride

ipdb> tmp
array([4, 1, 1, 4, 4, 7, 7, 7])
ipdb> new_data
array([4, 1, 1, 4, 4, 7, 7, 7, 7, 7, 7, 4, 1, 4, 4, 4])
ipdb> current
(1.0, 1.0, 0.0)
ipdb> print(c)
       Timestamp  Event  TaskID  Part  PID  \
0   1.575388e+12      4     0.0   1.0  1.0   
1   1.575388e+12      1     0.0   1.0  1.0   
2   1.575388e+12      1     0.0   1.0  1.0   
3   1.575388e+12      4     0.0   1.0  1.0   
4   1.575388e+12      4     0.0   1.0  1.0   
5   1.575388e+12      7     0.0   1.0  1.0   
6   1.575388e+12      7     0.0   1.0  1.0   
7   1.575388e+12      7     0.0   1.0  1.0   
8   1.575388e+12      7     0.0   1.0  1.0   
9   1.575388e+12      7     0.0   1.0  1.0   
10  1.575388e+12      7     0.0   1.0  1.0   
11  1.575388e+12      4     0.0   1.0  1.0   
12  1.575388e+12      1     0.0   1.0  1.0   
13  1.575388e+12      4     0.0   1.0  1.0   
14  1.575388e+12      4     0.0   1.0  1.0   
15  1.575388e+12      4     0.0   1.0  1.0   

                                             TextRule  Rule     Type  
0   {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0      Cmd  
1   {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0  Toolbar  
2   {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0      Cmd  
3   {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0      Cmd  
4   {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0      Cmd  
5   {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0  Toolbar  
6   {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0  Toolbar  
7   {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0  Toolbar  
8   {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0  Toolbar  
9   {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0  Toolbar  
10  {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0  Toolbar  
11  {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0      Cmd  
12  {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0  Toolbar  
13  {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0      Cmd  
14  {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0      Cmd  
15  {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...   3.0      Cmd  
ipdb> print(c.TextRule)
0     {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
1     {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
2     {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
3     {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
4     {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
5     {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
6     {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
7     {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
8     {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
9     {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
10    {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
11    {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
12    {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
13    {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
14    {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
15    {'Title': ['1', 'Indent', 'and', 'Italic'], 'S...
Name: TextRule, dtype: object
ipdb> print(c.Event)
0     4
1     1
2     1
3     4
4     4
5     7
6     7
7     7
8     7
9     7
10    7
11    4
12    1
13    4
14    4
15    4
Name: Event, dtype: int64
ipdb> val
*** NameError: name 'val' is not defined
ipdb> val_IDs
[200]
--KeyboardInterrupt--

KeyboardInterrupt: Interrupted by user
> /tmp/ipykernel_90176/2602038955.py(22)createTrainTest()
     20             while stepper <= (len(new_data)-window_size-1):
     21                 tmp = new_data[stepper:stepper + window_size]
---> 22                 pdb.set_trace()
     23                 x = tmp[:-1]
     24                 y = tmp[-1]

--KeyboardInterrupt--

KeyboardInterrupt: Interrupted by user
> /tmp/ipykernel_90176/2602038955.py(23)createTrainTest()
     21                 tmp = new_data[stepper:stepper + window_size]
     22                 pdb.set_trace()
---> 23                 x = tmp[:-1]
     24                 y = tmp[-1]
     25                 stepper += stride

ipdb> q
---------------------------------------------------------------------------
BdbQuit                                   Traceback (most recent call last)
/tmp/ipykernel_90176/1128965594.py in <module>
----> 1 doTrainSlideWindowNoPad(1)

/tmp/ipykernel_90176/2629087375.py in doTrainSlideWindowNoPad(currentPid)
      3     dfs = []
      4     for window_size in range(8, 15):
----> 5         (x_train, y_train, x_test, y_test) = createTrainTest([currentPid], Task_IDs, window_size, 1, False, [200])
      6         print(f"doTrain: created TrainTestsplit")
      7 

/tmp/ipykernel_90176/2602038955.py in createTrainTest(test_IDs, task_IDs, window_size, stride, shapes, val_IDs)
     21                 tmp = new_data[stepper:stepper + window_size]
     22                 pdb.set_trace()
---> 23                 x = tmp[:-1]
     24                 y = tmp[-1]
     25                 stepper += stride

/tmp/ipykernel_90176/2602038955.py in createTrainTest(test_IDs, task_IDs, window_size, stride, shapes, val_IDs)
     21                 tmp = new_data[stepper:stepper + window_size]
     22                 pdb.set_trace()
---> 23                 x = tmp[:-1]
     24                 y = tmp[-1]
     25                 stepper += stride

~/miniconda3/envs/intentPrediction/lib/python3.9/bdb.py in trace_dispatch(self, frame, event, arg)
     86             return # None
     87         if event == 'line':
---> 88             return self.dispatch_line(frame)
     89         if event == 'call':
     90             return self.dispatch_call(frame, arg)

~/miniconda3/envs/intentPrediction/lib/python3.9/bdb.py in dispatch_line(self, frame)
    111         if self.stop_here(frame) or self.break_here(frame):
    112             self.user_line(frame)
--> 113             if self.quitting: raise BdbQuit
    114         return self.trace_dispatch
    115 

BdbQuit: