36 KiB
36 KiB
In [1]:
import pandas as pd import numpy as np import datetime import time,pdb import json import random import statistics import matplotlib.pyplot as plt import tensorflow as tf from tensorflow import keras from sklearn import svm from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from tensorflow.keras.layers import * from sklearn.model_selection import train_test_split from tensorflow.keras.models import Sequential from tensorflow.keras.optimizers import * from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, Callback from tensorflow.keras.preprocessing.sequence import pad_sequences from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import mean_squared_error from sklearn.metrics import accuracy_score import tqdm from multiprocessing import Pool import os from tensorflow.compat.v1.keras.layers import Bidirectional, CuDNNLSTM
2021-09-27 15:31:30.518074: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
In [18]:
study_data_path = "../IntentData/" data = pd.read_pickle(study_data_path + "/Preprocessing_data/clean_data.pkl") #val_data = pd.read_pickle(study_data_path + "/Preprocessing_data/clean_data_condition2.pkl") print("available PIDs", data.PID.unique()) print("available TaskIDs", data.TaskID.unique()) data.Event.unique() data
available PIDs [ 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11. 12. 13. 14. 15. 16.] available TaskIDs [0. 1. 2. 3. 4. 5. 6.]
Out[18]:
Timestamp | Event | TaskID | Part | PID | TextRule | Rule | Type | |
---|---|---|---|---|---|---|---|---|
0 | 1.575388e+12 | 4 | 0.0 | 1.0 | 1.0 | {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... | 3.0 | Cmd |
1 | 1.575388e+12 | 1 | 0.0 | 1.0 | 1.0 | {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... | 3.0 | Toolbar |
2 | 1.575388e+12 | 1 | 0.0 | 1.0 | 1.0 | {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... | 3.0 | Cmd |
3 | 1.575388e+12 | 4 | 0.0 | 1.0 | 1.0 | {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... | 3.0 | Cmd |
4 | 1.575388e+12 | 4 | 0.0 | 1.0 | 1.0 | {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... | 3.0 | Cmd |
... | ... | ... | ... | ... | ... | ... | ... | ... |
8376 | 1.603898e+12 | 7 | 6.0 | 5.0 | 16.0 | {'Title': ['Size', 'Big'], 'Subtitle': ['Bold'... | 5.0 | Toolbar |
8377 | 1.603898e+12 | 2 | 6.0 | 5.0 | 16.0 | {'Title': ['Size', 'Big'], 'Subtitle': ['Bold'... | 5.0 | Cmd |
8378 | 1.603898e+12 | 2 | 6.0 | 5.0 | 16.0 | {'Title': ['Size', 'Big'], 'Subtitle': ['Bold'... | 5.0 | Cmd |
8379 | 1.603898e+12 | 6 | 6.0 | 5.0 | 16.0 | {'Title': ['Size', 'Big'], 'Subtitle': ['Bold'... | 5.0 | Toolbar |
8380 | 1.603898e+12 | 6 | 6.0 | 5.0 | 16.0 | {'Title': ['Size', 'Big'], 'Subtitle': ['Bold'... | 5.0 | Toolbar |
8381 rows × 8 columns
In [3]:
data.groupby(["PID", "Part", "TaskID"])["Event"].count().describe()
Out[3]:
count 560.000000 mean 14.966071 std 2.195440 min 8.000000 25% 14.000000 50% 15.000000 75% 16.000000 max 28.000000 Name: Event, dtype: float64
In [4]:
Task_IDs = list(range(0,7)) # grouping by part is needed to have one ruleset for the whole part g = data.groupby(["PID", "Part", "TaskID"]) df_all = []
In [5]:
def createTrainTestalaSven(test_IDs, task_IDs, window_size, stride, shapes=False, val_IDs=None): if not isinstance(test_IDs, list): raise ValueError("Test_IDs are not a list") if not isinstance(task_IDs, list): raise ValueError("Task_IDs are not a list") # Fill data arrays all_elem = [] for current in g.groups.keys(): c = g.get_group(current) if (c.TaskID.isin(task_IDs).all()): new_data = c.Event.values stepper = 0 while stepper <= (len(new_data)-window_size-1): tmp = new_data[stepper:stepper + window_size] x = tmp[:-1] y = tmp[-1] stepper += stride if (c.PID.isin(test_IDs).all()): all_elem.append(["Test", x, y]) elif (c.PID.isin(val_IDs).all()): all_elem.append(["Val", x, y]) else: all_elem.append(["Train", x, y]) df_tmp = pd.DataFrame(all_elem, columns =["Split", "X", "Y"]) turbo = [] for s in df_tmp.Split.unique(): dfX = df_tmp[df_tmp.Split == s] max_amount = dfX.groupby(["Y"]).count().max().X for y in dfX.Y.unique(): df_turbotmp = dfX[dfX.Y == y] turbo.append(df_turbotmp) turbo.append(df_turbotmp.sample(max_amount-len(df_turbotmp), replace=True)) # if len(df_turbotmp) < max_amount: df_tmp = pd.concat(turbo) x_train, y_train = df_tmp[df_tmp.Split == "Train"].X.values, df_tmp[df_tmp.Split == "Train"].Y.values x_test, y_test = df_tmp[df_tmp.Split == "Test"].X.values, df_tmp[df_tmp.Split == "Test"].Y.values x_val, y_val = df_tmp[df_tmp.Split == "Val"].X.values, df_tmp[df_tmp.Split == "Val"].Y.values x_train = np.expand_dims(np.stack(x_train), axis=2) y_train = np.array(y_train) x_test = np.expand_dims(np.stack(x_test), axis=2) y_test = np.array(y_test) if len(x_val) > 0: x_val = np.expand_dims(np.stack(x_val), axis=2) y_val = np.array(y_val) return(x_train, y_train, x_test, y_test, x_val, y_val) return(x_train, y_train, x_test, y_test)
In [11]:
def createTrainTest(test_IDs, task_IDs, window_size, stride, shapes=False, val_IDs=None): if not isinstance(test_IDs, list): raise ValueError("Test_IDs are not a list") if not isinstance(task_IDs, list): raise ValueError("Task_IDs are not a list") # Fill data arrays y_train = [] x_train = [] y_test = [] x_test = [] x_val = [] y_val = [] for current in g.groups.keys(): c = g.get_group(current) if (c.TaskID.isin(task_IDs).all()): new_data = c.Event.values stepper = 0 while stepper <= (len(new_data)-window_size-1): tmp = new_data[stepper:stepper + window_size] pdb.set_trace() x = tmp[:-1] y = tmp[-1] stepper += stride if (c.PID.isin(test_IDs).all()): if y == 6: y_test.append(y) x_test.append(x) y_test.append(y) x_test.append(x) elif (c.PID.isin(val_IDs).all()): if y == 6: y_val.append(y) x_val.append(x) y_val.append(y) x_val.append(x) else: if y == 6: y_train.append(y) x_train.append(x) y_train.append(y) x_train.append(x) x_train = np.array(x_train) y_train = np.array(y_train) x_test = np.array(x_test) y_test = np.array(y_test) x_val = np.array(x_val) y_val = np.array(y_val) pdb.set_trace() if (shapes): print(x_train.shape) print(y_train.shape) print(x_test.shape) print(y_test.shape) print(x_val.shape) print(y_val.shape) print(np.unique(y_test)) print(np.unique(y_train)) if len(x_val) > 0: return(x_train, y_train, x_test, y_test, x_val, y_val) return (x_train, y_train, x_test, y_test)
In [12]:
maxlen = 1000 lens = [] for current in g.groups.keys(): c = g.get_group(current) lens.append(len(c.Event.values)) maxlen = min(maxlen, len(c.Event.values))
In [13]:
# Number of trees in random forest n_estimators = np.arange(5,100, 5) # Number of features to consider at every split max_features = ['sqrt'] # Maximum number of levels in tree max_depth = np.arange(5,100, 5) # Minimum number of samples required to split a node min_samples_split = np.arange(2,10, 1) # Minimum number of samples required at each leaf node min_samples_leaf = np.arange(2,5, 1) # Method of selecting samples for training each tree bootstrap = [True, False] # Create the random grid param_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap} grid = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 0, return_train_score=True)
In [14]:
def doTrainSlideWindowNoPad(currentPid): print(f"doTrain: {currentPid}") dfs = [] for window_size in range(8, 15): (x_train, y_train, x_test, y_test) = createTrainTest([currentPid], Task_IDs, window_size, 1, False, [200]) print(f"doTrain: created TrainTestsplit") # print("window_size", 5, "PID", currentPid, "samples", x_train.shape[0], "generated_samples", "samples", x_train_window.shape[0]) grid.fit(x_train, y_train) print("fitted") # y_pred = grid.predict(x_test) df_params = pd.DataFrame(grid.cv_results_["params"]) df_params["Mean_test"] = grid.cv_results_["mean_test_score"] df_params["Mean_train"] = grid.cv_results_["mean_train_score"] df_params["STD_test"] = grid.cv_results_["std_test_score"] df_params["STD_train"] = grid.cv_results_["std_train_score"] df_params['Window_Size'] = window_size df_params['PID'] = currentPid # df_params["Accuracy"] = accuracy_score(y_pred, y_test) dfs.append(df_params) return pd.concat(dfs)
In [15]:
doTrainSlideWindowNoPad(1)
doTrain: 1 > /tmp/ipykernel_90176/2602038955.py(23)createTrainTest() 21 tmp = new_data[stepper:stepper + window_size] 22 pdb.set_trace() ---> 23 x = tmp[:-1] 24 y = tmp[-1] 25 stepper += stride ipdb> tmp array([4, 1, 1, 4, 4, 7, 7, 7]) ipdb> new_data array([4, 1, 1, 4, 4, 7, 7, 7, 7, 7, 7, 4, 1, 4, 4, 4]) ipdb> current (1.0, 1.0, 0.0) ipdb> print(c) Timestamp Event TaskID Part PID \ 0 1.575388e+12 4 0.0 1.0 1.0 1 1.575388e+12 1 0.0 1.0 1.0 2 1.575388e+12 1 0.0 1.0 1.0 3 1.575388e+12 4 0.0 1.0 1.0 4 1.575388e+12 4 0.0 1.0 1.0 5 1.575388e+12 7 0.0 1.0 1.0 6 1.575388e+12 7 0.0 1.0 1.0 7 1.575388e+12 7 0.0 1.0 1.0 8 1.575388e+12 7 0.0 1.0 1.0 9 1.575388e+12 7 0.0 1.0 1.0 10 1.575388e+12 7 0.0 1.0 1.0 11 1.575388e+12 4 0.0 1.0 1.0 12 1.575388e+12 1 0.0 1.0 1.0 13 1.575388e+12 4 0.0 1.0 1.0 14 1.575388e+12 4 0.0 1.0 1.0 15 1.575388e+12 4 0.0 1.0 1.0 TextRule Rule Type 0 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Cmd 1 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Toolbar 2 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Cmd 3 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Cmd 4 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Cmd 5 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Toolbar 6 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Toolbar 7 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Toolbar 8 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Toolbar 9 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Toolbar 10 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Toolbar 11 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Cmd 12 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Toolbar 13 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Cmd 14 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Cmd 15 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3.0 Cmd ipdb> print(c.TextRule) 0 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 1 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 2 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 3 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 4 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 5 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 6 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 7 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 8 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 9 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 10 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 11 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 12 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 13 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 14 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... 15 {'Title': ['1', 'Indent', 'and', 'Italic'], 'S... Name: TextRule, dtype: object ipdb> print(c.Event) 0 4 1 1 2 1 3 4 4 4 5 7 6 7 7 7 8 7 9 7 10 7 11 4 12 1 13 4 14 4 15 4 Name: Event, dtype: int64 ipdb> val *** NameError: name 'val' is not defined ipdb> val_IDs [200] --KeyboardInterrupt-- KeyboardInterrupt: Interrupted by user > /tmp/ipykernel_90176/2602038955.py(22)createTrainTest() 20 while stepper <= (len(new_data)-window_size-1): 21 tmp = new_data[stepper:stepper + window_size] ---> 22 pdb.set_trace() 23 x = tmp[:-1] 24 y = tmp[-1] --KeyboardInterrupt-- KeyboardInterrupt: Interrupted by user > /tmp/ipykernel_90176/2602038955.py(23)createTrainTest() 21 tmp = new_data[stepper:stepper + window_size] 22 pdb.set_trace() ---> 23 x = tmp[:-1] 24 y = tmp[-1] 25 stepper += stride ipdb> q
--------------------------------------------------------------------------- BdbQuit Traceback (most recent call last) /tmp/ipykernel_90176/1128965594.py in <module> ----> 1 doTrainSlideWindowNoPad(1) /tmp/ipykernel_90176/2629087375.py in doTrainSlideWindowNoPad(currentPid) 3 dfs = [] 4 for window_size in range(8, 15): ----> 5 (x_train, y_train, x_test, y_test) = createTrainTest([currentPid], Task_IDs, window_size, 1, False, [200]) 6 print(f"doTrain: created TrainTestsplit") 7 /tmp/ipykernel_90176/2602038955.py in createTrainTest(test_IDs, task_IDs, window_size, stride, shapes, val_IDs) 21 tmp = new_data[stepper:stepper + window_size] 22 pdb.set_trace() ---> 23 x = tmp[:-1] 24 y = tmp[-1] 25 stepper += stride /tmp/ipykernel_90176/2602038955.py in createTrainTest(test_IDs, task_IDs, window_size, stride, shapes, val_IDs) 21 tmp = new_data[stepper:stepper + window_size] 22 pdb.set_trace() ---> 23 x = tmp[:-1] 24 y = tmp[-1] 25 stepper += stride ~/miniconda3/envs/intentPrediction/lib/python3.9/bdb.py in trace_dispatch(self, frame, event, arg) 86 return # None 87 if event == 'line': ---> 88 return self.dispatch_line(frame) 89 if event == 'call': 90 return self.dispatch_call(frame, arg) ~/miniconda3/envs/intentPrediction/lib/python3.9/bdb.py in dispatch_line(self, frame) 111 if self.stop_here(frame) or self.break_here(frame): 112 self.user_line(frame) --> 113 if self.quitting: raise BdbQuit 114 return self.trace_dispatch 115 BdbQuit: