knuckletouch/python/Step_36_LSTM_ReadData.ipynb

8.0 KiB

Filtering the data for the LSTM: removes all the rows, where we used the revert button, when the participant performed a wrong gesture

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count
In [2]:
dfAll = pd.read_pickle("DataStudyEvaluation/AllData.pkl")
df_actual = dfAll[(dfAll.Actual_Data == True) & (dfAll.Is_Pause == False)]
df_actual.head()
Out[2]:
userID Timestamp Current_Task Task_amount TaskID VersionID RepetitionID Actual_Data Is_Pause Image
8351 2 1553594010364 1 510 28 2 0 True False [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
8352 2 1553594010414 1 510 28 2 0 True False [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
8353 2 1553594010445 1 510 28 2 0 True False [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
8354 2 1553594010485 1 510 28 2 0 True False [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
8355 2 1553594010525 1 510 28 2 0 True False [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
In [3]:
len(df_actual.userID.unique())
Out[3]:
12
In [4]:
print("all: %s, actual data: %s" % (len(dfAll), len(df_actual)))
all: 608084, actual data: 495142
In [5]:
%%time
# filter out all gestures, where the revert button was pressed during the study and the gestrue was repeated
def is_max(df):
    df_temp = df.copy(deep=True)
    max_version = df_temp.RepetitionID.max()
    df_temp["IsMax"] = np.where(df_temp.RepetitionID == max_version, True, False)
    df_temp["MaxRepetition"] = [max_version] * len(df_temp)
    return df_temp

df_filtered = df_actual.copy(deep=True)
df_grp = df_filtered.groupby([df_filtered.userID, df_filtered.TaskID, df_filtered.VersionID])
pool = Pool(cpu_count() - 1)
result_lst = pool.map(is_max, [grp for name, grp in df_grp])
df_filtered = pd.concat(result_lst)
df_filtered = df_filtered[df_filtered.IsMax == True]
pool.close()
CPU times: user 23.3 s, sys: 3.08 s, total: 26.3 s
Wall time: 26 s
In [6]:
df_filtered.to_pickle("DataStudyEvaluation/df_lstm.pkl")
In [7]:
print("actual: %s, filtered data: %s" % (len(df_actual), len(df_filtered)))
actual: 495142, filtered data: 457271