10 KiB
10 KiB
In [2]:
%matplotlib inline from scipy.odr import * from scipy.stats import * import numpy as np import pandas as pd import os import time import matplotlib.pyplot as plt import ast from multiprocessing import Pool, cpu_count import scipy from IPython import display from matplotlib.patches import Rectangle from sklearn.metrics import mean_squared_error import json import scipy.stats as st from sklearn.metrics import r2_score from matplotlib import cm from mpl_toolkits.mplot3d import axes3d import matplotlib.pyplot as plt import copy from sklearn.model_selection import LeaveOneOut, LeavePOut from multiprocessing import Pool
In [3]:
def cast_to_int(row): try: return np.array([a if float(a) >= 0 else 0 for a in row[2:-1]], dtype=np.uint8) except Exception as e: return None def load_csv(file): temp_df = pd.read_csv(file, delimiter=";") temp_df.Image = temp_df.Image.str.split(',') temp_df.Image = temp_df.Image.apply(cast_to_int) return temp_df
In [4]:
%%time pool = Pool(cpu_count() - 2) data_files = ["DataStudyEvaluation/%s" % file for file in os.listdir("DataStudyEvaluation") if file.endswith(".csv") and "studyData" in file] print(data_files) df_lst = pool.map(load_csv, data_files) dfAll = pd.concat(df_lst) pool.close()
['DataStudyEvaluation/2_studyData.csv', 'DataStudyEvaluation/12_studyData.csv', 'DataStudyEvaluation/5_studyData.csv', 'DataStudyEvaluation/1_studyData.csv', 'DataStudyEvaluation/10_studyData.csv', 'DataStudyEvaluation/6_studyData.csv', 'DataStudyEvaluation/3_studyData.csv', 'DataStudyEvaluation/7_studyData.csv', 'DataStudyEvaluation/8_studyData.csv', 'DataStudyEvaluation/9_studyData.csv', 'DataStudyEvaluation/11_studyData.csv', 'DataStudyEvaluation/4_studyData.csv'] CPU times: user 1.35 s, sys: 786 ms, total: 2.14 s Wall time: 1min 43s
In [5]:
df = dfAll[dfAll.Image.notnull()] df = df[df.userID != "userID"] df.userID = pd.to_numeric(df.userID) len(df)
Out[5]:
608084
In [6]:
print("loaded %s values" % len(dfAll)) print("removed %s values (thats %s%%)" % (len(dfAll) - len(df), round((len(dfAll) - len(df)) / len(dfAll) * 100, 3))) print("new df has size %s" % len(df))
loaded 610816 values removed 2732 values (thats 0.447%) new df has size 608084
In [7]:
df = df.reset_index(drop=True)
In [8]:
df.head()
Out[8]:
userID | Timestamp | Current_Task | Task_amount | TaskID | VersionID | RepetitionID | Actual_Data | Is_Pause | Image | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 1553593631562 | 0 | 34 | 0 | 0 | 0 | false | false | [3, 3, 3, 2, 0, 0, 1, 0, 0, 0, 1, 2, 1, 0, 0, ... |
1 | 2 | 1553593631595 | 0 | 34 | 0 | 0 | 0 | false | false | [3, 3, 3, 2, 0, 0, 1, 0, 0, 0, 1, 222, 0, 0, 0... |
2 | 2 | 1553593631634 | 0 | 34 | 0 | 0 | 0 | false | false | [3, 3, 3, 2, 0, 0, 1, 0, 0, 0, 1, 222, 0, 0, 0... |
3 | 2 | 1553593631676 | 0 | 34 | 0 | 0 | 0 | false | false | [3, 3, 3, 2, 0, 0, 1, 0, 0, 0, 1, 222, 0, 0, 0... |
4 | 2 | 1553593631716 | 0 | 34 | 0 | 0 | 0 | false | false | [3, 3, 3, 2, 0, 0, 1, 0, 0, 0, 1, 222, 0, 0, 0... |
In [11]:
df.userID.unique()
Out[11]:
array([ 2, 12, 5, 1, 10, 6, 3, 7, 8, 9, 11, 4])
In [12]:
df.userID = pd.to_numeric(df.userID) df.TaskID = pd.to_numeric(df.TaskID) df.VersionID = pd.to_numeric(df.VersionID) df.Timestamp = pd.to_numeric(df.Timestamp) df.Current_Task = pd.to_numeric(df.Current_Task) df.Task_amount = pd.to_numeric(df.Task_amount) df.RepetitionID = pd.to_numeric(df.RepetitionID) df.loc[df.Actual_Data == "false", "Actual_Data"] = False df.loc[df.Actual_Data == "true", "Actual_Data"] = True df.loc[df.Is_Pause == "false", "Is_Pause"] = False df.loc[df.Is_Pause == "true", "Is_Pause"] = True
In [18]:
df.to_pickle("DataStudyEvaluation/AllData.pkl")
In [ ]: