9.8 KiB
9.8 KiB
In [1]:
%matplotlib inline from scipy.odr import * from scipy.stats import * import numpy as np import pandas as pd import os import time import matplotlib.pyplot as plt import ast from multiprocessing import Pool, cpu_count import scipy from IPython import display from matplotlib.patches import Rectangle from sklearn.metrics import mean_squared_error import json import scipy.stats as st from sklearn.metrics import r2_score from matplotlib import cm from mpl_toolkits.mplot3d import axes3d import matplotlib.pyplot as plt import copy from sklearn.model_selection import LeaveOneOut, LeavePOut from multiprocessing import Pool
In [2]:
def cast_to_int(row): try: return np.array([a if float(a) >= 0 else 0 for a in row[2:-1]], dtype=np.uint8) except Exception as e: return None def load_csv(file): temp_df = pd.read_csv(file, delimiter=";") temp_df.Image = temp_df.Image.str.split(',') temp_df.Image = temp_df.Image.apply(cast_to_int) return temp_df
In [3]:
%%time pool = Pool(cpu_count() - 2) data_files = ["DataStudyCollection/%s" % file for file in os.listdir("DataStudyCollection") if file.endswith(".csv") and "studyData" in file] print(data_files) df_lst = pool.map(load_csv, data_files) dfAll = pd.concat(df_lst) pool.close()
['DataStudyCollection/17_studyData.csv', 'DataStudyCollection/2_studyData.csv', 'DataStudyCollection/12_studyData.csv', 'DataStudyCollection/15_studyData.csv', 'DataStudyCollection/5_studyData.csv', 'DataStudyCollection/1_studyData.csv', 'DataStudyCollection/14_studyData.csv', 'DataStudyCollection/10_studyData.csv', 'DataStudyCollection/13_studyData.csv', 'DataStudyCollection/18_studyData.csv', 'DataStudyCollection/6_studyData.csv', 'DataStudyCollection/16_studyData.csv', 'DataStudyCollection/3_studyData.csv', 'DataStudyCollection/7_studyData.csv', 'DataStudyCollection/8_studyData.csv', 'DataStudyCollection/9_studyData.csv', 'DataStudyCollection/11_studyData.csv', 'DataStudyCollection/4_studyData.csv'] CPU times: user 1.86 s, sys: 1.03 s, total: 2.89 s Wall time: 17.3 s
In [4]:
df = dfAll[dfAll.Image.notnull()] len(df)
Out[4]:
1010014
In [5]:
print("loaded %s values" % len(dfAll)) print("removed %s values (thats %s%%)" % (len(dfAll) - len(df), round((len(dfAll) - len(df)) / len(dfAll) * 100, 3))) print("new df has size %s" % len(df))
loaded 1013841 values removed 3827 values (thats 0.377%) new df has size 1010014
In [6]:
df = df.reset_index(drop=True)
In [7]:
df.head()
Out[7]:
userID | Timestamp | Current_Task | Task_amount | TaskID | VersionID | RepetitionID | Actual_Data | Is_Pause | Image | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 17 | 1547138602677 | 0 | 34 | 0 | 0 | 0 | False | False | [1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 0, ... |
1 | 17 | 1547138602697 | 0 | 34 | 0 | 0 | 0 | False | False | [1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 0, ... |
2 | 17 | 1547138602796 | 0 | 34 | 0 | 0 | 0 | False | False | [1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 0, ... |
3 | 17 | 1547138602817 | 0 | 34 | 0 | 0 | 0 | False | False | [1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 0, ... |
4 | 17 | 1547138602863 | 0 | 34 | 0 | 0 | 0 | False | False | [1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 0, ... |
In [8]:
df.to_pickle("DataStudyCollection/AllData.pkl")
In [9]:
sorted(df.userID.unique())
Out[9]:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]