knuckletouch/python/Step_32_ReadData-Evaluation...

10 KiB

This notebook creates one dataframe from all participants data

It also removes 1% of the data as this is corrupted

In [2]:
%matplotlib inline

from scipy.odr import *
from scipy.stats import *
import numpy as np
import pandas as pd
import os
import time
import matplotlib.pyplot as plt
import ast
from multiprocessing import Pool, cpu_count

import scipy

from IPython import display
from matplotlib.patches import Rectangle

from sklearn.metrics import mean_squared_error
import json

import scipy.stats as st
from sklearn.metrics import r2_score


from matplotlib import cm
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt

import copy

from sklearn.model_selection import LeaveOneOut, LeavePOut

from multiprocessing import Pool
In [3]:
def cast_to_int(row):
    try:
        return np.array([a if float(a) >= 0 else 0 for a in row[2:-1]], dtype=np.uint8)
    except Exception as e:
        return None
    
def load_csv(file):
    temp_df = pd.read_csv(file, delimiter=";")
    temp_df.Image = temp_df.Image.str.split(',')
    temp_df.Image = temp_df.Image.apply(cast_to_int)
    return temp_df
In [4]:
%%time
pool = Pool(cpu_count() - 2)
data_files = ["DataStudyEvaluation/%s" % file for file in os.listdir("DataStudyEvaluation") if file.endswith(".csv") and "studyData" in file]
print(data_files)
df_lst = pool.map(load_csv, data_files)
dfAll = pd.concat(df_lst)
pool.close()
['DataStudyEvaluation/2_studyData.csv', 'DataStudyEvaluation/12_studyData.csv', 'DataStudyEvaluation/5_studyData.csv', 'DataStudyEvaluation/1_studyData.csv', 'DataStudyEvaluation/10_studyData.csv', 'DataStudyEvaluation/6_studyData.csv', 'DataStudyEvaluation/3_studyData.csv', 'DataStudyEvaluation/7_studyData.csv', 'DataStudyEvaluation/8_studyData.csv', 'DataStudyEvaluation/9_studyData.csv', 'DataStudyEvaluation/11_studyData.csv', 'DataStudyEvaluation/4_studyData.csv']
CPU times: user 1.35 s, sys: 786 ms, total: 2.14 s
Wall time: 1min 43s
In [5]:
df = dfAll[dfAll.Image.notnull()]
df = df[df.userID != "userID"]
df.userID = pd.to_numeric(df.userID)
len(df)
Out[5]:
608084
In [6]:
print("loaded %s values" % len(dfAll))
print("removed %s values (thats %s%%)" % (len(dfAll) - len(df), round((len(dfAll) - len(df)) / len(dfAll) * 100, 3)))
print("new df has size %s" % len(df))
loaded 610816 values
removed 2732 values (thats 0.447%)
new df has size 608084
In [7]:
df = df.reset_index(drop=True)
In [8]:
df.head()
Out[8]:
userID Timestamp Current_Task Task_amount TaskID VersionID RepetitionID Actual_Data Is_Pause Image
0 2 1553593631562 0 34 0 0 0 false false [3, 3, 3, 2, 0, 0, 1, 0, 0, 0, 1, 2, 1, 0, 0, ...
1 2 1553593631595 0 34 0 0 0 false false [3, 3, 3, 2, 0, 0, 1, 0, 0, 0, 1, 222, 0, 0, 0...
2 2 1553593631634 0 34 0 0 0 false false [3, 3, 3, 2, 0, 0, 1, 0, 0, 0, 1, 222, 0, 0, 0...
3 2 1553593631676 0 34 0 0 0 false false [3, 3, 3, 2, 0, 0, 1, 0, 0, 0, 1, 222, 0, 0, 0...
4 2 1553593631716 0 34 0 0 0 false false [3, 3, 3, 2, 0, 0, 1, 0, 0, 0, 1, 222, 0, 0, 0...
In [11]:
df.userID.unique()
Out[11]:
array([ 2, 12,  5,  1, 10,  6,  3,  7,  8,  9, 11,  4])
In [12]:
df.userID = pd.to_numeric(df.userID)
df.TaskID = pd.to_numeric(df.TaskID)
df.VersionID = pd.to_numeric(df.VersionID)
df.Timestamp = pd.to_numeric(df.Timestamp)
df.Current_Task = pd.to_numeric(df.Current_Task)
df.Task_amount = pd.to_numeric(df.Task_amount)
df.RepetitionID = pd.to_numeric(df.RepetitionID)
df.loc[df.Actual_Data == "false", "Actual_Data"] = False
df.loc[df.Actual_Data == "true", "Actual_Data"] = True
df.loc[df.Is_Pause == "false", "Is_Pause"] = False
df.loc[df.Is_Pause == "true", "Is_Pause"] = True
In [18]:
df.to_pickle("DataStudyEvaluation/AllData.pkl")
In [ ]: