knuckletouch/python/Step_02_ReadData.ipynb

9.8 KiB

This notebook creates one dataframe from all participants data

It also removes 1% of the data as this is corrupted

In [1]:
%matplotlib inline

from scipy.odr import *
from scipy.stats import *
import numpy as np
import pandas as pd
import os
import time
import matplotlib.pyplot as plt
import ast
from multiprocessing import Pool, cpu_count

import scipy

from IPython import display
from matplotlib.patches import Rectangle

from sklearn.metrics import mean_squared_error
import json

import scipy.stats as st
from sklearn.metrics import r2_score


from matplotlib import cm
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt

import copy

from sklearn.model_selection import LeaveOneOut, LeavePOut

from multiprocessing import Pool
In [2]:
def cast_to_int(row):
    try:
        return np.array([a if float(a) >= 0 else 0 for a in row[2:-1]], dtype=np.uint8)
    except Exception as e:
        return None
    
def load_csv(file):
    temp_df = pd.read_csv(file, delimiter=";")
    temp_df.Image = temp_df.Image.str.split(',')
    temp_df.Image = temp_df.Image.apply(cast_to_int)
    return temp_df
In [3]:
%%time
pool = Pool(cpu_count() - 2)
data_files = ["DataStudyCollection/%s" % file for file in os.listdir("DataStudyCollection") if file.endswith(".csv") and "studyData" in file]
print(data_files)
df_lst = pool.map(load_csv, data_files)
dfAll = pd.concat(df_lst)
pool.close()
['DataStudyCollection/17_studyData.csv', 'DataStudyCollection/2_studyData.csv', 'DataStudyCollection/12_studyData.csv', 'DataStudyCollection/15_studyData.csv', 'DataStudyCollection/5_studyData.csv', 'DataStudyCollection/1_studyData.csv', 'DataStudyCollection/14_studyData.csv', 'DataStudyCollection/10_studyData.csv', 'DataStudyCollection/13_studyData.csv', 'DataStudyCollection/18_studyData.csv', 'DataStudyCollection/6_studyData.csv', 'DataStudyCollection/16_studyData.csv', 'DataStudyCollection/3_studyData.csv', 'DataStudyCollection/7_studyData.csv', 'DataStudyCollection/8_studyData.csv', 'DataStudyCollection/9_studyData.csv', 'DataStudyCollection/11_studyData.csv', 'DataStudyCollection/4_studyData.csv']
CPU times: user 1.86 s, sys: 1.03 s, total: 2.89 s
Wall time: 17.3 s
In [4]:
df = dfAll[dfAll.Image.notnull()]
len(df)
Out[4]:
1010014
In [5]:
print("loaded %s values" % len(dfAll))
print("removed %s values (thats %s%%)" % (len(dfAll) - len(df), round((len(dfAll) - len(df)) / len(dfAll) * 100, 3)))
print("new df has size %s" % len(df))
loaded 1013841 values
removed 3827 values (thats 0.377%)
new df has size 1010014
In [6]:
df = df.reset_index(drop=True)
In [7]:
df.head()
Out[7]:
userID Timestamp Current_Task Task_amount TaskID VersionID RepetitionID Actual_Data Is_Pause Image
0 17 1547138602677 0 34 0 0 0 False False [1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 0, ...
1 17 1547138602697 0 34 0 0 0 False False [1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 0, ...
2 17 1547138602796 0 34 0 0 0 False False [1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 0, ...
3 17 1547138602817 0 34 0 0 0 False False [1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 0, ...
4 17 1547138602863 0 34 0 0 0 False False [1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 0, ...
In [8]:
df.to_pickle("DataStudyCollection/AllData.pkl")
In [9]:
sorted(df.userID.unique())
Out[9]:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]