knuckletouch/python/Step_32_ReadData-Evaluation.ipynb at master

public-projects/knuckletouch

Fork 0

mayersn 0fb6efadd5 Added notebooks and models

2019-08-07 17:57:12 -04:00

10 KiB

Raw Permalink Blame History

This notebook creates one dataframe from all participants data¶

It also removes 1% of the data as this is corrupted¶

In [2]:

%matplotlib inline

from scipy.odr import *
from scipy.stats import *
import numpy as np
import pandas as pd
import os
import time
import matplotlib.pyplot as plt
import ast
from multiprocessing import Pool, cpu_count

import scipy

from IPython import display
from matplotlib.patches import Rectangle

from sklearn.metrics import mean_squared_error
import json

import scipy.stats as st
from sklearn.metrics import r2_score


from matplotlib import cm
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt

import copy

from sklearn.model_selection import LeaveOneOut, LeavePOut

from multiprocessing import Pool

In [3]:

def cast_to_int(row):
    try:
        return np.array([a if float(a) >= 0 else 0 for a in row[2:-1]], dtype=np.uint8)
    except Exception as e:
        return None
    
def load_csv(file):
    temp_df = pd.read_csv(file, delimiter=";")
    temp_df.Image = temp_df.Image.str.split(',')
    temp_df.Image = temp_df.Image.apply(cast_to_int)
    return temp_df

In [4]:

%%time
pool = Pool(cpu_count() - 2)
data_files = ["DataStudyEvaluation/%s" % file for file in os.listdir("DataStudyEvaluation") if file.endswith(".csv") and "studyData" in file]
print(data_files)
df_lst = pool.map(load_csv, data_files)
dfAll = pd.concat(df_lst)
pool.close()

['DataStudyEvaluation/2_studyData.csv', 'DataStudyEvaluation/12_studyData.csv', 'DataStudyEvaluation/5_studyData.csv', 'DataStudyEvaluation/1_studyData.csv', 'DataStudyEvaluation/10_studyData.csv', 'DataStudyEvaluation/6_studyData.csv', 'DataStudyEvaluation/3_studyData.csv', 'DataStudyEvaluation/7_studyData.csv', 'DataStudyEvaluation/8_studyData.csv', 'DataStudyEvaluation/9_studyData.csv', 'DataStudyEvaluation/11_studyData.csv', 'DataStudyEvaluation/4_studyData.csv']
CPU times: user 1.35 s, sys: 786 ms, total: 2.14 s
Wall time: 1min 43s

In [5]:

df = dfAll[dfAll.Image.notnull()]
df = df[df.userID != "userID"]
df.userID = pd.to_numeric(df.userID)
len(df)

Out[5]:

In [6]:

print("loaded %s values" % len(dfAll))
print("removed %s values (thats %s%%)" % (len(dfAll) - len(df), round((len(dfAll) - len(df)) / len(dfAll) * 100, 3)))
print("new df has size %s" % len(df))

loaded 610816 values
removed 2732 values (thats 0.447%)
new df has size 608084

In [7]:

df = df.reset_index(drop=True)

In [8]:

df.head()

Out[8]:

	userID	Timestamp	Task_amount	Actual_Data	Is_Pause	Image
0	2	1553593631562	34	false	false	[3, 3, 3, 2, 0, 0, 1, 0, 0, 0, 1, 2, 1, 0, 0, ...
1	2	1553593631595	34	false	false	[3, 3, 3, 2, 0, 0, 1, 0, 0, 0, 1, 222, 0, 0, 0...
2	2	1553593631634	34	false	false	[3, 3, 3, 2, 0, 0, 1, 0, 0, 0, 1, 222, 0, 0, 0...
3	2	1553593631676	34	false	false	[3, 3, 3, 2, 0, 0, 1, 0, 0, 0, 1, 222, 0, 0, 0...
4	2	1553593631716	34	false	false	[3, 3, 3, 2, 0, 0, 1, 0, 0, 0, 1, 222, 0, 0, 0...

In [11]:

df.userID.unique()

Out[11]:

array([ 2, 12,  5,  1, 10,  6,  3,  7,  8,  9, 11,  4])

In [12]:

df.userID = pd.to_numeric(df.userID)
df.TaskID = pd.to_numeric(df.TaskID)
df.VersionID = pd.to_numeric(df.VersionID)
df.Timestamp = pd.to_numeric(df.Timestamp)
df.Current_Task = pd.to_numeric(df.Current_Task)
df.Task_amount = pd.to_numeric(df.Task_amount)
df.RepetitionID = pd.to_numeric(df.RepetitionID)
df.loc[df.Actual_Data == "false", "Actual_Data"] = False
df.loc[df.Actual_Data == "true", "Actual_Data"] = True
df.loc[df.Is_Pause == "false", "Is_Pause"] = False
df.loc[df.Is_Pause == "true", "Is_Pause"] = True

In [18]:

df.to_pickle("DataStudyEvaluation/AllData.pkl")

In [ ]:

10 KiB Raw Permalink Blame History

This notebook creates one dataframe from all participants data¶

It also removes 1% of the data as this is corrupted¶

10 KiB

Raw Permalink Blame History