knuckletouch/python/Step_01_UserData.ipynb

7.8 KiB

In [1]:
%matplotlib inline

from scipy.odr import *
from scipy.stats import *
import numpy as np
import pandas as pd
import os
import time
import matplotlib.pyplot as plt
from multiprocessing import Pool
In [2]:
def cast_to_int(row):
    try:
        return np.array([a if float(a) >= 0 else 0 for a in row[2:-1]], dtype=np.uint8)
    except Exception as e:
        return None
    
def load_csv(file):
    temp_df = pd.read_csv(file, header=None, names = ["UserID", "Age", "Gender"], delimiter=";")
    return temp_df
In [3]:
%%time
pool = Pool(os.cpu_count() - 2)
data_files = ["DataStudyCollection/%s" % file for file in os.listdir("DataStudyCollection") if file.endswith(".csv") and "userData" in file]
df_lst = pool.map(load_csv, data_files)
dfAll = pd.concat(df_lst)
dfAll = dfAll.sort_values("UserID")
dfAll = dfAll.reset_index(drop=True)
pool.close()
CPU times: user 298 ms, sys: 443 ms, total: 741 ms
Wall time: 937 ms
In [4]:
dfAll.Age.mean()
Out[4]:
24.166666666666668
In [5]:
dfAll.Age.std()
Out[5]:
1.4245742398014511
In [6]:
dfAll.Age.min()
Out[6]:
21
In [7]:
dfAll.Age.max()
Out[7]:
26
In [8]:
dfAll
Out[8]:
UserID Age Gender
0 1 23 male
1 2 24 male
2 3 25 male
3 4 25 male
4 5 26 male
5 6 23 male
6 7 21 female
7 8 24 male
8 9 24 male
9 10 24 male
10 11 25 female
11 12 26 male
12 13 22 female
13 14 24 male
14 15 24 male
15 16 26 female
16 17 26 male
17 18 23 male