7.8 KiB
7.8 KiB
In [1]:
%matplotlib inline from scipy.odr import * from scipy.stats import * import numpy as np import pandas as pd import os import time import matplotlib.pyplot as plt from multiprocessing import Pool
In [2]:
def cast_to_int(row): try: return np.array([a if float(a) >= 0 else 0 for a in row[2:-1]], dtype=np.uint8) except Exception as e: return None def load_csv(file): temp_df = pd.read_csv(file, header=None, names = ["UserID", "Age", "Gender"], delimiter=";") return temp_df
In [3]:
%%time pool = Pool(os.cpu_count() - 2) data_files = ["DataStudyCollection/%s" % file for file in os.listdir("DataStudyCollection") if file.endswith(".csv") and "userData" in file] df_lst = pool.map(load_csv, data_files) dfAll = pd.concat(df_lst) dfAll = dfAll.sort_values("UserID") dfAll = dfAll.reset_index(drop=True) pool.close()
CPU times: user 298 ms, sys: 443 ms, total: 741 ms Wall time: 937 ms
In [4]:
dfAll.Age.mean()
Out[4]:
24.166666666666668
In [5]:
dfAll.Age.std()
Out[5]:
1.4245742398014511
In [6]:
dfAll.Age.min()
Out[6]:
21
In [7]:
dfAll.Age.max()
Out[7]:
26
In [8]:
dfAll
Out[8]:
UserID | Age | Gender | |
---|---|---|---|
0 | 1 | 23 | male |
1 | 2 | 24 | male |
2 | 3 | 25 | male |
3 | 4 | 25 | male |
4 | 5 | 26 | male |
5 | 6 | 23 | male |
6 | 7 | 21 | female |
7 | 8 | 24 | male |
8 | 9 | 24 | male |
9 | 10 | 24 | male |
10 | 11 | 25 | female |
11 | 12 | 26 | male |
12 | 13 | 22 | female |
13 | 14 | 24 | male |
14 | 15 | 24 | male |
15 | 16 | 26 | female |
16 | 17 | 26 | male |
17 | 18 | 23 | male |