{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "\n", "from scipy.odr import *\n", "from scipy.stats import *\n", "import numpy as np\n", "import pandas as pd\n", "import os\n", "import time\n", "import matplotlib.pyplot as plt\n", "from multiprocessing import Pool" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def cast_to_int(row):\n", " try:\n", " return np.array([a if float(a) >= 0 else 0 for a in row[2:-1]], dtype=np.uint8)\n", " except Exception as e:\n", " return None\n", " \n", "def load_csv(file):\n", " temp_df = pd.read_csv(file, header=None, names = [\"UserID\", \"Age\", \"Gender\"], delimiter=\";\")\n", " return temp_df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 298 ms, sys: 443 ms, total: 741 ms\n", "Wall time: 937 ms\n" ] } ], "source": [ "%%time\n", "pool = Pool(os.cpu_count() - 2)\n", "data_files = [\"DataStudyCollection/%s\" % file for file in os.listdir(\"DataStudyCollection\") if file.endswith(\".csv\") and \"userData\" in file]\n", "df_lst = pool.map(load_csv, data_files)\n", "dfAll = pd.concat(df_lst)\n", "dfAll = dfAll.sort_values(\"UserID\")\n", "dfAll = dfAll.reset_index(drop=True)\n", "pool.close()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "24.166666666666668" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfAll.Age.mean()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.4245742398014511" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfAll.Age.std()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "21" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfAll.Age.min()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "26" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfAll.Age.max()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | UserID | \n", "Age | \n", "Gender | \n", "
---|---|---|---|
0 | \n", "1 | \n", "23 | \n", "male | \n", "
1 | \n", "2 | \n", "24 | \n", "male | \n", "
2 | \n", "3 | \n", "25 | \n", "male | \n", "
3 | \n", "4 | \n", "25 | \n", "male | \n", "
4 | \n", "5 | \n", "26 | \n", "male | \n", "
5 | \n", "6 | \n", "23 | \n", "male | \n", "
6 | \n", "7 | \n", "21 | \n", "female | \n", "
7 | \n", "8 | \n", "24 | \n", "male | \n", "
8 | \n", "9 | \n", "24 | \n", "male | \n", "
9 | \n", "10 | \n", "24 | \n", "male | \n", "
10 | \n", "11 | \n", "25 | \n", "female | \n", "
11 | \n", "12 | \n", "26 | \n", "male | \n", "
12 | \n", "13 | \n", "22 | \n", "female | \n", "
13 | \n", "14 | \n", "24 | \n", "male | \n", "
14 | \n", "15 | \n", "24 | \n", "male | \n", "
15 | \n", "16 | \n", "26 | \n", "female | \n", "
16 | \n", "17 | \n", "26 | \n", "male | \n", "
17 | \n", "18 | \n", "23 | \n", "male | \n", "