knuckletouch/python/Step_06_CNN_Baseline.ipynb

891 lines
32 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"## USE for Multi GPU Systems\n",
"#import os\n",
"#os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
"\n",
"%matplotlib inline\n",
"\n",
"from scipy.odr import *\n",
"from scipy.stats import *\n",
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import time\n",
"import matplotlib.pyplot as plt\n",
"import ast\n",
"from multiprocessing import Pool\n",
"\n",
"import scipy\n",
"\n",
"from IPython import display\n",
"from matplotlib.patches import Rectangle\n",
"\n",
"from sklearn.metrics import mean_squared_error\n",
"import json\n",
"\n",
"import scipy.stats as st\n",
"from sklearn.metrics import r2_score\n",
"\n",
"\n",
"from matplotlib import cm\n",
"from mpl_toolkits.mplot3d import axes3d\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib.patches import Ellipse\n",
"\n",
"import copy\n",
"\n",
"from sklearn.model_selection import LeaveOneOut, LeavePOut\n",
"\n",
"from multiprocessing import Pool\n",
"import cv2\n",
"\n",
"import sklearn\n",
"import random\n",
"from sklearn import neighbors\n",
"from sklearn import svm\n",
"from sklearn import tree\n",
"from sklearn import ensemble\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.metrics import classification_report\n",
"\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import math\n",
"\n",
"# Importing matplotlib to plot images.\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"%matplotlib inline\n",
"\n",
"# Importing SK-learn to calculate precision and recall\n",
"import sklearn\n",
"from sklearn import metrics\n",
"from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneGroupOut\n",
"from sklearn.utils import shuffle\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.metrics.pairwise import euclidean_distances\n",
"from sklearn.metrics import confusion_matrix\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"import pickle as pkl\n",
"import h5py\n",
"\n",
"from pathlib import Path\n",
"import os.path\n",
"import sys\n",
"import datetime\n",
"import time\n",
"\n",
"import skimage\n",
"\n",
"target_names = [\"Knuckle\", \"Finger\"]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from skimage import measure\n",
"from skimage.measure import find_contours, approximate_polygon, \\\n",
" subdivide_polygon, EllipseModel, LineModelND"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def getEllipseParams(img):\n",
" points = np.argwhere(img > 40)\n",
" \n",
" contours = skimage.measure.find_contours(img, 40)\n",
" points_to_approx = []\n",
" highest_val = 0\n",
" for n, contour in enumerate(contours):\n",
" if (len(contour) > highest_val):\n",
" points_to_approx = contour\n",
" highest_val = len(contour) \n",
" \n",
" try:\n",
" contour = np.fliplr(points_to_approx)\n",
" except Exception as inst:\n",
" return [-1, -1, -1, -1, -1]\n",
" \n",
"\n",
" ellipse = skimage.measure.fit.EllipseModel()\n",
" ellipse.estimate(contour)\n",
" try:\n",
" xc, yc, a, b, theta = ellipse.params \n",
" except Exception as int:\n",
" return [-1, -1, -1, -1, -1]\n",
" \n",
" return [xc, yc, a, b, theta]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 1 2 9 6 4 14 17 16 12 3 10 18 5] [13 8 11 15 7]\n",
"13 : 5\n",
"0.7222222222222222 : 0.2777777777777778\n"
]
}
],
"source": [
"# the data, split between train and test sets\n",
"df = pd.read_pickle(\"DataStudyCollection/df_statistics.pkl\")\n",
"\n",
"lst = df.userID.unique()\n",
"np.random.seed(42)\n",
"np.random.shuffle(lst)\n",
"test_ids = lst[-5:]\n",
"train_ids = lst[:-5]\n",
"\n",
"df[\"Set\"] = \"Test\"\n",
"df.loc[df.userID.isin(train_ids), \"Set\"] = \"Train\"\n",
"print(train_ids, test_ids)\n",
"print(len(train_ids), \":\", len(test_ids))\n",
"print(len(train_ids) / len(lst), \":\", len(test_ids)/ len(lst))\n",
"\n",
"#df_train = df[df.userID.isin(train_ids)]\n",
"#df_test = df[df.userID.isin(test_ids) & (df.Version == \"Normal\")]\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.patches.Ellipse at 0x7ff60430b668>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAJ4AAAD8CAYAAACGuR0qAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAADetJREFUeJzt3XuQVPWZxvHvO8AwMAy3gCNBRbCQDV4yuoSoYArvqNkga0K8ZXHXCmrFXbWyu2VlqzS7+YdaNWx243pJQImV6LoaSjZFokhijHeZaBBULuE+AgPKZYIyzDDv/tFnKsNlunu6e/rt6Xk+VVPdfW79UvVwus85/TuvuTsixVYRXYD0TgqehFDwJISCJyEUPAmh4EkIBU9CKHgSQsGTEH2L+WaV1t+rqO58AbO061ufDP9PDrWlnZ3xGo2u4uStid273H1kpuXyCp6ZTQd+APQBfuzuc9MtX0U1X6y4uPPtVVamfb+KmkFp5/uf9qefnyFY3tycdr5k9oI/vSmb5XL+qDWzPsADwOXAROBaM5uY6/akd8nnO95kYJ27r3f3g8CTwIzClCXlLp/gjQa2dHi9NZl2GDObY2bLzWx5C/ook5RuP6p190fcfZK7T+pH/+5+O+kh8gleA3Bih9cnJNNEMsoneG8B481srJlVAtcAiwtTlpS7nE+nuHurmd0GPEfqdMoCd1+VdiWztKdMKgYOTL96htMp1jf9P6d1R2Pa+VI8eZ3Hc/clwJIC1SK9iC6ZSQgFT0IoeBKiqD8SKKR+lX353NljOHnCKCoqjLY2Z/PKzbxfv4HmT1uiy5MMelzwaoYO5Ou3XsSV151L1cCjj5BbDrbyxtKVPPEfv2T9Kp1WLFU9KnjnXnIa3773GqprBgDwx/caWP2HzTR/2kJlVT8mnHEC404bzdQr65h6ZR2/fuZNHvjOU3zSdCC4cjlSaQVv5PBOZ13x9S/yrXtm0Keigpc2bOTe3/2O9xp3HrbMwHpj5OBqbpz2l8w67/NcePVkTvnSeG6b/ywbd+7msw/uTfv2bQcU0GLpEQcXZ08Zzz/860z6VFQw7+VX+Ntnfn5U6Nrt3Lefexe/xNX3Pc77DY2MGTmM+bd+lVHDaopctaRT8sEbMqyaf5w7C4B5r7zKD19/I6v1Nu/aw+wf/g9vrtvCcUMG8dA3Z1IzLM2vn6WoSj541956IcNH1rDirfX8d5aha/fpwVbuePT/WLttF+NqP8Mt/35DN1UpXVXSwauuqWL6174AwIPfW0xbDmMimg40c/ujiznQ0srF10+lbpp+JF0KSjp4508/g6oBlbz96lo2rNme83a2fLSXh5e+DqC9Xoko6eCdOXkcAK++kP5HL9l47MV6Ptq+h7Gnn8iZ5/9F3tuT/JR08M6YNBaAFW9tyHtbrYfaWDL/1wB85ZZL8t6e5Keo5/EMsDRjZ33An69EVPbvy8hRQzl4sJVNDXvwAZWsveHBtNuvbz6Ydv4Pvnoj3/iXv2by5XX0H11LS3PrYfPbtupKR7GU7B6vZnDq6kTT3k8yjofN1kc79vLHVQ30r6rkc2efXJBtSm5KNniDaqoA2N9U2JFpq95aD8CEujEF3a50TckG75NPUh+bAwcVdmTahxtTVzxGjBpa0O1K15Rs8PZ8nLodxdACX23Ys7Mptd0R6cdvSPcq2eC1HGxl7+799O3Xh5HHDynYdv+071MABiXfISVGyQYPYM17HwJwWt1JBdtmdRK4/fqpVKiSDt47b6YOBL4wZXzBtlkzNDWEsmnPJwXbpnRdUc/jOelvFVbRcPi419eefIVv3nkZUy+cyEN3/oSz/+3WtNs/OCT9/fVOOrCasafWArBt/XZcv78LU9J7vA83NFL/m1VUDazk0uvOK8g2zzg3tfdc+dragmxPclPSwQNYPP9FAGb9/WUMqa7Ka1ujxx3HSaeO4tP9B1j7h80FqE5yVfLBe+P5d1nxyhqGjqjhjplfymtbV825EIDfLlpOa8uhQpQnOSr54AH85z/9lIPNLcw873S+ck5uv6ebcPwILrt+Cm1tbSx6eFmBK5Su6hHB27puB4/c/TQAd19/KeefPrZL6w/o15f7rrmCyv79+OXjL7Np9bbuKFO6oEcED+AXj/6W+c+9Sd8+Fcy7eQbXXXBWVusNGVjFj2+6mlNqP8Om1dt45O7/7eZKJRulNbwxg/969mUM+LvLJvPPX7uAS846lYeXvMbrHxx9oGAGl585gTsvm8LoYUP4cPc+vnfjQ7rLQImwYnboHmzDPV27gYpBGe5/V9kPgClfPovb77+BwcNTy2/ftIt3X1vD9i0f4w4jPzuMuvMnMGrMCADWvbuFe/7mIXatTn8kq3G1+XvBn65390mZlsu3z8VGoAk4BLRm84aF8Mov3ubtF9/nr26axsxbLub4MSM4PglZR7s+3M3j9y/hhafeoC1D8xUprrz2eEnwJrn7rmyWL9Qe77B1Koxxp5/AaeeMZ8iIwZgZ+3bv54P6Dax5ZxNtbX/+97Xt0Z0EultR9niloK3NWbdiC+tWbMH6HR1MKU35HtU68LyZ1ZvZnEIUJL1Dvnu8qe7eYGbHAUvN7AN3f6njAkkg5wBUkf7m2tJ75LXHc/eG5LERWESqzdSRy6jBihwlnyZ61WZW0/4cuBRYWajCpLzl81FbCyxKxsn2BX7m7r/KuFaao+i2pqY8ypGeJJ8GK+uBzxewFulFesy1WikvCp6EUPAkhIInIRQ8CaHgSQgFT0IoeBJCwZMQCp6EUPAkhIInIRQ8CaHgSQgFT0IoeBJCwZMQCp6EUPAkhIInIRQ8CaHgSQgFT0IoeBJCwZMQCp6EUPAkhIInIRQ8CaHgSQgFT0JkDJ6ZLTCzRjNb2WHacDNbamZrk8dh3VumlJts9niPAdOPmHYXsMzdxwPLktciWcsYvOQu7h8fMXkGsDB5vhC4qsB1SZnL9Tterbu3997cTup+yCJZy/vgwlM9qTq9o7aZzTGz5Wa2vIXmfN9OykSuwdthZqMAksfGzhZUnws5llyDtxiYnTyfDTxbmHKkt8jmdMoTwGvABDPbamY3AXOBS8xsLXBx8lokaxn7XLj7tZ3MuqjAtUgvoisXEkLBkxAKnoRQ8CSEgichFDwJoeBJCAVPQih4EkLBkxAKnoRQ8CSEgichFDwJoeBJCAVPQih4EkLBkxAKnoRQ8CSEgichFDwJoeBJCAVPQih4EkLBkxAKnoRQ8CSEgichFDwJoeBJCAVPQuTaYOW7ZtZgZu8kf1d0b5lSbnJtsAIwz93rkr8lhS1Lyl2uDVZE8pLPd7zbzGxF8lHcaS8z9bmQY8k1eA8CpwB1wDbg/s4WVJ8LOZacgufuO9z9kLu3AT8CJhe2LCl3OQWvvatPYiawsrNlRY4lY5+LpMHKNGCEmW0F7gGmmVkdqR5mG4Gbu7FGKUO5NliZ3w21SC+iKxcSQsGTEAqehFDwJISCJyEUPAmh4EkIBU9CKHgSQsGTEAqehFDwJISCJyEUPAmh4EkIBU9CKHgSQsGTEAqehFDwJISCJyEUPAmh4EkIBU9CKHgSQsGTEAqehFDwJISCJyEUPAmh4EmIbPpcnGhmvzGz98xslZndnkwfbmZLzWxt8tjpDbhFjpTNHq8V+La7TwTOAb5lZhOBu4Bl7j4eWJa8FslKNn0utrn775PnTcD7wGhgBrAwWWwhcFV3FSnlJ+OtaDsys5OBs4A3gFp335bM2g7UdrLOHGAOQBUDc61TykzWBxdmNgh4BrjD3fd1nOfuTupG3EdRnws5lqyCZ2b9SIXup+7+82Tyjva2A8ljY/eUKOUom6NaI3WX9/fd/fsdZi0GZifPZwPPFr48KVfZfMebAnwDeNfM3kmmfQeYCzxlZjcBm4BZ3VOilKNs+ly8DFgnsy8qbDnSW+jKhYRQ8CSEgichFDwJoeBJCAVPQih4EkLBkxAKnoRQ8CSEgichFDwJoeBJCAVPQih4EkLBkxAKnoRQ8CSEgichFDwJoeBJCAVPQih4EkLBkxAKnoRQ8CSEgichFDwJoeBJCAVPQih4EiKfPhffNbMGM3sn+bui+8uVcpHNHUHb+1z83sxqgHozW5rMm+fu93VfeVKusrkj6DZgW/K8ycza+1yI5KxL3/GO6HMBcJuZrTCzBWopJV2RT5+LB4FTgDpSe8T7O1lvjpktN7PlLTQXoGQpBzn3uXD3He5+yN3bgB8Bk4+1rhqsyLHk3OeivblKYiawsvDlSbnKp8/FtWZWR6qV1Ebg5m6pUMpSPn0ulhS+HOktdOVCQih4EkLBkxAKnoRQ8CSEgichFDwJYe5evDcz20mqqXK7EcCuohXQdaVeH5RejWPcfWSmhYoavKPe3Gy5u08KKyCDUq8PekaNx6KPWgmh4EmI6OA9Evz+mZR6fdAzajxK6Hc86b2i93jSS4UEz8ymm9lqM1tnZndF1JCJmW00s3eToZvLS6CeBWbWaGYrO0wbbmZLzWxt8thjxr0UPXhm1gd4ALgcmEjqB6UTi11Hli5w97oSOV3xGDD9iGl3AcvcfTywLHndI0Ts8SYD69x9vbsfBJ4EZgTU0aO4+0vAx0dMngEsTJ4vBK4qalF5iAjeaGBLh9dbKc1xug48b2b1ZjYnuphO1CbjngG2A7WRxXRFNmMuequp7t5gZscBS83sg2SvU5Lc3c2sx5yiiNjjNQAndnh9QjKtpLh7Q/LYCCyik+GbwXa0j/ZLHhuD68laRPDeAsab2VgzqwSuARYH1NEpM6tO7hODmVUDl1KawzcXA7OT57OBZwNr6ZKif9S6e6uZ3QY8B/QBFrj7qmLXkUEtsCg1pJi+wM/c/VeRBZnZE8A0YISZbQXuAeYCT5nZTaR+9TMrrsKu0ZULCaErFxJCwZMQCp6EUPAkhIInIRQ8CaHgSQgFT0L8Pyp94TKMVNfuAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"fig, ax = plt.subplots(1)\n",
"img = df.iloc[0].Blobs\n",
"xc, yc, a, b, theta = getEllipseParams(img)\n",
"ax.imshow(img)\n",
"e = Ellipse(xy=[xc,yc], width=a*2, height=b*2, angle=math.degrees(theta), fill=False, lw=2, edgecolor='w')\n",
"ax.add_artist(e)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"lst = df.Blobs.apply(lambda x: getEllipseParams(x))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"lst2 = np.vstack(lst.values)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(618012, 5)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lst2.shape"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"df[\"XC\"] = lst2[:,0]\n",
"df[\"YC\"] = lst2[:,1]\n",
"df[\"EllipseW\"] = lst2[:,2]\n",
"df[\"EllipseH\"] = lst2[:,3]\n",
"df[\"EllipseTheta\"] = lst2[:,4]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"df[\"Area\"] = df[\"EllipseW\"] * df[\"EllipseH\"] * np.pi\n",
"df[\"AvgCapa\"] = df.Blobs.apply(lambda x: np.mean(x))\n",
"df[\"SumCapa\"] = df.Blobs.apply(lambda x: np.sum(x))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[8, 11, 6, 7, 16, 15, 14, 10, 9, 2, 3, 13, 17, 5, 12, 1, 4]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lst = list(range(1, df.userID.max()))\n",
"SEED = 42#448\n",
"random.seed(SEED)\n",
"random.shuffle(lst)\n",
"lst"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"dfY = df[df.Set == \"Train\"].copy(deep=True)\n",
"dfT = df[(df.Set == \"Test\") & (df.Version == \"Normal\")].copy(deep=True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"minmax = min(len(dfY[dfY.Input == \"Finger\"]), len(dfY[dfY.Input == \"Knuckle\"]))\n",
"dfX = dfY[dfY.Input == \"Finger\"].sample(minmax)\n",
"dfZ = dfY[dfY.Input == \"Knuckle\"].sample(minmax)\n",
"dfY = pd.concat([dfX,dfZ])\n",
"\n",
"minmax = min(len(dfT[dfT.Input == \"Finger\"]), len(dfT[dfT.Input == \"Knuckle\"]))\n",
"dfX = dfT[dfT.Input == \"Finger\"].sample(minmax)\n",
"dfZ = dfT[dfT.Input == \"Knuckle\"].sample(minmax)\n",
"dfT = pd.concat([dfX,dfZ])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>userID</th>\n",
" <th>Timestamp</th>\n",
" <th>Current_Task</th>\n",
" <th>Task_amount</th>\n",
" <th>TaskID</th>\n",
" <th>VersionID</th>\n",
" <th>RepetitionID</th>\n",
" <th>Actual_Data</th>\n",
" <th>Is_Pause</th>\n",
" <th>Image</th>\n",
" <th>...</th>\n",
" <th>InputMethod</th>\n",
" <th>Set</th>\n",
" <th>XC</th>\n",
" <th>YC</th>\n",
" <th>EllipseW</th>\n",
" <th>EllipseH</th>\n",
" <th>EllipseTheta</th>\n",
" <th>Area</th>\n",
" <th>AvgCapa</th>\n",
" <th>SumCapa</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Input</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Finger</th>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>...</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Knuckle</th>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>...</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" <td>9421</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows × 31 columns</p>\n",
"</div>"
],
"text/plain": [
" userID Timestamp Current_Task Task_amount TaskID VersionID \\\n",
"Input \n",
"Finger 9421 9421 9421 9421 9421 9421 \n",
"Knuckle 9421 9421 9421 9421 9421 9421 \n",
"\n",
" RepetitionID Actual_Data Is_Pause Image ... InputMethod Set \\\n",
"Input ... \n",
"Finger 9421 9421 9421 9421 ... 9421 9421 \n",
"Knuckle 9421 9421 9421 9421 ... 9421 9421 \n",
"\n",
" XC YC EllipseW EllipseH EllipseTheta Area AvgCapa SumCapa \n",
"Input \n",
"Finger 9421 9421 9421 9421 9421 9421 9421 9421 \n",
"Knuckle 9421 9421 9421 9421 9421 9421 9421 9421 \n",
"\n",
"[2 rows x 31 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfT.groupby(\"Input\").count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# FEATURE SET: sum of capacitance, avg of capacitance, ellipse area, ellipse width, height and theta."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"features = [\"SumCapa\", \"AvgCapa\", \"Area\", \"EllipseW\", \"EllipseH\", \"EllipseTheta\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ZeroR"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"dfT[\"InputMethodPred\"] = 1"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0 9421]\n",
" [ 0 9421]]\n",
"Accuray: 0.50\n",
"Recall: 0.50\n",
"Precision: 0.50\n",
"F1-Score: 0.33\n",
" precision recall f1-score support\n",
"\n",
" Knuckle 0.00 0.00 0.00 9421\n",
" Finger 0.50 1.00 0.67 9421\n",
"\n",
" micro avg 0.50 0.50 0.50 18842\n",
" macro avg 0.25 0.50 0.33 18842\n",
"weighted avg 0.25 0.50 0.33 18842\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n",
"/usr/local/lib/python3.6/dist-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
" 'precision', 'predicted', average, warn_for)\n"
]
}
],
"source": [
"print(confusion_matrix(dfT.InputMethod.values, dfT.InputMethodPred.values, labels=[0, 1]))\n",
"print(\"Accuray: %.2f\" % accuracy_score(dfT.InputMethod.values, dfT.InputMethodPred.values))\n",
"print(\"Recall: %.2f\" % metrics.recall_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average=\"macro\"))\n",
"print(\"Precision: %.2f\" % metrics.average_precision_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average=\"macro\"))\n",
"print(\"F1-Score: %.2f\" % metrics.f1_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average=\"macro\"))\n",
"print(classification_report(dfT.InputMethod.values, dfT.InputMethodPred.values, target_names=target_names))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# DecisionTreeClassifier"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 240 candidates, totalling 1200 fits\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.\n",
"[Parallel(n_jobs=30)]: Done 140 tasks | elapsed: 10.4s\n",
"[Parallel(n_jobs=30)]: Done 390 tasks | elapsed: 31.4s\n",
"[Parallel(n_jobs=30)]: Done 740 tasks | elapsed: 1.3min\n",
"[Parallel(n_jobs=30)]: Done 1200 out of 1200 | elapsed: 2.4min finished\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'max_depth': 22, 'min_samples_split': 2} 0.8120637794585754\n",
"[[7409 2012]\n",
" [3096 6325]]\n",
"Accuray: 0.73\n",
"Recall: 0.73\n",
"Precision: 0.67\n",
"F1-Score: 0.73\n",
" precision recall f1-score support\n",
"\n",
" Knuckle 0.71 0.79 0.74 9421\n",
" Finger 0.76 0.67 0.71 9421\n",
"\n",
" micro avg 0.73 0.73 0.73 18842\n",
" macro avg 0.73 0.73 0.73 18842\n",
"weighted avg 0.73 0.73 0.73 18842\n",
"\n",
"CPU times: user 7.26 s, sys: 3.38 s, total: 10.6 s\n",
"Wall time: 2min 29s\n"
]
}
],
"source": [
"%%time\n",
"param_grid = {'max_depth': range(2,32,1),\n",
" 'min_samples_split':range(2,10,1)}\n",
"#TODO: Create Baseline for different ML stuff\n",
"clf = GridSearchCV(tree.DecisionTreeClassifier(), \n",
" param_grid,\n",
" cv=5 , n_jobs=os.cpu_count()-2, verbose=1)\n",
"clf.fit(dfY[features].values, dfY.InputMethod.values)\n",
"print(clf.best_params_, clf.best_score_)\n",
"dfT[\"InputMethodPred\"] = clf.predict(dfT[features].values) \n",
"\n",
"print(confusion_matrix(dfT.InputMethod.values, dfT.InputMethodPred.values, labels=[0, 1]))\n",
"print(\"Accuray: %.3f\" % accuracy_score(dfT.InputMethod.values, dfT.InputMethodPred.values))\n",
"print(\"Recall: %.3f\" % metrics.recall_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average=\"macro\"))\n",
"print(\"Precision: %.3f\" % metrics.average_precision_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average=\"macro\"))\n",
"print(\"F1-Score: %.3f\" % metrics.f1_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average=\"macro\"))\n",
"print(classification_report(dfT.InputMethod.values, dfT.InputMethodPred.values, target_names=target_names))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RandomForestClassifier"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 180 candidates, totalling 900 fits\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=94)]: Using backend LokyBackend with 94 concurrent workers.\n",
"[Parallel(n_jobs=94)]: Done 12 tasks | elapsed: 1.2min\n",
"[Parallel(n_jobs=94)]: Done 262 tasks | elapsed: 4.0min\n",
"[Parallel(n_jobs=94)]: Done 612 tasks | elapsed: 9.2min\n",
"[Parallel(n_jobs=94)]: Done 900 out of 900 | elapsed: 12.8min finished\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'max_depth': 60, 'n_estimators': 63} 0.8669582104371696\n",
"[[8175 1246]\n",
" [2765 6656]]\n",
"Accuray: 0.79\n",
"Recall: 0.71\n",
"Precision: 0.74\n",
"F1-Score: 0.77\n",
" precision recall f1-score support\n",
"\n",
" Knuckle 0.75 0.87 0.80 9421\n",
" Finger 0.84 0.71 0.77 9421\n",
"\n",
" micro avg 0.79 0.79 0.79 18842\n",
" macro avg 0.79 0.79 0.79 18842\n",
"weighted avg 0.79 0.79 0.79 18842\n",
"\n",
"CPU times: user 42.1 s, sys: 834 ms, total: 42.9 s\n",
"Wall time: 13min 28s\n"
]
}
],
"source": [
"%%time\n",
"param_grid = {'n_estimators': range(55,64,1),\n",
" 'max_depth': range(50,70,1)}\n",
"#TODO: Create Baseline for different ML stuff\n",
"clf = GridSearchCV(ensemble.RandomForestClassifier(), \n",
" param_grid,\n",
" cv=5 , n_jobs=os.cpu_count()-2, verbose=1)\n",
"clf.fit(dfY[features].values, dfY.InputMethod.values)\n",
"print(clf.best_params_, clf.best_score_)\n",
"dfT[\"InputMethodPred\"] = clf.predict(dfT[features].values) \n",
"\n",
"print(confusion_matrix(dfT.InputMethod.values, dfT.InputMethodPred.values, labels=[0, 1]))\n",
"print(\"Accuray: %.2f\" % accuracy_score(dfT.InputMethod.values, dfT.InputMethodPred.values))\n",
"print(\"Recall: %.2f\" % metrics.recall_score(dfT.InputMethod.values, dfT.InputMethodPred.values))\n",
"print(\"Precision: %.2f\" % metrics.average_precision_score(dfT.InputMethod.values, dfT.InputMethodPred.values))\n",
"print(\"F1-Score: %.2f\" % metrics.f1_score(dfT.InputMethod.values, dfT.InputMethodPred.values))\n",
"print(classification_report(dfT.InputMethod.values, dfT.InputMethodPred.values, target_names=target_names))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# kNN"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 62 candidates, totalling 310 fits\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=94)]: Using backend LokyBackend with 94 concurrent workers.\n",
"[Parallel(n_jobs=94)]: Done 12 tasks | elapsed: 17.7s\n",
"[Parallel(n_jobs=94)]: Done 310 out of 310 | elapsed: 1.5min finished\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'n_neighbors': 2} 0.800546827088748\n",
"[[8187 1234]\n",
" [4318 5103]]\n",
"Accuray: 0.71\n",
"Recall: 0.54\n",
"Precision: 0.67\n",
"F1-Score: 0.65\n",
" precision recall f1-score support\n",
"\n",
" Knuckle 0.65 0.87 0.75 9421\n",
" Finger 0.81 0.54 0.65 9421\n",
"\n",
" micro avg 0.71 0.71 0.71 18842\n",
" macro avg 0.73 0.71 0.70 18842\n",
"weighted avg 0.73 0.71 0.70 18842\n",
"\n",
"CPU times: user 1.74 s, sys: 300 ms, total: 2.04 s\n",
"Wall time: 1min 30s\n"
]
}
],
"source": [
"%%time\n",
"param_grid = {'n_neighbors': range(2,64,1),\n",
" #weights': ['uniform', 'distance']\n",
" }\n",
"#TODO: Create Baseline for different ML stuff\n",
"clf = GridSearchCV(neighbors.KNeighborsClassifier(),\n",
" param_grid,\n",
" cv=5 , n_jobs=os.cpu_count()-2, verbose=1)\n",
"clf.fit(dfY[features].values, dfY.InputMethod.values)\n",
"print(clf.best_params_, clf.best_score_)\n",
"dfT[\"InputMethodPred\"] = clf.predict(dfT[features].values) \n",
"\n",
"print(confusion_matrix(dfT.InputMethod.values, dfT.InputMethodPred.values, labels=[0, 1]))\n",
"print(\"Accuray: %.2f\" % accuracy_score(dfT.InputMethod.values, dfT.InputMethodPred.values))\n",
"print(\"Recall: %.2f\" % metrics.recall_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average=\"macro\"))\n",
"print(\"Precision: %.2f\" % metrics.average_precision_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average=\"macro\"))\n",
"print(\"F1-Score: %.2f\" % metrics.f1_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average=\"macro\"))\n",
"print(classification_report(dfT.InputMethod.values, dfT.InputMethodPred.values, target_names=target_names))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# SVM"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 9 candidates, totalling 45 fits\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=94)]: Using backend LokyBackend with 94 concurrent workers.\n",
"[Parallel(n_jobs=94)]: Done 42 out of 45 | elapsed: 1056.5min remaining: 75.5min\n",
"[Parallel(n_jobs=94)]: Done 45 out of 45 | elapsed: 1080.5min finished\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'C': 10.0, 'gamma': 10.0} 0.8256943024851795\n",
"CPU times: user 2h 42min 9s, sys: 23.6 s, total: 2h 42min 33s\n",
"Wall time: 20h 43min 1s\n"
]
}
],
"source": [
"%%time\n",
"C_range = np.logspace(1, 3,3)\n",
"gamma_range = np.logspace(-1, 1, 3)\n",
"param_grid = dict(gamma=gamma_range, C=C_range)\n",
"clf = GridSearchCV(sklearn.svm.SVC(), \n",
" param_grid,\n",
" cv=5 , n_jobs=os.cpu_count()-2, verbose=1)\n",
"clf.fit(dfY[features].values, dfY.InputMethod.values)\n",
"print(clf.best_params_, clf.best_score_)\n",
"\n",
"dfT[\"InputMethodPred\"] = clf.predict(dfT[features].values)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'C': 10.0, 'gamma': 10.0} 0.8256943024851795\n",
"[[7106 2315]\n",
" [2944 6477]]\n",
"Accuray: 0.72\n",
"Recall: 0.69\n",
"Precision: 0.66\n",
"F1-Score: 0.71\n",
" precision recall f1-score support\n",
"\n",
" Knuckle 0.71 0.75 0.73 9421\n",
" Finger 0.74 0.69 0.71 9421\n",
"\n",
" micro avg 0.72 0.72 0.72 18842\n",
" macro avg 0.72 0.72 0.72 18842\n",
"weighted avg 0.72 0.72 0.72 18842\n",
"\n"
]
}
],
"source": [
"print(clf.best_params_, clf.best_score_)\n",
"print(confusion_matrix(dfT.InputMethod.values, dfT.InputMethodPred.values, labels=[0, 1]))\n",
"print(\"Accuray: %.2f\" % accuracy_score(dfT.InputMethod.values, dfT.InputMethodPred.values))\n",
"print(\"Recall: %.2f\" % metrics.recall_score(dfT.InputMethod.values, dfT.InputMethodPred.values))\n",
"print(\"Precision: %.2f\" % metrics.average_precision_score(dfT.InputMethod.values, dfT.InputMethodPred.values))\n",
"print(\"F1-Score: %.2f\" % metrics.f1_score(dfT.InputMethod.values, dfT.InputMethodPred.values))\n",
"print(classification_report(dfT.InputMethod.values, dfT.InputMethodPred.values, target_names=target_names))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}