mayersn 0fb6efadd5 Added notebooks and models

2019-08-07 17:57:12 -04:00

32 KiB

Raw Permalink Blame History

In [1]:

## USE for Multi GPU Systems
#import os
#os.environ["CUDA_VISIBLE_DEVICES"]="0"

%matplotlib inline

from scipy.odr import *
from scipy.stats import *
import numpy as np
import pandas as pd
import os
import time
import matplotlib.pyplot as plt
import ast
from multiprocessing import Pool

import scipy

from IPython import display
from matplotlib.patches import Rectangle

from sklearn.metrics import mean_squared_error
import json

import scipy.stats as st
from sklearn.metrics import r2_score


from matplotlib import cm
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse

import copy

from sklearn.model_selection import LeaveOneOut, LeavePOut

from multiprocessing import Pool
import cv2

import sklearn
import random
from sklearn import neighbors
from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

# Importing matplotlib to plot images.
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# Importing SK-learn to calculate precision and recall
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneGroupOut
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import pickle as pkl
import h5py

from pathlib import Path
import os.path
import sys
import datetime
import time

import skimage

target_names = ["Knuckle", "Finger"]

In [2]:

from skimage import measure
from skimage.measure import find_contours, approximate_polygon, \
    subdivide_polygon, EllipseModel, LineModelND

In [3]:

def getEllipseParams(img):
    points = np.argwhere(img > 40)
    
    contours = skimage.measure.find_contours(img, 40)
    points_to_approx = []
    highest_val = 0
    for n, contour in enumerate(contours):
        if (len(contour) > highest_val):
            points_to_approx = contour
            highest_val = len(contour)   
    
    try:
        contour = np.fliplr(points_to_approx)
    except Exception as inst:
        return [-1, -1, -1, -1, -1]
    

    ellipse = skimage.measure.fit.EllipseModel()
    ellipse.estimate(contour)
    try:
        xc, yc, a, b, theta = ellipse.params  
    except Exception as int:
        return [-1, -1, -1, -1, -1]
    
    return [xc, yc, a, b, theta]

In [4]:

# the data, split between train and test sets
df = pd.read_pickle("DataStudyCollection/df_statistics.pkl")

lst = df.userID.unique()
np.random.seed(42)
np.random.shuffle(lst)
test_ids = lst[-5:]
train_ids = lst[:-5]

df["Set"] = "Test"
df.loc[df.userID.isin(train_ids), "Set"] = "Train"
print(train_ids, test_ids)
print(len(train_ids), ":", len(test_ids))
print(len(train_ids) / len(lst), ":", len(test_ids)/ len(lst))

#df_train = df[df.userID.isin(train_ids)]
#df_test = df[df.userID.isin(test_ids) & (df.Version == "Normal")]

[ 1  2  9  6  4 14 17 16 12  3 10 18  5] [13  8 11 15  7]
13 : 5
0.7222222222222222 : 0.2777777777777778

In [5]:

fig, ax = plt.subplots(1)
img = df.iloc[0].Blobs
xc, yc, a, b, theta = getEllipseParams(img)
ax.imshow(img)
e = Ellipse(xy=[xc,yc], width=a*2, height=b*2, angle=math.degrees(theta), fill=False, lw=2, edgecolor='w')
ax.add_artist(e)

Out[5]:

<matplotlib.patches.Ellipse at 0x7ff60430b668>

In [6]:

lst = df.Blobs.apply(lambda x: getEllipseParams(x))

In [7]:

lst2 = np.vstack(lst.values)

In [8]:

lst2.shape

Out[8]:

(618012, 5)

In [9]:

df["XC"] = lst2[:,0]
df["YC"] = lst2[:,1]
df["EllipseW"] = lst2[:,2]
df["EllipseH"] = lst2[:,3]
df["EllipseTheta"] = lst2[:,4]

In [10]:

df["Area"] = df["EllipseW"] * df["EllipseH"] * np.pi
df["AvgCapa"] = df.Blobs.apply(lambda x: np.mean(x))
df["SumCapa"] = df.Blobs.apply(lambda x: np.sum(x))

In [11]:

lst = list(range(1, df.userID.max()))
SEED = 42#448
random.seed(SEED)
random.shuffle(lst)
lst

Out[11]:

[8, 11, 6, 7, 16, 15, 14, 10, 9, 2, 3, 13, 17, 5, 12, 1, 4]

In [12]:

dfY = df[df.Set == "Train"].copy(deep=True)
dfT = df[(df.Set == "Test") & (df.Version == "Normal")].copy(deep=True)

In [13]:

minmax = min(len(dfY[dfY.Input == "Finger"]), len(dfY[dfY.Input == "Knuckle"]))
dfX = dfY[dfY.Input == "Finger"].sample(minmax)
dfZ = dfY[dfY.Input == "Knuckle"].sample(minmax)
dfY = pd.concat([dfX,dfZ])

minmax = min(len(dfT[dfT.Input == "Finger"]), len(dfT[dfT.Input == "Knuckle"]))
dfX = dfT[dfT.Input == "Finger"].sample(minmax)
dfZ = dfT[dfT.Input == "Knuckle"].sample(minmax)
dfT = pd.concat([dfX,dfZ])

In [14]:

dfT.groupby("Input").count()

Out[14]:

	userID	Timestamp	Current_Task	Task_amount	TaskID	VersionID	RepetitionID	Actual_Data	Is_Pause	Image	...	InputMethod	Set	XC	YC	EllipseW	EllipseH	EllipseTheta	Area	AvgCapa	SumCapa
Input
Finger	9421	9421	9421	9421	9421	9421	9421	9421	9421	9421	...	9421	9421	9421	9421	9421	9421	9421	9421	9421	9421
Knuckle	9421	9421	9421	9421	9421	9421	9421	9421	9421	9421	...	9421	9421	9421	9421	9421	9421	9421	9421	9421	9421

2 rows × 31 columns

FEATURE SET: sum of capacitance, avg of capacitance, ellipse area, ellipse width, height and theta.¶

In [15]:

features = ["SumCapa", "AvgCapa", "Area", "EllipseW", "EllipseH", "EllipseTheta"]

ZeroR¶

In [16]:

dfT["InputMethodPred"] = 1

In [17]:

print(confusion_matrix(dfT.InputMethod.values, dfT.InputMethodPred.values, labels=[0, 1]))
print("Accuray: %.2f" % accuracy_score(dfT.InputMethod.values, dfT.InputMethodPred.values))
print("Recall: %.2f" % metrics.recall_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average="macro"))
print("Precision: %.2f" % metrics.average_precision_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average="macro"))
print("F1-Score: %.2f" % metrics.f1_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average="macro"))
print(classification_report(dfT.InputMethod.values, dfT.InputMethodPred.values, target_names=target_names))

[[   0 9421]
 [   0 9421]]
Accuray: 0.50
Recall: 0.50
Precision: 0.50
F1-Score: 0.33
              precision    recall  f1-score   support

     Knuckle       0.00      0.00      0.00      9421
      Finger       0.50      1.00      0.67      9421

   micro avg       0.50      0.50      0.50     18842
   macro avg       0.25      0.50      0.33     18842
weighted avg       0.25      0.50      0.33     18842

/usr/local/lib/python3.6/dist-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/usr/local/lib/python3.6/dist-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

DecisionTreeClassifier¶

In [18]:

%%time
param_grid = {'max_depth': range(2,32,1),
              'min_samples_split':range(2,10,1)}
#TODO: Create Baseline for different ML stuff
clf = GridSearchCV(tree.DecisionTreeClassifier(), 
                   param_grid,
                   cv=5 , n_jobs=os.cpu_count()-2, verbose=1)
clf.fit(dfY[features].values, dfY.InputMethod.values)
print(clf.best_params_, clf.best_score_)
dfT["InputMethodPred"] = clf.predict(dfT[features].values) 

print(confusion_matrix(dfT.InputMethod.values, dfT.InputMethodPred.values, labels=[0, 1]))
print("Accuray: %.3f" % accuracy_score(dfT.InputMethod.values, dfT.InputMethodPred.values))
print("Recall: %.3f" % metrics.recall_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average="macro"))
print("Precision: %.3f" % metrics.average_precision_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average="macro"))
print("F1-Score: %.3f" % metrics.f1_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average="macro"))
print(classification_report(dfT.InputMethod.values, dfT.InputMethodPred.values, target_names=target_names))

Fitting 5 folds for each of 240 candidates, totalling 1200 fits

[Parallel(n_jobs=30)]: Using backend LokyBackend with 30 concurrent workers.
[Parallel(n_jobs=30)]: Done 140 tasks      | elapsed:   10.4s
[Parallel(n_jobs=30)]: Done 390 tasks      | elapsed:   31.4s
[Parallel(n_jobs=30)]: Done 740 tasks      | elapsed:  1.3min
[Parallel(n_jobs=30)]: Done 1200 out of 1200 | elapsed:  2.4min finished

{'max_depth': 22, 'min_samples_split': 2} 0.8120637794585754
[[7409 2012]
 [3096 6325]]
Accuray: 0.73
Recall: 0.73
Precision: 0.67
F1-Score: 0.73
              precision    recall  f1-score   support

     Knuckle       0.71      0.79      0.74      9421
      Finger       0.76      0.67      0.71      9421

   micro avg       0.73      0.73      0.73     18842
   macro avg       0.73      0.73      0.73     18842
weighted avg       0.73      0.73      0.73     18842

CPU times: user 7.26 s, sys: 3.38 s, total: 10.6 s
Wall time: 2min 29s

RandomForestClassifier¶

In [19]:

%%time
param_grid = {'n_estimators': range(55,64,1),
              'max_depth': range(50,70,1)}
#TODO: Create Baseline for different ML stuff
clf = GridSearchCV(ensemble.RandomForestClassifier(), 
                   param_grid,
                   cv=5 , n_jobs=os.cpu_count()-2, verbose=1)
clf.fit(dfY[features].values, dfY.InputMethod.values)
print(clf.best_params_, clf.best_score_)
dfT["InputMethodPred"] = clf.predict(dfT[features].values) 

print(confusion_matrix(dfT.InputMethod.values, dfT.InputMethodPred.values, labels=[0, 1]))
print("Accuray: %.2f" % accuracy_score(dfT.InputMethod.values, dfT.InputMethodPred.values))
print("Recall: %.2f" % metrics.recall_score(dfT.InputMethod.values, dfT.InputMethodPred.values))
print("Precision: %.2f" % metrics.average_precision_score(dfT.InputMethod.values, dfT.InputMethodPred.values))
print("F1-Score: %.2f" % metrics.f1_score(dfT.InputMethod.values, dfT.InputMethodPred.values))
print(classification_report(dfT.InputMethod.values, dfT.InputMethodPred.values, target_names=target_names))

Fitting 5 folds for each of 180 candidates, totalling 900 fits

[Parallel(n_jobs=94)]: Using backend LokyBackend with 94 concurrent workers.
[Parallel(n_jobs=94)]: Done  12 tasks      | elapsed:  1.2min
[Parallel(n_jobs=94)]: Done 262 tasks      | elapsed:  4.0min
[Parallel(n_jobs=94)]: Done 612 tasks      | elapsed:  9.2min
[Parallel(n_jobs=94)]: Done 900 out of 900 | elapsed: 12.8min finished

{'max_depth': 60, 'n_estimators': 63} 0.8669582104371696
[[8175 1246]
 [2765 6656]]
Accuray: 0.79
Recall: 0.71
Precision: 0.74
F1-Score: 0.77
              precision    recall  f1-score   support

     Knuckle       0.75      0.87      0.80      9421
      Finger       0.84      0.71      0.77      9421

   micro avg       0.79      0.79      0.79     18842
   macro avg       0.79      0.79      0.79     18842
weighted avg       0.79      0.79      0.79     18842

CPU times: user 42.1 s, sys: 834 ms, total: 42.9 s
Wall time: 13min 28s

kNN¶

In [20]:

%%time
param_grid = {'n_neighbors':  range(2,64,1),
              #weights': ['uniform', 'distance']
             }
#TODO: Create Baseline for different ML stuff
clf = GridSearchCV(neighbors.KNeighborsClassifier(),
                   param_grid,
                   cv=5 , n_jobs=os.cpu_count()-2, verbose=1)
clf.fit(dfY[features].values, dfY.InputMethod.values)
print(clf.best_params_, clf.best_score_)
dfT["InputMethodPred"] = clf.predict(dfT[features].values) 

print(confusion_matrix(dfT.InputMethod.values, dfT.InputMethodPred.values, labels=[0, 1]))
print("Accuray: %.2f" % accuracy_score(dfT.InputMethod.values, dfT.InputMethodPred.values))
print("Recall: %.2f" % metrics.recall_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average="macro"))
print("Precision: %.2f" % metrics.average_precision_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average="macro"))
print("F1-Score: %.2f" % metrics.f1_score(dfT.InputMethod.values, dfT.InputMethodPred.values, average="macro"))
print(classification_report(dfT.InputMethod.values, dfT.InputMethodPred.values, target_names=target_names))

Fitting 5 folds for each of 62 candidates, totalling 310 fits

[Parallel(n_jobs=94)]: Using backend LokyBackend with 94 concurrent workers.
[Parallel(n_jobs=94)]: Done  12 tasks      | elapsed:   17.7s
[Parallel(n_jobs=94)]: Done 310 out of 310 | elapsed:  1.5min finished

{'n_neighbors': 2} 0.800546827088748
[[8187 1234]
 [4318 5103]]
Accuray: 0.71
Recall: 0.54
Precision: 0.67
F1-Score: 0.65
              precision    recall  f1-score   support

     Knuckle       0.65      0.87      0.75      9421
      Finger       0.81      0.54      0.65      9421

   micro avg       0.71      0.71      0.71     18842
   macro avg       0.73      0.71      0.70     18842
weighted avg       0.73      0.71      0.70     18842

CPU times: user 1.74 s, sys: 300 ms, total: 2.04 s
Wall time: 1min 30s

SVM¶

In [21]:

%%time
C_range = np.logspace(1, 3,3)
gamma_range = np.logspace(-1, 1, 3)
param_grid = dict(gamma=gamma_range, C=C_range)
clf = GridSearchCV(sklearn.svm.SVC(), 
                   param_grid,
                   cv=5 , n_jobs=os.cpu_count()-2, verbose=1)
clf.fit(dfY[features].values, dfY.InputMethod.values)
print(clf.best_params_, clf.best_score_)

dfT["InputMethodPred"] = clf.predict(dfT[features].values)

Fitting 5 folds for each of 9 candidates, totalling 45 fits

[Parallel(n_jobs=94)]: Using backend LokyBackend with 94 concurrent workers.
[Parallel(n_jobs=94)]: Done  42 out of  45 | elapsed: 1056.5min remaining: 75.5min
[Parallel(n_jobs=94)]: Done  45 out of  45 | elapsed: 1080.5min finished

{'C': 10.0, 'gamma': 10.0} 0.8256943024851795
CPU times: user 2h 42min 9s, sys: 23.6 s, total: 2h 42min 33s
Wall time: 20h 43min 1s

In [22]:

print(clf.best_params_, clf.best_score_)
print(confusion_matrix(dfT.InputMethod.values, dfT.InputMethodPred.values, labels=[0, 1]))
print("Accuray: %.2f" % accuracy_score(dfT.InputMethod.values, dfT.InputMethodPred.values))
print("Recall: %.2f" % metrics.recall_score(dfT.InputMethod.values, dfT.InputMethodPred.values))
print("Precision: %.2f" % metrics.average_precision_score(dfT.InputMethod.values, dfT.InputMethodPred.values))
print("F1-Score: %.2f" % metrics.f1_score(dfT.InputMethod.values, dfT.InputMethodPred.values))
print(classification_report(dfT.InputMethod.values, dfT.InputMethodPred.values, target_names=target_names))

{'C': 10.0, 'gamma': 10.0} 0.8256943024851795
[[7106 2315]
 [2944 6477]]
Accuray: 0.72
Recall: 0.69
Precision: 0.66
F1-Score: 0.71
              precision    recall  f1-score   support

     Knuckle       0.71      0.75      0.73      9421
      Finger       0.74      0.69      0.71      9421

   micro avg       0.72      0.72      0.72     18842
   macro avg       0.72      0.72      0.72     18842
weighted avg       0.72      0.72      0.72     18842

32 KiB Raw Permalink Blame History Unescape Escape