knuckletouch/python/Step_37_LSTM_Preprocessing.ipynb at master

mayersn 0fb6efadd5 Added notebooks and models

2019-08-07 17:57:12 -04:00

43 KiB

Raw Permalink Blame History

Preprocessing for LSTM: Blobdetection and Cutting¶

In [1]:

%matplotlib inline

from scipy.odr import *
from scipy.stats import *
import numpy as np
import pandas as pd
import os
import time
import matplotlib.pyplot as plt
import ast
from multiprocessing import Pool, cpu_count

import scipy

from IPython import display
from matplotlib.patches import Rectangle

from sklearn.metrics import mean_squared_error
import json

import scipy.stats as st
from sklearn.metrics import r2_score


from matplotlib import cm
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt

import copy

from sklearn.model_selection import LeaveOneOut, LeavePOut

from multiprocessing import Pool
import cv2

In [2]:

df_filtered = pd.read_pickle("DataStudyEvaluation/df_lstm.pkl")

In [3]:

df_filtered.head()

Out[3]:

	userID	Timestamp	Current_Task	Task_amount	VersionID	RepetitionID	Actual_Data	Is_Pause	Image	IsMax	MaxRepetition
178293	1	1553521747262	16	510	2	1	True	False	[0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 3, 1, 1, 1, 2, ...	True	1
178294	1	1553521747302	16	510	2	1	True	False	[0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...	True	1
178295	1	1553521747342	16	510	2	1	True	False	[0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...	True	1
178296	1	1553521747388	16	510	2	1	True	False	[0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...	True	1
178297	1	1553521747422	16	510	2	1	True	False	[0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...	True	1

In [4]:

len(df_filtered.userID.unique())

Out[4]:

In [5]:

df_filtered.Image = df_filtered.Image.apply(lambda x: x.reshape(27, 15))
df_filtered.Image = df_filtered.Image.apply(lambda x: x.clip(min=0, max=255))
df_filtered.Image = df_filtered.Image.apply(lambda x: x.astype(np.uint8))
df_filtered["ImageSum"] = df_filtered.Image.apply(lambda x: np.sum(x))

In [6]:

#LSTMs new Blob detection (only detect, if there are blobs)
def detect_blobs(image):
    #image = image.reshape(27, 15)
    large = np.ones((29,17), dtype=np.uint8)
    large[1:28,1:16] = image
    temp, thresh = cv2.threshold(cv2.bitwise_not(large), 200, 255, cv2.THRESH_BINARY)
    contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    contours = [a for a in contours if cv2.contourArea(a) > 8 and cv2.contourArea(a) < 255]
    lstBlob  = []
    lstMin = []
    lstMax = []
    count = 0
    return len(contours) > 0

In [7]:

%%time
pool = Pool(cpu_count() - 1)
temp_blobs = pool.map(detect_blobs, df_filtered.Image)
pool.close()

CPU times: user 1.93 s, sys: 581 ms, total: 2.51 s
Wall time: 2.71 s

In [8]:

df_filtered["ContainsBlobs"] = temp_blobs

In [9]:

#Label if knuckle or finger
def f(row):
    if row['TaskID'] < 17:
        #val = "Knuckle"
        val = 0
    elif row['TaskID'] >= 17:
        #val = "Finger"
        val = 1
    return val
df_filtered['InputMethod'] = df_filtered.apply(f, axis=1)

In [10]:

# Problem: some timestamps are strings (XXXXE+XXXX) which is not accurate enough, switching to index instead
"""def cast_to_int(x):
    if type(x) == int:
        return x
    x = str(x).replace(",", ".")
    return int(float(x))

df_filtered.Timestamp = df_filtered.Timestamp.map(cast_to_int)"""
df_filtered.index = range(len(df_filtered))

In [11]:

%%time
# trim image sequences down to only between first and last detected blob
UserIDs = []
TaskIDs = []
VersionIDs = []
Blobs = []
for userID in df_filtered.userID.unique():
    print(userID)
    for TaskID in df_filtered[df_filtered.userID == userID].TaskID.unique():
        for VersionID in df_filtered[(df_filtered.userID == userID) & (df_filtered.TaskID == TaskID)].VersionID.unique():
            first_blob = -1
            last_blob = -1
            for index, row in df_filtered[(df_filtered.userID == userID) & (df_filtered.TaskID == TaskID) & (df_filtered.VersionID == VersionID)].iterrows():
                if row.ContainsBlobs:
                    last_blob = index
                    if first_blob == -1:
                        first_blob = index
            if first_blob >= 0 and last_blob >= 0:
                UserIDs.append(userID)
                TaskIDs.append(TaskID)
                VersionIDs.append(VersionID)
                Blobs.append(df_filtered[(df_filtered.userID == userID) & (df_filtered.TaskID == TaskID) & (df_filtered.VersionID == VersionID) & (df_filtered.index >= first_blob) & (df_filtered.index <= last_blob)].Image.tolist())

1
2
3
4
5
6
7
8
9
10
11
12
CPU times: user 1min 32s, sys: 60.2 ms, total: 1min 32s
Wall time: 1min 32s

In [12]:

UserIDs = np.array(UserIDs, dtype=np.int64)
TaskIDs = np.array(TaskIDs, dtype=np.int64)
VersionIDs = np.array(VersionIDs, dtype=np.int64)

In [13]:

df_lstm_all = pd.DataFrame()
df_lstm_all["userID"] = UserIDs
df_lstm_all["TaskID"] = TaskIDs
df_lstm_all["VersionID"] = VersionIDs
df_lstm_all["Blobs"] = Blobs
df_lstm_all.Blobs = df_lstm_all.Blobs.map(np.array)
df_lstm_all.head()

Out[13]:

	userID	VersionID	Blobs
0	1	2	[[[0, 1, 1, 2, 1, 1, 2, 0, 1, 2, 1, 0, 0, 0, 0...
1	1	3	[[[0, 2, 191, 0, 0, 1, 2, 0, 1, 1, 2, 0, 1, 0,...
2	1	4	[[[0, 0, 0, 0, 1, 2, 2, 1, 0, 0, 0, 0, 1, 0, 2...
3	1	5	[[[0, 1, 0, 2, 2, 0, 1, 0, 3, 1, 1, 0, 0, 0, 0...
4	1	6	[[[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0...

In [14]:

df_lstm_all.head()

Out[14]:

	userID	VersionID	Blobs
0	1	2	[[[0, 1, 1, 2, 1, 1, 2, 0, 1, 2, 1, 0, 0, 0, 0...
1	1	3	[[[0, 2, 191, 0, 0, 1, 2, 0, 1, 1, 2, 0, 1, 0,...
2	1	4	[[[0, 0, 0, 0, 1, 2, 2, 1, 0, 0, 0, 0, 1, 0, 2...
3	1	5	[[[0, 1, 0, 2, 2, 0, 1, 0, 3, 1, 1, 0, 0, 0, 0...
4	1	6	[[[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0...

In [15]:

df_lstm_all["Length"] = df_lstm_all.Blobs.apply(lambda x: x.shape[0])

In [16]:

df_lstm_all.Length.hist(range=(0,300), bins=30)

Out[16]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f563b2b4278>

In [17]:

len(df_lstm_all[df_lstm_all.Length > 50]) / len(df_lstm_all)

Out[17]:

0.05110421609782807

In [18]:

df_lstm_all.Length.describe().round(1)

Out[18]:

count    5479.0
mean       21.2
std        15.5
min         1.0
25%        13.0
50%        18.0
75%        26.0
max       251.0
Name: Length, dtype: float64

In [19]:

lengths = []
for index, row in df_lstm_all.iterrows():
    lengths.append(row.Blobs.shape[0])
df_lstm_all["BlobCount"] = lengths
# add a column for pure gesture recognition without finger/knuckle
df_lstm_all["GestureOnly"] = df_lstm_all.TaskID % 17

In [20]:

df_lstm_all.BlobCount.describe()

Out[20]:

count    5479.000000
mean       21.239460
std        15.541015
min         1.000000
25%        13.000000
50%        18.000000
75%        26.000000
max       251.000000
Name: BlobCount, dtype: float64

In [21]:

df_lstm_all.groupby(df_lstm_all.GestureOnly)["BlobCount"].agg(["mean", "std"])

Out[21]:

	mean	std
GestureOnly
0	19.000000	15.626834
1	18.387387	12.717864
2	13.418006	11.615571
3	13.192182	10.096861
4	13.439344	9.632580
5	13.157407	9.709874
6	17.198813	8.017407
7	18.627841	7.706187
8	30.096463	14.204850
9	19.134375	8.786072
10	20.289308	10.179677
11	19.311526	20.655269
12	20.683230	10.314326
13	20.357367	9.820602
14	21.581538	11.342465
15	27.737654	13.548982
16	51.783708	19.654648

In [22]:

# filter on gesture lengths
print("before: %s" % len(df_lstm_all))
df_lstm = df_lstm_all[(df_lstm_all.BlobCount <= 100) & (df_lstm_all.BlobCount >= 5)]
print("after: %s" % len(df_lstm))
print("ratio: %s" % ((len(df_lstm_all) - len(df_lstm)) / len(df_lstm_all) * 100))

before: 5479
after: 5043
ratio: 7.957656506661799

In [23]:

df_lstm.BlobCount.describe()

Out[23]:

count    5043.000000
mean       22.512195
std        13.430134
min         5.000000
25%        14.000000
50%        19.000000
75%        27.000000
max        99.000000
Name: BlobCount, dtype: float64

In [24]:

lengths = []
for index, row in df_lstm.iterrows():
    lengths.append(row.Blobs.shape[0])

In [25]:

df_lstm["BlobCount"] = lengths

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

In [27]:

df_lstm.BlobCount.describe()

Out[27]:

count    5043.000000
mean       22.512195
std        13.430134
min         5.000000
25%        14.000000
50%        19.000000
75%        27.000000
max        99.000000
Name: BlobCount, dtype: float64

In [28]:

def lerp(a, b, c=0.5):
    return c * b + (1.0 - c) * a

#Svens new Blob detection
def detect_blobs_return_old(image, task):
    #image = e.Image
    large = np.ones((29,17), dtype=np.uint8)
    large[1:28,1:16] = np.copy(image)
    temp, thresh = cv2.threshold(cv2.bitwise_not(large), 205, 255, cv2.THRESH_BINARY)
    contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    contours = [a for a in contours if cv2.contourArea(a) > 8 and cv2.contourArea(a) < 255]
    lstBlob  = []
    lstCenter = []
    lstMin = []
    lstMax = []
    count = 0
    contours.sort(key=lambda a: cv2.contourArea(a))
    if len(contours) > 0:
        # if two finger or knuckle
        cont_count = 2 if task in [1, 6, 7, 18, 23, 24] and len(contours) > 1 else 1
        for i in range(1, cont_count + 1):
            max_contour = contours[-1 * i]
            xmax, ymax = np.max(max_contour.reshape(len(max_contour),2), axis=0)
            xmin, ymin = np.min(max_contour.reshape(len(max_contour),2), axis=0)
            M = cv2.moments(max_contour)
            cX = int(M["m10"] / M["m00"]) - 1
            cY = int(M["m01"] / M["m00"]) - 1
            #croped_im = np.zeros((27,15))
            blob = large[max(ymin - 1, 0):min(ymax + 1, large.shape[0]),max(xmin - 1, 0):min(xmax + 1, large.shape[1])]
            #croped_im[0:blob.shape[0],0:blob.shape[1]] = blob
            #return (1, [croped_im])
            lstBlob.append(blob)
            lstCenter.append((cY, cX))
            lstMin.append(xmax-xmin)
            lstMax.append(ymax-ymin)
            count = count + 1
        return (count, lstBlob, lstCenter)
    else:
        return (0, [np.zeros((29, 19))], 0, 0)

In [29]:

# descides whether or not a normalization is neccessary
# and cuts or adds zeros
def normalize_blobs(blobs, new_len=50):
    new_count = new_len - blobs.shape[0]
    if new_count == 0:
        return blobs
    elif new_count > 0:
        temp = np.array([np.zeros((27, 15))] * new_count)
        return np.append(blobs, temp, axis=0)
    else:
        return blobs[0:new_len]

In [30]:

%%time
# normalizes all image sequences
df_lstm_norm = df_lstm.copy(deep=True)
new_blobs = []
for index, row in df_lstm.iterrows():
    new_blobs.append(normalize_blobs(row.Blobs, 50))

df_lstm_norm.Blobs = new_blobs

lengths = []
for index, row in df_lstm_norm.iterrows():
    lengths.append(row.Blobs.shape[0])
df_lstm_norm["BlobCount"] = lengths

CPU times: user 1.48 s, sys: 236 ms, total: 1.71 s
Wall time: 1.71 s

In [31]:

df_lstm_norm.BlobCount.describe()

Out[31]:

count    5043.0
mean       50.0
std         0.0
min        50.0
25%        50.0
50%        50.0
75%        50.0
max        50.0
Name: BlobCount, dtype: float64

In [32]:

df_lstm_norm.to_pickle("DataStudyEvaluation/df_lstm_norm50.pkl")

In [33]:

df_lstm_norm.head()

Out[33]:

	userID	VersionID	Blobs	Length	BlobCount
0	1	2	[[[0.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 0.0, 1.0...	25	50
1	1	3	[[[0.0, 2.0, 191.0, 0.0, 0.0, 1.0, 2.0, 0.0, 1...	12	50
2	1	4	[[[0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 2.0, 1.0, 0.0...	14	50
3	1	5	[[[0.0, 1.0, 0.0, 2.0, 2.0, 0.0, 1.0, 0.0, 3.0...	11	50
4	1	6	[[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0...	16	50

In [34]:

df_lstm_norm.userID.unique()

Out[34]:

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [ ]:

43 KiB Raw Permalink Blame History

Preprocessing for LSTM: Blobdetection and Cutting¶

43 KiB

Raw Permalink Blame History