43 KiB
43 KiB
Preprocessing for LSTM: Blobdetection and Cutting¶
In [1]:
%matplotlib inline from scipy.odr import * from scipy.stats import * import numpy as np import pandas as pd import os import time import matplotlib.pyplot as plt import ast from multiprocessing import Pool, cpu_count import scipy from IPython import display from matplotlib.patches import Rectangle from sklearn.metrics import mean_squared_error import json import scipy.stats as st from sklearn.metrics import r2_score from matplotlib import cm from mpl_toolkits.mplot3d import axes3d import matplotlib.pyplot as plt import copy from sklearn.model_selection import LeaveOneOut, LeavePOut from multiprocessing import Pool import cv2
In [2]:
df_filtered = pd.read_pickle("DataStudyEvaluation/df_lstm.pkl")
In [3]:
df_filtered.head()
Out[3]:
userID | Timestamp | Current_Task | Task_amount | TaskID | VersionID | RepetitionID | Actual_Data | Is_Pause | Image | IsMax | MaxRepetition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
178293 | 1 | 1553521747262 | 16 | 510 | 0 | 2 | 1 | True | False | [0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 3, 1, 1, 1, 2, ... | True | 1 |
178294 | 1 | 1553521747302 | 16 | 510 | 0 | 2 | 1 | True | False | [0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ... | True | 1 |
178295 | 1 | 1553521747342 | 16 | 510 | 0 | 2 | 1 | True | False | [0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ... | True | 1 |
178296 | 1 | 1553521747388 | 16 | 510 | 0 | 2 | 1 | True | False | [0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ... | True | 1 |
178297 | 1 | 1553521747422 | 16 | 510 | 0 | 2 | 1 | True | False | [0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ... | True | 1 |
In [4]:
len(df_filtered.userID.unique())
Out[4]:
12
In [5]:
df_filtered.Image = df_filtered.Image.apply(lambda x: x.reshape(27, 15)) df_filtered.Image = df_filtered.Image.apply(lambda x: x.clip(min=0, max=255)) df_filtered.Image = df_filtered.Image.apply(lambda x: x.astype(np.uint8)) df_filtered["ImageSum"] = df_filtered.Image.apply(lambda x: np.sum(x))
In [6]:
#LSTMs new Blob detection (only detect, if there are blobs) def detect_blobs(image): #image = image.reshape(27, 15) large = np.ones((29,17), dtype=np.uint8) large[1:28,1:16] = image temp, thresh = cv2.threshold(cv2.bitwise_not(large), 200, 255, cv2.THRESH_BINARY) contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) contours = [a for a in contours if cv2.contourArea(a) > 8 and cv2.contourArea(a) < 255] lstBlob = [] lstMin = [] lstMax = [] count = 0 return len(contours) > 0
In [7]:
%%time pool = Pool(cpu_count() - 1) temp_blobs = pool.map(detect_blobs, df_filtered.Image) pool.close()
CPU times: user 1.93 s, sys: 581 ms, total: 2.51 s Wall time: 2.71 s
In [8]:
df_filtered["ContainsBlobs"] = temp_blobs
In [9]:
#Label if knuckle or finger def f(row): if row['TaskID'] < 17: #val = "Knuckle" val = 0 elif row['TaskID'] >= 17: #val = "Finger" val = 1 return val df_filtered['InputMethod'] = df_filtered.apply(f, axis=1)
In [10]:
# Problem: some timestamps are strings (XXXXE+XXXX) which is not accurate enough, switching to index instead """def cast_to_int(x): if type(x) == int: return x x = str(x).replace(",", ".") return int(float(x)) df_filtered.Timestamp = df_filtered.Timestamp.map(cast_to_int)""" df_filtered.index = range(len(df_filtered))
In [11]:
%%time # trim image sequences down to only between first and last detected blob UserIDs = [] TaskIDs = [] VersionIDs = [] Blobs = [] for userID in df_filtered.userID.unique(): print(userID) for TaskID in df_filtered[df_filtered.userID == userID].TaskID.unique(): for VersionID in df_filtered[(df_filtered.userID == userID) & (df_filtered.TaskID == TaskID)].VersionID.unique(): first_blob = -1 last_blob = -1 for index, row in df_filtered[(df_filtered.userID == userID) & (df_filtered.TaskID == TaskID) & (df_filtered.VersionID == VersionID)].iterrows(): if row.ContainsBlobs: last_blob = index if first_blob == -1: first_blob = index if first_blob >= 0 and last_blob >= 0: UserIDs.append(userID) TaskIDs.append(TaskID) VersionIDs.append(VersionID) Blobs.append(df_filtered[(df_filtered.userID == userID) & (df_filtered.TaskID == TaskID) & (df_filtered.VersionID == VersionID) & (df_filtered.index >= first_blob) & (df_filtered.index <= last_blob)].Image.tolist())
1 2 3 4 5 6 7 8 9 10 11 12 CPU times: user 1min 32s, sys: 60.2 ms, total: 1min 32s Wall time: 1min 32s
In [12]:
UserIDs = np.array(UserIDs, dtype=np.int64) TaskIDs = np.array(TaskIDs, dtype=np.int64) VersionIDs = np.array(VersionIDs, dtype=np.int64)
In [13]:
df_lstm_all = pd.DataFrame() df_lstm_all["userID"] = UserIDs df_lstm_all["TaskID"] = TaskIDs df_lstm_all["VersionID"] = VersionIDs df_lstm_all["Blobs"] = Blobs df_lstm_all.Blobs = df_lstm_all.Blobs.map(np.array) df_lstm_all.head()
Out[13]:
userID | TaskID | VersionID | Blobs | |
---|---|---|---|---|
0 | 1 | 0 | 2 | [[[0, 1, 1, 2, 1, 1, 2, 0, 1, 2, 1, 0, 0, 0, 0... |
1 | 1 | 0 | 3 | [[[0, 2, 191, 0, 0, 1, 2, 0, 1, 1, 2, 0, 1, 0,... |
2 | 1 | 0 | 4 | [[[0, 0, 0, 0, 1, 2, 2, 1, 0, 0, 0, 0, 1, 0, 2... |
3 | 1 | 0 | 5 | [[[0, 1, 0, 2, 2, 0, 1, 0, 3, 1, 1, 0, 0, 0, 0... |
4 | 1 | 0 | 6 | [[[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0... |
In [14]:
df_lstm_all.head()
Out[14]:
userID | TaskID | VersionID | Blobs | |
---|---|---|---|---|
0 | 1 | 0 | 2 | [[[0, 1, 1, 2, 1, 1, 2, 0, 1, 2, 1, 0, 0, 0, 0... |
1 | 1 | 0 | 3 | [[[0, 2, 191, 0, 0, 1, 2, 0, 1, 1, 2, 0, 1, 0,... |
2 | 1 | 0 | 4 | [[[0, 0, 0, 0, 1, 2, 2, 1, 0, 0, 0, 0, 1, 0, 2... |
3 | 1 | 0 | 5 | [[[0, 1, 0, 2, 2, 0, 1, 0, 3, 1, 1, 0, 0, 0, 0... |
4 | 1 | 0 | 6 | [[[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0... |
In [15]:
df_lstm_all["Length"] = df_lstm_all.Blobs.apply(lambda x: x.shape[0])
In [16]:
df_lstm_all.Length.hist(range=(0,300), bins=30)
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f563b2b4278>
In [17]:
len(df_lstm_all[df_lstm_all.Length > 50]) / len(df_lstm_all)
Out[17]:
0.05110421609782807
In [18]:
df_lstm_all.Length.describe().round(1)
Out[18]:
count 5479.0 mean 21.2 std 15.5 min 1.0 25% 13.0 50% 18.0 75% 26.0 max 251.0 Name: Length, dtype: float64
In [19]:
lengths = [] for index, row in df_lstm_all.iterrows(): lengths.append(row.Blobs.shape[0]) df_lstm_all["BlobCount"] = lengths # add a column for pure gesture recognition without finger/knuckle df_lstm_all["GestureOnly"] = df_lstm_all.TaskID % 17
In [20]:
df_lstm_all.BlobCount.describe()
Out[20]:
count 5479.000000 mean 21.239460 std 15.541015 min 1.000000 25% 13.000000 50% 18.000000 75% 26.000000 max 251.000000 Name: BlobCount, dtype: float64
In [21]:
df_lstm_all.groupby(df_lstm_all.GestureOnly)["BlobCount"].agg(["mean", "std"])
Out[21]:
mean | std | |
---|---|---|
GestureOnly | ||
0 | 19.000000 | 15.626834 |
1 | 18.387387 | 12.717864 |
2 | 13.418006 | 11.615571 |
3 | 13.192182 | 10.096861 |
4 | 13.439344 | 9.632580 |
5 | 13.157407 | 9.709874 |
6 | 17.198813 | 8.017407 |
7 | 18.627841 | 7.706187 |
8 | 30.096463 | 14.204850 |
9 | 19.134375 | 8.786072 |
10 | 20.289308 | 10.179677 |
11 | 19.311526 | 20.655269 |
12 | 20.683230 | 10.314326 |
13 | 20.357367 | 9.820602 |
14 | 21.581538 | 11.342465 |
15 | 27.737654 | 13.548982 |
16 | 51.783708 | 19.654648 |
In [22]:
# filter on gesture lengths print("before: %s" % len(df_lstm_all)) df_lstm = df_lstm_all[(df_lstm_all.BlobCount <= 100) & (df_lstm_all.BlobCount >= 5)] print("after: %s" % len(df_lstm)) print("ratio: %s" % ((len(df_lstm_all) - len(df_lstm)) / len(df_lstm_all) * 100))
before: 5479 after: 5043 ratio: 7.957656506661799
In [23]:
df_lstm.BlobCount.describe()
Out[23]:
count 5043.000000 mean 22.512195 std 13.430134 min 5.000000 25% 14.000000 50% 19.000000 75% 27.000000 max 99.000000 Name: BlobCount, dtype: float64
In [24]:
lengths = [] for index, row in df_lstm.iterrows(): lengths.append(row.Blobs.shape[0])
In [25]:
df_lstm["BlobCount"] = lengths
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy """Entry point for launching an IPython kernel.
In [27]:
df_lstm.BlobCount.describe()
Out[27]:
count 5043.000000 mean 22.512195 std 13.430134 min 5.000000 25% 14.000000 50% 19.000000 75% 27.000000 max 99.000000 Name: BlobCount, dtype: float64
In [28]:
def lerp(a, b, c=0.5): return c * b + (1.0 - c) * a #Svens new Blob detection def detect_blobs_return_old(image, task): #image = e.Image large = np.ones((29,17), dtype=np.uint8) large[1:28,1:16] = np.copy(image) temp, thresh = cv2.threshold(cv2.bitwise_not(large), 205, 255, cv2.THRESH_BINARY) contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) contours = [a for a in contours if cv2.contourArea(a) > 8 and cv2.contourArea(a) < 255] lstBlob = [] lstCenter = [] lstMin = [] lstMax = [] count = 0 contours.sort(key=lambda a: cv2.contourArea(a)) if len(contours) > 0: # if two finger or knuckle cont_count = 2 if task in [1, 6, 7, 18, 23, 24] and len(contours) > 1 else 1 for i in range(1, cont_count + 1): max_contour = contours[-1 * i] xmax, ymax = np.max(max_contour.reshape(len(max_contour),2), axis=0) xmin, ymin = np.min(max_contour.reshape(len(max_contour),2), axis=0) M = cv2.moments(max_contour) cX = int(M["m10"] / M["m00"]) - 1 cY = int(M["m01"] / M["m00"]) - 1 #croped_im = np.zeros((27,15)) blob = large[max(ymin - 1, 0):min(ymax + 1, large.shape[0]),max(xmin - 1, 0):min(xmax + 1, large.shape[1])] #croped_im[0:blob.shape[0],0:blob.shape[1]] = blob #return (1, [croped_im]) lstBlob.append(blob) lstCenter.append((cY, cX)) lstMin.append(xmax-xmin) lstMax.append(ymax-ymin) count = count + 1 return (count, lstBlob, lstCenter) else: return (0, [np.zeros((29, 19))], 0, 0)
In [29]:
# descides whether or not a normalization is neccessary # and cuts or adds zeros def normalize_blobs(blobs, new_len=50): new_count = new_len - blobs.shape[0] if new_count == 0: return blobs elif new_count > 0: temp = np.array([np.zeros((27, 15))] * new_count) return np.append(blobs, temp, axis=0) else: return blobs[0:new_len]
In [30]:
%%time # normalizes all image sequences df_lstm_norm = df_lstm.copy(deep=True) new_blobs = [] for index, row in df_lstm.iterrows(): new_blobs.append(normalize_blobs(row.Blobs, 50)) df_lstm_norm.Blobs = new_blobs lengths = [] for index, row in df_lstm_norm.iterrows(): lengths.append(row.Blobs.shape[0]) df_lstm_norm["BlobCount"] = lengths
CPU times: user 1.48 s, sys: 236 ms, total: 1.71 s Wall time: 1.71 s
In [31]:
df_lstm_norm.BlobCount.describe()
Out[31]:
count 5043.0 mean 50.0 std 0.0 min 50.0 25% 50.0 50% 50.0 75% 50.0 max 50.0 Name: BlobCount, dtype: float64
In [32]:
df_lstm_norm.to_pickle("DataStudyEvaluation/df_lstm_norm50.pkl")
In [33]:
df_lstm_norm.head()
Out[33]:
userID | TaskID | VersionID | Blobs | Length | BlobCount | GestureOnly | |
---|---|---|---|---|---|---|---|
0 | 1 | 0 | 2 | [[[0.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 0.0, 1.0... | 25 | 50 | 0 |
1 | 1 | 0 | 3 | [[[0.0, 2.0, 191.0, 0.0, 0.0, 1.0, 2.0, 0.0, 1... | 12 | 50 | 0 |
2 | 1 | 0 | 4 | [[[0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 2.0, 1.0, 0.0... | 14 | 50 | 0 |
3 | 1 | 0 | 5 | [[[0.0, 1.0, 0.0, 2.0, 2.0, 0.0, 1.0, 0.0, 3.0... | 11 | 50 | 0 |
4 | 1 | 0 | 6 | [[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0... | 16 | 50 | 0 |
In [34]:
df_lstm_norm.userID.unique()
Out[34]:
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
In [ ]: