knuckletouch/python/Step_33_CNN_PreprocessData....

40 KiB

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from multiprocessing import Pool, cpu_count

import copy

import cv2
In [2]:
dfAll = pd.read_pickle("DataStudyEvaluation/AllData.pkl")
df = dfAll[(dfAll.Actual_Data == True) & (dfAll.Is_Pause == False)]
df.head()
Out[2]:
userID Timestamp Current_Task Task_amount TaskID VersionID RepetitionID Actual_Data Is_Pause Image
56454 12 1553865148939 1 510 17 2 0 True False [0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, ...
56455 12 1553865148981 1 510 17 2 0 True False [0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, ...
56456 12 1553865149021 1 510 17 2 0 True False [0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, ...
56457 12 1553865149060 1 510 17 2 0 True False [0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, ...
56458 12 1553865149099 1 510 17 2 0 True False [0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, ...
In [3]:
df.userID.unique()
Out[3]:
array([12,  5,  1, 10,  6,  3,  7,  8,  9, 11])
In [4]:
%%time
def is_max(df):
    df_temp = df.copy(deep=True)
    max_version = df_temp.RepetitionID.max()
    df_temp["IsMax"] = np.where(df_temp.RepetitionID == max_version, True, False)
    df_temp["MaxRepetition"] = [max_version] * len(df_temp)
    return df_temp

df_grp = df.groupby([df.userID, df.TaskID, df.VersionID])
pool = Pool(cpu_count() - 1)
result_lst = pool.map(is_max, [grp for name, grp in df_grp])
df = pd.concat(result_lst)
pool.close()
CPU times: user 12.2 s, sys: 2.34 s, total: 14.6 s
Wall time: 14.2 s
In [5]:
df.Image = df.Image.apply(lambda x: x.reshape(27, 15))
df.Image = df.Image.apply(lambda x: x.clip(min=0, max=255))
df.Image = df.Image.apply(lambda x: x.astype(np.uint8))
df["ImageSum"] = df.Image.apply(lambda x: np.sum(x))
In [6]:
df.to_pickle("DataStudyEvaluation/dfFiltered.pkl")
In [7]:
print("recorded actual: %s, used data: %s" % (len(dfAll), len(df)))
recorded actual: 608084, used data: 413500
In [8]:
df = pd.read_pickle("DataStudyEvaluation/dfFiltered.pkl")
In [9]:
df.head()
Out[9]:
userID Timestamp Current_Task Task_amount TaskID VersionID RepetitionID Actual_Data Is_Pause Image IsMax MaxRepetition ImageSum
178160 1 1553521741802 16 510 0 2 0 True False [[0, 1, 1, 1, 0, 1, 0, 4, 1, 0, 2, 1, 1, 0, 1]... False 1 286
178161 1 1553521741842 16 510 0 2 0 True False [[0, 1, 1, 1, 0, 1, 0, 4, 1, 0, 2, 1, 1, 0, 1]... False 1 319
178162 1 1553521741882 16 510 0 2 0 True False [[0, 1, 1, 1, 0, 1, 0, 4, 1, 0, 2, 1, 1, 0, 1]... False 1 72
178163 1 1553521741922 16 510 0 2 0 True False [[0, 1, 1, 1, 0, 1, 0, 4, 1, 0, 2, 1, 1, 0, 1]... False 1 288
178164 1 1553521741990 16 510 0 2 0 True False [[0, 1, 1, 1, 0, 1, 0, 4, 1, 0, 2, 1, 1, 0, 1]... False 1 308
In [10]:
#Label if knuckle or finger
def f(row):
    if row['TaskID'] < 17:
        #val = "Knuckle"
        val = 0
    elif row['TaskID'] >= 17:
        #val = "Finger"
        val = 1
    return val
df['InputMethod'] = df.apply(f, axis=1)

def f(row):
    if row['TaskID'] < 17:
        val = "Knuckle"
    elif row['TaskID'] >= 17:
        val = "Finger"
    return val
df['Input'] = df.apply(f, axis=1)
In [11]:
#Svens new Blob detection
def detect_blobs(image, task):
    #image = e.Image
    large = np.ones((29,17), dtype=np.uint8)
    large[1:28,1:16] = np.copy(image)
    temp, thresh = cv2.threshold(cv2.bitwise_not(large), 200, 255, cv2.THRESH_BINARY)
    contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    contours = [a for a in contours if cv2.contourArea(a) > 8 and cv2.contourArea(a) < 255]
    lstBlob  = []
    lstMin = []
    lstMax = []
    count = 0
    contours.sort(key=lambda a: cv2.contourArea(a))
    if len(contours) > 0:
        # if two finger or knuckle
        cont_count = 2 if task in [1, 6, 7, 18, 23, 24] and len(contours) > 1 else 1
        for i in range(1, cont_count + 1):
            max_contour = contours[-1 * i]
            xmax, ymax = np.max(max_contour.reshape(len(max_contour),2), axis=0)
            xmin, ymin = np.min(max_contour.reshape(len(max_contour),2), axis=0)
            #croped_im = np.zeros((27,15))
            blob = large[max(ymin - 1, 0):min(ymax + 1, large.shape[0]),max(xmin - 1, 0):min(xmax + 1, large.shape[1])]
            #croped_im[0:blob.shape[0],0:blob.shape[1]] = blob
            #return (1, [croped_im])
            lstBlob.append(blob)
            lstMin.append(xmax-xmin)
            lstMax.append(ymax-ymin)
            count = count + 1
        return (count, lstBlob, lstMin, lstMax)
    else:
        return (0, [np.zeros((29, 19))], 0, 0)
In [12]:
%%time
pool = Pool(os.cpu_count()-1)
temp_blobs = pool.starmap(detect_blobs, zip(df.Image, df.TaskID))
pool.close()
CPU times: user 5.65 s, sys: 4.52 s, total: 10.2 s
Wall time: 9.76 s
In [13]:
df["BlobCount"] = [a[0] for a in temp_blobs]
df["BlobImages"] = [a[1] for a in temp_blobs]
df["BlobW"] = [a[2] for a in temp_blobs]
df["BlobH"] = [a[3] for a in temp_blobs]
In [14]:
df.BlobCount.value_counts()
Out[14]:
0    334475
1     73449
2      5576
Name: BlobCount, dtype: int64
In [15]:
dfX = df[(df.BlobCount == 1)].copy(deep=True)
dfX.BlobImages = dfX.BlobImages.apply(lambda x : x[0])
dfX.BlobW = dfX.BlobW.apply(lambda x : x[0])
dfX.BlobH = dfX.BlobH.apply(lambda x : x[0])

dfY = df[(df.BlobCount == 2)].copy(deep=True)
dfY.BlobImages = dfY.BlobImages.apply(lambda x : x[0])
dfY.BlobW = dfY.BlobW.apply(lambda x : x[0])
dfY.BlobH = dfY.BlobH.apply(lambda x : x[0])

dfZ = df[(df.BlobCount == 2)].copy(deep=True)
dfZ.BlobImages = dfZ.BlobImages.apply(lambda x : x[1])
dfZ.BlobW = dfZ.BlobW.apply(lambda x : x[1])
dfZ.BlobH = dfZ.BlobH.apply(lambda x : x[1])

df = dfX.append([dfY, dfZ])
In [16]:
print("Sample Size not Argumented:", len(df))
Sample Size not Argumented: 84601
In [17]:
df["BlobArea"] = df["BlobW"] * df["BlobH"]
In [18]:
df.BlobArea.describe().round(1)
Out[18]:
count    84601.0
mean        16.5
std          5.9
min         12.0
25%         12.0
50%         16.0
75%         16.0
max         72.0
Name: BlobArea, dtype: float64
In [19]:
df.groupby("Input").BlobArea.describe().round(1)
Out[19]:
count mean std min 25% 50% 75% max
Input
Finger 59879.0 17.3 5.8 12.0 12.0 16.0 20.0 56.0
Knuckle 24722.0 14.8 5.7 12.0 12.0 12.0 16.0 72.0
In [20]:
df["BlobSum"] = df.BlobImages.apply(lambda x: np.sum(x))
In [21]:
df.BlobSum.describe()
Out[21]:
count    84601.000000
mean      1238.209170
std        485.150602
min        467.000000
25%        930.000000
50%       1094.000000
75%       1383.000000
max       4275.000000
Name: BlobSum, dtype: float64
In [22]:
df.BlobSum.hist()
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x7ff3f0375b38>
In [23]:
#Small / Blobs where the pixels are only a "little" hit
dfX = df[df.BlobSum <= 255]
len(dfX)
Out[23]:
0
In [24]:
print("Sample Size argumented:", len(df))
Sample Size argumented: 84601
In [25]:
def pasteToEmpty (blob):
    croped_im = np.zeros((27,15))
    croped_im[0:blob.shape[0],0:blob.shape[1]] = blob
    return croped_im
In [26]:
df["Blobs"] = df.BlobImages.apply(lambda x: pasteToEmpty(x))
In [27]:
df.to_pickle("DataStudyEvaluation/df_statistics.pkl")
In [28]:
df[["userID", "TaskID", "Blobs", "InputMethod"]].to_pickle("DataStudyEvaluation/df_blobs_area.pkl")

display blobs

In [29]:
plt.clf()
plt.figure(figsize=(6, 6))
ax = plt.gca()
data_point = 100
data = df.Blobs.iloc[data_point]
print(df.iloc[data_point])
plt.imshow(data, cmap='gray', vmin=0, vmax=255)
# Loop over data dimensions and create text annotations.
for i in range(0, data.shape[0]):
    for j in range(0, data.shape[1]):
        text = ax.text(j, i, int(data[i, j]),
                       ha="center", va="center", color="cyan", fontsize=1)
plt.show()
userID                                                           1
Timestamp                                            1553522289862
Current_Task                                                   155
Task_amount                                                    510
TaskID                                                           0
VersionID                                                       11
RepetitionID                                                     0
Actual_Data                                                   True
Is_Pause                                                     False
Image            [[1, 1, 1, 0, 0, 0, 0, 2, 1, 2, 2, 0, 3, 0, 0]...
IsMax                                                         True
MaxRepetition                                                    0
ImageSum                                                       933
InputMethod                                                      0
Input                                                      Knuckle
BlobCount                                                        1
BlobImages       [[2, 2, 4, 5, 2], [1, 5, 11, 13, 5], [4, 9, 71...
BlobW                                                            3
BlobH                                                            4
BlobArea                                                        12
BlobSum                                                        710
Blobs            [[2.0, 2.0, 4.0, 5.0, 2.0, 0.0, 0.0, 0.0, 0.0,...
Name: 191534, dtype: object
<Figure size 432x288 with 0 Axes>