207 lines
6.9 KiB
Python
207 lines
6.9 KiB
Python
import pickle
|
|
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
from pathlib import Path
|
|
import argparse
|
|
|
|
def view_clean_data():
|
|
with open('dataset/clean_data.pkl', 'rb') as f:
|
|
data = pickle.load(f)
|
|
print(type(data), len(data))
|
|
print(data.keys())
|
|
print('length of data:',len(data))
|
|
print('event', data['Event'], 'length of event', len(data['Event']))
|
|
print('rule', data['Rule'], 'length of event', len(data['Rule']))
|
|
|
|
print('rule unique', data.Rule.unique())
|
|
print('task id unique', data.TaskID.unique())
|
|
print('pid unique', data.PID.unique())
|
|
print('event unique', data.Event.unique())
|
|
|
|
def split_org_data():
|
|
# generate train, test data by split user, aggregate action sequence for next action prediction
|
|
# orignial action seq: a = [a_0 ... a_n]
|
|
# new action seq: for a: a0 = [a_0], a1 = [a_0, a_1] ...
|
|
|
|
# split original data into train and test based on user
|
|
with open('dataset/clean_data.pkl', 'rb') as f:
|
|
data = pickle.load(f)
|
|
|
|
print('original data keys', data.keys())
|
|
print('len of original data', len(data))
|
|
print('rule unique', data.Rule.unique())
|
|
print('event unique', data.Event.unique())
|
|
|
|
data_train = data[data['PID']<=11]
|
|
data_test = data[data['PID']>11]
|
|
print('train set len', len(data_train))
|
|
print('test set len', len(data_test))
|
|
|
|
# split data by task
|
|
train_data_intent = []
|
|
test_data_intent = []
|
|
for i in range(7):
|
|
# 7 different rules, each as an intention
|
|
train_data_intent.append(data_train[data_train['Rule']==i])
|
|
test_data_intent.append(data_test[data_test['Rule']==i])
|
|
|
|
# generate train set
|
|
max_len = 0 # max len is 35
|
|
for i in range(7): # 7 tasks/rules
|
|
train_data = [] # [task]
|
|
train_label = []
|
|
for u in range(1,12):
|
|
user_data = train_data_intent[i][train_data_intent[i]['PID']==u]
|
|
for j in range(1,6): # 5 parts == 5 trials
|
|
part_data = user_data[user_data['Part']==j]
|
|
for l in range(1,len(part_data['Event'])-1):
|
|
print(part_data['Event'][:l].tolist())
|
|
train_data.append(part_data['Event'][:l].tolist())
|
|
train_label.append(part_data['Event'].iat[l+1])
|
|
if len(part_data['Event'])>max_len:
|
|
max_len = len(part_data['Event'])
|
|
|
|
for k in range(len(train_data)):
|
|
while len(train_data[k])<35:
|
|
train_data[k].append(0) # padding with 0
|
|
|
|
print('x_len', len(train_data), type(train_data[0]), len(train_data[0]))
|
|
print('y_len', len(train_label), type(train_label[0]))
|
|
|
|
Path("dataset/strategy_dataset").mkdir(parents=True, exist_ok=True)
|
|
with open('dataset/strategy_dataset/train_label_'+str(i)+'.pkl', 'wb') as f:
|
|
pickle.dump(train_label, f)
|
|
with open('dataset/strategy_dataset/train_data_'+str(i)+'.pkl', 'wb') as f:
|
|
pickle.dump(train_data, f)
|
|
print('max_len', max_len)
|
|
|
|
# generate test set
|
|
max_len = 0 # max len is 33, total max is 35
|
|
for i in range(7): # 7 tasks/rules
|
|
test_data = [] # [task][user]
|
|
test_label = []
|
|
test_action_id = []
|
|
for u in range(12,17):
|
|
user_data = test_data_intent[i][test_data_intent[i]['PID']==u]
|
|
test_data_user = []
|
|
test_label_user = []
|
|
test_action_id_user = []
|
|
for j in range(1,6): # 5 parts == 5 trials
|
|
part_data = user_data[user_data['Part']==j]
|
|
|
|
for l in range(1,len(part_data['Event'])-1):
|
|
test_data_user.append(part_data['Event'][:l].tolist())
|
|
test_label_user.append(part_data['Event'].iat[l+1])
|
|
test_action_id_user.append(part_data['Part'].iat[l])
|
|
|
|
if len(part_data['Event'])>max_len:
|
|
max_len = len(part_data['Event'])
|
|
|
|
for k in range(len(test_data_user)):
|
|
while len(test_data_user[k])<35:
|
|
test_data_user[k].append(0) # padding with 0
|
|
|
|
test_data.append(test_data_user)
|
|
test_label.append(test_label_user)
|
|
test_action_id.append(test_action_id_user)
|
|
|
|
|
|
print('x_len', len(test_data), type(test_data[0]), len(test_data[0]))
|
|
print('y_len', len(test_label), type(test_label[0]))
|
|
with open('dataset/strategy_dataset/test_label_'+str(i)+'.pkl', 'wb') as f:
|
|
pickle.dump(test_label, f)
|
|
with open('dataset/strategy_dataset/test_data_'+str(i)+'.pkl', 'wb') as f:
|
|
pickle.dump(test_data, f)
|
|
with open('dataset/strategy_dataset/test_action_id_'+str(i)+'.pkl', 'wb') as f:
|
|
pickle.dump(test_action_id, f)
|
|
print('max_len', max_len)
|
|
|
|
def calc_gt_prob():
|
|
# train set unique label
|
|
for i in range(7):
|
|
with open('dataset/strategy_dataset/train_label_'+str(i)+'.pkl', 'rb') as f:
|
|
y = pickle.load(f)
|
|
y = np.array(y)
|
|
print('task ', i)
|
|
print('unique train label', np.unique(y))
|
|
|
|
def plot_gt_dist():
|
|
|
|
full_data = []
|
|
for i in range(7):
|
|
with open('dataset/strategy_dataset/' + 'test' + '_label_' + str(i) + '.pkl', 'rb') as f:
|
|
data = pickle.load(f)
|
|
#print(len(data))
|
|
full_data.append(data)
|
|
|
|
fig, axs = plt.subplots(7)
|
|
fig.set_figheight(10)
|
|
fig.set_figwidth(16)
|
|
act_name = ["Italic", "Bold", "Underline", "Indent", "Align", "FontSize", "FontFamily"]
|
|
x = np.arange(7)
|
|
|
|
width = 0.1
|
|
for i in range(7):
|
|
for u in range(len(data)): # 5 users
|
|
values, counts = np.unique(full_data[i][u], return_counts=True)
|
|
counts_vis = [0]*7
|
|
for j in range(len(values)):
|
|
counts_vis[values[j]-1] = counts[j]
|
|
print('task', i, 'actions', values, 'num', counts)
|
|
|
|
axs[i].set_title('Intention '+str(i))
|
|
axs[i].set_xlabel('action id')
|
|
axs[i].set_ylabel('num of actions')
|
|
axs[i].bar(x+u*width, counts_vis, width=0.1, label='user '+str(u))
|
|
axs[i].set_xticks(np.arange(len(x)))
|
|
axs[i].set_xticklabels(act_name)
|
|
axs[i].set_ylim([0,80])
|
|
|
|
axs[0].legend(loc='upper right', ncol=1)
|
|
plt.tight_layout()
|
|
plt.savefig('dataset/'+'test'+'_gt_dist.png')
|
|
plt.show()
|
|
|
|
def plot_act():
|
|
full_data = []
|
|
for i in range(7):
|
|
with open('dataset/strategy_dataset/' + 'test' + '_label_' + str(i) + '.pkl', 'rb') as f:
|
|
data = pickle.load(f)
|
|
full_data.append(data)
|
|
|
|
width = 0.1
|
|
for i in range(7):
|
|
fig, axs = plt.subplots(5)
|
|
fig.set_figheight(10)
|
|
fig.set_figwidth(16)
|
|
act_name = ["Italic", "Bold", "Underline", "Indent", "Align", "FontSize", "FontFamily"]
|
|
for u in range(len(full_data[i])): # 5 users
|
|
x = np.arange(len(full_data[i][u]))
|
|
axs[u].set_xlabel('action id')
|
|
axs[u].set_ylabel('num of actions')
|
|
axs[u].plot(x, full_data[i][u])
|
|
|
|
axs[0].legend(loc='upper right', ncol=1)
|
|
plt.tight_layout()
|
|
#plt.savefig('test'+'_act.png')
|
|
plt.show()
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("func", help="select what function to run. view_clean_data, split_org_data, calc_gt_prob, plot_gt_dist, plot_act", type=str)
|
|
args = parser.parse_args()
|
|
|
|
if args.func == 'view_clean_data':
|
|
view_clean_data() # view original keyboad and mouse interaction dataset
|
|
if args.func == 'split_org_data':
|
|
split_org_data() # split the original keyboad and mouse interaction dataset. User 1-11 for training, rest for testing
|
|
if args.func == 'calc_gt_prob':
|
|
calc_gt_prob() # see unique label in train set
|
|
if args.func == 'plot_gt_dist':
|
|
plot_gt_dist() # plot the label distribution of test set
|
|
if args.func == 'plot_act':
|
|
plot_act() # plot the label of test set
|
|
|
|
|
|
|