340 lines
No EOL
13 KiB
Python
340 lines
No EOL
13 KiB
Python
from json import decoder
|
|
from operator import mod
|
|
import numpy as np
|
|
import time,pdb,os, random, math, copy
|
|
import pandas as pd
|
|
import pickle as pkl
|
|
from sklearn.metrics import f1_score
|
|
|
|
import matplotlib.lines as mlines
|
|
from scipy.stats import norm
|
|
|
|
import torch
|
|
from torch import nn, Tensor
|
|
from torchinfo import summary
|
|
from torch.utils.data import DataLoader,Dataset
|
|
from torch.nn import TransformerEncoder, TransformerEncoderLayer
|
|
|
|
|
|
def find_replace(seq, token, word):
|
|
st = 0
|
|
while st<len(word):
|
|
if seq==tuple(word[st:st+len(seq)]):
|
|
word = word[:st] + [token] + word[st+len(seq):]
|
|
st+=1
|
|
return word
|
|
|
|
def generateVocabdict(data):
|
|
vocab = list(data.event.unique())
|
|
vocabdict = dict((x, tuple([idx])) for idx, x in enumerate(vocab))
|
|
vocabdict['OVER'] = tuple([-1])
|
|
return vocabdict
|
|
|
|
def generateWordsVocab(data, vocabdict):
|
|
word, words = [], []
|
|
for gkey, gdata in data.groupby(['user', 'task', 'trial']):
|
|
events = list(gdata.event.values) + ['OVER']
|
|
word = [vocabdict[x] for x in list(events)]
|
|
words.append(word)
|
|
vocab = set([])
|
|
for word in words:
|
|
vocab = vocab.union(set(word))
|
|
vocab = set(x for x in list(vocab))
|
|
return words, vocab
|
|
|
|
def generateBPE(iters, vocab, vocabdict, words, pool, savefileprefix='./', savefreq=50):
|
|
for i in range(iters):
|
|
stackeddata = pool.starmap(getPair, [[x] for x in words])
|
|
if len(stackeddata)==0:
|
|
break
|
|
stackedsubwords = pool.starmap(uniqueSubwords, stackeddata)
|
|
subwords = set([])
|
|
for idx, subword in enumerate(stackedsubwords):
|
|
subwords = subwords.union(subword)
|
|
|
|
paircounter = dict((x, 0) for x in subwords)
|
|
for pairs, word in stackeddata:
|
|
for _, pair in enumerate(pairs):
|
|
subword = []
|
|
for x in list(pair):
|
|
if len(x)==1:
|
|
subword.append(tuple(x))
|
|
else:
|
|
subword.append(list(x))
|
|
subword = tuple(map(tuple,subword))
|
|
paircounter[subword] += 1
|
|
if len(paircounter)==0:
|
|
break
|
|
if max(paircounter.values())==1:
|
|
break
|
|
targetpair = max(paircounter, key=paircounter.get)
|
|
prelen = len(vocab)
|
|
vocab.add(targetpair)
|
|
|
|
for idx, word in enumerate(words):
|
|
stidx = []
|
|
for st in range(len(word)-len(targetpair)+1):
|
|
if equalTuple(targetpair, tuple(word[st:st+len(targetpair)])):
|
|
stidx.append(st)
|
|
pre = 0
|
|
updated = []
|
|
for st in stidx:
|
|
if pre<st:
|
|
updated = updated + word[pre:st] + [targetpair]
|
|
pre = st+len(targetpair)
|
|
words[idx] = updated + word[pre:]
|
|
|
|
if (i+1)%savefreq==0:
|
|
savefilename = savefileprefix+'%d.pkl'%(i)
|
|
print('saving to =>', savefilename)
|
|
with open(savefilename, 'wb') as f:
|
|
pkl.dump([vocab, vocabdict, words, paircounter],f)
|
|
return savefilename
|
|
|
|
def sort_by_key_len(dict):
|
|
dict_len= {key: len(key) for key in dict.keys()}
|
|
import operator
|
|
sorted_key_list = sorted(dict_len.items(), key=operator.itemgetter(1), reverse=True)
|
|
sorted_dict = [{item[0]: dict[item [0]]} for item in sorted_key_list]
|
|
return sorted_dict
|
|
|
|
def getPair(word, N=2):
|
|
word = np.array(word)
|
|
slid = 1
|
|
sub_windows = (
|
|
np.expand_dims(np.arange(N), 0) +
|
|
np.expand_dims(np.arange(0, word.shape[0]-N+1, slid), 0).T
|
|
).astype(int)
|
|
return word[sub_windows], word
|
|
|
|
def flattenTuple(x):
|
|
flatten = []
|
|
for oo in list(x):
|
|
if type(oo)==int:
|
|
flatten.append(tuple([oo]))
|
|
else:
|
|
if len(oo)==1:
|
|
flatten.append(oo)
|
|
else:
|
|
flatten = flatten + flattenTuple(oo)
|
|
return flatten
|
|
|
|
def equalTuple(x, y):
|
|
if set(x)==set(y):
|
|
if flattenTuple(x)==flattenTuple(y):
|
|
return True
|
|
return False
|
|
|
|
def findTuple(target, candidates):
|
|
for x in candidates:
|
|
if equalTuple(target, x):
|
|
return True
|
|
return False
|
|
|
|
def uniqueSubwords(pairs, ignore):
|
|
subwords = set([])
|
|
for pair in pairs:
|
|
subword = []
|
|
for x in list(pair):
|
|
if len(x)==1:
|
|
subword.append(tuple(x))
|
|
else:
|
|
subword.append(list(x))
|
|
subword = tuple(map(tuple,subword))
|
|
subwords.add(subword)
|
|
return subwords
|
|
|
|
def setup_seed(seed):
|
|
torch.manual_seed(seed)
|
|
torch.cuda.manual_seed_all(seed)
|
|
np.random.seed(seed)
|
|
random.seed(seed)
|
|
torch.backends.cudnn.deterministic = True
|
|
|
|
def poolsegmentTokenise(gkey, gdata, win_len, vocabdict, mark, BPEvocab=None):
|
|
unknowntoken = vocabdict['unknown']
|
|
paddingtoken = vocabdict['padding']
|
|
slid = win_len
|
|
windowTokendict, windowBPEdict = {}, {}
|
|
events = np.array(list(gdata.event.values) + ['OVER'])
|
|
sub_windows = (
|
|
np.expand_dims(np.arange(win_len), 0) +
|
|
np.expand_dims(np.arange(0, len(gdata)-win_len+1, slid), 0).T
|
|
).astype(int)
|
|
windowOri = events[sub_windows].tolist()
|
|
if len(sub_windows)==0:
|
|
lastidx = 0
|
|
else:
|
|
lastidx = sub_windows[-1][-1]
|
|
if lastidx<len(gdata)-1:
|
|
windowOri.append(events[lastidx+1:].tolist())
|
|
|
|
windowToken = []
|
|
for windowO in windowOri:
|
|
word = []
|
|
for x in list(windowO):
|
|
if x in vocabdict.keys():
|
|
word.append(vocabdict[x])
|
|
else:
|
|
assert mark!='train'
|
|
word.append(unknowntoken)
|
|
windowToken.append(word)
|
|
windowTokendict[gkey] = windowToken
|
|
if BPEvocab is None:
|
|
BPEvocab = [{tuple([tuple([x-1])]):x} for x in range(len(vocabdict))]
|
|
|
|
windowBPE = []
|
|
for word in windowToken:
|
|
prelen = len(word)
|
|
for vocab in BPEvocab:
|
|
seq = list(vocab.keys())[0]
|
|
token = list(vocab.values())[0]
|
|
word = find_replace(seq, token, word)
|
|
assert len(word)<=prelen
|
|
windowBPE.append(word)
|
|
windowBPEdict[gkey] = windowBPE
|
|
for key,values in windowBPEdict.items():
|
|
oo = []
|
|
for word in values:
|
|
newword = []
|
|
for x in word:
|
|
if type(x)==tuple:
|
|
newword.append(x[0])
|
|
else:
|
|
newword.append(x)
|
|
oo.append(newword)
|
|
windowBPEdict[key] = oo
|
|
segments, labels, nextaction = np.array([]), [], []
|
|
for gkey, groupWindowBPE in windowBPEdict.items():
|
|
for idx, windowBPE in enumerate(groupWindowBPE):
|
|
windowBPE = windowBPE + [paddingtoken[0] for x in range(win_len-len(windowBPE))]
|
|
assert len(windowBPE)==win_len
|
|
if idx<len(groupWindowBPE)-1:
|
|
nextaction.append(windowTokendict[gkey][idx+1][0][0])
|
|
else:
|
|
nextaction.append(vocabdict['OVER'][0])
|
|
labels.append(gkey[1])
|
|
if segments.shape[0]==0:
|
|
segments = np.array([windowBPE])
|
|
else:
|
|
segments = np.concatenate((segments, np.array([windowBPE])), axis=0)
|
|
return segments, labels, unknowntoken[0], paddingtoken[0]
|
|
|
|
|
|
class PositionalEncoding(nn.Module):
|
|
def __init__(self, d_model: int, dropout: float, max_len: int = 5000):
|
|
super().__init__()
|
|
self.dropout = nn.Dropout(p=dropout)
|
|
position = torch.arange(max_len).unsqueeze(1)
|
|
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
|
|
pe = torch.zeros(1, max_len, d_model)
|
|
pe[0, :, 0::2] = torch.sin(position * div_term)
|
|
pe[0, :, 1::2] = torch.cos(position * div_term)
|
|
self.register_buffer('pe', pe)
|
|
def forward(self, x: Tensor) -> Tensor:
|
|
x = x + self.pe[:,:x.shape[1]]
|
|
return self.dropout(x)
|
|
|
|
|
|
class SupervisedTransformerv2(nn.Module):
|
|
def __init__(self, **params):
|
|
super().__init__()
|
|
self.params = params
|
|
self.pos_encoder = PositionalEncoding(d_model=params['d_model'], dropout=params['dropout'])
|
|
if 'ntokens' in params.keys():
|
|
self.embedding = nn.Embedding(num_embeddings=params['ntokens'], embedding_dim=params['d_model'])
|
|
self.transformer_encoder = TransformerEncoder(TransformerEncoderLayer(d_model=params['d_model'], nhead=params['nhead'],
|
|
dim_feedforward=params['d_model']*4,
|
|
dropout=params['dropout'], activation='relu', batch_first=True),
|
|
params['n_layers'])
|
|
self.linear = nn.Linear(params['d_model']*params['win_len'], params['nlabels'])
|
|
|
|
def forward(self, encoder_input, paddingmask):
|
|
encoder_embed = self.embedding(encoder_input) * math.sqrt(self.params['d_model'])
|
|
encoder_pos = self.pos_encoder(encoder_embed)
|
|
encoder_output = self.transformer_encoder(src=encoder_pos, src_key_padding_mask=paddingmask)
|
|
output = encoder_output.view(encoder_output.shape[0],-1)
|
|
final_output = self.linear(output)
|
|
return final_output
|
|
|
|
def evaluate(model, criterion, val_loader, device):
|
|
model = model.to(device)
|
|
model.eval()
|
|
losses, f1s = [], []
|
|
with torch.no_grad():
|
|
for batchdata, batchlabel, batchmask in val_loader:
|
|
predictions = model(batchdata.to(device), batchmask.to(device))
|
|
loss = criterion(predictions, batchlabel.reshape(-1).to(device).long())
|
|
if np.isnan(loss.item()):
|
|
raise "Loss NaN!"
|
|
losses.append(loss.item())
|
|
pred_label = np.argmax(predictions.detach().cpu().numpy(), axis=1)
|
|
f1 = f1_score(batchlabel.numpy(), pred_label, average='macro')
|
|
f1s.append(f1)
|
|
return np.mean(losses), np.mean(f1s)
|
|
|
|
class DatasetPadding(Dataset):
|
|
def __init__(self, data, paddingtoken=None, label=None):
|
|
self.data = data
|
|
self.label = label
|
|
if paddingtoken is not None:
|
|
self.mask = data==paddingtoken
|
|
def __getitem__(self, idx):
|
|
if self.label is None:
|
|
return self.data[idx], self.mask[idx]
|
|
return self.data[idx], self.label[idx], self.mask[idx]
|
|
def __len__(self):
|
|
return len(self.data)
|
|
|
|
def fit_transformer(traindata, trainlabel, testdata, testlabel, args, device, paddingtoken, nLabels, savemodel='model', nTokens=None):
|
|
if os.path.exists(savemodel):
|
|
return
|
|
|
|
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
|
|
traindata = traindata.astype(int)
|
|
|
|
params = {'nlabels': nLabels, 'batch_size':args.batch_size, 'd_model':args.d_model,
|
|
'optimizer_name': args, 'nhead':args.nhead, 'dropout':args.dropout,
|
|
'win_len': traindata.shape[1], 'lr':args.lr, 'n_layers':args.n_layers, 'ntokens': nTokens}
|
|
trainset = DatasetPadding(data=traindata, paddingtoken=paddingtoken, label=trainlabel)
|
|
testset = DatasetPadding(data=testdata, paddingtoken=paddingtoken, label=testlabel)
|
|
trainloader = DataLoader(trainset, batch_size=params['batch_size'], shuffle=True, num_workers=0)
|
|
testloader = DataLoader(testset, batch_size=params['batch_size'], shuffle=True, num_workers=0)
|
|
|
|
model = SupervisedTransformerv2(**params).to(device)
|
|
model.train()
|
|
|
|
optimizer = getattr(torch.optim, params['optimizer_name'])(model.parameters(),
|
|
lr=params['lr'], betas=(0.9,0.999), weight_decay=0.01)
|
|
if len(trainloader)>=20:
|
|
LOG = int(len(trainloader)/20)
|
|
else:
|
|
LOG = 1
|
|
trloss, valoss, trf1, vaf1 = [], [], [], []
|
|
evaloss, evaf1 = 0,0
|
|
for epoch in range(1, args.epochs+1):
|
|
for batch, (batchdata, batchlabel, batchmask) in enumerate(trainloader):
|
|
predictions = model(batchdata.to(device), batchmask.to(device))
|
|
loss = criterion(predictions, batchlabel.reshape(-1).to(device).long())
|
|
if np.isnan(loss.item()):
|
|
raise "Loss NaN!"
|
|
loss.requires_grad_(True)
|
|
optimizer.zero_grad()
|
|
loss.backward()
|
|
optimizer.step()
|
|
|
|
evaloss+=loss.item()
|
|
pred_label = np.argmax(predictions.detach().cpu().numpy(), axis=1)
|
|
f1 = f1_score(batchlabel.numpy(), pred_label, average='macro')
|
|
evaf1 += f1
|
|
if batch%(LOG)==0 or batch==len(trainloader)-1:
|
|
cur_valoss, cur_vaf1 = evaluate(model, criterion, testloader, device)
|
|
model.train()
|
|
trloss.append(evaloss/LOG)
|
|
valoss.append(cur_valoss)
|
|
trf1.append(evaf1/LOG)
|
|
vaf1.append(cur_vaf1)
|
|
evaloss, evaf1 = 0,0
|
|
print('Epoch [{}/{}], Batch [{}/{}], Train Loss: {:.4f}, Train F1: {:.4f}, Val Loss: {:.4f}, Val F1: {:.4f}'
|
|
.format(epoch, args.epochs, batch, len(trainloader), trloss[-1], trf1[-1], valoss[-1], vaf1[-1]))
|
|
torch.save([model.cpu(), [trloss, valoss, trf1, vaf1]], savemodel+'%d.pkl'%(epoch)) |