EMAKI/utils_Analysis.py

340 lines
13 KiB
Python

from json import decoder
from operator import mod
import numpy as np
import time,pdb,os, random, math, copy
import pandas as pd
import pickle as pkl
from sklearn.metrics import f1_score
import matplotlib.lines as mlines
from scipy.stats import norm
import torch
from torch import nn, Tensor
from torchinfo import summary
from torch.utils.data import DataLoader,Dataset
from torch.nn import TransformerEncoder, TransformerEncoderLayer
def find_replace(seq, token, word):
st = 0
while st<len(word):
if seq==tuple(word[st:st+len(seq)]):
word = word[:st] + [token] + word[st+len(seq):]
st+=1
return word
def generateVocabdict(data):
vocab = list(data.event.unique())
vocabdict = dict((x, tuple([idx])) for idx, x in enumerate(vocab))
vocabdict['OVER'] = tuple([-1])
return vocabdict
def generateWordsVocab(data, vocabdict):
word, words = [], []
for gkey, gdata in data.groupby(['user', 'task', 'trial']):
events = list(gdata.event.values) + ['OVER']
word = [vocabdict[x] for x in list(events)]
words.append(word)
vocab = set([])
for word in words:
vocab = vocab.union(set(word))
vocab = set(x for x in list(vocab))
return words, vocab
def generateBPE(iters, vocab, vocabdict, words, pool, savefileprefix='./', savefreq=50):
for i in range(iters):
stackeddata = pool.starmap(getPair, [[x] for x in words])
if len(stackeddata)==0:
break
stackedsubwords = pool.starmap(uniqueSubwords, stackeddata)
subwords = set([])
for idx, subword in enumerate(stackedsubwords):
subwords = subwords.union(subword)
paircounter = dict((x, 0) for x in subwords)
for pairs, word in stackeddata:
for _, pair in enumerate(pairs):
subword = []
for x in list(pair):
if len(x)==1:
subword.append(tuple(x))
else:
subword.append(list(x))
subword = tuple(map(tuple,subword))
paircounter[subword] += 1
if len(paircounter)==0:
break
if max(paircounter.values())==1:
break
targetpair = max(paircounter, key=paircounter.get)
prelen = len(vocab)
vocab.add(targetpair)
for idx, word in enumerate(words):
stidx = []
for st in range(len(word)-len(targetpair)+1):
if equalTuple(targetpair, tuple(word[st:st+len(targetpair)])):
stidx.append(st)
pre = 0
updated = []
for st in stidx:
if pre<st:
updated = updated + word[pre:st] + [targetpair]
pre = st+len(targetpair)
words[idx] = updated + word[pre:]
if (i+1)%savefreq==0:
savefilename = savefileprefix+'%d.pkl'%(i)
print('saving to =>', savefilename)
with open(savefilename, 'wb') as f:
pkl.dump([vocab, vocabdict, words, paircounter],f)
return savefilename
def sort_by_key_len(dict):
dict_len= {key: len(key) for key in dict.keys()}
import operator
sorted_key_list = sorted(dict_len.items(), key=operator.itemgetter(1), reverse=True)
sorted_dict = [{item[0]: dict[item [0]]} for item in sorted_key_list]
return sorted_dict
def getPair(word, N=2):
word = np.array(word)
slid = 1
sub_windows = (
np.expand_dims(np.arange(N), 0) +
np.expand_dims(np.arange(0, word.shape[0]-N+1, slid), 0).T
).astype(int)
return word[sub_windows], word
def flattenTuple(x):
flatten = []
for oo in list(x):
if type(oo)==int:
flatten.append(tuple([oo]))
else:
if len(oo)==1:
flatten.append(oo)
else:
flatten = flatten + flattenTuple(oo)
return flatten
def equalTuple(x, y):
if set(x)==set(y):
if flattenTuple(x)==flattenTuple(y):
return True
return False
def findTuple(target, candidates):
for x in candidates:
if equalTuple(target, x):
return True
return False
def uniqueSubwords(pairs, ignore):
subwords = set([])
for pair in pairs:
subword = []
for x in list(pair):
if len(x)==1:
subword.append(tuple(x))
else:
subword.append(list(x))
subword = tuple(map(tuple,subword))
subwords.add(subword)
return subwords
def setup_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
def poolsegmentTokenise(gkey, gdata, win_len, vocabdict, mark, BPEvocab=None):
unknowntoken = vocabdict['unknown']
paddingtoken = vocabdict['padding']
slid = win_len
windowTokendict, windowBPEdict = {}, {}
events = np.array(list(gdata.event.values) + ['OVER'])
sub_windows = (
np.expand_dims(np.arange(win_len), 0) +
np.expand_dims(np.arange(0, len(gdata)-win_len+1, slid), 0).T
).astype(int)
windowOri = events[sub_windows].tolist()
if len(sub_windows)==0:
lastidx = 0
else:
lastidx = sub_windows[-1][-1]
if lastidx<len(gdata)-1:
windowOri.append(events[lastidx+1:].tolist())
windowToken = []
for windowO in windowOri:
word = []
for x in list(windowO):
if x in vocabdict.keys():
word.append(vocabdict[x])
else:
assert mark!='train'
word.append(unknowntoken)
windowToken.append(word)
windowTokendict[gkey] = windowToken
if BPEvocab is None:
BPEvocab = [{tuple([tuple([x-1])]):x} for x in range(len(vocabdict))]
windowBPE = []
for word in windowToken:
prelen = len(word)
for vocab in BPEvocab:
seq = list(vocab.keys())[0]
token = list(vocab.values())[0]
word = find_replace(seq, token, word)
assert len(word)<=prelen
windowBPE.append(word)
windowBPEdict[gkey] = windowBPE
for key,values in windowBPEdict.items():
oo = []
for word in values:
newword = []
for x in word:
if type(x)==tuple:
newword.append(x[0])
else:
newword.append(x)
oo.append(newword)
windowBPEdict[key] = oo
segments, labels, nextaction = np.array([]), [], []
for gkey, groupWindowBPE in windowBPEdict.items():
for idx, windowBPE in enumerate(groupWindowBPE):
windowBPE = windowBPE + [paddingtoken[0] for x in range(win_len-len(windowBPE))]
assert len(windowBPE)==win_len
if idx<len(groupWindowBPE)-1:
nextaction.append(windowTokendict[gkey][idx+1][0][0])
else:
nextaction.append(vocabdict['OVER'][0])
labels.append(gkey[1])
if segments.shape[0]==0:
segments = np.array([windowBPE])
else:
segments = np.concatenate((segments, np.array([windowBPE])), axis=0)
return segments, labels, unknowntoken[0], paddingtoken[0]
class PositionalEncoding(nn.Module):
def __init__(self, d_model: int, dropout: float, max_len: int = 5000):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
position = torch.arange(max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
pe = torch.zeros(1, max_len, d_model)
pe[0, :, 0::2] = torch.sin(position * div_term)
pe[0, :, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe)
def forward(self, x: Tensor) -> Tensor:
x = x + self.pe[:,:x.shape[1]]
return self.dropout(x)
class SupervisedTransformerv2(nn.Module):
def __init__(self, **params):
super().__init__()
self.params = params
self.pos_encoder = PositionalEncoding(d_model=params['d_model'], dropout=params['dropout'])
if 'ntokens' in params.keys():
self.embedding = nn.Embedding(num_embeddings=params['ntokens'], embedding_dim=params['d_model'])
self.transformer_encoder = TransformerEncoder(TransformerEncoderLayer(d_model=params['d_model'], nhead=params['nhead'],
dim_feedforward=params['d_model']*4,
dropout=params['dropout'], activation='relu', batch_first=True),
params['n_layers'])
self.linear = nn.Linear(params['d_model']*params['win_len'], params['nlabels'])
def forward(self, encoder_input, paddingmask):
encoder_embed = self.embedding(encoder_input) * math.sqrt(self.params['d_model'])
encoder_pos = self.pos_encoder(encoder_embed)
encoder_output = self.transformer_encoder(src=encoder_pos, src_key_padding_mask=paddingmask)
output = encoder_output.view(encoder_output.shape[0],-1)
final_output = self.linear(output)
return final_output
def evaluate(model, criterion, val_loader, device):
model = model.to(device)
model.eval()
losses, f1s = [], []
with torch.no_grad():
for batchdata, batchlabel, batchmask in val_loader:
predictions = model(batchdata.to(device), batchmask.to(device))
loss = criterion(predictions, batchlabel.reshape(-1).to(device).long())
if np.isnan(loss.item()):
raise "Loss NaN!"
losses.append(loss.item())
pred_label = np.argmax(predictions.detach().cpu().numpy(), axis=1)
f1 = f1_score(batchlabel.numpy(), pred_label, average='macro')
f1s.append(f1)
return np.mean(losses), np.mean(f1s)
class DatasetPadding(Dataset):
def __init__(self, data, paddingtoken=None, label=None):
self.data = data
self.label = label
if paddingtoken is not None:
self.mask = data==paddingtoken
def __getitem__(self, idx):
if self.label is None:
return self.data[idx], self.mask[idx]
return self.data[idx], self.label[idx], self.mask[idx]
def __len__(self):
return len(self.data)
def fit_transformer(traindata, trainlabel, testdata, testlabel, args, device, paddingtoken, nLabels, savemodel='model', nTokens=None):
if os.path.exists(savemodel):
return
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
traindata = traindata.astype(int)
params = {'nlabels': nLabels, 'batch_size':args.batch_size, 'd_model':args.d_model,
'optimizer_name': args, 'nhead':args.nhead, 'dropout':args.dropout,
'win_len': traindata.shape[1], 'lr':args.lr, 'n_layers':args.n_layers, 'ntokens': nTokens}
trainset = DatasetPadding(data=traindata, paddingtoken=paddingtoken, label=trainlabel)
testset = DatasetPadding(data=testdata, paddingtoken=paddingtoken, label=testlabel)
trainloader = DataLoader(trainset, batch_size=params['batch_size'], shuffle=True, num_workers=0)
testloader = DataLoader(testset, batch_size=params['batch_size'], shuffle=True, num_workers=0)
model = SupervisedTransformerv2(**params).to(device)
model.train()
optimizer = getattr(torch.optim, params['optimizer_name'])(model.parameters(),
lr=params['lr'], betas=(0.9,0.999), weight_decay=0.01)
if len(trainloader)>=20:
LOG = int(len(trainloader)/20)
else:
LOG = 1
trloss, valoss, trf1, vaf1 = [], [], [], []
evaloss, evaf1 = 0,0
for epoch in range(1, args.epochs+1):
for batch, (batchdata, batchlabel, batchmask) in enumerate(trainloader):
predictions = model(batchdata.to(device), batchmask.to(device))
loss = criterion(predictions, batchlabel.reshape(-1).to(device).long())
if np.isnan(loss.item()):
raise "Loss NaN!"
loss.requires_grad_(True)
optimizer.zero_grad()
loss.backward()
optimizer.step()
evaloss+=loss.item()
pred_label = np.argmax(predictions.detach().cpu().numpy(), axis=1)
f1 = f1_score(batchlabel.numpy(), pred_label, average='macro')
evaf1 += f1
if batch%(LOG)==0 or batch==len(trainloader)-1:
cur_valoss, cur_vaf1 = evaluate(model, criterion, testloader, device)
model.train()
trloss.append(evaloss/LOG)
valoss.append(cur_valoss)
trf1.append(evaf1/LOG)
vaf1.append(cur_vaf1)
evaloss, evaf1 = 0,0
print('Epoch [{}/{}], Batch [{}/{}], Train Loss: {:.4f}, Train F1: {:.4f}, Val Loss: {:.4f}, Val F1: {:.4f}'
.format(epoch, args.epochs, batch, len(trainloader), trloss[-1], trf1[-1], valoss[-1], vaf1[-1]))
torch.save([model.cpu(), [trloss, valoss, trf1, vaf1]], savemodel+'%d.pkl'%(epoch))