diff --git a/BPE.py b/BPE.py new file mode 100644 index 0000000..914aae3 --- /dev/null +++ b/BPE.py @@ -0,0 +1,127 @@ +import warnings +warnings.filterwarnings("ignore") +import copy +import pandas as pd +import numpy as np +import time, pdb, os, random +import pickle as pkl +import seaborn as sn +import itertools +from itertools import product +from argparse import ArgumentParser +from ast import literal_eval +import torch, gc +import multiprocessing as mp +import torch.multiprocessing as torchmp +from utils_Analysis import setup_seed, flattenTuple, generateBPE, generateVocabdict, generateWordsVocab, fit_transformer, poolsegmentTokenise, sort_by_key_len + +if __name__=='__main__': + parser = ArgumentParser(description='BPE on datasets') + parser.add_argument('--seed', dest='seed', type=int, required=True) + parser.add_argument('-m','--modality', dest='modality', choices=['mouse', 'keyboard', 'both'], type=str, required=True) + parser.add_argument('--IDTfile', type=str, required=False) + parser.add_argument('--keyfile', type=str, required=False) + parser.add_argument('--lr', type=float, required=True) + parser.add_argument('--n_layers', type=int, required=True) + parser.add_argument('--d_model', type=int, required=True) + parser.add_argument('--batch_size', type=int, required=True) + parser.add_argument('--dropout', type=float, required=True) + parser.add_argument('--nhead', type=int, required=True) + parser.add_argument('--optimizer_name', type=str, required=True) + parser.add_argument('--epochs', type=int, required=True) + parser.add_argument('--iteration', type=int, required=True) + args = parser.parse_args() + setup_seed(args.seed) + args.folds = 5 + device = torch.device('cuda') + WINLENS_key = [10, 50, 100] + WINLENS_mouse = [20, 100, 200] + WINLENS_both = [15, 75, 150] + pool = mp.Pool(mp.cpu_count()) + + if args.modality=='mouse': + WINLENS = WINLENS_mouse + with open(args.IDTfile, 'rb') as f: + data = pkl.load(f) + elif args.modality=='keyboard': + WINLENS = WINLENS_key + with open(args.keyfile, 'rb') as f: + data = pkl.load(f) + elif args.modality=='both': + WINLENS = WINLENS_both + with open(args.IDTfile, 'rb') as f: + data = pkl.load(f) + with open(args.keyfile, 'rb') as f: + keydata = pkl.load(f) + data = pd.concat((data, keydata)) + data.sort_index(inplace=True) + + users = data.user.unique() + random.shuffle(users) + userInAFold = int(len(users)/args.folds) + + for fold in range(args.folds): + print('----- Fold %d -----'%(fold)) + testuser = users[fold*userInAFold:(fold+1)*userInAFold] + trainuser = set(users)-set(testuser) + trset = data[data['user'].isin(trainuser)] + teset = data[data['user'].isin(testuser)] + + vocabdict = generateVocabdict(trset) + words, vocab = generateWordsVocab(trset, vocabdict) + vocabdict['unknown'] = tuple([len(vocabdict)]) + vocabdict['padding'] = tuple([len(vocabdict)]) + nLabels = data.task.unique().shape[0] + + generateBPE(args.iteration, vocab, vocabdict, words, pool) + + with open('%d.pkl'%(args.iteration-1), 'rb') as f: + vocab, _, _, _ = pkl.load(f) + flatvocab = set() + for x in vocab: + flatvocab = flatvocab.union([(tuple(flattenTuple(x)))]) + BPEvocabdict = dict((x, idx) for idx, x in enumerate(flatvocab)) + assert len(flatvocab)==len(BPEvocabdict) + rankedvocab = sort_by_key_len(BPEvocabdict) + rankedvocab = rankedvocab + [{tuple([vocabdict['unknown']]):len(rankedvocab)},{tuple([vocabdict['padding']]):len(rankedvocab)+1}] + + stackeddata = pool.starmap(poolsegmentTokenise, [(gkey, gdata, win_len, vocabdict, 'train', rankedvocab) for (gkey,gdata),win_len in product(trset.groupby(['user', 'task', 'session']), WINLENS)]) + minlabel = np.inf + stackedtrdata, stackedtrlabel = dict(zip(WINLENS,[[] for i in range(len(WINLENS))])), dict(zip(WINLENS,[[] for i in range(len(WINLENS))])) + for segments, labels, unknowntoken, paddingtoken in stackeddata: + if segments.shape[0]==0: + continue + assert vocabdict['padding'][0]==paddingtoken + assert vocabdict['unknown'][0]==unknowntoken + if len(stackedtrdata[segments.shape[1]])==0: + stackedtrdata[segments.shape[1]] = segments + else: + stackedtrdata[segments.shape[1]] = np.concatenate((stackedtrdata[segments.shape[1]], segments), axis=0) + stackedtrlabel[segments.shape[1]] = np.array(list(stackedtrlabel[segments.shape[1]]) + labels) + if np.min(labels)', savefilename) + with open(savefilename, 'wb') as f: + pkl.dump([vocab, vocabdict, words, paircounter],f) + return savefilename + +def sort_by_key_len(dict): + dict_len= {key: len(key) for key in dict.keys()} + import operator + sorted_key_list = sorted(dict_len.items(), key=operator.itemgetter(1), reverse=True) + sorted_dict = [{item[0]: dict[item [0]]} for item in sorted_key_list] + return sorted_dict + +def getPair(word, N=2): + word = np.array(word) + slid = 1 + sub_windows = ( + np.expand_dims(np.arange(N), 0) + + np.expand_dims(np.arange(0, word.shape[0]-N+1, slid), 0).T + ).astype(int) + return word[sub_windows], word + +def flattenTuple(x): + flatten = [] + for oo in list(x): + if type(oo)==int: + flatten.append(tuple([oo])) + else: + if len(oo)==1: + flatten.append(oo) + else: + flatten = flatten + flattenTuple(oo) + return flatten + +def equalTuple(x, y): + if set(x)==set(y): + if flattenTuple(x)==flattenTuple(y): + return True + return False + +def findTuple(target, candidates): + for x in candidates: + if equalTuple(target, x): + return True + return False + +def uniqueSubwords(pairs, ignore): + subwords = set([]) + for pair in pairs: + subword = [] + for x in list(pair): + if len(x)==1: + subword.append(tuple(x)) + else: + subword.append(list(x)) + subword = tuple(map(tuple,subword)) + subwords.add(subword) + return subwords + +def setup_seed(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + np.random.seed(seed) + random.seed(seed) + torch.backends.cudnn.deterministic = True + +def poolsegmentTokenise(gkey, gdata, win_len, vocabdict, mark, BPEvocab=None): + unknowntoken = vocabdict['unknown'] + paddingtoken = vocabdict['padding'] + slid = win_len + windowTokendict, windowBPEdict = {}, {} + events = np.array(list(gdata.event.values) + ['OVER']) + sub_windows = ( + np.expand_dims(np.arange(win_len), 0) + + np.expand_dims(np.arange(0, len(gdata)-win_len+1, slid), 0).T + ).astype(int) + windowOri = events[sub_windows].tolist() + if len(sub_windows)==0: + lastidx = 0 + else: + lastidx = sub_windows[-1][-1] + if lastidx Tensor: + x = x + self.pe[:,:x.shape[1]] + return self.dropout(x) + + +class SupervisedTransformerv2(nn.Module): + def __init__(self, **params): + super().__init__() + self.params = params + self.pos_encoder = PositionalEncoding(d_model=params['d_model'], dropout=params['dropout']) + if 'ntokens' in params.keys(): + self.embedding = nn.Embedding(num_embeddings=params['ntokens'], embedding_dim=params['d_model']) + self.transformer_encoder = TransformerEncoder(TransformerEncoderLayer(d_model=params['d_model'], nhead=params['nhead'], + dim_feedforward=params['d_model']*4, + dropout=params['dropout'], activation='relu', batch_first=True), + params['n_layers']) + self.linear = nn.Linear(params['d_model']*params['win_len'], params['nlabels']) + + def forward(self, encoder_input, paddingmask): + encoder_embed = self.embedding(encoder_input) * math.sqrt(self.params['d_model']) + encoder_pos = self.pos_encoder(encoder_embed) + encoder_output = self.transformer_encoder(src=encoder_pos, src_key_padding_mask=paddingmask) + output = encoder_output.view(encoder_output.shape[0],-1) + final_output = self.linear(output) + return final_output + +def evaluate(model, criterion, val_loader, device): + model = model.to(device) + model.eval() + losses, f1s = [], [] + with torch.no_grad(): + for batchdata, batchlabel, batchmask in val_loader: + predictions = model(batchdata.to(device), batchmask.to(device)) + loss = criterion(predictions, batchlabel.reshape(-1).to(device).long()) + if np.isnan(loss.item()): + raise "Loss NaN!" + losses.append(loss.item()) + pred_label = np.argmax(predictions.detach().cpu().numpy(), axis=1) + f1 = f1_score(batchlabel.numpy(), pred_label, average='macro') + f1s.append(f1) + return np.mean(losses), np.mean(f1s) + +class DatasetPadding(Dataset): + def __init__(self, data, paddingtoken=None, label=None): + self.data = data + self.label = label + if paddingtoken is not None: + self.mask = data==paddingtoken + def __getitem__(self, idx): + if self.label is None: + return self.data[idx], self.mask[idx] + return self.data[idx], self.label[idx], self.mask[idx] + def __len__(self): + return len(self.data) + +def fit_transformer(traindata, trainlabel, testdata, testlabel, args, device, paddingtoken, nLabels, savemodel='model', nTokens=None): + if os.path.exists(savemodel): + return + + criterion = nn.CrossEntropyLoss(label_smoothing=0.1) + traindata = traindata.astype(int) + + params = {'nlabels': nLabels, 'batch_size':args.batch_size, 'd_model':args.d_model, + 'optimizer_name': args, 'nhead':args.nhead, 'dropout':args.dropout, + 'win_len': traindata.shape[1], 'lr':args.lr, 'n_layers':args.n_layers, 'ntokens': nTokens} + trainset = DatasetPadding(data=traindata, paddingtoken=paddingtoken, label=trainlabel) + testset = DatasetPadding(data=testdata, paddingtoken=paddingtoken, label=testlabel) + trainloader = DataLoader(trainset, batch_size=params['batch_size'], shuffle=True, num_workers=0) + testloader = DataLoader(testset, batch_size=params['batch_size'], shuffle=True, num_workers=0) + + model = SupervisedTransformerv2(**params).to(device) + model.train() + + optimizer = getattr(torch.optim, params['optimizer_name'])(model.parameters(), + lr=params['lr'], betas=(0.9,0.999), weight_decay=0.01) + if len(trainloader)>=20: + LOG = int(len(trainloader)/20) + else: + LOG = 1 + trloss, valoss, trf1, vaf1 = [], [], [], [] + evaloss, evaf1 = 0,0 + for epoch in range(1, args.epochs+1): + for batch, (batchdata, batchlabel, batchmask) in enumerate(trainloader): + predictions = model(batchdata.to(device), batchmask.to(device)) + loss = criterion(predictions, batchlabel.reshape(-1).to(device).long()) + if np.isnan(loss.item()): + raise "Loss NaN!" + loss.requires_grad_(True) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + evaloss+=loss.item() + pred_label = np.argmax(predictions.detach().cpu().numpy(), axis=1) + f1 = f1_score(batchlabel.numpy(), pred_label, average='macro') + evaf1 += f1 + if batch%(LOG)==0 or batch==len(trainloader)-1: + cur_valoss, cur_vaf1 = evaluate(model, criterion, testloader, device) + model.train() + trloss.append(evaloss/LOG) + valoss.append(cur_valoss) + trf1.append(evaf1/LOG) + vaf1.append(cur_vaf1) + evaloss, evaf1 = 0,0 + print('Epoch [{}/{}], Batch [{}/{}], Train Loss: {:.4f}, Train F1: {:.4f}, Val Loss: {:.4f}, Val F1: {:.4f}' + .format(epoch, args.epochs, batch, len(trainloader), trloss[-1], trf1[-1], valoss[-1], vaf1[-1])) + torch.save([model.cpu(), [trloss, valoss, trf1, vaf1]], savemodel+'%d.pkl'%(epoch)) \ No newline at end of file