from collections import OrderedDict import logging import sys from .self_attention import Transformer import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence, pad_packed_sequence def random_embedding(vocab_size, embedding_dim): pretrain_emb = np.empty([vocab_size, embedding_dim]) scale = np.sqrt(3.0 / embedding_dim) for index in range(vocab_size): pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim]) return pretrain_emb def neg_log_likelihood_loss(outputs, batch_label, batch_size, seq_len): outputs = outputs.view(batch_size * seq_len, -1) score = F.log_softmax(outputs, 1) loss = nn.NLLLoss(ignore_index=0, size_average=False)( score, batch_label.view(batch_size * seq_len) ) loss = loss / batch_size _, tag_seq = torch.max(score, 1) tag_seq = tag_seq.view(batch_size, seq_len) # print(score[0], tag_seq[0]) return loss, tag_seq def mse_loss(outputs, batch_label, batch_size, seq_len, word_seq_length): # score = torch.nn.functional.softmax(outputs, 1) score = torch.sigmoid(outputs) mask = torch.zeros_like(score) for i, v in enumerate(word_seq_length): mask[i, 0:v] = 1 score = score * mask loss = nn.MSELoss(reduction="sum")( score.view(batch_size, seq_len), batch_label.view(batch_size, seq_len) ) loss = loss / batch_size return loss, score.view(batch_size, seq_len) class Network(nn.Module): def __init__( self, embedding_type, vocab_size, embedding_dim, dropout, hidden_dim, embeddings=None, attention=True, ): super().__init__() self.logger = logging.getLogger(f"{__name__}") prelayers = OrderedDict() postlayers = OrderedDict() if embedding_type in ("w2v", "glove"): if embeddings is not None: prelayers["embedding_layer"] = nn.Embedding.from_pretrained(embeddings) else: prelayers["embedding_layer"] = nn.Embedding(vocab_size, embedding_dim) prelayers["embedding_dropout_layer"] = nn.Dropout(dropout) embedding_dim = 300 elif embedding_type == "bert": embedding_dim = 768 self.lstm = BiLSTM(embedding_dim, hidden_dim // 2, num_layers=1) postlayers["lstm_dropout_layer"] = nn.Dropout(dropout) if attention: # increased compl with 1024D, and 16,16: for no att and att experiments # before: for the initial att and pretraining: heads 4 and layers 4, 128D # then was 128 D with heads 4 layer 1 = results for all IUI ###postlayers["position_encodings"] = PositionalEncoding(hidden_dim) postlayers["attention_layer"] = Transformer( d_model=hidden_dim, n_heads=4, n_layers=1 ) postlayers["ff_layer"] = nn.Linear(hidden_dim, hidden_dim // 2) postlayers["ff_activation"] = nn.ReLU() postlayers["output_layer"] = nn.Linear(hidden_dim // 2, 1) self.logger.info(f"prelayers: {prelayers.keys()}") self.logger.info(f"postlayers: {postlayers.keys()}") self.pre = nn.Sequential(prelayers) self.post = nn.Sequential(postlayers) def forward(self, x, word_seq_length): x = self.pre(x) x = self.lstm(x, word_seq_length) #MS pritning fix model params #for p in self.parameters(): # print(p.data) # break return self.post(x.transpose(1, 0)) class BiLSTM(nn.Module): def __init__(self, embedding_dim, lstm_hidden, num_layers): super().__init__() self.net = nn.LSTM( input_size=embedding_dim, hidden_size=lstm_hidden, num_layers=num_layers, batch_first=True, bidirectional=True, ) def forward(self, x, word_seq_length): packed_words = pack_padded_sequence(x, word_seq_length, True, False) lstm_out, hidden = self.net(packed_words) lstm_out, _ = pad_packed_sequence(lstm_out) return lstm_out