vlcn/code/core/data/utils.py
2022-03-30 10:46:35 +02:00

81 lines
1.9 KiB
Python

import en_vectors_web_lg, random, re, json
import numpy as np
def tokenize(ques_list, use_glove):
token_to_ix = {
'PAD': 0,
'UNK': 1,
}
spacy_tool = None
pretrained_emb = []
if use_glove:
spacy_tool = en_vectors_web_lg.load()
pretrained_emb.append(spacy_tool('PAD').vector)
pretrained_emb.append(spacy_tool('UNK').vector)
for ques in ques_list:
words = re.sub(
r"([.,'!?\"()*#:;])",
'',
ques.lower()
).replace('-', ' ').replace('/', ' ').split()
for word in words:
if word not in token_to_ix:
token_to_ix[word] = len(token_to_ix)
if use_glove:
pretrained_emb.append(spacy_tool(word).vector)
pretrained_emb = np.array(pretrained_emb)
return token_to_ix, pretrained_emb
def proc_ques(ques, token_to_ix, max_token):
ques_ix = np.zeros(max_token, np.int64)
words = re.sub(
r"([.,'!?\"()*#:;])",
'',
ques.lower()
).replace('-', ' ').replace('/', ' ').split()
q_len = 0
for ix, word in enumerate(words):
if word in token_to_ix:
ques_ix[ix] = token_to_ix[word]
q_len += 1
else:
ques_ix[ix] = token_to_ix['UNK']
if ix + 1 == max_token:
break
return ques_ix, q_len, len(words)
def ans_stat(ans_list):
ans_to_ix, ix_to_ans = {}, {}
for i, ans in enumerate(ans_list):
ans_to_ix[ans] = i
ix_to_ans[i] = ans
return ans_to_ix, ix_to_ans
def shuffle_list(ans_list):
random.shuffle(ans_list)
def qlen_to_key(q_len):
if 1<= q_len <=3:
return '1-3'
if 4<= q_len <=8:
return '4-8'
if 9<= q_len:
return '9-15'
def ans_to_key(ans_idx):
if 0 <= ans_idx <= 99 :
return '0-99'
if 100 <= ans_idx <= 299 :
return '100-299'
if 300 <= ans_idx <= 999 :
return '300-999'