initial commit
This commit is contained in:
commit
a82bbc593e
129 changed files with 33981 additions and 0 deletions
45
models/common/vqa_tools/aokvqa/data_scripts/build_vocab.py
Normal file
45
models/common/vqa_tools/aokvqa/data_scripts/build_vocab.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
import os
|
||||
import argparse
|
||||
from collections import Counter
|
||||
import pathlib
|
||||
|
||||
from load_aokvqa import load_aokvqa
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
|
||||
parser.add_argument('--out', type=pathlib.Path, required=True, dest='output_file')
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
# Build vocab from train set: correct choices + (direct answers appearing in >= 3 )
|
||||
|
||||
train_set = load_aokvqa(args.aokvqa_dir, 'train')
|
||||
|
||||
vocab = []
|
||||
all_choices = Counter()
|
||||
direct_answers = Counter()
|
||||
|
||||
for i in train_set:
|
||||
vocab.append( i['choices'][i['correct_choice_idx']] )
|
||||
all_choices.update(i['choices'])
|
||||
direct_answers.update(set(i['direct_answers']))
|
||||
vocab += [k for k,v in all_choices.items() if v >= 3]
|
||||
vocab += [k for k,v in direct_answers.items() if v >= 3]
|
||||
|
||||
vocab = sorted(set(vocab))
|
||||
print(f"Vocab size: {len(vocab)}")
|
||||
|
||||
# Save vocabulary Output
|
||||
|
||||
with open(args.output_file, 'w') as f:
|
||||
for v in vocab:
|
||||
print(v, file=f)
|
||||
|
||||
## Check validation set coverage
|
||||
|
||||
val_set = load_aokvqa(args.aokvqa_dir, 'val')
|
||||
|
||||
val_acc = [v['choices'][v['correct_choice_idx']] in vocab for v in val_set]
|
||||
val_acc = sum(val_acc) / len(val_acc) * 100
|
||||
print(f"Val set coverage: {val_acc:.2f}" )
|
|
@ -0,0 +1,26 @@
|
|||
import json
|
||||
from tqdm import tqdm
|
||||
import argparse
|
||||
import pathlib
|
||||
|
||||
import torch
|
||||
import clip
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--vocab', type=pathlib.Path, required=True, dest='vocab_file')
|
||||
parser.add_argument('--model-type', type=str, choices=['RN50', 'RN50x4', 'RN50x16', 'RN50x64', 'RN101', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px'], required=True, dest='model_type')
|
||||
parser.add_argument('--out', type=pathlib.Path, required=True, dest='output_file')
|
||||
args = parser.parse_args()
|
||||
|
||||
assert args.output_file.suffix == '.pt'
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model, preprocess = clip.load(args.model_type, device=device)
|
||||
|
||||
with torch.no_grad():
|
||||
a = open(args.vocab_file).read().splitlines()
|
||||
mc_text = clip.tokenize(a).to(device)
|
||||
mc_text_features = torch.stack([model.encode_text(mct.unsqueeze(0)).cpu() for mct in tqdm(mc_text)], dim=1)[0]
|
||||
mc_text_features = mc_text_features.float()
|
||||
model_name = args.model_type.replace('/', '-').replace('@', '-')
|
||||
torch.save(mc_text_features, args.output_file)
|
|
@ -0,0 +1,50 @@
|
|||
import os
|
||||
import argparse
|
||||
import pathlib
|
||||
from tqdm import tqdm
|
||||
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
|
||||
from load_aokvqa import load_aokvqa
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
|
||||
parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True)
|
||||
parser.add_argument('--out', type=pathlib.Path, required=True, dest='output_file')
|
||||
args = parser.parse_args()
|
||||
|
||||
assert args.output_file.suffix == '.pt'
|
||||
|
||||
## Load dataset
|
||||
|
||||
dataset = load_aokvqa(args.aokvqa_dir, args.split)
|
||||
|
||||
## Load model
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
|
||||
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model = model.to(device)
|
||||
model.eval()
|
||||
|
||||
def mean_pooling(model_output, attention_mask):
|
||||
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
|
||||
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
||||
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
||||
|
||||
## Encoding loop
|
||||
|
||||
with torch.no_grad():
|
||||
embeddings = {}
|
||||
|
||||
for d in tqdm(dataset):
|
||||
encoded_input = tokenizer([d['question']], padding=True, return_tensors='pt')
|
||||
encoded_input = {k:v.to(device) for k,v in encoded_input.items()}
|
||||
e = mean_pooling(model(**encoded_input), encoded_input['attention_mask'])
|
||||
embeddings[d['question_id']] = {
|
||||
'question' : e[0].cpu()
|
||||
}
|
||||
|
||||
torch.save(embeddings, args.output_file)
|
|
@ -0,0 +1,51 @@
|
|||
import os
|
||||
from PIL import Image
|
||||
from tqdm import tqdm
|
||||
import argparse
|
||||
import pathlib
|
||||
|
||||
import torch
|
||||
import clip
|
||||
|
||||
from load_aokvqa import load_aokvqa, get_coco_path
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
|
||||
parser.add_argument('--coco-dir', type=pathlib.Path, required=True, dest='coco_dir')
|
||||
parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True)
|
||||
parser.add_argument('--model-type', type=str, choices=['RN50', 'RN50x4', 'RN50x16', 'RN50x64', 'RN101', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px'], required=True, dest='model_type')
|
||||
parser.add_argument('--out', type=pathlib.Path, required=True, dest='output_file')
|
||||
args = parser.parse_args()
|
||||
|
||||
assert args.output_file.suffix == '.pt'
|
||||
|
||||
## Load dataset
|
||||
|
||||
dataset = load_aokvqa(args.aokvqa_dir, args.split)
|
||||
|
||||
## Load model
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model, preprocess = clip.load(args.model_type, device=device)
|
||||
|
||||
## Encoding loop
|
||||
|
||||
with torch.no_grad():
|
||||
embeddings = {}
|
||||
|
||||
for d in tqdm(dataset):
|
||||
q = d["question"]
|
||||
q_text = clip.tokenize(q).to(device)
|
||||
q_text_features = model.encode_text(q_text)
|
||||
|
||||
img = Image.open(get_coco_path(args.split, d['image_id'], args.coco_dir))
|
||||
img = preprocess(img).unsqueeze(0).to(device)
|
||||
image_features = model.encode_image(img)
|
||||
|
||||
embeddings[d['question_id']] = {
|
||||
'question' : q_text_features[0].float().cpu(),
|
||||
'image' : image_features[0].float().cpu(),
|
||||
}
|
||||
|
||||
torch.save(embeddings, args.output_file)
|
|
@ -0,0 +1,62 @@
|
|||
import os
|
||||
import argparse
|
||||
import pathlib
|
||||
from tqdm import tqdm
|
||||
from PIL import Image
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torchvision import models
|
||||
from torchvision import transforms as T
|
||||
|
||||
from load_aokvqa import load_aokvqa, get_coco_path
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
|
||||
parser.add_argument('--coco-dir', type=pathlib.Path, required=True, dest='coco_dir')
|
||||
parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True)
|
||||
parser.add_argument('--out', type=pathlib.Path, required=True, dest='output_file')
|
||||
args = parser.parse_args()
|
||||
|
||||
assert args.output_file.suffix == '.pt'
|
||||
|
||||
## Load dataset
|
||||
|
||||
dataset = load_aokvqa(args.aokvqa_dir, args.split)
|
||||
|
||||
## Load model
|
||||
|
||||
resnet_preprocess = T.Compose([
|
||||
T.Resize(size=224, interpolation=T.InterpolationMode.BICUBIC),
|
||||
T.CenterCrop(size=(224, 224)),
|
||||
T.ToTensor(),
|
||||
T.Normalize(
|
||||
mean=[0.485, 0.456, 0.406],
|
||||
std=[0.229, 0.224, 0.225]
|
||||
)
|
||||
])
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
resnet_model = models.resnet50(pretrained=True)
|
||||
resnet_model = torch.nn.Sequential(
|
||||
*list(resnet_model.children())[:-1],
|
||||
nn.Flatten()
|
||||
) # strip classification layer
|
||||
resnet_model = resnet_model.to(device)
|
||||
|
||||
## Encoding loop
|
||||
|
||||
with torch.no_grad():
|
||||
embeddings = {}
|
||||
|
||||
for d in tqdm(dataset):
|
||||
img = Image.open(get_coco_path(args.split, d['image_id'], args.coco_dir)).convert('RGB')
|
||||
resnet_input = resnet_preprocess(img).unsqueeze(0).to(device)
|
||||
resnet_features = resnet_model(resnet_input)
|
||||
embeddings[d['question_id']] = {
|
||||
'image' : resnet_features[0].cpu()
|
||||
}
|
||||
|
||||
torch.save(embeddings, args.output_file)
|
Loading…
Add table
Add a link
Reference in a new issue