189 lines
4.5 KiB
Text
189 lines
4.5 KiB
Text
|
# Phase 1
|
||
|
P1 {
|
||
|
use_cpu = false
|
||
|
visdial_version = 1.0
|
||
|
train_on_dense = false
|
||
|
metrics_to_maximize = mrr
|
||
|
|
||
|
# visdial data
|
||
|
visdial_image_feats = data/visdial_img_feat.lmdb
|
||
|
|
||
|
visdial_image_adj_matrices = data/img_adj_matrices
|
||
|
visdial_question_adj_matrices = data/question_adj_matrices
|
||
|
visdial_history_adj_matrices = data/history_adj_matrices
|
||
|
|
||
|
visdial_train = data/visdial_1.0_train.json
|
||
|
visdial_val = data/visdial_1.0_val.json
|
||
|
visdial_test = data/visdial_1.0_test.json
|
||
|
visdial_val_dense_annotations = data/visdial_1.0_val_dense_annotations.json
|
||
|
|
||
|
visdial_train_09 = data/visdial_0.9_train.json
|
||
|
visdial_val_09 = data/visdial_0.9_val.json
|
||
|
visdial_test_09 = data/visdial_0.9_test.json
|
||
|
|
||
|
visdialconv_val = data/visdial_conv.json
|
||
|
visdialconv_val_dense_annotations = data/visdialconv_dense_annotations.json
|
||
|
|
||
|
visdialvispro_val = data/vispro.json
|
||
|
visdialvispro_val_dense_annotations = data/vispro_dense_annotations.json
|
||
|
|
||
|
visdial_question_parse_vocab = data/parse_vocab.pkl
|
||
|
|
||
|
# init
|
||
|
start_path = ckpt/vdgr_visdial_v1.0_after_warmup_K2.ckpt
|
||
|
model_config = config/bert_base_6layer_6conect.json
|
||
|
|
||
|
# visdial training
|
||
|
freeze_vilbert = false
|
||
|
visdial_tot_rounds = 11
|
||
|
num_negative_samples = 1
|
||
|
sequences_per_image = 2
|
||
|
batch_size = 8
|
||
|
lm_loss_coeff = 1
|
||
|
nsp_loss_coeff = 1
|
||
|
img_loss_coeff = 1
|
||
|
batch_multiply = 1
|
||
|
use_trainval = false
|
||
|
dense_loss = ce
|
||
|
dense_loss_coeff = 0
|
||
|
dataloader_text_only = false
|
||
|
rlv_hst_only = false
|
||
|
rlv_hst_dense_round = false
|
||
|
|
||
|
# visdial model
|
||
|
mask_prob = 0.1
|
||
|
image_mask_prob = 0.1
|
||
|
max_seq_len = 256
|
||
|
num_options = 100
|
||
|
num_options_dense = 100
|
||
|
use_embedding = joint
|
||
|
|
||
|
# visdial evaluation
|
||
|
eval_visdial_on_test = true
|
||
|
eval_batch_size = 1
|
||
|
eval_line_batch_size = 200
|
||
|
skip_mrr_eval = false
|
||
|
skip_ndcg_eval = false
|
||
|
skip_visdial_eval = false
|
||
|
eval_visdial_every = 1
|
||
|
eval_dataset = visdial # visdial_vispro # choices = [visdial, visdial_conv, visdial_vispro ]
|
||
|
|
||
|
continue_evaluation = false
|
||
|
eval_at_start = false
|
||
|
eval_before_training = false
|
||
|
initializer = normal
|
||
|
bert_cased = false
|
||
|
|
||
|
# restore ckpt
|
||
|
loads_best_ckpt = false
|
||
|
loads_ckpt = false
|
||
|
restarts = false
|
||
|
resets_max_metric = false
|
||
|
uses_new_optimizer = false
|
||
|
sets_new_lr = false
|
||
|
loads_start_path = false
|
||
|
|
||
|
# logging
|
||
|
random_seed = 42
|
||
|
next_logging_pct = 1.0
|
||
|
next_evaluating_pct = 50.0
|
||
|
max_ckpt_to_keep = 1
|
||
|
num_epochs = 20
|
||
|
early_stop_epoch = 5
|
||
|
skip_saving_ckpt = false
|
||
|
dp_type = apex
|
||
|
stack_gr_data = false
|
||
|
master_port = 5122
|
||
|
stop_epochs = -1
|
||
|
train_each_round = false
|
||
|
drop_last_answer = false
|
||
|
num_samples = -1
|
||
|
|
||
|
# predicting
|
||
|
predict_split = test
|
||
|
predict_each_round = false
|
||
|
predict_dense_round = false
|
||
|
num_test_dialogs = 8000
|
||
|
num_val_dialogs = 2064
|
||
|
save_score = false
|
||
|
|
||
|
# optimizer
|
||
|
reset_optim = none
|
||
|
learning_rate_bert = 5e-6
|
||
|
learning_rate_gnn = 2e-4
|
||
|
gnn_weight_decay = 0.01
|
||
|
use_diff_lr_gnn = true
|
||
|
min_lr = 0
|
||
|
decay_method_bert = linear
|
||
|
decay_method_gnn = linear
|
||
|
decay_exp = 2
|
||
|
max_grad_norm = 1.0
|
||
|
task_optimizer = adam
|
||
|
warmup_ratio = 0.1
|
||
|
|
||
|
# directory
|
||
|
log_dir = logs/vdgr
|
||
|
data_dir = data
|
||
|
visdial_output_dir = visdial_output
|
||
|
bert_cache_dir = transformers
|
||
|
|
||
|
# keep track of other hparams in bert json
|
||
|
v_gnn_edge_dim = 12 # 11 classes + hub_node
|
||
|
q_gnn_edge_dim = 48 # 47 classes + hub_node
|
||
|
num_v_gnn_layers = 2
|
||
|
num_q_gnn_layers = 2
|
||
|
num_h_gnn_layers = 2
|
||
|
num_gnn_attention_heads = 4
|
||
|
v_gnn_ids = [0, 1, 2, 3, 4, 5]
|
||
|
t_gnn_ids = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
|
||
|
}
|
||
|
|
||
|
# Phase 2
|
||
|
P2_CE = ${P1} {
|
||
|
# basic
|
||
|
train_on_dense = true
|
||
|
use_trainval = true
|
||
|
metrics_to_maximize = ndcg
|
||
|
|
||
|
visdial_train_dense = data/visdial_1.0_train_dense.json
|
||
|
visdial_train_dense_annotations = data/visdial_1.0_train_dense_annotations.json
|
||
|
visdial_val_dense = data/visdial_1.0_val.json
|
||
|
|
||
|
tr_graph_idx_mapping = data/tr_dense_mapping.json
|
||
|
val_graph_idx_mapping = data/val_dense_mapping.json
|
||
|
test_graph_idx_mapping = data/test_dense_mapping.json
|
||
|
|
||
|
visdial_val = data/visdial_1.0_val.json
|
||
|
visdial_val_dense_annotations = data/visdial_1.0_val_dense_annotations.json
|
||
|
|
||
|
# data
|
||
|
start_path = logs/vdgr/P1_K2_v1.0/epoch_best.ckpt
|
||
|
rlv_hst_only = false
|
||
|
|
||
|
# visdial training
|
||
|
nsp_loss_coeff = 0
|
||
|
dense_loss_coeff = 1
|
||
|
batch_multiply = 10
|
||
|
batch_size = 1
|
||
|
|
||
|
# visdial model
|
||
|
num_options_dense = 100
|
||
|
|
||
|
# visdial evaluation
|
||
|
eval_batch_size = 1
|
||
|
eval_line_batch_size = 100
|
||
|
skip_mrr_eval = true
|
||
|
|
||
|
# training
|
||
|
stop_epochs = 3
|
||
|
dp_type = dp
|
||
|
dense_loss = ce
|
||
|
|
||
|
# optimizer
|
||
|
learning_rate_bert = 1e-4
|
||
|
}
|
||
|
|
||
|
P2_LISTNET = ${P2_CE} {
|
||
|
dense_loss = listnet
|
||
|
}
|