initial commit
This commit is contained in:
commit
7be61f8c6d
137 changed files with 33491 additions and 0 deletions
23
config/config_bert_base.json
Executable file
23
config/config_bert_base.json
Executable file
|
@ -0,0 +1,23 @@
|
|||
{
|
||||
"architectures": [
|
||||
"BertForMaskedLM"
|
||||
],
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"hidden_size": 768,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 3072,
|
||||
"layer_norm_eps": 1e-12,
|
||||
"max_position_embeddings": 512,
|
||||
"model_type": "bert",
|
||||
"num_attention_heads": 12,
|
||||
"num_hidden_layers": 12,
|
||||
"pad_token_id": 0,
|
||||
"type_vocab_size": 2,
|
||||
"vocab_size": 30522,
|
||||
"fusion_layer": 9,
|
||||
"encoder_width": 768,
|
||||
"cross_module": "ca"
|
||||
}
|
||||
|
26
config/config_bert_large.json
Executable file
26
config/config_bert_large.json
Executable file
|
@ -0,0 +1,26 @@
|
|||
{
|
||||
"architectures": [
|
||||
"BertForMaskedLM"
|
||||
],
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"gradient_checkpointing": false,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"hidden_size": 1024,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 4096,
|
||||
"layer_norm_eps": 1e-12,
|
||||
"max_position_embeddings": 512,
|
||||
"model_type": "bert",
|
||||
"num_attention_heads": 16,
|
||||
"num_hidden_layers": 24,
|
||||
"pad_token_id": 0,
|
||||
"position_embedding_type": "absolute",
|
||||
"type_vocab_size": 2,
|
||||
"use_cache": true,
|
||||
"vocab_size": 30522,
|
||||
"fusion_layer": 19,
|
||||
"encoder_width": 1024,
|
||||
"cross_module": "ca"
|
||||
}
|
||||
|
8
config/config_lora.json
Executable file
8
config/config_lora.json
Executable file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"r": 8,
|
||||
"lora_alpha": 16,
|
||||
"bias": "none",
|
||||
"use_rslora": true,
|
||||
"lora_dropout": 0.05,
|
||||
"task_type":"CAUSAL_LM"
|
||||
}
|
43
config/ds_config.json
Executable file
43
config/ds_config.json
Executable file
|
@ -0,0 +1,43 @@
|
|||
{
|
||||
"train_micro_batch_size_per_gpu": 8,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"optimizer":{
|
||||
"type": "Adam",
|
||||
"params": {
|
||||
"lr": 1e-4,
|
||||
"betas": [0.9, 0.999],
|
||||
"weight_decay": 0.02,
|
||||
"adam_w_mode": true
|
||||
}
|
||||
},
|
||||
"scheduler":{
|
||||
"type": "WarmupCosineLR",
|
||||
"params":{}
|
||||
},
|
||||
"fp16": {
|
||||
"enabled": false,
|
||||
"auto_cast": false,
|
||||
"loss_scale": 0,
|
||||
"initial_scale_power": 16,
|
||||
"loss_scale_window": 1000,
|
||||
"hysteresis": 2,
|
||||
"consecutive_hysteresis": false,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 0,
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 1e9,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"stage3_max_live_parameters": 1e9,
|
||||
"stage3_max_reuse_distance": 1e9,
|
||||
"stage3_gather_16bit_weights_on_model_save": true
|
||||
},
|
||||
"wandb":{
|
||||
"enabled": true,
|
||||
"project": "V2Dial"
|
||||
}
|
||||
}
|
116
config/v2dial_stage_1.conf
Executable file
116
config/v2dial_stage_1.conf
Executable file
|
@ -0,0 +1,116 @@
|
|||
stage_1 {
|
||||
#################################################################################
|
||||
# datasets
|
||||
preextracted = false
|
||||
|
||||
# webvid
|
||||
root_raw_vis_webvid_train = to_fill
|
||||
mapping_path_webvid_train = to_fill
|
||||
root_raw_vis_webvid_val = to_fill
|
||||
mapping_path_webvid_val = to_fill
|
||||
|
||||
# cc3m
|
||||
root_raw_vis_cc3m_train = to_fill
|
||||
mapping_path_cc3m_train = to_fill
|
||||
root_raw_vis_cc3m_val = to_fill
|
||||
mapping_path_cc3m_val = to_fill
|
||||
# Model
|
||||
embed_from_llm = false
|
||||
use_lora_llm = true
|
||||
use_lora_experts = false
|
||||
vit_token_pooling = true
|
||||
expert_size = large
|
||||
use_decoder_only = false
|
||||
lora_config = config/config_lora.json
|
||||
use_residuals = false
|
||||
|
||||
use_moes = true
|
||||
use_sep_spatial_temp_experts = false
|
||||
|
||||
hidden_size_moe = 1024
|
||||
num_moe_layers = 12
|
||||
num_moe_modality_layers = 9
|
||||
num_moe_attention_head = 8
|
||||
|
||||
bert_config_large = config/config_bert_large.json
|
||||
bert_config_base = config/config_bert_base.json
|
||||
vit_model = eva_clip_g
|
||||
text_dim_base = 768
|
||||
text_dim_large = 1024
|
||||
vis_dim_base = 768
|
||||
vis_dim_large = 1024
|
||||
joint_dim = 256
|
||||
num_temporal_query_tokens_base = 32
|
||||
num_spatial_query_tokens_base = 32
|
||||
num_temporal_query_tokens_large = 32
|
||||
num_spatial_query_tokens_large = 32
|
||||
beit_add_ln = true
|
||||
temperature = 0.07
|
||||
gradient_checkpointing = false
|
||||
masking_prob = 0.15
|
||||
image_res = 224
|
||||
pre_train = true
|
||||
vindlu_path = pretrained/25M-pretrain.pth
|
||||
|
||||
bypass_text_expert = false
|
||||
freeze_vit = true
|
||||
freeze_llm = true
|
||||
tie_embeddings = true
|
||||
num_frames = 4
|
||||
|
||||
llm_name = no_llm
|
||||
|
||||
# Training
|
||||
pre_train = true # if true, perfrom mlm
|
||||
batch_size_cc3m = 64
|
||||
batch_size_webvid = 64
|
||||
batch_size_msrvtt = 0
|
||||
num_samples_cc3m = -1
|
||||
num_samples_webvid = -1
|
||||
num_workers = 8
|
||||
use_cpu = false
|
||||
gradient_checkpointing = false
|
||||
masking_prob = 0.15
|
||||
image_res = 224
|
||||
max_cap_len = 32
|
||||
seed = 77
|
||||
vindlu_path = pretrained/25M-pretrain.pth
|
||||
|
||||
# Optimizer & Scheduler
|
||||
optimizer = adamW
|
||||
opt_betas = [0.9, 0.999]
|
||||
lr = 1e-4
|
||||
min_lr = 1e-5
|
||||
# warmup_lr = 1e-6
|
||||
min_lr_multi = 0
|
||||
# num_warmup_steps = 5000
|
||||
warmup_epochs = 1
|
||||
|
||||
weight_decay = 0.01
|
||||
clip_grad_value = 1.0
|
||||
use_different_lr = false
|
||||
diff_lr = 0
|
||||
|
||||
scheduler = linear # cosine / linear
|
||||
|
||||
# scheduler = linear_warmup_cosine_lr # constant_lr / linear_warmup_cosine_lr / linear_warmup_step_lr
|
||||
epochs = 10
|
||||
fp16 = true
|
||||
vit_precision = fp16
|
||||
loss_names = [stc, stm, vcc, vcm, mlm]
|
||||
loss_weights = [0.0, 0.0, 1.0, 1.0, 1.0]
|
||||
accum_grad_every = 1
|
||||
# Logging & Checkpointing
|
||||
log_dir = logs/stage_1
|
||||
log_dir_tokenizer = tokenizers/
|
||||
pretrained_path = none
|
||||
log_freq = 1
|
||||
wandb_project = V2Dial
|
||||
master_port = 5000
|
||||
|
||||
# Evaluation
|
||||
evaluate = false
|
||||
eval_offload = true
|
||||
eval_k_test = 128
|
||||
stop_key = tot
|
||||
}
|
113
config/v2dial_stage_2.conf
Executable file
113
config/v2dial_stage_2.conf
Executable file
|
@ -0,0 +1,113 @@
|
|||
stage_2 {
|
||||
#################################################################################
|
||||
# datasets
|
||||
preextracted = false
|
||||
media_train = [champagne]
|
||||
# champagne
|
||||
root_raw_vis_champagne_train = to_fill
|
||||
mapping_path_champagne_train = to_fill
|
||||
|
||||
root_raw_vis_champagne_val = to_fill
|
||||
mapping_path_champagne_val = to_fill
|
||||
|
||||
num_val_samples = 16
|
||||
|
||||
# Model
|
||||
embed_from_llm = true # if true, use the embedding and tokenizer from the llm
|
||||
use_lora_llm = false
|
||||
use_lora_experts = false
|
||||
vit_token_pooling = true
|
||||
expert_size = large
|
||||
use_decoder_only = false
|
||||
lora_config = config/config_lora.json
|
||||
use_residuals = false
|
||||
hidden_size_moe = 1024
|
||||
num_moe_layers = 12
|
||||
num_moe_modality_layers = 9
|
||||
num_moe_attention_head = 8
|
||||
|
||||
bert_config_large = config/config_bert_large.json
|
||||
bert_config_base = config/config_bert_base.json
|
||||
beit_config = ./config/config_beit.json
|
||||
vit_model = eva_clip_g
|
||||
text_dim_base = 768
|
||||
text_dim_large = 1024
|
||||
vis_dim_base = 768
|
||||
vis_dim_large = 1024
|
||||
joint_dim = 256
|
||||
num_temporal_query_tokens_base = 32
|
||||
num_spatial_query_tokens_base = 32
|
||||
num_temporal_query_tokens_large = 32
|
||||
num_spatial_query_tokens_large = 32
|
||||
beit_add_ln = true
|
||||
temperature = 0.07
|
||||
gradient_checkpointing = false
|
||||
masking_prob = 0.15
|
||||
image_res = 224
|
||||
pre_train = true
|
||||
vindlu_path = pretrained/25M-pretrain.pth
|
||||
|
||||
bypass_text_expert = false
|
||||
freeze_vit = true
|
||||
freeze_llm = true
|
||||
tie_embeddings = true
|
||||
num_frames = 4
|
||||
|
||||
llm_family = flan_t5
|
||||
llm_name = google/flan-t5-large
|
||||
|
||||
# Training
|
||||
pre_train = true # if true, perfrom mlm
|
||||
batch_size_champagne = 8 # 8
|
||||
num_samples_champagne = -1
|
||||
num_workers = 8
|
||||
use_cpu = false
|
||||
gradient_checkpointing = false
|
||||
masking_prob = 0.15
|
||||
image_res = 224
|
||||
max_text_len = 50
|
||||
|
||||
seed = 77
|
||||
vindlu_path = pretrained/25M-pretrain.pth
|
||||
|
||||
# Optimizer & Scheduler
|
||||
optimizer = adamW
|
||||
opt_betas = [0.9, 0.999]
|
||||
lr = 1e-4
|
||||
min_lr = 5e-5
|
||||
# warmup_lr = 1e-6
|
||||
min_lr_multi = 0
|
||||
# num_warmup_steps = 5000
|
||||
warmup_epochs = 1
|
||||
|
||||
weight_decay = 0.01
|
||||
clip_grad_value = 1.0
|
||||
use_different_lr = false
|
||||
diff_lr = 0
|
||||
|
||||
scheduler = linear # cosine / linear
|
||||
|
||||
# scheduler = linear_warmup_cosine_lr # constant_lr / linear_warmup_cosine_lr / linear_warmup_step_lr
|
||||
epochs = 5
|
||||
fp16 = true
|
||||
vit_precision = fp16
|
||||
loss_names = [stc, stm, vhc, vhm, gen]
|
||||
loss_weights = [0.0, 0.0, 0.0, 0.0, 1.0]
|
||||
accum_grad_every = 1
|
||||
|
||||
# Logging & Checkpointing
|
||||
log_dir = logs/stage_2
|
||||
log_dir_tokenizer = tokenizers/
|
||||
pretrained_path = to_fill
|
||||
log_freq = 1
|
||||
wandb_project = V2Dial
|
||||
master_port = 5000
|
||||
|
||||
resume = false
|
||||
pretrained_path_resume = none
|
||||
# Evaluation
|
||||
evaluate = false
|
||||
eval_offload = true
|
||||
eval_k_test = 128
|
||||
stop_key = gen
|
||||
}
|
161
config/v2dial_stage_3.conf
Executable file
161
config/v2dial_stage_3.conf
Executable file
|
@ -0,0 +1,161 @@
|
|||
stage_3 = {
|
||||
|
||||
# data
|
||||
media_train = [visdial]
|
||||
media_val = [visdial]
|
||||
media_test = visdial
|
||||
|
||||
|
||||
####### AVSD #######
|
||||
root_raw_vis_avsd_train = to_fill
|
||||
root_raw_vis_avsd_val = to_fill
|
||||
root_raw_vis_avsd_test = to_fill
|
||||
|
||||
anno_avsd_train = to_fill
|
||||
anno_avsd_val = to_fill
|
||||
anno_avsd_test_dstc_7 = to_fill
|
||||
anno_avsd_test_dstc_8 = to_fill
|
||||
anno_avsd_test_dstc_10 = to_fill
|
||||
|
||||
dstc = 7
|
||||
num_hist_turns_avsd = 10
|
||||
|
||||
####### VisDial #######
|
||||
root_raw_vis_visdial_train = to_fill
|
||||
root_raw_vis_visdial_val = to_fill
|
||||
root_raw_vis_visdial_test = to_fill
|
||||
# anno_visdial_train = /scratch/abdessaied/data/visdial_v1.0/annotations/visdial_1.0_train.json
|
||||
anno_visdial_train = to_fill
|
||||
anno_visdial_val = to_fill
|
||||
anno_visdial_test = to_fill
|
||||
num_hist_turns_visdial = 3
|
||||
|
||||
# Model
|
||||
embed_from_llm = true # if true, use the embedding and tokenizer from the llm
|
||||
use_lora_llm = false
|
||||
vit_token_pooling = true
|
||||
expert_size = large
|
||||
use_decoder_only = false
|
||||
lora_config = config/config_lora.json
|
||||
use_residuals = false
|
||||
|
||||
use_moes = True
|
||||
use_sep_spatial_temp_experts = True
|
||||
drop_vis_features = false
|
||||
|
||||
hidden_size_moe = 1024
|
||||
num_moe_layers = 12
|
||||
num_moe_modality_layers = 9
|
||||
num_moe_attention_head = 8
|
||||
|
||||
bert_config_large = config/config_bert_large.json
|
||||
bert_config_base = config/config_bert_base.json
|
||||
beit_config = ./config/config_beit.json
|
||||
vit_model = eva_clip_g
|
||||
text_dim_base = 768
|
||||
text_dim_large = 1024
|
||||
vis_dim_base = 768
|
||||
vis_dim_large = 1024
|
||||
joint_dim = 256
|
||||
num_temporal_query_tokens_base = 32
|
||||
num_spatial_query_tokens_base = 32
|
||||
num_temporal_query_tokens_large = 32
|
||||
num_spatial_query_tokens_large = 32
|
||||
beit_add_ln = true
|
||||
temperature = 0.07
|
||||
gradient_checkpointing = false
|
||||
masking_prob = 0.15
|
||||
image_res = 224
|
||||
pre_train = true
|
||||
vindlu_path = pretrained/25M-pretrain.pth
|
||||
|
||||
bypass_text_expert = false
|
||||
freeze_vit = true
|
||||
freeze_llm = false
|
||||
tie_embeddings = true
|
||||
num_frames = 4
|
||||
|
||||
llm_family = flan_t5 # bart, flan_t5, llama
|
||||
llm_name = google/flan-t5-large
|
||||
|
||||
# Training
|
||||
pre_train = true # if true, perfrom mlm
|
||||
batch_size_nextqa = 12
|
||||
batch_size_avsd = 6
|
||||
batch_size_visdial = 16
|
||||
|
||||
batch_size_test_avsd = 1
|
||||
batch_size_test_visdial = 1
|
||||
batch_size_test_nextqa = 1
|
||||
|
||||
num_samples_avsd = -1
|
||||
num_samples_visdial = -1
|
||||
num_samples_nextqa = -1
|
||||
|
||||
num_workers = 8
|
||||
use_cpu = false
|
||||
gradient_checkpointing = false
|
||||
masking_prob = 0.15
|
||||
image_res = 224
|
||||
max_text_len = 70
|
||||
|
||||
seed = 77
|
||||
vindlu_path = pretrained/25M-pretrain.pth
|
||||
|
||||
# Optimizer & Scheduler
|
||||
optimizer = adamW
|
||||
opt_betas = [0.9, 0.999]
|
||||
lr = 1e-4
|
||||
min_lr = 5e-5
|
||||
# warmup_lr = 1e-6
|
||||
min_lr_multi = 0
|
||||
# num_warmup_steps = 5000
|
||||
warmup_epochs = 1
|
||||
|
||||
weight_decay = 0.01
|
||||
clip_grad_value = 1.0
|
||||
use_different_lr = false
|
||||
diff_lr = 0
|
||||
|
||||
scheduler = linear # cosine / linear
|
||||
|
||||
# scheduler = linear_warmup_cosine_lr # constant_lr / linear_warmup_cosine_lr / linear_warmup_step_lr
|
||||
epochs = 12
|
||||
fp16 = false
|
||||
vit_precision = fp32
|
||||
loss_names = [stc, stm, vhc, vhm, gen]
|
||||
loss_weights = [0.0, 0.0, 0.0, 0.0, 1.0]
|
||||
accum_grad_every = 1
|
||||
|
||||
# Logging & Checkpointing
|
||||
log_dir = logs/stage_3
|
||||
log_dir_tokenizer = tokenizers/
|
||||
pretrained_path = none
|
||||
log_freq = 1
|
||||
wandb_project = V2Dial
|
||||
master_port = 5001
|
||||
|
||||
resume = false
|
||||
pretrained_path_resume = none
|
||||
|
||||
best_ckpt_path = to_fill
|
||||
|
||||
# Evaluation
|
||||
evaluate = false
|
||||
eval_offload = true
|
||||
eval_k_test = 128
|
||||
stop_key = gen
|
||||
|
||||
output_dir_avsd_7 = output/dstc7
|
||||
output_dir_avsd_8 = output/dstc8
|
||||
output_dir_avsd_10 = output/dstc10
|
||||
output_dir_nextqa = output/nextqa
|
||||
|
||||
# Generation
|
||||
beam_depth = 5
|
||||
max_generation_length = 20
|
||||
min_generation_length = 1
|
||||
length_penalty = 0.3
|
||||
top_p = 1.0
|
||||
temperature = 1.0
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue