initial commit

This commit is contained in:
Andreas Bulling 2025-07-10 07:31:58 +02:00
commit 7be61f8c6d
137 changed files with 33491 additions and 0 deletions

23
config/config_bert_base.json Executable file
View file

@ -0,0 +1,23 @@
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"type_vocab_size": 2,
"vocab_size": 30522,
"fusion_layer": 9,
"encoder_width": 768,
"cross_module": "ca"
}

26
config/config_bert_large.json Executable file
View file

@ -0,0 +1,26 @@
{
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 16,
"num_hidden_layers": 24,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 30522,
"fusion_layer": 19,
"encoder_width": 1024,
"cross_module": "ca"
}

8
config/config_lora.json Executable file
View file

@ -0,0 +1,8 @@
{
"r": 8,
"lora_alpha": 16,
"bias": "none",
"use_rslora": true,
"lora_dropout": 0.05,
"task_type":"CAUSAL_LM"
}

43
config/ds_config.json Executable file
View file

@ -0,0 +1,43 @@
{
"train_micro_batch_size_per_gpu": 8,
"gradient_accumulation_steps": 1,
"optimizer":{
"type": "Adam",
"params": {
"lr": 1e-4,
"betas": [0.9, 0.999],
"weight_decay": 0.02,
"adam_w_mode": true
}
},
"scheduler":{
"type": "WarmupCosineLR",
"params":{}
},
"fp16": {
"enabled": false,
"auto_cast": false,
"loss_scale": 0,
"initial_scale_power": 16,
"loss_scale_window": 1000,
"hysteresis": 2,
"consecutive_hysteresis": false,
"min_loss_scale": 1
},
"zero_optimization": {
"stage": 0,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"wandb":{
"enabled": true,
"project": "V2Dial"
}
}

116
config/v2dial_stage_1.conf Executable file
View file

@ -0,0 +1,116 @@
stage_1 {
#################################################################################
# datasets
preextracted = false
# webvid
root_raw_vis_webvid_train = to_fill
mapping_path_webvid_train = to_fill
root_raw_vis_webvid_val = to_fill
mapping_path_webvid_val = to_fill
# cc3m
root_raw_vis_cc3m_train = to_fill
mapping_path_cc3m_train = to_fill
root_raw_vis_cc3m_val = to_fill
mapping_path_cc3m_val = to_fill
# Model
embed_from_llm = false
use_lora_llm = true
use_lora_experts = false
vit_token_pooling = true
expert_size = large
use_decoder_only = false
lora_config = config/config_lora.json
use_residuals = false
use_moes = true
use_sep_spatial_temp_experts = false
hidden_size_moe = 1024
num_moe_layers = 12
num_moe_modality_layers = 9
num_moe_attention_head = 8
bert_config_large = config/config_bert_large.json
bert_config_base = config/config_bert_base.json
vit_model = eva_clip_g
text_dim_base = 768
text_dim_large = 1024
vis_dim_base = 768
vis_dim_large = 1024
joint_dim = 256
num_temporal_query_tokens_base = 32
num_spatial_query_tokens_base = 32
num_temporal_query_tokens_large = 32
num_spatial_query_tokens_large = 32
beit_add_ln = true
temperature = 0.07
gradient_checkpointing = false
masking_prob = 0.15
image_res = 224
pre_train = true
vindlu_path = pretrained/25M-pretrain.pth
bypass_text_expert = false
freeze_vit = true
freeze_llm = true
tie_embeddings = true
num_frames = 4
llm_name = no_llm
# Training
pre_train = true # if true, perfrom mlm
batch_size_cc3m = 64
batch_size_webvid = 64
batch_size_msrvtt = 0
num_samples_cc3m = -1
num_samples_webvid = -1
num_workers = 8
use_cpu = false
gradient_checkpointing = false
masking_prob = 0.15
image_res = 224
max_cap_len = 32
seed = 77
vindlu_path = pretrained/25M-pretrain.pth
# Optimizer & Scheduler
optimizer = adamW
opt_betas = [0.9, 0.999]
lr = 1e-4
min_lr = 1e-5
# warmup_lr = 1e-6
min_lr_multi = 0
# num_warmup_steps = 5000
warmup_epochs = 1
weight_decay = 0.01
clip_grad_value = 1.0
use_different_lr = false
diff_lr = 0
scheduler = linear # cosine / linear
# scheduler = linear_warmup_cosine_lr # constant_lr / linear_warmup_cosine_lr / linear_warmup_step_lr
epochs = 10
fp16 = true
vit_precision = fp16
loss_names = [stc, stm, vcc, vcm, mlm]
loss_weights = [0.0, 0.0, 1.0, 1.0, 1.0]
accum_grad_every = 1
# Logging & Checkpointing
log_dir = logs/stage_1
log_dir_tokenizer = tokenizers/
pretrained_path = none
log_freq = 1
wandb_project = V2Dial
master_port = 5000
# Evaluation
evaluate = false
eval_offload = true
eval_k_test = 128
stop_key = tot
}

113
config/v2dial_stage_2.conf Executable file
View file

@ -0,0 +1,113 @@
stage_2 {
#################################################################################
# datasets
preextracted = false
media_train = [champagne]
# champagne
root_raw_vis_champagne_train = to_fill
mapping_path_champagne_train = to_fill
root_raw_vis_champagne_val = to_fill
mapping_path_champagne_val = to_fill
num_val_samples = 16
# Model
embed_from_llm = true # if true, use the embedding and tokenizer from the llm
use_lora_llm = false
use_lora_experts = false
vit_token_pooling = true
expert_size = large
use_decoder_only = false
lora_config = config/config_lora.json
use_residuals = false
hidden_size_moe = 1024
num_moe_layers = 12
num_moe_modality_layers = 9
num_moe_attention_head = 8
bert_config_large = config/config_bert_large.json
bert_config_base = config/config_bert_base.json
beit_config = ./config/config_beit.json
vit_model = eva_clip_g
text_dim_base = 768
text_dim_large = 1024
vis_dim_base = 768
vis_dim_large = 1024
joint_dim = 256
num_temporal_query_tokens_base = 32
num_spatial_query_tokens_base = 32
num_temporal_query_tokens_large = 32
num_spatial_query_tokens_large = 32
beit_add_ln = true
temperature = 0.07
gradient_checkpointing = false
masking_prob = 0.15
image_res = 224
pre_train = true
vindlu_path = pretrained/25M-pretrain.pth
bypass_text_expert = false
freeze_vit = true
freeze_llm = true
tie_embeddings = true
num_frames = 4
llm_family = flan_t5
llm_name = google/flan-t5-large
# Training
pre_train = true # if true, perfrom mlm
batch_size_champagne = 8 # 8
num_samples_champagne = -1
num_workers = 8
use_cpu = false
gradient_checkpointing = false
masking_prob = 0.15
image_res = 224
max_text_len = 50
seed = 77
vindlu_path = pretrained/25M-pretrain.pth
# Optimizer & Scheduler
optimizer = adamW
opt_betas = [0.9, 0.999]
lr = 1e-4
min_lr = 5e-5
# warmup_lr = 1e-6
min_lr_multi = 0
# num_warmup_steps = 5000
warmup_epochs = 1
weight_decay = 0.01
clip_grad_value = 1.0
use_different_lr = false
diff_lr = 0
scheduler = linear # cosine / linear
# scheduler = linear_warmup_cosine_lr # constant_lr / linear_warmup_cosine_lr / linear_warmup_step_lr
epochs = 5
fp16 = true
vit_precision = fp16
loss_names = [stc, stm, vhc, vhm, gen]
loss_weights = [0.0, 0.0, 0.0, 0.0, 1.0]
accum_grad_every = 1
# Logging & Checkpointing
log_dir = logs/stage_2
log_dir_tokenizer = tokenizers/
pretrained_path = to_fill
log_freq = 1
wandb_project = V2Dial
master_port = 5000
resume = false
pretrained_path_resume = none
# Evaluation
evaluate = false
eval_offload = true
eval_k_test = 128
stop_key = gen
}

161
config/v2dial_stage_3.conf Executable file
View file

@ -0,0 +1,161 @@
stage_3 = {
# data
media_train = [visdial]
media_val = [visdial]
media_test = visdial
####### AVSD #######
root_raw_vis_avsd_train = to_fill
root_raw_vis_avsd_val = to_fill
root_raw_vis_avsd_test = to_fill
anno_avsd_train = to_fill
anno_avsd_val = to_fill
anno_avsd_test_dstc_7 = to_fill
anno_avsd_test_dstc_8 = to_fill
anno_avsd_test_dstc_10 = to_fill
dstc = 7
num_hist_turns_avsd = 10
####### VisDial #######
root_raw_vis_visdial_train = to_fill
root_raw_vis_visdial_val = to_fill
root_raw_vis_visdial_test = to_fill
# anno_visdial_train = /scratch/abdessaied/data/visdial_v1.0/annotations/visdial_1.0_train.json
anno_visdial_train = to_fill
anno_visdial_val = to_fill
anno_visdial_test = to_fill
num_hist_turns_visdial = 3
# Model
embed_from_llm = true # if true, use the embedding and tokenizer from the llm
use_lora_llm = false
vit_token_pooling = true
expert_size = large
use_decoder_only = false
lora_config = config/config_lora.json
use_residuals = false
use_moes = True
use_sep_spatial_temp_experts = True
drop_vis_features = false
hidden_size_moe = 1024
num_moe_layers = 12
num_moe_modality_layers = 9
num_moe_attention_head = 8
bert_config_large = config/config_bert_large.json
bert_config_base = config/config_bert_base.json
beit_config = ./config/config_beit.json
vit_model = eva_clip_g
text_dim_base = 768
text_dim_large = 1024
vis_dim_base = 768
vis_dim_large = 1024
joint_dim = 256
num_temporal_query_tokens_base = 32
num_spatial_query_tokens_base = 32
num_temporal_query_tokens_large = 32
num_spatial_query_tokens_large = 32
beit_add_ln = true
temperature = 0.07
gradient_checkpointing = false
masking_prob = 0.15
image_res = 224
pre_train = true
vindlu_path = pretrained/25M-pretrain.pth
bypass_text_expert = false
freeze_vit = true
freeze_llm = false
tie_embeddings = true
num_frames = 4
llm_family = flan_t5 # bart, flan_t5, llama
llm_name = google/flan-t5-large
# Training
pre_train = true # if true, perfrom mlm
batch_size_nextqa = 12
batch_size_avsd = 6
batch_size_visdial = 16
batch_size_test_avsd = 1
batch_size_test_visdial = 1
batch_size_test_nextqa = 1
num_samples_avsd = -1
num_samples_visdial = -1
num_samples_nextqa = -1
num_workers = 8
use_cpu = false
gradient_checkpointing = false
masking_prob = 0.15
image_res = 224
max_text_len = 70
seed = 77
vindlu_path = pretrained/25M-pretrain.pth
# Optimizer & Scheduler
optimizer = adamW
opt_betas = [0.9, 0.999]
lr = 1e-4
min_lr = 5e-5
# warmup_lr = 1e-6
min_lr_multi = 0
# num_warmup_steps = 5000
warmup_epochs = 1
weight_decay = 0.01
clip_grad_value = 1.0
use_different_lr = false
diff_lr = 0
scheduler = linear # cosine / linear
# scheduler = linear_warmup_cosine_lr # constant_lr / linear_warmup_cosine_lr / linear_warmup_step_lr
epochs = 12
fp16 = false
vit_precision = fp32
loss_names = [stc, stm, vhc, vhm, gen]
loss_weights = [0.0, 0.0, 0.0, 0.0, 1.0]
accum_grad_every = 1
# Logging & Checkpointing
log_dir = logs/stage_3
log_dir_tokenizer = tokenizers/
pretrained_path = none
log_freq = 1
wandb_project = V2Dial
master_port = 5001
resume = false
pretrained_path_resume = none
best_ckpt_path = to_fill
# Evaluation
evaluate = false
eval_offload = true
eval_k_test = 128
stop_key = gen
output_dir_avsd_7 = output/dstc7
output_dir_avsd_8 = output/dstc8
output_dir_avsd_10 = output/dstc10
output_dir_nextqa = output/nextqa
# Generation
beam_depth = 5
max_generation_length = 20
min_generation_length = 1
length_penalty = 0.3
top_p = 1.0
temperature = 1.0
}