initial commit

2025-07-10 07:31:58 +02:00 · 2025-07-10 07:31:58 +02:00 · 7be61f8c6d
commit 7be61f8c6d
137 changed files with 33491 additions and 0 deletions
--- a/config/config_bert_base.json
+++ b/config/config_bert_base.json
@ -0,0 +1,23 @@
+{
+    "architectures": [
+      "BertForMaskedLM"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-12,
+    "max_position_embeddings": 512,
+    "model_type": "bert",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "pad_token_id": 0,
+    "type_vocab_size": 2,
+    "vocab_size": 30522,
+    "fusion_layer": 9,
+    "encoder_width": 768,
+    "cross_module": "ca"
+  }
+  
--- a/config/config_bert_large.json
+++ b/config/config_bert_large.json
@ -0,0 +1,26 @@
+{
+    "architectures": [
+      "BertForMaskedLM"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "gradient_checkpointing": false,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-12,
+    "max_position_embeddings": 512,
+    "model_type": "bert",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "pad_token_id": 0,
+    "position_embedding_type": "absolute",
+    "type_vocab_size": 2,
+    "use_cache": true,
+    "vocab_size": 30522,
+    "fusion_layer": 19,
+    "encoder_width": 1024,
+    "cross_module": "ca"
+  }
+  
--- a/config/config_lora.json
+++ b/config/config_lora.json
@ -0,0 +1,8 @@
+{
+    "r": 8,
+    "lora_alpha": 16,
+    "bias": "none",
+    "use_rslora": true,
+    "lora_dropout": 0.05,
+    "task_type":"CAUSAL_LM"
+}
--- a/config/ds_config.json
+++ b/config/ds_config.json
@ -0,0 +1,43 @@
+{
+    "train_micro_batch_size_per_gpu": 8,
+    "gradient_accumulation_steps": 1,
+    "optimizer":{
+        "type": "Adam",
+        "params": {
+            "lr": 1e-4,
+            "betas": [0.9, 0.999],
+            "weight_decay": 0.02,
+            "adam_w_mode": true
+        }
+    },
+    "scheduler":{
+        "type": "WarmupCosineLR",
+        "params":{}
+    },
+    "fp16": {
+        "enabled": false,
+        "auto_cast": false,
+        "loss_scale": 0,
+        "initial_scale_power": 16,
+        "loss_scale_window": 1000,
+        "hysteresis": 2,
+        "consecutive_hysteresis": false,
+        "min_loss_scale": 1
+    },
+    "zero_optimization": {
+        "stage": 0,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "wandb":{
+        "enabled": true,
+        "project": "V2Dial"
+    }
+}
--- a/config/v2dial_stage_1.conf
+++ b/config/v2dial_stage_1.conf
@ -0,0 +1,116 @@
+stage_1 {
+    #################################################################################
+    # datasets
+    preextracted = false
+
+    # webvid
+    root_raw_vis_webvid_train = to_fill
+    mapping_path_webvid_train = to_fill
+    root_raw_vis_webvid_val = to_fill
+    mapping_path_webvid_val = to_fill
+
+    # cc3m
+    root_raw_vis_cc3m_train = to_fill
+    mapping_path_cc3m_train = to_fill
+    root_raw_vis_cc3m_val = to_fill
+    mapping_path_cc3m_val = to_fill
+    # Model
+    embed_from_llm = false
+    use_lora_llm = true
+    use_lora_experts = false
+    vit_token_pooling = true
+    expert_size = large
+    use_decoder_only = false
+    lora_config = config/config_lora.json
+    use_residuals = false
+
+    use_moes = true
+    use_sep_spatial_temp_experts = false
+
+    hidden_size_moe = 1024
+    num_moe_layers = 12
+    num_moe_modality_layers = 9
+    num_moe_attention_head = 8
+
+    bert_config_large = config/config_bert_large.json
+    bert_config_base  = config/config_bert_base.json
+    vit_model = eva_clip_g
+    text_dim_base = 768
+    text_dim_large = 1024
+    vis_dim_base = 768
+    vis_dim_large = 1024
+    joint_dim = 256 
+    num_temporal_query_tokens_base = 32
+    num_spatial_query_tokens_base = 32
+    num_temporal_query_tokens_large = 32
+    num_spatial_query_tokens_large = 32
+    beit_add_ln = true
+    temperature = 0.07
+    gradient_checkpointing = false
+    masking_prob = 0.15
+    image_res = 224
+    pre_train = true
+    vindlu_path = pretrained/25M-pretrain.pth
+
+    bypass_text_expert = false
+    freeze_vit = true
+    freeze_llm = true
+    tie_embeddings = true
+    num_frames = 4
+    
+    llm_name = no_llm
+
+    # Training
+    pre_train = true  # if true, perfrom mlm
+    batch_size_cc3m = 64
+    batch_size_webvid = 64
+    batch_size_msrvtt = 0
+    num_samples_cc3m = -1
+    num_samples_webvid = -1
+    num_workers = 8
+    use_cpu = false
+    gradient_checkpointing = false
+    masking_prob = 0.15
+    image_res = 224
+    max_cap_len = 32
+    seed = 77
+    vindlu_path = pretrained/25M-pretrain.pth
+
+    # Optimizer & Scheduler
+    optimizer = adamW
+    opt_betas = [0.9, 0.999]
+    lr = 1e-4
+    min_lr = 1e-5
+    # warmup_lr = 1e-6
+    min_lr_multi = 0
+    # num_warmup_steps = 5000
+    warmup_epochs = 1
+
+    weight_decay = 0.01
+    clip_grad_value = 1.0
+    use_different_lr = false
+    diff_lr = 0
+
+    scheduler = linear  # cosine / linear
+
+    # scheduler = linear_warmup_cosine_lr  # constant_lr / linear_warmup_cosine_lr / linear_warmup_step_lr
+    epochs = 10
+    fp16 = true
+    vit_precision = fp16
+    loss_names = [stc, stm, vcc, vcm, mlm]
+    loss_weights = [0.0, 0.0, 1.0, 1.0, 1.0]
+    accum_grad_every = 1
+    # Logging & Checkpointing
+    log_dir = logs/stage_1
+    log_dir_tokenizer = tokenizers/
+    pretrained_path = none
+    log_freq = 1
+    wandb_project = V2Dial
+    master_port = 5000
+    
+    # Evaluation
+    evaluate = false
+    eval_offload = true
+    eval_k_test = 128
+    stop_key = tot
+}
--- a/config/v2dial_stage_2.conf
+++ b/config/v2dial_stage_2.conf
@ -0,0 +1,113 @@
+stage_2 {
+    #################################################################################
+    # datasets
+    preextracted = false
+    media_train = [champagne]
+    # champagne
+    root_raw_vis_champagne_train = to_fill
+    mapping_path_champagne_train = to_fill
+
+    root_raw_vis_champagne_val = to_fill
+    mapping_path_champagne_val = to_fill
+
+    num_val_samples = 16
+
+    # Model
+    embed_from_llm = true # if true, use the embedding and tokenizer from the llm 
+    use_lora_llm = false
+    use_lora_experts = false
+    vit_token_pooling = true
+    expert_size = large
+    use_decoder_only = false
+    lora_config = config/config_lora.json
+    use_residuals = false
+    hidden_size_moe = 1024
+    num_moe_layers = 12
+    num_moe_modality_layers = 9
+    num_moe_attention_head = 8
+
+    bert_config_large = config/config_bert_large.json
+    bert_config_base  = config/config_bert_base.json
+    beit_config = ./config/config_beit.json
+    vit_model = eva_clip_g
+    text_dim_base = 768
+    text_dim_large = 1024
+    vis_dim_base = 768
+    vis_dim_large = 1024
+    joint_dim = 256 
+    num_temporal_query_tokens_base = 32
+    num_spatial_query_tokens_base = 32
+    num_temporal_query_tokens_large = 32
+    num_spatial_query_tokens_large = 32
+    beit_add_ln = true
+    temperature = 0.07
+    gradient_checkpointing = false
+    masking_prob = 0.15
+    image_res = 224
+    pre_train = true
+    vindlu_path = pretrained/25M-pretrain.pth
+
+    bypass_text_expert = false
+    freeze_vit = true
+    freeze_llm = true
+    tie_embeddings = true
+    num_frames = 4
+    
+    llm_family = flan_t5
+    llm_name = google/flan-t5-large
+
+    # Training
+    pre_train = true  # if true, perfrom mlm
+    batch_size_champagne = 8 # 8
+    num_samples_champagne = -1
+    num_workers = 8
+    use_cpu = false
+    gradient_checkpointing = false
+    masking_prob = 0.15
+    image_res = 224
+    max_text_len = 50
+
+    seed = 77
+    vindlu_path = pretrained/25M-pretrain.pth
+
+    # Optimizer & Scheduler
+    optimizer = adamW
+    opt_betas = [0.9, 0.999]
+    lr = 1e-4
+    min_lr = 5e-5
+    # warmup_lr = 1e-6
+    min_lr_multi = 0
+    # num_warmup_steps = 5000
+    warmup_epochs = 1
+
+    weight_decay = 0.01
+    clip_grad_value = 1.0
+    use_different_lr = false
+    diff_lr = 0
+
+    scheduler = linear  # cosine / linear
+
+    # scheduler = linear_warmup_cosine_lr  # constant_lr / linear_warmup_cosine_lr / linear_warmup_step_lr
+    epochs = 5
+    fp16 = true
+    vit_precision = fp16
+    loss_names = [stc, stm, vhc, vhm, gen]
+    loss_weights = [0.0, 0.0, 0.0, 0.0, 1.0]
+    accum_grad_every = 1
+
+    # Logging & Checkpointing
+    log_dir = logs/stage_2
+    log_dir_tokenizer = tokenizers/
+    pretrained_path = to_fill
+    log_freq = 1
+    wandb_project = V2Dial
+    master_port = 5000
+
+    resume = false
+    pretrained_path_resume = none
+    # Evaluation
+    evaluate = false
+    eval_offload = true
+    eval_k_test = 128
+    stop_key = gen
+}
--- a/config/v2dial_stage_3.conf
+++ b/config/v2dial_stage_3.conf
@ -0,0 +1,161 @@
+stage_3 = {
+
+    # data
+    media_train = [visdial]
+    media_val = [visdial]
+    media_test = visdial
+
+
+    ####### AVSD #######
+    root_raw_vis_avsd_train = to_fill
+    root_raw_vis_avsd_val = to_fill
+    root_raw_vis_avsd_test = to_fill
+    
+    anno_avsd_train = to_fill
+    anno_avsd_val = to_fill
+    anno_avsd_test_dstc_7 = to_fill
+    anno_avsd_test_dstc_8 = to_fill
+    anno_avsd_test_dstc_10 = to_fill
+
+    dstc = 7
+    num_hist_turns_avsd = 10
+
+    ####### VisDial #######
+    root_raw_vis_visdial_train = to_fill
+    root_raw_vis_visdial_val   = to_fill
+    root_raw_vis_visdial_test  = to_fill
+    # anno_visdial_train = /scratch/abdessaied/data/visdial_v1.0/annotations/visdial_1.0_train.json
+    anno_visdial_train = to_fill
+    anno_visdial_val   = to_fill
+    anno_visdial_test  = to_fill
+    num_hist_turns_visdial = 3
+
+   # Model
+    embed_from_llm = true # if true, use the embedding and tokenizer from the llm 
+    use_lora_llm = false
+    vit_token_pooling = true
+    expert_size = large
+    use_decoder_only = false
+    lora_config = config/config_lora.json
+    use_residuals = false
+
+    use_moes = True
+    use_sep_spatial_temp_experts = True
+    drop_vis_features = false
+
+    hidden_size_moe = 1024
+    num_moe_layers = 12
+    num_moe_modality_layers = 9
+    num_moe_attention_head = 8
+
+    bert_config_large = config/config_bert_large.json
+    bert_config_base  = config/config_bert_base.json
+    beit_config = ./config/config_beit.json
+    vit_model = eva_clip_g
+    text_dim_base = 768
+    text_dim_large = 1024
+    vis_dim_base = 768
+    vis_dim_large = 1024
+    joint_dim = 256 
+    num_temporal_query_tokens_base = 32
+    num_spatial_query_tokens_base = 32
+    num_temporal_query_tokens_large = 32
+    num_spatial_query_tokens_large = 32
+    beit_add_ln = true
+    temperature = 0.07
+    gradient_checkpointing = false
+    masking_prob = 0.15
+    image_res = 224
+    pre_train = true
+    vindlu_path = pretrained/25M-pretrain.pth
+
+    bypass_text_expert = false
+    freeze_vit = true
+    freeze_llm = false
+    tie_embeddings = true
+    num_frames = 4
+    
+    llm_family = flan_t5  # bart, flan_t5, llama
+    llm_name = google/flan-t5-large
+
+    # Training
+    pre_train = true  # if true, perfrom mlm
+    batch_size_nextqa = 12
+    batch_size_avsd = 6
+    batch_size_visdial = 16
+
+    batch_size_test_avsd = 1
+    batch_size_test_visdial = 1
+    batch_size_test_nextqa = 1
+
+    num_samples_avsd = -1
+    num_samples_visdial = -1
+    num_samples_nextqa = -1
+
+    num_workers = 8
+    use_cpu = false
+    gradient_checkpointing = false
+    masking_prob = 0.15
+    image_res = 224
+    max_text_len = 70
+
+    seed = 77
+    vindlu_path = pretrained/25M-pretrain.pth
+
+    # Optimizer & Scheduler
+    optimizer = adamW
+    opt_betas = [0.9, 0.999]
+    lr = 1e-4
+    min_lr = 5e-5
+    # warmup_lr = 1e-6
+    min_lr_multi = 0
+    # num_warmup_steps = 5000
+    warmup_epochs = 1
+
+    weight_decay = 0.01
+    clip_grad_value = 1.0
+    use_different_lr = false
+    diff_lr = 0
+
+    scheduler = linear  # cosine / linear
+
+    # scheduler = linear_warmup_cosine_lr  # constant_lr / linear_warmup_cosine_lr / linear_warmup_step_lr
+    epochs = 12
+    fp16 = false
+    vit_precision = fp32
+    loss_names = [stc, stm, vhc, vhm, gen]
+    loss_weights = [0.0, 0.0, 0.0, 0.0, 1.0]
+    accum_grad_every = 1
+
+    # Logging & Checkpointing
+    log_dir = logs/stage_3
+    log_dir_tokenizer = tokenizers/
+    pretrained_path = none
+    log_freq = 1
+    wandb_project = V2Dial
+    master_port = 5001
+
+    resume = false
+    pretrained_path_resume = none
+               
+    best_ckpt_path = to_fill
+
+    # Evaluation
+    evaluate = false
+    eval_offload = true
+    eval_k_test = 128
+    stop_key = gen
+    
+    output_dir_avsd_7 = output/dstc7 
+    output_dir_avsd_8 = output/dstc8 
+    output_dir_avsd_10 = output/dstc10 
+    output_dir_nextqa = output/nextqa 
+
+    # Generation
+    beam_depth = 5
+    max_generation_length = 20
+    min_generation_length = 1
+    length_penalty = 0.3
+    top_p = 1.0
+    temperature = 1.0
+}