Uploaded

2025-04-10 20:14:17 +02:00 · 2025-04-10 20:14:17 +02:00 · 04c4625cfe
commit 04c4625cfe
11 changed files with 1330 additions and 0 deletions
--- a/hf_bmt/bmt_hf.py
+++ b/hf_bmt/bmt_hf.py
@ -0,0 +1,92 @@
+import os, pdb
+import json
+import torch
+import sys
+import shutil
+import argparse
+from collections import OrderedDict
+from transformers import AutoConfig, AutoModelForCausalLM
+
+
+def transform_to_hf(bmt_model, model_size):
+    model_hf = OrderedDict()
+
+    if 'input_embedding.weight' in bmt_model.keys():
+        model_hf['model.embed_tokens.weight'] = bmt_model["input_embedding.weight"].contiguous().float()
+        model_hf['model.norm.weight'] = bmt_model["encoder.output_layernorm.weight"].contiguous().float()
+        try:
+            model_hf['lm_head.weight'] = bmt_model['output_projection.weight'].contiguous().float()
+        except:
+            model_hf['lm_head.weight'] = bmt_model["input_embedding.weight"].contiguous().float()
+    else:
+        model_hf['model.embed_tokens.weight'] = bmt_model["LLM.input_embedding.weight"].contiguous().float()
+        model_hf['model.norm.weight'] = bmt_model["LLM.encoder.output_layernorm.weight"].contiguous().float()
+        try:
+            model_hf['lm_head.weight'] = bmt_model['LLM.output_projection.weight'].contiguous().float()
+        except:
+            model_hf['lm_head.weight'] = bmt_model["LLM.input_embedding.weight"].contiguous().float()
+
+    if model_size == "7b":
+        layernum = 32
+    elif model_size == "13b" or model_size == "13b-2":
+        layernum = 40
+    elif model_size == "65b":
+        layernum = 80
+        
+    for lnum in range(layernum):
+        hf_pfx = f"model.layers.{lnum}"
+        if 'input_embedding.weight' in bmt_model.keys():
+            bmt_pfx = f"encoder.layers.{lnum}"
+        else:
+            bmt_pfx = f"LLM.encoder.layers.{lnum}"
+        
+        model_hf[f"{hf_pfx}.input_layernorm.weight"] = bmt_model[f"{bmt_pfx}.self_att.layernorm_before_attention.weight"].contiguous().float()
+
+        model_hf[f"{hf_pfx}.self_attn.q_proj.weight"] = bmt_model[f"{bmt_pfx}.self_att.self_attention.project_q.weight"].contiguous().float()
+        model_hf[f"{hf_pfx}.self_attn.k_proj.weight"] = bmt_model[f"{bmt_pfx}.self_att.self_attention.project_k.weight"].contiguous().float()
+        model_hf[f"{hf_pfx}.self_attn.v_proj.weight"] = bmt_model[f"{bmt_pfx}.self_att.self_attention.project_v.weight"].contiguous().float()
+        model_hf[f"{hf_pfx}.self_attn.o_proj.weight"] = bmt_model[f"{bmt_pfx}.self_att.self_attention.attention_out.weight"].contiguous().float()
+
+        model_hf[f"{hf_pfx}.post_attention_layernorm.weight"] = bmt_model[f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"].contiguous().float()
+
+        model_hf[f"{hf_pfx}.mlp.gate_proj.weight"] = bmt_model[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"].contiguous().float()
+        model_hf[f"{hf_pfx}.mlp.up_proj.weight"] = bmt_model[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"].contiguous().float()
+
+        model_hf[f"{hf_pfx}.mlp.down_proj.weight"] = bmt_model[f"{bmt_pfx}.ffn.ffn.w_out.weight"].contiguous().float()
+    
+    for key in model_hf:
+        model_hf[key] = model_hf[key].bfloat16()
+    return model_hf
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--in_path", type=str)
+    parser.add_argument("--output_path", type=str)
+    parser.add_argument("--original_mistral_path", type=str)
+
+    args = parser.parse_args()
+    os.makedirs(args.output_path, exist_ok=True)
+    print("transforming " + args.in_path)
+    
+    model_size = "7b"
+    
+    ckpt = [name for name in os.listdir(args.in_path) if name.endswith(".pt")]
+    bmt_model = torch.load(os.path.join(args.in_path, ckpt[0]))
+
+    hf_state_dict = transform_to_hf(bmt_model, model_size)
+    print(f"start saving to {args.output_path}")
+
+    model_config = AutoConfig.from_pretrained(args.original_mistral_path)
+    model = AutoModelForCausalLM.from_config(model_config)
+    model.load_state_dict(hf_state_dict)
+    
+    for param in model.parameters():
+        param.data = param.data.to(torch.bfloat16)
+    
+    model.save_pretrained(args.output_path, safe_serialization=False)
+    for file_name in ["tokenizer_config.json", "special_tokens_map.json", "tokenizer.model", "tokenizer.json"]:
+        if os.path.exists(os.path.join(args.in_path, file_name)):
+            shutil.copy(os.path.join(args.in_path, file_name), os.path.join(args.output_path, file_name))
+    print("saved huggingface checkpoint")
--- a/hf_bmt/hf_2_bmtrain.py
+++ b/hf_bmt/hf_2_bmtrain.py
@ -0,0 +1,108 @@
+from transformers import LlamaConfig
+from transformers import AutoModelForCausalLM
+import torch, os
+import json
+from collections import OrderedDict
+import shutil, pdb
+
+import argparse
+
+def initialize():
+    # get arguments
+    parser = argparse.ArgumentParser("")
+    # Output Directory for the bmt train weights.
+    parser.add_argument("--out_path", type=str, default=f"/Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24")
+    # Path where you downloaded mistral-7b hugging face weight
+    parser.add_argument('--in_path', type=str, default=f"/Mistral-{ver}-bmtrain")
+    args = parser.parse_args()
+    return args
+
+ver = "7b"
+# Change these two 
+# Output Directory for the bmt train weights.
+# outpath = f"/Mistral-{ver}-bmtrain"
+# Path where you downloaded mistral-7b hugging face weight
+# inpath = f"/Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24"
+def convert_weights(args):
+    hf_config = LlamaConfig.from_pretrained(args.in_path)
+    config = {
+        'dim_model': hf_config.hidden_size,
+        'dim_ff': hf_config.intermediate_size,
+        'num_layers': hf_config.num_hidden_layers,
+        'num_heads': hf_config.num_attention_heads,
+        'num_heads_kv': hf_config.num_key_value_heads,
+        'dim_head': hf_config.hidden_size // hf_config.num_attention_heads,
+        'norm_eps': hf_config.rms_norm_eps,
+    }
+    os.makedirs(args.out_path, exist_ok=True)
+
+    with open(os.path.join(args.out_path, "config.json"), 'w') as f:
+        json.dump(config, f)
+
+    layernum = config['num_layers']
+
+    model_hf = OrderedDict()
+    ckpt_num = None
+    if 'v0.1' in args.in_path:
+        prefix = "pytorch_model-"
+        endtext = ".bin"
+    else:
+        prefix = "model-"
+        endtext = ".safetensors"
+    for name in os.listdir(args.in_path):
+        if name.startswith(prefix) and name.endswith(endtext):
+            ckpt_num =int(name.split(endtext)[0].split('-')[-1])
+    for i in range(1, ckpt_num + 1):
+        if 'v0.1' in args.in_path:
+            part = torch.load(os.path.join(args.in_path, f"pytorch_model-{i:05d}-of-{ckpt_num:05d}.bin"))
+        else:            
+            from safetensors import safe_open
+            with safe_open(os.path.join(args.in_path, f"model-{i:05d}-of-{ckpt_num:05d}.safetensors"), framework="pt", device=0) as f:
+                part = {}
+                for k in f.keys():
+                    part[k] = f.get_tensor(k)
+        model_hf.update(part)
+
+    out = OrderedDict()
+
+    out["input_embedding.weight"] = model_hf['model.embed_tokens.weight'].contiguous()
+    out["encoder.output_layernorm.weight"] = model_hf['model.norm.weight'].contiguous()
+    out['output_projection.weight'] = model_hf['lm_head.weight'].contiguous()
+    for lnum in range(layernum):
+        hf_pfx = f"model.layers.{lnum}"
+        bmt_pfx = f"encoder.layers.{lnum}"
+        
+        out[f"{bmt_pfx}.self_att.layernorm_before_attention.weight"] = model_hf[f"{hf_pfx}.input_layernorm.weight"].contiguous()
+
+        out[f"{bmt_pfx}.self_att.self_attention.project_q.weight"] = model_hf[f"{hf_pfx}.self_attn.q_proj.weight"].contiguous()
+        out[f"{bmt_pfx}.self_att.self_attention.project_k.weight"] = model_hf[f"{hf_pfx}.self_attn.k_proj.weight"].contiguous()
+        out[f"{bmt_pfx}.self_att.self_attention.project_v.weight"] = model_hf[f"{hf_pfx}.self_attn.v_proj.weight"].contiguous()
+        out[f"{bmt_pfx}.self_att.self_attention.attention_out.weight"] = model_hf[f"{hf_pfx}.self_attn.o_proj.weight"].contiguous()
+
+        out[f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"] = model_hf[f"{hf_pfx}.post_attention_layernorm.weight"].contiguous()
+
+        out[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"] = model_hf[f"{hf_pfx}.mlp.gate_proj.weight"].contiguous()
+        out[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"] = model_hf[f"{hf_pfx}.mlp.up_proj.weight"].contiguous()
+
+        out[f"{bmt_pfx}.ffn.ffn.w_out.weight"] = model_hf[f"{hf_pfx}.mlp.down_proj.weight"].contiguous()
+        
+        
+    for key in out:
+        out[key] = out[key].half()
+
+    if not os.path.exists(args.out_path):
+        os.makedirs(args.out_path)
+    torch.save(out, os.path.join(args.out_path, "pytorch_model.pt"))
+
+    for file_name in ["tokenizer_config.json", "special_tokens_map.json", "tokenizer.model", "tokenizer.json"]:
+        if os.path.exists(os.path.join(args.in_path, file_name)):
+            shutil.copy(os.path.join(args.in_path, file_name), os.path.join(args.out_path, file_name))
+    
+    print("BMT weights created sucessfully")
+
+def main():
+    args = initialize()
+    convert_weights(args)
+    
+if __name__ == "__main__":
+    main()
--- a/hf_bmt/hf_2_bmtrain.sh
+++ b/hf_bmt/hf_2_bmtrain.sh
@ -0,0 +1,13 @@
+IN_PATH="your-path-to-hf-model"
+OUT_PATH="your-wanted-path-to-bm-model"
+
+OPTS=""
+OPTS+="--in_path ${IN_PATH} "
+OPTS+="--out_path ${OUT_PATH}"
+
+CMD="python3 hf_2_bmtrain.py ${OPTS}"
+
+echo "-------final CMD is------"
+echo "${CMD}"
+echo "-------final CMD end------"
+eval ${CMD}