from transformers import AutoTokenizer, AutoModelForCausalLM import argparse import os, pdb import numpy as np import json from pathlib import Path import json from tqdm import tqdm from cprint import cprint import evaluate os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32" import logging import torch logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) from sentence_transformers import SentenceTransformer, util def initialize(): parser = argparse.ArgumentParser("") parser.add_argument("--model_name_or_path", type=str, default='') parser.add_argument("--embedding_model_path", type=str, default="") parser.add_argument("--train_data_dir", type=str, default='') parser.add_argument("--test_data_dir", type=str, default='') parser.add_argument("--prompt_file", type=str, default=None, help="The file for loading the prompt") args = parser.parse_args() return args def get_tokenizer(args): tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, device_map={"":0}) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = 'left' return tokenizer def get_model(args): model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, device_map={"":0}) return model def setup_model_and_tokenizer(args): tokenizer = get_tokenizer(args) model = get_model(args) return tokenizer, model def read_json_file(filename): with open(filename, 'r') as infile: data = json.load(infile) return data def format_one_action(action): return f"- {action}\n" def format_actions_list(actions): actions_str = "" for action in actions: actions_str += format_one_action(action) return actions_str def preprocess_data(task, args): with open(args.prompt_file, 'r') as file: task_description = file.read().split('===') input_str = f"## Website:\n{task['website_en']}\n\n## Domain:\n{task['domain_en']}\n\n## Sub-domain:\n{task['subdomain_en']}\n\n## Actions (Each line is one action):\n{format_actions_list(task['task_subintention'])}\n## Sub-intentions summarised from these actions:\n{format_actions_list(task['steps'])}" query_inputs = f"{task_description[0]}\n{input_str}{task_description[1]}\n" summary_str = task['task_description'] summary_str = summary_str[0].upper() + summary_str[1:] + "." test_prompt = f"User: {query_inputs}\nAgent:" return {"task": summary_str, "prompt": test_prompt} def load_raw_dataset(data, args): tasks = [] for d in tqdm(data): processed_task = preprocess_data(d, args) tasks.append(processed_task) return tasks def main_loop(args, test_dataset, tokenizer, model, sacrebleu, rouge, meteor, embedding_model, mark): os.makedirs(args.model_name_or_path+"/results/", exist_ok=True) global_sacrebleu, global_rouge1, global_rouge2, global_rougeL, global_rougeLsum, global_meteor, global_cosine, global_distance = [], [], [], [], [], [], [], [] for i, data in tqdm(enumerate(test_dataset)): save_task_response_filename = args.model_name_or_path + f"/results/{mark}_{i}_insert_mistral.json" if os.path.exists(save_task_response_filename): with open(save_task_response_filename, 'r') as f: save_dict = json.load(f) else: prompt = data["prompt"] task = data["task"] save_dict = {} model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda") generated_ids = model.generate(**model_inputs,max_new_tokens=1024, do_sample=False, top_p= 0.95, repetition_penalty=1.2) pred = tokenizer.batch_decode(generated_ids)[0] response = pred.split("[SUMMARY]")[-1].replace('','').strip() rouge_calc = rouge.compute(predictions = [response], references=[[task]], use_aggregator=True) sacrebleu_calc = sacrebleu.compute(predictions = [response], references=[[task]]) meteor_calc = meteor.compute(predictions = [response], references=[[task]]) GT_Embedding= embedding_model.encode(task.lower(), convert_to_tensor=True) Prediction_Embedding = embedding_model.encode(response.lower(), convert_to_tensor=True) cosine_similarity = util.cos_sim(GT_Embedding, Prediction_Embedding).item() euclidean_disance = torch.sqrt(torch.sum(torch.pow(torch.subtract(GT_Embedding, Prediction_Embedding), 2))).item() save_dict["prompt"] = prompt save_dict["prediction"] = response save_dict["task"] = task save_dict["sacrebleu"] = sacrebleu_calc save_dict["rouge"] = rouge_calc save_dict["meteor"] = meteor_calc save_dict["cosine_similarity"] = cosine_similarity save_dict["euclidean_disance"] = euclidean_disance with open(save_task_response_filename, 'w') as f: json.dump(save_dict, f) global_sacrebleu.append(save_dict["sacrebleu"]["score"]) global_rouge1.append(save_dict["rouge"]["rouge1"]) global_rouge2.append(save_dict["rouge"]["rouge2"]) global_rougeL.append(save_dict["rouge"]["rougeL"]) global_rougeLsum.append(save_dict["rouge"]["rougeLsum"]) global_meteor.append(save_dict["meteor"]["meteor"]) global_cosine.append(save_dict["cosine_similarity"]) global_distance.append(save_dict["euclidean_disance"]) return global_sacrebleu, global_rouge1, global_rouge2, global_rougeL, global_rougeLsum, global_meteor, global_cosine, global_distance def main(mark): args = initialize() assert 'Mind2Web' in args.test_data_dir tokenizer, model = setup_model_and_tokenizer(args) sacrebleu = evaluate.load('sacrebleu', modeule_type = "metric") rouge = evaluate.load('rouge', modeule_type = "metric") meteor = evaluate.load('meteor', modeule_type = "metric") embedding_model = SentenceTransformer(args.embedding_model_path, device="cuda") test_folders_names = ["test_domain", "test_task", "test_website"] for name in test_folders_names: test_folder_path = Path(os.path.join(args.test_data_dir,name)) global_sacrebleu, global_rouge1, global_rouge2, global_rougeL, global_rougeLsum, global_meteor, global_cosine, global_distance = [], [], [], [], [], [], [], [] for json_file in test_folder_path.rglob('*_with_steps_insert_mistral.json'): with json_file.open('r') as f: data = json.load(f) raw_tasks = load_raw_dataset(data, args) sacrebleu_calc, rouge1_calc, rouge2_calc, rougeL_calc, rougeLsum_calc, meteor_calc, cosine_calc, distance_calc = main_loop(args, raw_tasks, tokenizer, model, sacrebleu, rouge, meteor, embedding_model, 'test_%s'%(name)) global_sacrebleu.extend(sacrebleu_calc) global_rouge1.extend(rouge1_calc) global_rouge2.extend(rouge2_calc) global_rougeL.extend(rougeL_calc) global_rougeLsum.extend(rougeLsum_calc) global_meteor.extend(meteor_calc) global_cosine.extend(cosine_calc) global_distance.extend(distance_calc) print(mark, name) print("%.3f" % (np.mean(global_cosine))) print("%.3f" % (np.mean(global_sacrebleu)/100.0)) print("%.3f" % (np.mean(global_rougeL))) print("%.3f" % (np.mean(global_meteor))) if __name__ == "__main__": main('test')