158 lines
No EOL
7.3 KiB
Python
158 lines
No EOL
7.3 KiB
Python
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
import argparse
|
|
import os, pdb
|
|
import numpy as np
|
|
import json
|
|
from pathlib import Path
|
|
import json
|
|
from tqdm import tqdm
|
|
from cprint import cprint
|
|
import evaluate
|
|
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
|
|
import logging
|
|
import torch
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
from sentence_transformers import SentenceTransformer, util
|
|
|
|
def initialize():
|
|
parser = argparse.ArgumentParser("")
|
|
parser.add_argument("--model_name_or_path", type=str, default='')
|
|
parser.add_argument("--embedding_model_path", type=str, default="")
|
|
parser.add_argument("--train_data_dir", type=str, default='')
|
|
parser.add_argument("--test_data_dir", type=str, default='')
|
|
parser.add_argument("--prompt_file", type=str, default=None, help="The file for loading the prompt")
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
def get_tokenizer(args):
|
|
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, device_map={"":0})
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
tokenizer.padding_side = 'left'
|
|
return tokenizer
|
|
|
|
def get_model(args):
|
|
model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, device_map={"":0})
|
|
return model
|
|
|
|
def setup_model_and_tokenizer(args):
|
|
tokenizer = get_tokenizer(args)
|
|
model = get_model(args)
|
|
return tokenizer, model
|
|
|
|
def read_json_file(filename):
|
|
with open(filename, 'r') as infile:
|
|
data = json.load(infile)
|
|
return data
|
|
|
|
def format_one_action(action):
|
|
return f"- {action}\n"
|
|
|
|
def format_actions_list(actions):
|
|
actions_str = ""
|
|
for action in actions:
|
|
actions_str += format_one_action(action)
|
|
return actions_str
|
|
|
|
def preprocess_data(task, args):
|
|
with open(args.prompt_file, 'r') as file:
|
|
task_description = file.read().split('===')
|
|
|
|
input_str = f"## Website:\n{task['website_en']}\n\n## Domain:\n{task['domain_en']}\n\n## Sub-domain:\n{task['subdomain_en']}\n\n## Actions (Each line is one action):\n{format_actions_list(task['task_subintention'])}\n## Sub-intentions summarised from these actions:\n{format_actions_list(task['steps'])}"
|
|
query_inputs = f"{task_description[0]}\n{input_str}{task_description[1]}\n"
|
|
summary_str = task['task_description']
|
|
summary_str = summary_str[0].upper() + summary_str[1:] + "."
|
|
test_prompt = f"User: {query_inputs}\nAgent:"
|
|
return {"task": summary_str, "prompt": test_prompt}
|
|
|
|
def load_raw_dataset(data, args):
|
|
tasks = []
|
|
for d in tqdm(data):
|
|
processed_task = preprocess_data(d, args)
|
|
tasks.append(processed_task)
|
|
return tasks
|
|
|
|
def main_loop(args, test_dataset, tokenizer, model, sacrebleu, rouge, meteor, embedding_model, mark):
|
|
os.makedirs(args.model_name_or_path+"/results/", exist_ok=True)
|
|
global_sacrebleu, global_rouge1, global_rouge2, global_rougeL, global_rougeLsum, global_meteor, global_cosine, global_distance = [], [], [], [], [], [], [], []
|
|
for i, data in tqdm(enumerate(test_dataset)):
|
|
save_task_response_filename = args.model_name_or_path + f"/results/{mark}_{i}_insert_mistral.json"
|
|
if os.path.exists(save_task_response_filename):
|
|
with open(save_task_response_filename, 'r') as f:
|
|
save_dict = json.load(f)
|
|
else:
|
|
prompt = data["prompt"]
|
|
task = data["task"]
|
|
|
|
save_dict = {}
|
|
model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
|
|
generated_ids = model.generate(**model_inputs,max_new_tokens=1024, do_sample=False, top_p= 0.95, repetition_penalty=1.2)
|
|
pred = tokenizer.batch_decode(generated_ids)[0]
|
|
response = pred.split("[SUMMARY]")[-1].replace('</s>','').strip()
|
|
|
|
rouge_calc = rouge.compute(predictions = [response], references=[[task]], use_aggregator=True)
|
|
sacrebleu_calc = sacrebleu.compute(predictions = [response], references=[[task]])
|
|
meteor_calc = meteor.compute(predictions = [response], references=[[task]])
|
|
GT_Embedding= embedding_model.encode(task.lower(), convert_to_tensor=True)
|
|
Prediction_Embedding = embedding_model.encode(response.lower(), convert_to_tensor=True)
|
|
cosine_similarity = util.cos_sim(GT_Embedding, Prediction_Embedding).item()
|
|
euclidean_disance = torch.sqrt(torch.sum(torch.pow(torch.subtract(GT_Embedding, Prediction_Embedding), 2))).item()
|
|
save_dict["prompt"] = prompt
|
|
save_dict["prediction"] = response
|
|
save_dict["task"] = task
|
|
save_dict["sacrebleu"] = sacrebleu_calc
|
|
save_dict["rouge"] = rouge_calc
|
|
save_dict["meteor"] = meteor_calc
|
|
save_dict["cosine_similarity"] = cosine_similarity
|
|
save_dict["euclidean_disance"] = euclidean_disance
|
|
|
|
with open(save_task_response_filename, 'w') as f:
|
|
json.dump(save_dict, f)
|
|
|
|
global_sacrebleu.append(save_dict["sacrebleu"]["score"])
|
|
global_rouge1.append(save_dict["rouge"]["rouge1"])
|
|
global_rouge2.append(save_dict["rouge"]["rouge2"])
|
|
global_rougeL.append(save_dict["rouge"]["rougeL"])
|
|
global_rougeLsum.append(save_dict["rouge"]["rougeLsum"])
|
|
global_meteor.append(save_dict["meteor"]["meteor"])
|
|
global_cosine.append(save_dict["cosine_similarity"])
|
|
global_distance.append(save_dict["euclidean_disance"])
|
|
|
|
return global_sacrebleu, global_rouge1, global_rouge2, global_rougeL, global_rougeLsum, global_meteor, global_cosine, global_distance
|
|
|
|
def main(mark):
|
|
args = initialize()
|
|
assert 'Mind2Web' in args.test_data_dir
|
|
tokenizer, model = setup_model_and_tokenizer(args)
|
|
sacrebleu = evaluate.load('sacrebleu', modeule_type = "metric")
|
|
rouge = evaluate.load('rouge', modeule_type = "metric")
|
|
meteor = evaluate.load('meteor', modeule_type = "metric")
|
|
embedding_model = SentenceTransformer(args.embedding_model_path, device="cuda")
|
|
|
|
test_folders_names = ["test_domain", "test_task", "test_website"]
|
|
for name in test_folders_names:
|
|
test_folder_path = Path(os.path.join(args.test_data_dir,name))
|
|
global_sacrebleu, global_rouge1, global_rouge2, global_rougeL, global_rougeLsum, global_meteor, global_cosine, global_distance = [], [], [], [], [], [], [], []
|
|
for json_file in test_folder_path.rglob('*_with_steps_insert_mistral.json'):
|
|
with json_file.open('r') as f:
|
|
data = json.load(f)
|
|
raw_tasks = load_raw_dataset(data, args)
|
|
sacrebleu_calc, rouge1_calc, rouge2_calc, rougeL_calc, rougeLsum_calc, meteor_calc, cosine_calc, distance_calc = main_loop(args, raw_tasks, tokenizer, model, sacrebleu, rouge, meteor, embedding_model, 'test_%s'%(name))
|
|
|
|
global_sacrebleu.extend(sacrebleu_calc)
|
|
global_rouge1.extend(rouge1_calc)
|
|
global_rouge2.extend(rouge2_calc)
|
|
global_rougeL.extend(rougeL_calc)
|
|
global_rougeLsum.extend(rougeLsum_calc)
|
|
global_meteor.extend(meteor_calc)
|
|
global_cosine.extend(cosine_calc)
|
|
global_distance.extend(distance_calc)
|
|
|
|
print(mark, name)
|
|
print("%.3f" % (np.mean(global_cosine)))
|
|
print("%.3f" % (np.mean(global_sacrebleu)/100.0))
|
|
print("%.3f" % (np.mean(global_rougeL)))
|
|
print("%.3f" % (np.mean(global_meteor)))
|
|
|
|
if __name__ == "__main__":
|
|
main('test') |