97 lines
No EOL
4 KiB
Python
97 lines
No EOL
4 KiB
Python
from tqdm import tqdm
|
|
import json
|
|
import os
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
def get_tokenizer(model_name_or_path):
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, device_map={"":0})
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
tokenizer.padding_side = 'left'
|
|
return tokenizer
|
|
|
|
def get_model(model_name_or_path):
|
|
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map={"":0})
|
|
return model
|
|
|
|
def read_json_file(filename):
|
|
with open(filename, 'r') as infile:
|
|
data = json.load(infile)
|
|
return data
|
|
|
|
if __name__ == "__main__":
|
|
model_name_or_path = "Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24"
|
|
tokenizer = get_tokenizer(model_name_or_path)
|
|
model = get_model(model_name_or_path)
|
|
|
|
# load prompts
|
|
with open("your-path-to-data/train_prompt.txt", "r") as f:
|
|
train_prompt = f.read()
|
|
with open("your-path-to-data/test_prompt.txt", "r") as f:
|
|
test_prompt = f.read()
|
|
|
|
for foldername in ['train','test_domain','test_website','test_task']:
|
|
SAVE_PATH = f"your-path-to-data/{foldername}"
|
|
|
|
for idx in range(100):
|
|
savejsonfilename = f"{SAVE_PATH}/{foldername}_{idx}_with_steps_insert_mistral.json"
|
|
jsonfilename = f"{SAVE_PATH}/{foldername}_{idx}_with_actions_description_insert.json"
|
|
if not os.path.exists(jsonfilename):
|
|
break
|
|
|
|
data = read_json_file(jsonfilename)
|
|
if os.path.exists(savejsonfilename):
|
|
data = read_json_file(savejsonfilename)
|
|
actions_steps = []
|
|
for i in tqdm(range(len(data)), desc="Steps_Creation"):
|
|
if "train" in foldername: # include task
|
|
message = f"""Website: {data[i]["website_en"]}
|
|
Domain: {data[i]["domain_en"]}
|
|
Sub-domain: {data[i]["subdomain_en"]}
|
|
Task: {data[i]["task_description"]}
|
|
Actions: {data[i]["task_subintention"]}\n
|
|
# OUTPUT #
|
|
"""
|
|
prompt = train_prompt
|
|
else: # exclude task
|
|
message = f"""Website: {data[i]["website_en"]}
|
|
Domain: {data[i]["domain_en"]}
|
|
Sub-domain: {data[i]["subdomain_en"]}
|
|
Actions: {data[i]["task_subintention"]}\n
|
|
# OUTPUT #
|
|
"""
|
|
prompt = test_prompt
|
|
|
|
messages=[
|
|
{"role": "system", "content": prompt},
|
|
{"role": "user", "content": message}
|
|
]
|
|
messages = 'System: ' + prompt + 'User: ' + message
|
|
|
|
model_inputs = tokenizer(messages, return_tensors="pt").to("cuda")
|
|
assert len(model_inputs['input_ids'])<=4096
|
|
generated_ids = model.generate(**model_inputs,max_new_tokens=1024, do_sample=False, top_p= 0.95, repetition_penalty=1.2)
|
|
json_object = tokenizer.batch_decode(generated_ids)[0]
|
|
answer = json_object.split('Sub-intentions: [')[-1].split('\n')
|
|
final_answer = []
|
|
for a in answer:
|
|
a = a.strip()
|
|
if '</s>' in a:
|
|
a = a.split('</s>')[0]
|
|
if len(a)==0:
|
|
continue
|
|
while a[0]=='"':
|
|
a = a[1:]
|
|
if len(a)==0:
|
|
break
|
|
if len(a)==0:
|
|
continue
|
|
while a[-1] in ['"', ',', ']', ]:
|
|
a = a[:-1]
|
|
if len(a)==0:
|
|
break
|
|
if len(a)==0:
|
|
continue
|
|
final_answer.append(a)
|
|
data[i]['steps'] = final_answer
|
|
with open(savejsonfilename, 'w') as json_file:
|
|
json.dump(data, json_file) |