Uploaded
This commit is contained in:
commit
04c4625cfe
11 changed files with 1330 additions and 0 deletions
135
preprocess/convert_dataset.py
Normal file
135
preprocess/convert_dataset.py
Normal file
|
@ -0,0 +1,135 @@
|
|||
import os, pdb
|
||||
import re
|
||||
import json
|
||||
from enum import Enum
|
||||
from tqdm import tqdm
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def read_json_file(filename):
|
||||
with open(filename, 'r') as infile:
|
||||
data = json.load(infile)
|
||||
return data
|
||||
|
||||
def convert_string(string_or_list):
|
||||
# Add escaping symbols to English quotes in string
|
||||
if isinstance(string_or_list, str):
|
||||
return string_or_list.replace('"', '\\"')
|
||||
elif isinstance(string_or_list, list):
|
||||
return [convert_string(s) for s in string_or_list]
|
||||
|
||||
def is_visible(element):
|
||||
bounding_box = element.get('bounding_box_rect')
|
||||
return bounding_box != "-1,-1,-1,-1"
|
||||
|
||||
def clean_text(text):
|
||||
cleaned_text = text.strip()
|
||||
cleaned_text = cleaned_text.replace('\n', ' ').replace('\t', ' ')
|
||||
cleaned_text = re.sub(' +', ' ', cleaned_text)
|
||||
return cleaned_text
|
||||
|
||||
def find_semantic_info(element):
|
||||
element_text = clean_text(element.get_text(strip=True))
|
||||
if element_text:
|
||||
return element_text
|
||||
|
||||
label = element.find_previous(lambda x: x.name == 'label' and is_visible(x))
|
||||
if label:
|
||||
label_text = clean_text(label.get_text(strip=True))
|
||||
if label_text:
|
||||
return label_text
|
||||
return None
|
||||
|
||||
def action_discription(ui_element_name, ui_element_text, operation_type, value):
|
||||
ret_en = ""
|
||||
if operation_type == "TYPE":
|
||||
if ui_element_text != "":
|
||||
ret_en += f'Type text "{value}" into {ui_element_name} with text "{ui_element_text}" on it'
|
||||
else:
|
||||
ret_en += f'Type text "{value}" into {ui_element_name}'
|
||||
elif operation_type == "SELECT":
|
||||
if ui_element_text != "":
|
||||
ret_en += f'Select "{value}" from {ui_element_name} with text "{ui_element_text}" on it'
|
||||
else:
|
||||
ret_en += f'Select "{value}" from {ui_element_name}.'
|
||||
elif operation_type == "CLICK":
|
||||
if ui_element_text != "":
|
||||
ret_en += f'Click the {ui_element_name} element with text "{ui_element_text}" on it'
|
||||
else:
|
||||
ret_en += f'Click the {ui_element_name} element'
|
||||
return ret_en
|
||||
|
||||
def process_one_task(task):
|
||||
base_info = {
|
||||
"website_en": task["website"],
|
||||
"domain_en": task["domain"],
|
||||
"subdomain_en": task["subdomain"],
|
||||
"annotation_id":task["annotation_id"],
|
||||
"task_description": task["confirmed_task"],
|
||||
"action_reprs" : task["action_reprs"]
|
||||
}
|
||||
action_descriptions_en = []
|
||||
for action_index, action in enumerate(task["actions"]):
|
||||
action_repr = task["action_reprs"][action_index]
|
||||
ui_element, _ = action_repr.split(" -> ")
|
||||
assert ui_element.count("] ")==1
|
||||
ui_element_name, ui_element_text = ui_element.split("] ")
|
||||
ui_element_name = ui_element_name[1:]
|
||||
ui_element_text = ui_element_text.strip()
|
||||
|
||||
if ui_element_text == "":
|
||||
raw_html = action["raw_html"]
|
||||
soup2 = BeautifulSoup(raw_html, 'html.parser')
|
||||
selected_element2 = soup2.find(attrs={"data_pw_testid_buckeye": action["action_uid"]})
|
||||
|
||||
ui_element_text = find_semantic_info(selected_element2)
|
||||
if ui_element_text is not None:
|
||||
ui_element_text = clean_text(ui_element_text)
|
||||
task["action_reprs"][action_index] = f"[{ui_element_name}] {ui_element_text} -> {task['action_reprs'][action_index].split(' -> ')[1]}"
|
||||
else:
|
||||
print(f'Warning: {task["annotation_id"]}, can not find semantic info for {action["action_uid"]}')
|
||||
|
||||
action_description_en = action_discription(ui_element_name, ui_element_text, action["operation"]["op"], action["operation"]["value"])
|
||||
action_descriptions_en.append(action_description_en)
|
||||
|
||||
base_info["task_subintention"] = action_descriptions_en
|
||||
return base_info
|
||||
|
||||
if __name__ == "__main__":
|
||||
for foldername in ['train','test_domain','test_website','test_task']:
|
||||
SAVE_PATH = f"your-path-to-data/{foldername}"
|
||||
|
||||
for idx in range(100):
|
||||
savejsonfilename = os.path.join(SAVE_PATH,f'{foldername}_{idx}_with_actions_description_insert.json')
|
||||
if os.path.exists(savejsonfilename):
|
||||
continue
|
||||
else:
|
||||
jsonfilename = f"{SAVE_PATH}/{foldername}_{idx}.json"
|
||||
if not os.path.exists(jsonfilename):
|
||||
break
|
||||
dataset = read_json_file(jsonfilename)
|
||||
|
||||
Mind2Web_with_subintentions = []
|
||||
for task in tqdm(dataset):
|
||||
base_info = process_one_task(task)
|
||||
Mind2Web_with_subintentions.append(base_info)
|
||||
assert len(Mind2Web_with_subintentions) == len(dataset)
|
||||
|
||||
if 'test' in foldername:
|
||||
with open(os.path.join(SAVE_PATH,f'{foldername}_{idx}_with_actions_description.json'), 'r') as json_file:
|
||||
Mind2Web_with_subintentions_saved = json.load(json_file)
|
||||
|
||||
for i in range(len(Mind2Web_with_subintentions)):
|
||||
if i>=len(Mind2Web_with_subintentions_saved):
|
||||
break
|
||||
if Mind2Web_with_subintentions[i] != Mind2Web_with_subintentions_saved[i]:
|
||||
for key in Mind2Web_with_subintentions[i].keys():
|
||||
if Mind2Web_with_subintentions[i][key] != Mind2Web_with_subintentions_saved[i][key]:
|
||||
found = False
|
||||
for j in range(len(Mind2Web_with_subintentions_saved)):
|
||||
if Mind2Web_with_subintentions[i][key] == Mind2Web_with_subintentions_saved[j][key]:
|
||||
found = True
|
||||
break
|
||||
if not found:
|
||||
print(found, i, j, jsonfilename)
|
||||
with open(savejsonfilename, 'w') as json_file:
|
||||
json.dump(Mind2Web_with_subintentions, json_file)
|
97
preprocess/create_steps.py
Normal file
97
preprocess/create_steps.py
Normal file
|
@ -0,0 +1,97 @@
|
|||
from tqdm import tqdm
|
||||
import json
|
||||
import os
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
def get_tokenizer(model_name_or_path):
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, device_map={"":0})
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
tokenizer.padding_side = 'left'
|
||||
return tokenizer
|
||||
|
||||
def get_model(model_name_or_path):
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map={"":0})
|
||||
return model
|
||||
|
||||
def read_json_file(filename):
|
||||
with open(filename, 'r') as infile:
|
||||
data = json.load(infile)
|
||||
return data
|
||||
|
||||
if __name__ == "__main__":
|
||||
model_name_or_path = "Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24"
|
||||
tokenizer = get_tokenizer(model_name_or_path)
|
||||
model = get_model(model_name_or_path)
|
||||
|
||||
# load prompts
|
||||
with open("your-path-to-data/train_prompt.txt", "r") as f:
|
||||
train_prompt = f.read()
|
||||
with open("your-path-to-data/test_prompt.txt", "r") as f:
|
||||
test_prompt = f.read()
|
||||
|
||||
for foldername in ['train','test_domain','test_website','test_task']:
|
||||
SAVE_PATH = f"your-path-to-data/{foldername}"
|
||||
|
||||
for idx in range(100):
|
||||
savejsonfilename = f"{SAVE_PATH}/{foldername}_{idx}_with_steps_insert_mistral.json"
|
||||
jsonfilename = f"{SAVE_PATH}/{foldername}_{idx}_with_actions_description_insert.json"
|
||||
if not os.path.exists(jsonfilename):
|
||||
break
|
||||
|
||||
data = read_json_file(jsonfilename)
|
||||
if os.path.exists(savejsonfilename):
|
||||
data = read_json_file(savejsonfilename)
|
||||
actions_steps = []
|
||||
for i in tqdm(range(len(data)), desc="Steps_Creation"):
|
||||
if "train" in foldername: # include task
|
||||
message = f"""Website: {data[i]["website_en"]}
|
||||
Domain: {data[i]["domain_en"]}
|
||||
Sub-domain: {data[i]["subdomain_en"]}
|
||||
Task: {data[i]["task_description"]}
|
||||
Actions: {data[i]["task_subintention"]}\n
|
||||
# OUTPUT #
|
||||
"""
|
||||
prompt = train_prompt
|
||||
else: # exclude task
|
||||
message = f"""Website: {data[i]["website_en"]}
|
||||
Domain: {data[i]["domain_en"]}
|
||||
Sub-domain: {data[i]["subdomain_en"]}
|
||||
Actions: {data[i]["task_subintention"]}\n
|
||||
# OUTPUT #
|
||||
"""
|
||||
prompt = test_prompt
|
||||
|
||||
messages=[
|
||||
{"role": "system", "content": prompt},
|
||||
{"role": "user", "content": message}
|
||||
]
|
||||
messages = 'System: ' + prompt + 'User: ' + message
|
||||
|
||||
model_inputs = tokenizer(messages, return_tensors="pt").to("cuda")
|
||||
assert len(model_inputs['input_ids'])<=4096
|
||||
generated_ids = model.generate(**model_inputs,max_new_tokens=1024, do_sample=False, top_p= 0.95, repetition_penalty=1.2)
|
||||
json_object = tokenizer.batch_decode(generated_ids)[0]
|
||||
answer = json_object.split('Sub-intentions: [')[-1].split('\n')
|
||||
final_answer = []
|
||||
for a in answer:
|
||||
a = a.strip()
|
||||
if '</s>' in a:
|
||||
a = a.split('</s>')[0]
|
||||
if len(a)==0:
|
||||
continue
|
||||
while a[0]=='"':
|
||||
a = a[1:]
|
||||
if len(a)==0:
|
||||
break
|
||||
if len(a)==0:
|
||||
continue
|
||||
while a[-1] in ['"', ',', ']', ]:
|
||||
a = a[:-1]
|
||||
if len(a)==0:
|
||||
break
|
||||
if len(a)==0:
|
||||
continue
|
||||
final_answer.append(a)
|
||||
data[i]['steps'] = final_answer
|
||||
with open(savejsonfilename, 'w') as json_file:
|
||||
json.dump(data, json_file)
|
Loading…
Add table
Add a link
Reference in a new issue