SummAct/preprocess/convert_dataset.py

import os, pdb
import re
import json
from enum import Enum
from tqdm import tqdm
from bs4 import BeautifulSoup

def read_json_file(filename):
    with open(filename, 'r') as infile:
        data = json.load(infile)
    return data

def convert_string(string_or_list):
    # Add escaping symbols to English quotes in string
    if isinstance(string_or_list, str):
        return string_or_list.replace('"', '\\"')
    elif isinstance(string_or_list, list):
        return [convert_string(s) for s in string_or_list]

def is_visible(element):
    bounding_box = element.get('bounding_box_rect')
    return bounding_box != "-1,-1,-1,-1"

def clean_text(text):
    cleaned_text = text.strip()
    cleaned_text = cleaned_text.replace('\n', ' ').replace('\t', ' ')
    cleaned_text = re.sub(' +', ' ', cleaned_text)
    return cleaned_text

def find_semantic_info(element):
    element_text = clean_text(element.get_text(strip=True))
    if element_text:
        return element_text

    label = element.find_previous(lambda x: x.name == 'label' and is_visible(x))
    if label:
        label_text = clean_text(label.get_text(strip=True))
        if label_text:
            return label_text
    return None

def action_discription(ui_element_name, ui_element_text, operation_type, value):
    ret_en = ""
    if operation_type == "TYPE":
        if ui_element_text != "":
            ret_en += f'Type text "{value}" into {ui_element_name} with text "{ui_element_text}" on it'
        else:
            ret_en += f'Type text "{value}" into {ui_element_name}'
    elif operation_type == "SELECT":
        if ui_element_text != "":
            ret_en += f'Select "{value}" from {ui_element_name} with text "{ui_element_text}" on it'
        else:
            ret_en += f'Select "{value}" from {ui_element_name}.'
    elif operation_type == "CLICK":
        if ui_element_text != "":
            ret_en += f'Click the {ui_element_name} element with text "{ui_element_text}" on it'
        else:
            ret_en += f'Click the {ui_element_name} element'
    return ret_en

def process_one_task(task):
    base_info = {
        "website_en": task["website"],
        "domain_en": task["domain"],
        "subdomain_en": task["subdomain"],
        "annotation_id":task["annotation_id"],
        "task_description": task["confirmed_task"],
        "action_reprs" : task["action_reprs"]
    }
    action_descriptions_en = []
    for action_index, action in enumerate(task["actions"]):
        action_repr = task["action_reprs"][action_index]
        ui_element, _ = action_repr.split(" -> ")
        assert ui_element.count("]  ")==1
        ui_element_name, ui_element_text = ui_element.split("]  ")
        ui_element_name = ui_element_name[1:]
        ui_element_text = ui_element_text.strip()

        if ui_element_text == "":
            raw_html = action["raw_html"]
            soup2 = BeautifulSoup(raw_html, 'html.parser')
            selected_element2 = soup2.find(attrs={"data_pw_testid_buckeye": action["action_uid"]})

            ui_element_text = find_semantic_info(selected_element2)
            if ui_element_text is not None:
                ui_element_text = clean_text(ui_element_text)
                task["action_reprs"][action_index] = f"[{ui_element_name}]  {ui_element_text} -> {task['action_reprs'][action_index].split(' -> ')[1]}"
            else:
                print(f'Warning: {task["annotation_id"]}, can not find semantic info for {action["action_uid"]}')

        action_description_en = action_discription(ui_element_name, ui_element_text, action["operation"]["op"], action["operation"]["value"])
        action_descriptions_en.append(action_description_en)

    base_info["task_subintention"] = action_descriptions_en
    return base_info

if __name__ == "__main__":
    for foldername in ['train','test_domain','test_website','test_task']:
        SAVE_PATH = f"your-path-to-data/{foldername}"

        for idx in range(100):
            savejsonfilename = os.path.join(SAVE_PATH,f'{foldername}_{idx}_with_actions_description_insert.json')
            if os.path.exists(savejsonfilename):
                continue
            else:
                jsonfilename = f"{SAVE_PATH}/{foldername}_{idx}.json"
                if not os.path.exists(jsonfilename):
                    break
                dataset = read_json_file(jsonfilename)

                Mind2Web_with_subintentions = []
                for task in tqdm(dataset):
                    base_info = process_one_task(task)
                    Mind2Web_with_subintentions.append(base_info)
                assert len(Mind2Web_with_subintentions) == len(dataset)

                if 'test' in foldername:
                    with open(os.path.join(SAVE_PATH,f'{foldername}_{idx}_with_actions_description.json'), 'r') as json_file:
                        Mind2Web_with_subintentions_saved = json.load(json_file)

                    for i in range(len(Mind2Web_with_subintentions)):
                        if i>=len(Mind2Web_with_subintentions_saved):
                            break
                        if Mind2Web_with_subintentions[i] != Mind2Web_with_subintentions_saved[i]:
                            for key in Mind2Web_with_subintentions[i].keys():
                                if Mind2Web_with_subintentions[i][key] != Mind2Web_with_subintentions_saved[i][key]:
                                    found = False
                                    for j in range(len(Mind2Web_with_subintentions_saved)):
                                        if Mind2Web_with_subintentions[i][key] == Mind2Web_with_subintentions_saved[j][key]:
                                            found = True
                                            break
                                    if not found:
                                        print(found, i, j, jsonfilename)
                with open(savejsonfilename, 'w') as json_file:
                    json.dump(Mind2Web_with_subintentions, json_file)