human-gaze-guided-neural-at.../joint_paraphrase_model/utils/long_sentence_split.py

44 lines
1017 B
Python

import os
import sys
import click
def read(path):
with open(path) as h:
for line in h:
line = line.strip()
try:
b, s, p, a, f = line.split("\t")
except:
print(f"skipping line {line}", file=sys.stderr)
continue
else:
yield b, s, p, a, f
@click.command()
@click.argument("path")
def main(path):
data = list(read(path))
avg_len = sum(len(x[1]) for x in data)/len(data)
filtered_data = []
filtered_data2 = []
fname, ext = os.path.splitext(path)
ext = f".{ext}" if ext else ext
with open(f"{fname}_long{ext}", "w") as lh, open(f"{fname}_short{ext}", "w") as sh:
for x in data:
if len(x[1]) > avg_len:
lh.write("\t".join(x))
lh.write("\n")
else:
sh.write("\t".join(x))
sh.write("\n")
print(f"avg sentence length {avg_len}")
if __name__ == "__main__":
main()