import os import sys import click def read(path): with open(path) as h: for line in h: line = line.strip() try: b, s, p, a, f = line.split("\t") except: print(f"skipping line {line}", file=sys.stderr) continue else: yield b, s, p, a, f @click.command() @click.argument("path") def main(path): data = list(read(path)) avg_len = sum(len(x[1]) for x in data)/len(data) filtered_data = [] filtered_data2 = [] fname, ext = os.path.splitext(path) ext = f".{ext}" if ext else ext with open(f"{fname}_long{ext}", "w") as lh, open(f"{fname}_short{ext}", "w") as sh: for x in data: if len(x[1]) > avg_len: lh.write("\t".join(x)) lh.write("\n") else: sh.write("\t".join(x)) sh.write("\n") print(f"avg sentence length {avg_len}") if __name__ == "__main__": main()