44 lines
1017 B
Python
44 lines
1017 B
Python
import os
|
|
import sys
|
|
|
|
import click
|
|
|
|
|
|
def read(path):
|
|
with open(path) as h:
|
|
for line in h:
|
|
line = line.strip()
|
|
try:
|
|
b, s, p, a, f = line.split("\t")
|
|
except:
|
|
print(f"skipping line {line}", file=sys.stderr)
|
|
continue
|
|
else:
|
|
yield b, s, p, a, f
|
|
|
|
|
|
@click.command()
|
|
@click.argument("path")
|
|
def main(path):
|
|
data = list(read(path))
|
|
avg_len = sum(len(x[1]) for x in data)/len(data)
|
|
filtered_data = []
|
|
filtered_data2 = []
|
|
|
|
fname, ext = os.path.splitext(path)
|
|
ext = f".{ext}" if ext else ext
|
|
with open(f"{fname}_long{ext}", "w") as lh, open(f"{fname}_short{ext}", "w") as sh:
|
|
for x in data:
|
|
if len(x[1]) > avg_len:
|
|
lh.write("\t".join(x))
|
|
lh.write("\n")
|
|
else:
|
|
sh.write("\t".join(x))
|
|
sh.write("\n")
|
|
|
|
print(f"avg sentence length {avg_len}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|