Skip to content

Instantly share code, notes, and snippets.

@jelmervdl
Created June 7, 2023 14:56
Show Gist options
  • Save jelmervdl/b541edc138f2aedf3ad121847bc96d3a to your computer and use it in GitHub Desktop.
Save jelmervdl/b541edc138f2aedf3ad121847bc96d3a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# Script that reads sentences scored with hardrules
# prints the 0's directly to stdout
# and passes the sentences with 1 to the subprocess bicleaner score
import sys
import subprocess
from threading import Thread
from queue import SimpleQueue
from itertools import chain
ReadFromChild = object()
EndOfInput = object()
def fast_forward(stdin, stdout):
"""While the input lines are marked with a score of 0, just write the 0
directly to the stdout. Only once we read a line that's not with a score of
0 we stop and return the line. If all input lines were score of 0, return
None."""
for line in stdin:
parts = line.rstrip("\r\n").split("\t")
if parts[-1] == "0":
stdout.write(line)
else:
return line
return None
def stdin_to_child(out_queue, stdin, child_in):
"""Reads sys.stdin and queues all lines for writing output. Lines that need
to be scored are also fed to the child process. Ends with putting a None on
the queue to indicate the feeding is done."""
try:
for line in stdin:
parts = line.rstrip("\r\n").split("\t")
if parts[-1] == "0":
out_queue.put(line)
pass
elif parts[-1] == "1":
# Give it to the child, except for the last column
child_in.write("\t".join(parts[:-1]) + "\n")
out_queue.put(ReadFromChild)
else:
raise Exception(f"Unknown input in score column: '{parts[-1]}'. Only expecting '0' or '1'.")
out_queue.put(EndOfInput)
except Exception as e:
out_queue.put(e)
finally:
child_in.close()
def child_to_stdout(out_queue, child_out, stdout):
"""Reads the queue, and when the value on the queue indicates that we should
ask the child process for the real value, it will read the value from the
child."""
while True:
line = out_queue.get()
if line is EndOfInput:
break
elif line is ReadFromChild:
stdout.write(child_out.readline())
elif isinstance(line, str):
stdout.write(line)
elif isinstance(line, Exception):
raise line
else:
raise Exception(f"Unknown value in queue")
def usage(prog):
print(f"Usage: {prog} command [arg...]", file=sys.stderr)
return 1
def main(argv):
if len(argv) < 2:
return usage(argv[0])
# Fast-forward while all lines are marked with a score of 0. If we have hit
# a file that only has these kind of lines, we can skip starting the child
# process without feeding it anything.
next_line = fast_forward(sys.stdin, sys.stdout)
if next_line is None:
return 0
child = subprocess.Popen(argv[1:], stdin=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8')
# queue contains 0 if a 0 needs to be written, 1 if a score from the child
# needs to be read & written, or None if we're at the end of the input. Then
# the child's stdin is closed.
queue = SimpleQueue()
# feeder feeds stdin to queue, and lines that need to be scored to child.
# Use chain() to stick next_line back onto the stdin feed.
# Note: daemon=True will make sure it does not keep the main thread alive
# when that should stop because of an exception in child_to_stdout().
feeder = Thread(target=stdin_to_child, args=[queue, chain([next_line], sys.stdin), child.stdin], daemon=True)
feeder.start()
try:
# child_to_stdout reads scores from queue, and if a score is 1 also the
# actual score from child. Will stop when it encounters None in queue.
child_to_stdout(queue, child.stdout, sys.stdout)
feeder.join()
# Assuming we've processed all output, child should be finished producing
# output by now.
child.wait()
return child.returncode
except BrokenPipeError:
# TODO: close stdin? Properly tell the child to stop?
return 141
sys.exit(main(sys.argv))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment