Skip to content

Instantly share code, notes, and snippets.

@mjpost
Created July 31, 2020 18:46
Show Gist options
  • Save mjpost/ed7456f6a987c533102fc121678ed302 to your computer and use it in GitHub Desktop.
Save mjpost/ed7456f6a987c533102fc121678ed302 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import sys
import sacremoses
def main(args):
"""Tokenizes, preserving tabs"""
mt = sacremoses.MosesTokenizer(lang=args.lang)
def tok(s):
return mt.tokenize(s, return_str=True)
for line in sys.stdin:
parts = list(map(tok, line.split("\t")))
print(*parts, sep="\t", flush=True)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--lang', '-l', default='en')
parser.add_argument('--penn', '-p', action='store_true')
parser.add_argument('--fields', '-f', help="fields to tokenize")
args = parser.parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment