Last active
September 14, 2017 01:01
-
-
Save zyocum/f44951db073c5a8f39988d2d63cf4e71 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Tokenize text naively based on unicode character properties""" | |
import json | |
import regex | |
import sys | |
from collections import namedtuple | |
PUNCT = regex.compile(r'\p{P}') | |
SPACE = regex.compile(r'\p{White_Space=Y}') | |
Token = namedtuple('Token', ['text', 'start', 'end', 'type']) | |
def c_type(c, non_punct='_-%$#@'): | |
if PUNCT.match(c) and (c not in non_punct): | |
return 'punct' | |
if SPACE.match(c): | |
return 'space' | |
return 'word' | |
def c_types(blob): | |
return ((c, c_type(c)) for c in iter(blob)) | |
def tokenize(blob): | |
text, start, end, t_type = '', 0, 0, None | |
for c, ct in c_types(blob): | |
if t_type is None or ct == t_type: | |
text, end, t_type = text + c, end + 1, ct | |
else: | |
yield Token(text, start, end, t_type) | |
text, start, end, t_type = c, end, end + 1, ct | |
if text: | |
yield Token(text, start, end, t_type) | |
if __name__ == '__main__': | |
import argparse | |
parser = argparse.ArgumentParser( | |
description=__doc__, | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
) | |
parser.add_argument( | |
'-l', '--lower', | |
action='store_true', | |
help='transduce tokens to lowercase' | |
) | |
parser.add_argument( | |
'-w', '--word', '--words', | |
action='store_true', | |
help='output word tokens' | |
) | |
parser.add_argument( | |
'-p', '--punct', '--puncts', '--punctuation', | |
action='store_true', | |
help='output punctuation tokens' | |
) | |
parser.add_argument( | |
'-s', '--space', '--spaces', | |
action='store_true', | |
help='output whitespace tokens' | |
) | |
args = parser.parse_args() | |
enabled = vars(args) | |
lower = enabled.pop('lower') | |
tokens = tokenize(sys.stdin.read()) | |
# if no type options are specified, act as if all are enabled | |
if not any(enabled.values()): | |
enabled = {k: True for k in enabled} | |
for token in tokens: | |
if enabled[token.type]: | |
if lower: | |
token = token._replace(text=token.text.lower()) | |
print(json.dumps(token._asdict(), ensure_ascii=False)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment