Skip to content

Instantly share code, notes, and snippets.

@zyocum
Last active September 14, 2017 01:01
Show Gist options
  • Save zyocum/f44951db073c5a8f39988d2d63cf4e71 to your computer and use it in GitHub Desktop.
Save zyocum/f44951db073c5a8f39988d2d63cf4e71 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""Tokenize text naively based on unicode character properties"""
import json
import regex
import sys
from collections import namedtuple
PUNCT = regex.compile(r'\p{P}')
SPACE = regex.compile(r'\p{White_Space=Y}')
Token = namedtuple('Token', ['text', 'start', 'end', 'type'])
def c_type(c, non_punct='_-%$#@'):
if PUNCT.match(c) and (c not in non_punct):
return 'punct'
if SPACE.match(c):
return 'space'
return 'word'
def c_types(blob):
return ((c, c_type(c)) for c in iter(blob))
def tokenize(blob):
text, start, end, t_type = '', 0, 0, None
for c, ct in c_types(blob):
if t_type is None or ct == t_type:
text, end, t_type = text + c, end + 1, ct
else:
yield Token(text, start, end, t_type)
text, start, end, t_type = c, end, end + 1, ct
if text:
yield Token(text, start, end, t_type)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
'-l', '--lower',
action='store_true',
help='transduce tokens to lowercase'
)
parser.add_argument(
'-w', '--word', '--words',
action='store_true',
help='output word tokens'
)
parser.add_argument(
'-p', '--punct', '--puncts', '--punctuation',
action='store_true',
help='output punctuation tokens'
)
parser.add_argument(
'-s', '--space', '--spaces',
action='store_true',
help='output whitespace tokens'
)
args = parser.parse_args()
enabled = vars(args)
lower = enabled.pop('lower')
tokens = tokenize(sys.stdin.read())
# if no type options are specified, act as if all are enabled
if not any(enabled.values()):
enabled = {k: True for k in enabled}
for token in tokens:
if enabled[token.type]:
if lower:
token = token._replace(text=token.text.lower())
print(json.dumps(token._asdict(), ensure_ascii=False))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment