zyocum/tokens.py

## tokens.py
#!/usr/bin/env python3

"""Tokenize text naively based on unicode character properties"""

import json
import regex
import sys

from collections import namedtuple

PUNCT = regex.compile(r'\p{P}')
SPACE = regex.compile(r'\p{White_Space=Y}')

Token = namedtuple('Token', ['text', 'start', 'end', 'type'])

def c_type(c, non_punct='_-%$#@'):
    if PUNCT.match(c) and (c not in non_punct):
        return 'punct'
    if SPACE.match(c):
        return 'space'
    return 'word'

def c_types(blob):
    return ((c, c_type(c)) for c in iter(blob))

def tokenize(blob):
    text, start, end, t_type = '', 0, 0, None
    for c, ct in c_types(blob):
        if t_type is None or ct == t_type:
            text, end, t_type = text + c, end + 1, ct
        else:
            yield Token(text, start, end, t_type)
            text, start, end, t_type = c, end, end + 1, ct
    if text:
        yield Token(text, start, end, t_type)

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        '-l', '--lower',
        action='store_true',
        help='transduce tokens to lowercase'
    )
    parser.add_argument(
        '-w', '--word', '--words',
        action='store_true',
        help='output word tokens'
    )
    parser.add_argument(
        '-p', '--punct', '--puncts', '--punctuation',
        action='store_true',
        help='output punctuation tokens'
    )
    parser.add_argument(
        '-s', '--space', '--spaces',
        action='store_true',
        help='output whitespace tokens'
    )
    args = parser.parse_args()
    enabled = vars(args)
    lower = enabled.pop('lower')
    tokens = tokenize(sys.stdin.read())
    # if no type options are specified, act as if all are enabled
    if not any(enabled.values()):
        enabled = {k: True for k in enabled}
    for token in tokens:
        if enabled[token.type]:
            if lower:
                token = token._replace(text=token.text.lower())
            print(json.dumps(token._asdict(), ensure_ascii=False))
	#!/usr/bin/env python3

	"""Tokenize text naively based on unicode character properties"""

	import json
	import regex
	import sys

	from collections import namedtuple

	PUNCT = regex.compile(r'\p{P}')
	SPACE = regex.compile(r'\p{White_Space=Y}')

	Token = namedtuple('Token', ['text', 'start', 'end', 'type'])

	def c_type(c, non_punct='_-%$#@'):
	if PUNCT.match(c) and (c not in non_punct):
	return 'punct'
	if SPACE.match(c):
	return 'space'
	return 'word'

	def c_types(blob):
	return ((c, c_type(c)) for c in iter(blob))

	def tokenize(blob):
	text, start, end, t_type = '', 0, 0, None
	for c, ct in c_types(blob):
	if t_type is None or ct == t_type:
	text, end, t_type = text + c, end + 1, ct
	else:
	yield Token(text, start, end, t_type)
	text, start, end, t_type = c, end, end + 1, ct
	if text:
	yield Token(text, start, end, t_type)

	if __name__ == '__main__':
	import argparse
	parser = argparse.ArgumentParser(
	description=__doc__,
	formatter_class=argparse.ArgumentDefaultsHelpFormatter
	)
	parser.add_argument(
	'-l', '--lower',
	action='store_true',
	help='transduce tokens to lowercase'
	)
	parser.add_argument(
	'-w', '--word', '--words',
	action='store_true',
	help='output word tokens'
	)
	parser.add_argument(
	'-p', '--punct', '--puncts', '--punctuation',
	action='store_true',
	help='output punctuation tokens'
	)
	parser.add_argument(
	'-s', '--space', '--spaces',
	action='store_true',
	help='output whitespace tokens'
	)
	args = parser.parse_args()
	enabled = vars(args)
	lower = enabled.pop('lower')
	tokens = tokenize(sys.stdin.read())
	# if no type options are specified, act as if all are enabled
	if not any(enabled.values()):
	enabled = {k: True for k in enabled}
	for token in tokens:
	if enabled[token.type]:
	if lower:
	token = token._replace(text=token.text.lower())
	print(json.dumps(token._asdict(), ensure_ascii=False))