erickrf/tokenizer.py

## tokenizer.py
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
from nltk.tokenize import RegexpTokenizer
import argparse
import os

"""
Script for tokenizing Portuguese text according to the Universal Dependencies
(UD) tokenization standards. This script was not created by the UD team; it was
based on observation of the corpus.
"""


def tokenize(text):
    """
    Tokenize the given sentence in Portuguese.
    :param text: text to be tokenized, as a string
    """
    tokenizer_regexp = r'''(?ux)
    # the order of the patterns is important!!
    # more structured patterns come first
    [a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+|    # emails
    (?:https?://)?\w{2,}(?:\.\w{2,})+(?:/\w+)*|                  # URLs
    (?:[\#@]\w+)|                     # Hashtags and twitter user names
    (?:[^\W\d_]\.)+|                  # one letter abbreviations, e.g. E.U.A.
    (?:[DSds][Rr][Aa]?)\.|            # common abbreviations such as dr., sr., sra., dra.
    (?:\B-)?\d+(?:[:.,]\d+)*(?:-?\w)*|
        # numbers in format 999.999.999,999, possibly followed by hyphen and alphanumerics
        # \B- avoids picks as F-14 as a negative number
    \.{3,}|                           # ellipsis or sequences of dots
    \w+|                              # alphanumerics
    -+|                               # any sequence of dashes
    \S                                # any non-space character
    '''
    tokenizer = RegexpTokenizer(tokenizer_regexp)

    return tokenizer.tokenize(text)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('inputs', nargs='+',
                        help='Files to tokenize (new files with .token extension '
                             'will be generated)')
    args = parser.parse_args()

    for filename in args.inputs:
        print('Tokenizing %s' % filename)

        tokenized_lines = []
        basename, _ = os.path.splitext(filename)
        new_name = basename + '.token'
        with open(filename, 'rb') as f:
            for line in f:
                line = line.decode('utf-8')
                tokens = tokenize(line)
                tokenized_line = ' '.join(tokens)
                tokenized_lines.append(tokenized_line)

        text = '\n'.join(tokenized_lines)
        with open(new_name, 'wb') as f:
            f.write(text.encode('utf-8'))
	# -- coding: utf-8 --

	from __future__ import unicode_literals
	from nltk.tokenize import RegexpTokenizer
	import argparse
	import os

	"""
	Script for tokenizing Portuguese text according to the Universal Dependencies
	(UD) tokenization standards. This script was not created by the UD team; it was
	based on observation of the corpus.
	"""


	def tokenize(text):
	"""
	Tokenize the given sentence in Portuguese.
	:param text: text to be tokenized, as a string
	"""
	tokenizer_regexp = r'''(?ux)
	# the order of the patterns is important!!
	# more structured patterns come first
	[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+\| # emails
	(?:https?://)?\w{2,}(?:\.\w{2,})+(?:/\w+)*\| # URLs
	(?:[\#@]\w+)\| # Hashtags and twitter user names
	(?:[^\W\d_]\.)+\| # one letter abbreviations, e.g. E.U.A.
	(?:[DSds][Rr][Aa]?)\.\| # common abbreviations such as dr., sr., sra., dra.
	(?:\B-)?\d+(?:[:.,]\d+)(?:-?\w)\|
	# numbers in format 999.999.999,999, possibly followed by hyphen and alphanumerics
	# \B- avoids picks as F-14 as a negative number
	\.{3,}\| # ellipsis or sequences of dots
	\w+\| # alphanumerics
	-+\| # any sequence of dashes
	\S # any non-space character
	'''
	tokenizer = RegexpTokenizer(tokenizer_regexp)

	return tokenizer.tokenize(text)

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument('inputs', nargs='+',
	help='Files to tokenize (new files with .token extension '
	'will be generated)')
	args = parser.parse_args()

	for filename in args.inputs:
	print('Tokenizing %s' % filename)

	tokenized_lines = []
	basename, _ = os.path.splitext(filename)
	new_name = basename + '.token'
	with open(filename, 'rb') as f:
	for line in f:
	line = line.decode('utf-8')
	tokens = tokenize(line)
	tokenized_line = ' '.join(tokens)
	tokenized_lines.append(tokenized_line)

	text = '\n'.join(tokenized_lines)
	with open(new_name, 'wb') as f:
	f.write(text.encode('utf-8'))