Last active
March 14, 2019 08:15
-
-
Save todd-cook/39027e4514df9edaa63875ec6be2864f to your computer and use it in GitHub Desktop.
word tokenizer file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Some notes on this gist: | |
# all abstract and base class stuff has been moved to the top of the file so that import resolution | |
# and IDEs won't complain unnecessarily. | |
# _get_model_path is default method | |
# tokenize has been promoted to be a default method, and signatures downstream were made consistent with this; | |
# -- whether this is a great idea is open to debate | |
# The default list argument, a subtle bug, has been corrected, e.g. see https://docs.python-guide.org/writing/gotchas/ | |
# the self : object annotation has been removed as it is implied and it confuses IDE. | |
"""Tokenize sentences.""" | |
__author__ = ['Patrick J. Burns <patrick@diyclassics.org>', | |
'Kyle P. Johnson <kyle@kyle-p-johnson.com>', 'Anoop Kunchukuttan'] | |
__license__ = 'MIT License. See LICENSE.' | |
import os | |
import re | |
import string | |
from typing import List, Dict, Tuple, Set, Any, Generator | |
from nltk.tokenize.punkt import PunktLanguageVars | |
from nltk.tokenize.punkt import PunktSentenceTokenizer | |
from cltk.tokenize.latin.params import LatinLanguageVars | |
from cltk.utils.file_operations import open_pickle | |
from abc import abstractmethod | |
# Part of Latin workaround | |
# class LatinLanguageVars(PunktLanguageVars): | |
# _re_non_word_chars = PunktLanguageVars._re_non_word_chars.replace("'",'') | |
PUNCTUATION = {'greek': | |
{'external': ('.', ';'), | |
'internal': (',', '·'), | |
'file': 'greek.pickle', }, | |
} | |
INDIAN_LANGUAGES = ['bengali', 'hindi', 'marathi', 'sanskrit', 'telugu'] | |
class BaseSentenceTokenizer: | |
""" Base class for sentence tokenization""" | |
def __init__(self, language: str = None): | |
""" Initialize stoplist builder with option for language specific parameters | |
:param language : language for sentence tokenization | |
:type language: str | |
""" | |
if language: | |
self.language = language.lower() | |
def tokenize(self, text: str, model: object = None): | |
""" | |
Method for tokenizing sentences with pretrained punkt models; can | |
be overridden by language-specific tokenizers. | |
:rtype: list | |
:param text: text to be tokenized into sentences | |
:type text: str | |
:param model: tokenizer object to used # Should be in init? | |
:type model: object | |
""" | |
if not self.model: | |
model = self.model | |
tokenizer = self.model | |
if self.lang_vars: | |
tokenizer._lang_vars = self.lang_vars | |
return tokenizer.tokenize(text) | |
def _get_models_path(self, language): # pragma: no cover | |
return f'~/cltk_data/{language}/model/{language}_models_cltk/tokenizers/sentence' | |
class BasePunktSentenceTokenizer(BaseSentenceTokenizer): | |
"""Base class for punkt sentence tokenization""" | |
missing_models_message = "BasePunktSentenceTokenizer requires a language model." | |
def __init__(self, language: str = None, lang_vars: object = None): | |
""" | |
:param language : language for sentence tokenization | |
:type language: str | |
""" | |
self.language = language | |
self.lang_vars = lang_vars | |
super().__init__(language=self.language) | |
if self.language: | |
self.models_path = self._get_models_path(self.language) | |
try: | |
self.model = open_pickle(os.path.join(os.path.expanduser(self.models_path), | |
f'{self.language}_punkt.pickle')) | |
except FileNotFoundError as err: | |
raise type(err)(BasePunktSentenceTokenizer.missing_models_message) | |
class BaseRegexSentenceTokenizer(BaseSentenceTokenizer): | |
""" Base class for regex sentence tokenization""" | |
def __init__(self, language: str = None, sent_end_chars: List[str] = None): | |
""" | |
:param language: language for sentence tokenization | |
:type language: str | |
:param sent_end_chars: list of sentence-ending punctuation marks | |
:type sent_end_chars: list | |
""" | |
BaseSentenceTokenizer.__init__(self, language) | |
if sent_end_chars: | |
self.sent_end_chars = sent_end_chars | |
self.sent_end_chars_regex = '|'.join(self.sent_end_chars) | |
self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s' | |
else: | |
raise Exception # TODO add message, must specify sent_end_chars, or warn and use defaults | |
def tokenize(self, text: str, model: object = None): | |
""" | |
Method for tokenizing sentences with regular expressions. | |
:rtype: list | |
:param text: text to be tokenized into sentences | |
:type text: str | |
""" | |
sentences = re.split(self.pattern, text) | |
return sentences | |
class TokenizeSentence(BasePunktSentenceTokenizer): # pylint: disable=R0903 | |
"""Tokenize sentences for the language given as argument, e.g., | |
``TokenizeSentence('greek')``. | |
""" | |
missing_models_message = "TokenizeSentence requires the models to be installed in cltk_data. Please load the correct models." | |
def __init__(self, language: str): | |
"""Lower incoming language name and assemble variables. | |
:type language: str | |
:param language : Language for sentence tokenization. | |
""" | |
self.language = language.lower() | |
# Workaround for Latin—use old API syntax to load new sent tokenizer | |
if self.language == 'latin': | |
self.lang_vars = LatinLanguageVars() | |
super().__init__(language='latin', lang_vars=self.lang_vars) | |
elif self.language not in INDIAN_LANGUAGES: | |
self.internal_punctuation, self.external_punctuation, self.tokenizer_path = \ | |
self._setup_language_variables(self.language) | |
def _setup_language_variables(self, lang: str): # pragma: no cover | |
"""Check for language availability and presence of tokenizer file, | |
then read punctuation characters for language and build tokenizer file | |
path. | |
:param lang: The language argument given to the class. | |
:type lang: str | |
:rtype (str, str, str) | |
""" | |
assert lang in PUNCTUATION.keys(), \ | |
'Sentence tokenizer not available for {0} language.'.format(lang) | |
internal_punctuation = PUNCTUATION[lang]['internal'] | |
external_punctuation = PUNCTUATION[lang]['external'] | |
file = PUNCTUATION[lang]['file'] | |
tokenizer_path = os.path.join(os.path.expanduser(self._get_models_path(language=lang)), | |
file) | |
assert os.path.isfile(tokenizer_path), \ | |
'CLTK linguistics data not found for language {0} {}'.format(lang) | |
return internal_punctuation, external_punctuation, tokenizer_path | |
def _setup_tokenizer(self, tokenizer: object): # pragma: no cover | |
"""Add tokenizer and punctuation variables. | |
:type tokenizer: object | |
:param tokenizer : Unpickled tokenizer object. | |
:rtype : object | |
""" | |
language_punkt_vars = PunktLanguageVars | |
language_punkt_vars.sent_end_chars = self.external_punctuation | |
language_punkt_vars.internal_punctuation = self.internal_punctuation | |
tokenizer.INCLUDE_ALL_COLLOCS = True | |
tokenizer.INCLUDE_ABBREV_COLLOCS = True | |
params = tokenizer.get_params() | |
return PunktSentenceTokenizer(params) | |
def tokenize_sentences(self, untokenized_string: str): | |
"""Tokenize sentences by reading trained tokenizer and invoking | |
``PunktSentenceTokenizer()``. | |
:type untokenized_string: str | |
:param untokenized_string: A string containing one of more sentences. | |
:rtype : list of strings | |
""" | |
# load tokenizer | |
assert isinstance(untokenized_string, str), \ | |
'Incoming argument must be a string.' | |
if self.language == 'latin': | |
self.models_path = self._get_models_path(self.language) | |
try: | |
self.model = open_pickle( | |
os.path.expanduser(os.path.join(self.models_path, 'latin_punkt.pickle'))) | |
except FileNotFoundError as err: | |
raise type(err)(TokenizeSentence.missing_models_message + self.models_path) | |
tokenizer = self.model | |
tokenizer._lang_vars = self.lang_vars | |
else: | |
tokenizer = open_pickle(self.tokenizer_path) | |
tokenizer = self._setup_tokenizer(tokenizer) | |
# mk list of tokenized sentences | |
if self.language == 'latin': | |
return tokenizer.tokenize(untokenized_string) | |
else: | |
tokenized_sentences = [sentence for sentence in | |
tokenizer.sentences_from_text(untokenized_string, | |
realign_boundaries=True)] | |
return tokenized_sentences | |
def indian_punctuation_tokenize_regex(self, untokenized_string: str): | |
"""A trivial tokenizer which just tokenizes on the punctuation boundaries. | |
This also includes punctuation, namely the the purna virama ("|") and | |
deergha virama ("॥"), for Indian language scripts. | |
:type untokenized_string: str | |
:param untokenized_string: A string containing one of more sentences. | |
:rtype : list of strings | |
""" | |
modified_punctuations = string.punctuation.replace("|", | |
"") # The replace , deletes the ' | ' from the punctuation string provided by the library | |
indian_punctuation_pattern = re.compile( | |
'([' + modified_punctuations + '\u0964\u0965' + ']|\|+)') | |
tok_str = indian_punctuation_pattern.sub(r' \1 ', untokenized_string.replace('\t', ' ')) | |
return re.sub(r'[ ]+', u' ', tok_str).strip(' ').split(' ') | |
def tokenize(self, untokenized_string: str, model=None): | |
"""Alias for tokenize_sentences()—NLTK's PlaintextCorpusReader needs a | |
function called tokenize in functions used as a parameter for sentence | |
tokenization. | |
:type untokenized_string: str | |
:param untokenized_string: A string containing one of more sentences. | |
""" | |
if self.language in INDIAN_LANGUAGES: | |
return self.indian_punctuation_tokenize_regex(untokenized_string) | |
else: | |
return self.tokenize_sentences(untokenized_string) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
First, thanks! very helpful.
Second, re: # _get_model_path is default method. My thought here was that not all tokenizers are model-based; e.g. the regex tokenizer. So, it was introduced in BasePunktSentenceTokenizer. Curious what the advantage of moving it up to the parent class would be?