Skip to content

Instantly share code, notes, and snippets.

@todd-cook
Last active March 14, 2019 08:15
Show Gist options
  • Save todd-cook/39027e4514df9edaa63875ec6be2864f to your computer and use it in GitHub Desktop.
Save todd-cook/39027e4514df9edaa63875ec6be2864f to your computer and use it in GitHub Desktop.
word tokenizer file
# Some notes on this gist:
# all abstract and base class stuff has been moved to the top of the file so that import resolution
# and IDEs won't complain unnecessarily.
# _get_model_path is default method
# tokenize has been promoted to be a default method, and signatures downstream were made consistent with this;
# -- whether this is a great idea is open to debate
# The default list argument, a subtle bug, has been corrected, e.g. see https://docs.python-guide.org/writing/gotchas/
# the self : object annotation has been removed as it is implied and it confuses IDE.
"""Tokenize sentences."""
__author__ = ['Patrick J. Burns <patrick@diyclassics.org>',
'Kyle P. Johnson <kyle@kyle-p-johnson.com>', 'Anoop Kunchukuttan']
__license__ = 'MIT License. See LICENSE.'
import os
import re
import string
from typing import List, Dict, Tuple, Set, Any, Generator
from nltk.tokenize.punkt import PunktLanguageVars
from nltk.tokenize.punkt import PunktSentenceTokenizer
from cltk.tokenize.latin.params import LatinLanguageVars
from cltk.utils.file_operations import open_pickle
from abc import abstractmethod
# Part of Latin workaround
# class LatinLanguageVars(PunktLanguageVars):
# _re_non_word_chars = PunktLanguageVars._re_non_word_chars.replace("'",'')
PUNCTUATION = {'greek':
{'external': ('.', ';'),
'internal': (',', '·'),
'file': 'greek.pickle', },
}
INDIAN_LANGUAGES = ['bengali', 'hindi', 'marathi', 'sanskrit', 'telugu']
class BaseSentenceTokenizer:
""" Base class for sentence tokenization"""
def __init__(self, language: str = None):
""" Initialize stoplist builder with option for language specific parameters
:param language : language for sentence tokenization
:type language: str
"""
if language:
self.language = language.lower()
def tokenize(self, text: str, model: object = None):
"""
Method for tokenizing sentences with pretrained punkt models; can
be overridden by language-specific tokenizers.
:rtype: list
:param text: text to be tokenized into sentences
:type text: str
:param model: tokenizer object to used # Should be in init?
:type model: object
"""
if not self.model:
model = self.model
tokenizer = self.model
if self.lang_vars:
tokenizer._lang_vars = self.lang_vars
return tokenizer.tokenize(text)
def _get_models_path(self, language): # pragma: no cover
return f'~/cltk_data/{language}/model/{language}_models_cltk/tokenizers/sentence'
class BasePunktSentenceTokenizer(BaseSentenceTokenizer):
"""Base class for punkt sentence tokenization"""
missing_models_message = "BasePunktSentenceTokenizer requires a language model."
def __init__(self, language: str = None, lang_vars: object = None):
"""
:param language : language for sentence tokenization
:type language: str
"""
self.language = language
self.lang_vars = lang_vars
super().__init__(language=self.language)
if self.language:
self.models_path = self._get_models_path(self.language)
try:
self.model = open_pickle(os.path.join(os.path.expanduser(self.models_path),
f'{self.language}_punkt.pickle'))
except FileNotFoundError as err:
raise type(err)(BasePunktSentenceTokenizer.missing_models_message)
class BaseRegexSentenceTokenizer(BaseSentenceTokenizer):
""" Base class for regex sentence tokenization"""
def __init__(self, language: str = None, sent_end_chars: List[str] = None):
"""
:param language: language for sentence tokenization
:type language: str
:param sent_end_chars: list of sentence-ending punctuation marks
:type sent_end_chars: list
"""
BaseSentenceTokenizer.__init__(self, language)
if sent_end_chars:
self.sent_end_chars = sent_end_chars
self.sent_end_chars_regex = '|'.join(self.sent_end_chars)
self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s'
else:
raise Exception # TODO add message, must specify sent_end_chars, or warn and use defaults
def tokenize(self, text: str, model: object = None):
"""
Method for tokenizing sentences with regular expressions.
:rtype: list
:param text: text to be tokenized into sentences
:type text: str
"""
sentences = re.split(self.pattern, text)
return sentences
class TokenizeSentence(BasePunktSentenceTokenizer): # pylint: disable=R0903
"""Tokenize sentences for the language given as argument, e.g.,
``TokenizeSentence('greek')``.
"""
missing_models_message = "TokenizeSentence requires the models to be installed in cltk_data. Please load the correct models."
def __init__(self, language: str):
"""Lower incoming language name and assemble variables.
:type language: str
:param language : Language for sentence tokenization.
"""
self.language = language.lower()
# Workaround for Latin—use old API syntax to load new sent tokenizer
if self.language == 'latin':
self.lang_vars = LatinLanguageVars()
super().__init__(language='latin', lang_vars=self.lang_vars)
elif self.language not in INDIAN_LANGUAGES:
self.internal_punctuation, self.external_punctuation, self.tokenizer_path = \
self._setup_language_variables(self.language)
def _setup_language_variables(self, lang: str): # pragma: no cover
"""Check for language availability and presence of tokenizer file,
then read punctuation characters for language and build tokenizer file
path.
:param lang: The language argument given to the class.
:type lang: str
:rtype (str, str, str)
"""
assert lang in PUNCTUATION.keys(), \
'Sentence tokenizer not available for {0} language.'.format(lang)
internal_punctuation = PUNCTUATION[lang]['internal']
external_punctuation = PUNCTUATION[lang]['external']
file = PUNCTUATION[lang]['file']
tokenizer_path = os.path.join(os.path.expanduser(self._get_models_path(language=lang)),
file)
assert os.path.isfile(tokenizer_path), \
'CLTK linguistics data not found for language {0} {}'.format(lang)
return internal_punctuation, external_punctuation, tokenizer_path
def _setup_tokenizer(self, tokenizer: object): # pragma: no cover
"""Add tokenizer and punctuation variables.
:type tokenizer: object
:param tokenizer : Unpickled tokenizer object.
:rtype : object
"""
language_punkt_vars = PunktLanguageVars
language_punkt_vars.sent_end_chars = self.external_punctuation
language_punkt_vars.internal_punctuation = self.internal_punctuation
tokenizer.INCLUDE_ALL_COLLOCS = True
tokenizer.INCLUDE_ABBREV_COLLOCS = True
params = tokenizer.get_params()
return PunktSentenceTokenizer(params)
def tokenize_sentences(self, untokenized_string: str):
"""Tokenize sentences by reading trained tokenizer and invoking
``PunktSentenceTokenizer()``.
:type untokenized_string: str
:param untokenized_string: A string containing one of more sentences.
:rtype : list of strings
"""
# load tokenizer
assert isinstance(untokenized_string, str), \
'Incoming argument must be a string.'
if self.language == 'latin':
self.models_path = self._get_models_path(self.language)
try:
self.model = open_pickle(
os.path.expanduser(os.path.join(self.models_path, 'latin_punkt.pickle')))
except FileNotFoundError as err:
raise type(err)(TokenizeSentence.missing_models_message + self.models_path)
tokenizer = self.model
tokenizer._lang_vars = self.lang_vars
else:
tokenizer = open_pickle(self.tokenizer_path)
tokenizer = self._setup_tokenizer(tokenizer)
# mk list of tokenized sentences
if self.language == 'latin':
return tokenizer.tokenize(untokenized_string)
else:
tokenized_sentences = [sentence for sentence in
tokenizer.sentences_from_text(untokenized_string,
realign_boundaries=True)]
return tokenized_sentences
def indian_punctuation_tokenize_regex(self, untokenized_string: str):
"""A trivial tokenizer which just tokenizes on the punctuation boundaries.
This also includes punctuation, namely the the purna virama ("|") and
deergha virama ("॥"), for Indian language scripts.
:type untokenized_string: str
:param untokenized_string: A string containing one of more sentences.
:rtype : list of strings
"""
modified_punctuations = string.punctuation.replace("|",
"") # The replace , deletes the ' | ' from the punctuation string provided by the library
indian_punctuation_pattern = re.compile(
'([' + modified_punctuations + '\u0964\u0965' + ']|\|+)')
tok_str = indian_punctuation_pattern.sub(r' \1 ', untokenized_string.replace('\t', ' '))
return re.sub(r'[ ]+', u' ', tok_str).strip(' ').split(' ')
def tokenize(self, untokenized_string: str, model=None):
"""Alias for tokenize_sentences()—NLTK's PlaintextCorpusReader needs a
function called tokenize in functions used as a parameter for sentence
tokenization.
:type untokenized_string: str
:param untokenized_string: A string containing one of more sentences.
"""
if self.language in INDIAN_LANGUAGES:
return self.indian_punctuation_tokenize_regex(untokenized_string)
else:
return self.tokenize_sentences(untokenized_string)
@diyclassics
Copy link

First, thanks! very helpful.

Second, re: # _get_model_path is default method. My thought here was that not all tokenizers are model-based; e.g. the regex tokenizer. So, it was introduced in BasePunktSentenceTokenizer. Curious what the advantage of moving it up to the parent class would be?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment