andfoy/tokenize_utils.py

## tokenize_utils.py
# -*- coding: utf-8 -*-

"""Basic tokenization and counting of nouns and verbs in a set of phrases."""

# Standard lib imports
from typing import Tuple, Dict, Union, List

# SpaCy imports
import spacy

# Language engines
ENGINES = {
    'es': spacy.load('es_core_news_sm'),
    'pt': spacy.load('pt_core_news_sm'),
    'en': spacy.load('en_core_web_sm')
}

# Input typings
GroupOfWords = Tuple[str, ...]
GroupCount = Dict[GroupOfWords, int]
WordCount = Dict[str, int]
# Language = Literal['es', 'pt', 'en']
FileOrList = Union[str, List[str]]


def tokenize_phrase(phrase: str, lang: str = 'es') -> GroupOfWords:
    """
    Extract nouns and verbs for a phrase in a given language.

    Parameters
    ----------
    phrase: str
        Input phrase to tokenize
    lang: str, optional
        Language of the phrase to tokenize, must be available on `ENGINES`

    Returns
    -------
    tuple[str, ...]
        A tuple containing all the verbs and nouns detected inside the input
        phrase.

    Examples
    --------
    >>> es_phrase = 'El jugo me llegó chorreado'
    >>> tokenize_phrase(es_phrase)
    >>> ['jugo', 'llegó', 'chorreado']

    Other languages

    >>> pt_phrase = 'Não recebi meu desconto'
    >>> tokenize_phrase()
    """
    nlp = ENGINES[lang]
    doc = nlp(phrase)
    token_graph = {}
    for token in doc:
        token_graph[token.text] = {'pos': token.pos_, 'label': token.dep_,
                                   'children': list(token.children),
                                   'base': token.lemma_}
    valid_words = [x.lower() for x in token_graph
                   if token_graph[x]['pos'] in {'VERB', 'NOUN', 'ADV', 'ADJ'}]
    return tuple(valid_words)


def update_histogram(phrase: str, group_count: GroupCount,
                     word_count: WordCount,
                     lang: str = 'es') -> Tuple[GroupCount, WordCount]:
    """
    Update histograms.

    Given a phrase, a group-of-words histogram and a words count histogram,
    updates both histograms taking into account the verbs and nouns of the
    input phrase.

    Parameters
    ----------
    phrase: str
        Input phrase, whose verbs and nouns are used to update the histogram
        statistics.
    group_count: dict
        Dictionary that maps a tuple of words to their ocurrence count.
    word_count: dict
        Dictionary that maps a word to its ocurrence count.
    lang: str, optional
        Language of the phrase to tokenize. Default: 'es'

    Returns
    -------
    dict
        Dictionary that maps a tuple of words to their ocurrence count.
    dict
        Dictionary that maps a word to its ocurrence count.
    """
    words = tokenize_phrase(phrase, lang)
    if len(words) > 1:
        g_count = group_count.get(words, 0)
        group_count[words] = g_count + 1
        for word in words:
            current_count = word_count.get(word, 0)
            word_count[word] = current_count + 1
    return group_count, word_count


def compute_histogram(phrases: FileOrList, lang: str = 'es'):
    """
    Compute histograms for word groups and individual words.

    Given a set of phrases, this function computes count histograms
    for individual and groups of nouns and verbs found throughout all the input
    phrases.

    Parameters
    ----------
    phrases: str or list[str]
        String pointing to a file that contains phrases, or a list of phrases
        whose histogram is going to be computed.
    lang: str, optional
        Language of the input phrase. Default: 'es'

    Returns
    -------
    dict
        Dictionary that maps a tuple of words to their ocurrence count.
    dict
        Dictionary that maps a word to its ocurrence count.
    """
    if isinstance(phrases, str):
        with open(phrases, 'r') as f:
            phrases = f.readlines()
            phrases = [x.strip() for x in phrases]
    group_count = {}
    word_count = {}
    for phrase in phrases:
        print(phrase)
        group_count, word_count = update_histogram(phrase, group_count,
                                                   word_count)
    return group_count, word_count
	# -- coding: utf-8 --

	"""Basic tokenization and counting of nouns and verbs in a set of phrases."""

	# Standard lib imports
	from typing import Tuple, Dict, Union, List

	# SpaCy imports
	import spacy

	# Language engines
	ENGINES = {
	'es': spacy.load('es_core_news_sm'),
	'pt': spacy.load('pt_core_news_sm'),
	'en': spacy.load('en_core_web_sm')
	}

	# Input typings
	GroupOfWords = Tuple[str, ...]
	GroupCount = Dict[GroupOfWords, int]
	WordCount = Dict[str, int]
	# Language = Literal['es', 'pt', 'en']
	FileOrList = Union[str, List[str]]


	def tokenize_phrase(phrase: str, lang: str = 'es') -> GroupOfWords:
	"""
	Extract nouns and verbs for a phrase in a given language.

	Parameters
	----------
	phrase: str
	Input phrase to tokenize
	lang: str, optional
	Language of the phrase to tokenize, must be available on `ENGINES`

	Returns
	-------
	tuple[str, ...]
	A tuple containing all the verbs and nouns detected inside the input
	phrase.

	Examples
	--------
	>>> es_phrase = 'El jugo me llegó chorreado'
	>>> tokenize_phrase(es_phrase)
	>>> ['jugo', 'llegó', 'chorreado']

	Other languages

	>>> pt_phrase = 'Não recebi meu desconto'
	>>> tokenize_phrase()
	"""
	nlp = ENGINES[lang]
	doc = nlp(phrase)
	token_graph = {}
	for token in doc:
	token_graph[token.text] = {'pos': token.pos_, 'label': token.dep_,
	'children': list(token.children),
	'base': token.lemma_}
	valid_words = [x.lower() for x in token_graph
	if token_graph[x]['pos'] in {'VERB', 'NOUN', 'ADV', 'ADJ'}]
	return tuple(valid_words)


	def update_histogram(phrase: str, group_count: GroupCount,
	word_count: WordCount,
	lang: str = 'es') -> Tuple[GroupCount, WordCount]:
	"""
	Update histograms.

	Given a phrase, a group-of-words histogram and a words count histogram,
	updates both histograms taking into account the verbs and nouns of the
	input phrase.

	Parameters
	----------
	phrase: str
	Input phrase, whose verbs and nouns are used to update the histogram
	statistics.
	group_count: dict
	Dictionary that maps a tuple of words to their ocurrence count.
	word_count: dict
	Dictionary that maps a word to its ocurrence count.
	lang: str, optional
	Language of the phrase to tokenize. Default: 'es'

	Returns
	-------
	dict
	Dictionary that maps a tuple of words to their ocurrence count.
	dict
	Dictionary that maps a word to its ocurrence count.
	"""
	words = tokenize_phrase(phrase, lang)
	if len(words) > 1:
	g_count = group_count.get(words, 0)
	group_count[words] = g_count + 1
	for word in words:
	current_count = word_count.get(word, 0)
	word_count[word] = current_count + 1
	return group_count, word_count


	def compute_histogram(phrases: FileOrList, lang: str = 'es'):
	"""
	Compute histograms for word groups and individual words.

	Given a set of phrases, this function computes count histograms
	for individual and groups of nouns and verbs found throughout all the input
	phrases.

	Parameters
	----------
	phrases: str or list[str]
	String pointing to a file that contains phrases, or a list of phrases
	whose histogram is going to be computed.
	lang: str, optional
	Language of the input phrase. Default: 'es'

	Returns
	-------
	dict
	Dictionary that maps a tuple of words to their ocurrence count.
	dict
	Dictionary that maps a word to its ocurrence count.
	"""
	if isinstance(phrases, str):
	with open(phrases, 'r') as f:
	phrases = f.readlines()
	phrases = [x.strip() for x in phrases]
	group_count = {}
	word_count = {}
	for phrase in phrases:
	print(phrase)
	group_count, word_count = update_histogram(phrase, group_count,
	word_count)
	return group_count, word_count