Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
# -*- coding: utf-8 -*-
"""Basic tokenization and counting of nouns and verbs in a set of phrases."""
# Standard lib imports
from typing import Tuple, Dict, Union, List
# SpaCy imports
import spacy
# Language engines
ENGINES = {
'es': spacy.load('es_core_news_sm'),
'pt': spacy.load('pt_core_news_sm'),
'en': spacy.load('en_core_web_sm')
}
# Input typings
GroupOfWords = Tuple[str, ...]
GroupCount = Dict[GroupOfWords, int]
WordCount = Dict[str, int]
# Language = Literal['es', 'pt', 'en']
FileOrList = Union[str, List[str]]
def tokenize_phrase(phrase: str, lang: str = 'es') -> GroupOfWords:
"""
Extract nouns and verbs for a phrase in a given language.
Parameters
----------
phrase: str
Input phrase to tokenize
lang: str, optional
Language of the phrase to tokenize, must be available on `ENGINES`
Returns
-------
tuple[str, ...]
A tuple containing all the verbs and nouns detected inside the input
phrase.
Examples
--------
>>> es_phrase = 'El jugo me llegó chorreado'
>>> tokenize_phrase(es_phrase)
>>> ['jugo', 'llegó', 'chorreado']
Other languages
>>> pt_phrase = 'Não recebi meu desconto'
>>> tokenize_phrase()
"""
nlp = ENGINES[lang]
doc = nlp(phrase)
token_graph = {}
for token in doc:
token_graph[token.text] = {'pos': token.pos_, 'label': token.dep_,
'children': list(token.children),
'base': token.lemma_}
valid_words = [x.lower() for x in token_graph
if token_graph[x]['pos'] in {'VERB', 'NOUN', 'ADV', 'ADJ'}]
return tuple(valid_words)
def update_histogram(phrase: str, group_count: GroupCount,
word_count: WordCount,
lang: str = 'es') -> Tuple[GroupCount, WordCount]:
"""
Update histograms.
Given a phrase, a group-of-words histogram and a words count histogram,
updates both histograms taking into account the verbs and nouns of the
input phrase.
Parameters
----------
phrase: str
Input phrase, whose verbs and nouns are used to update the histogram
statistics.
group_count: dict
Dictionary that maps a tuple of words to their ocurrence count.
word_count: dict
Dictionary that maps a word to its ocurrence count.
lang: str, optional
Language of the phrase to tokenize. Default: 'es'
Returns
-------
dict
Dictionary that maps a tuple of words to their ocurrence count.
dict
Dictionary that maps a word to its ocurrence count.
"""
words = tokenize_phrase(phrase, lang)
if len(words) > 1:
g_count = group_count.get(words, 0)
group_count[words] = g_count + 1
for word in words:
current_count = word_count.get(word, 0)
word_count[word] = current_count + 1
return group_count, word_count
def compute_histogram(phrases: FileOrList, lang: str = 'es'):
"""
Compute histograms for word groups and individual words.
Given a set of phrases, this function computes count histograms
for individual and groups of nouns and verbs found throughout all the input
phrases.
Parameters
----------
phrases: str or list[str]
String pointing to a file that contains phrases, or a list of phrases
whose histogram is going to be computed.
lang: str, optional
Language of the input phrase. Default: 'es'
Returns
-------
dict
Dictionary that maps a tuple of words to their ocurrence count.
dict
Dictionary that maps a word to its ocurrence count.
"""
if isinstance(phrases, str):
with open(phrases, 'r') as f:
phrases = f.readlines()
phrases = [x.strip() for x in phrases]
group_count = {}
word_count = {}
for phrase in phrases:
print(phrase)
group_count, word_count = update_histogram(phrase, group_count,
word_count)
return group_count, word_count
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.