Created
March 19, 2020 00:32
-
-
Save andfoy/6badf2b2f7a39f632932c546b2aebaed to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
"""Basic tokenization and counting of nouns and verbs in a set of phrases.""" | |
# Standard lib imports | |
from typing import Tuple, Dict, Union, List | |
# SpaCy imports | |
import spacy | |
# Language engines | |
ENGINES = { | |
'es': spacy.load('es_core_news_sm'), | |
'pt': spacy.load('pt_core_news_sm'), | |
'en': spacy.load('en_core_web_sm') | |
} | |
# Input typings | |
GroupOfWords = Tuple[str, ...] | |
GroupCount = Dict[GroupOfWords, int] | |
WordCount = Dict[str, int] | |
# Language = Literal['es', 'pt', 'en'] | |
FileOrList = Union[str, List[str]] | |
def tokenize_phrase(phrase: str, lang: str = 'es') -> GroupOfWords: | |
""" | |
Extract nouns and verbs for a phrase in a given language. | |
Parameters | |
---------- | |
phrase: str | |
Input phrase to tokenize | |
lang: str, optional | |
Language of the phrase to tokenize, must be available on `ENGINES` | |
Returns | |
------- | |
tuple[str, ...] | |
A tuple containing all the verbs and nouns detected inside the input | |
phrase. | |
Examples | |
-------- | |
>>> es_phrase = 'El jugo me llegó chorreado' | |
>>> tokenize_phrase(es_phrase) | |
>>> ['jugo', 'llegó', 'chorreado'] | |
Other languages | |
>>> pt_phrase = 'Não recebi meu desconto' | |
>>> tokenize_phrase() | |
""" | |
nlp = ENGINES[lang] | |
doc = nlp(phrase) | |
token_graph = {} | |
for token in doc: | |
token_graph[token.text] = {'pos': token.pos_, 'label': token.dep_, | |
'children': list(token.children), | |
'base': token.lemma_} | |
valid_words = [x.lower() for x in token_graph | |
if token_graph[x]['pos'] in {'VERB', 'NOUN', 'ADV', 'ADJ'}] | |
return tuple(valid_words) | |
def update_histogram(phrase: str, group_count: GroupCount, | |
word_count: WordCount, | |
lang: str = 'es') -> Tuple[GroupCount, WordCount]: | |
""" | |
Update histograms. | |
Given a phrase, a group-of-words histogram and a words count histogram, | |
updates both histograms taking into account the verbs and nouns of the | |
input phrase. | |
Parameters | |
---------- | |
phrase: str | |
Input phrase, whose verbs and nouns are used to update the histogram | |
statistics. | |
group_count: dict | |
Dictionary that maps a tuple of words to their ocurrence count. | |
word_count: dict | |
Dictionary that maps a word to its ocurrence count. | |
lang: str, optional | |
Language of the phrase to tokenize. Default: 'es' | |
Returns | |
------- | |
dict | |
Dictionary that maps a tuple of words to their ocurrence count. | |
dict | |
Dictionary that maps a word to its ocurrence count. | |
""" | |
words = tokenize_phrase(phrase, lang) | |
if len(words) > 1: | |
g_count = group_count.get(words, 0) | |
group_count[words] = g_count + 1 | |
for word in words: | |
current_count = word_count.get(word, 0) | |
word_count[word] = current_count + 1 | |
return group_count, word_count | |
def compute_histogram(phrases: FileOrList, lang: str = 'es'): | |
""" | |
Compute histograms for word groups and individual words. | |
Given a set of phrases, this function computes count histograms | |
for individual and groups of nouns and verbs found throughout all the input | |
phrases. | |
Parameters | |
---------- | |
phrases: str or list[str] | |
String pointing to a file that contains phrases, or a list of phrases | |
whose histogram is going to be computed. | |
lang: str, optional | |
Language of the input phrase. Default: 'es' | |
Returns | |
------- | |
dict | |
Dictionary that maps a tuple of words to their ocurrence count. | |
dict | |
Dictionary that maps a word to its ocurrence count. | |
""" | |
if isinstance(phrases, str): | |
with open(phrases, 'r') as f: | |
phrases = f.readlines() | |
phrases = [x.strip() for x in phrases] | |
group_count = {} | |
word_count = {} | |
for phrase in phrases: | |
print(phrase) | |
group_count, word_count = update_histogram(phrase, group_count, | |
word_count) | |
return group_count, word_count |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment