Last active
July 8, 2016 02:29
-
-
Save diogommartins/b20e73c6f5f09b8a7702235346ca4ae3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import multiprocessing | |
import operator | |
import string | |
from functools import reduce | |
import itertools | |
def remove_punctuation(text): | |
translator = str.maketrans({key: None for key in string.punctuation}) | |
return text.translate(translator) | |
def map_func(text): | |
table = {} # hashtable auxiliar | |
for word in remove_punctuation(text).split(): # separa linha em lista de palavras, removendo pontuações | |
if word.isalpha(): | |
word = word.lower() | |
try: | |
table[word] += 1 | |
except KeyError: | |
table[word] = 1 | |
return table | |
def reduce_func(a, b): | |
for k, v in b.items(): | |
try: | |
a[k] += v | |
except KeyError: | |
a[k] = v | |
return a | |
with open('./lorem.txt') as f: | |
# Le arquivo, separa em linhas e remove '\n' | |
paragraphs = (p for p in f.readlines() if p != '\n') | |
# Cria pool de processos | |
pool = multiprocessing.Pool() | |
# map -> para cada linha, cria tabela de ocorrencias. Trabalho é paralelizado em processos separados e | |
# não concorrentes | |
map_values = pool.map(map_func, paragraphs) | |
chained_results = itertools.chain(map_values) | |
# redução do conjunto de tabelas em uma única tabela | |
reduced_values = reduce(reduce_func, chained_results) | |
# ordena resultados de forma decrescente | |
sorted_desc = sorted(reduced_values.items(), key=operator.itemgetter(1), reverse=True) | |
# imprime | |
for word, count in sorted_desc: | |
print("{word} tem {count} ocorrencias".format(word=word, count=count)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment