Skip to content

Instantly share code, notes, and snippets.

@diogommartins
Last active July 8, 2016 02:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save diogommartins/b20e73c6f5f09b8a7702235346ca4ae3 to your computer and use it in GitHub Desktop.
Save diogommartins/b20e73c6f5f09b8a7702235346ca4ae3 to your computer and use it in GitHub Desktop.
import multiprocessing
import operator
import string
from functools import reduce
import itertools
def remove_punctuation(text):
translator = str.maketrans({key: None for key in string.punctuation})
return text.translate(translator)
def map_func(text):
table = {} # hashtable auxiliar
for word in remove_punctuation(text).split(): # separa linha em lista de palavras, removendo pontuações
if word.isalpha():
word = word.lower()
try:
table[word] += 1
except KeyError:
table[word] = 1
return table
def reduce_func(a, b):
for k, v in b.items():
try:
a[k] += v
except KeyError:
a[k] = v
return a
with open('./lorem.txt') as f:
# Le arquivo, separa em linhas e remove '\n'
paragraphs = (p for p in f.readlines() if p != '\n')
# Cria pool de processos
pool = multiprocessing.Pool()
# map -> para cada linha, cria tabela de ocorrencias. Trabalho é paralelizado em processos separados e
# não concorrentes
map_values = pool.map(map_func, paragraphs)
chained_results = itertools.chain(map_values)
# redução do conjunto de tabelas em uma única tabela
reduced_values = reduce(reduce_func, chained_results)
# ordena resultados de forma decrescente
sorted_desc = sorted(reduced_values.items(), key=operator.itemgetter(1), reverse=True)
# imprime
for word, count in sorted_desc:
print("{word} tem {count} ocorrencias".format(word=word, count=count))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment