Skip to content

Instantly share code, notes, and snippets.

@dubirajara
Last active December 1, 2018 12:16
Show Gist options
  • Save dubirajara/3b507b1b0b3988d8ca5529e52abcb057 to your computer and use it in GitHub Desktop.
Save dubirajara/3b507b1b0b3988d8ca5529e52abcb057 to your computer and use it in GitHub Desktop.
count words
from itertools import chain
from operator import itemgetter
import re
import string
import requests
urls = ['https://storage.googleapis.com/apache-beam-samples/shakespeare/kinglear.txt',
'https://storage.googleapis.com/apache-beam-samples/shakespeare/othello.txt',
'https://storage.googleapis.com/apache-beam-samples/shakespeare/romeoandjuliet.txt']
stop_words_url = 'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words'
data_url = [requests.get(url).text.lower().split() for url in urls] # Get urls data and convert in words lists
data_url = list(chain.from_iterable(data_url)) # Join words lists
stop_words = requests.get(stop_words_url).text.split() # Get stop words data to relevant words classification
def clean_words(data):
"""Clean the data, punctuation and irrelevant words"""
re_path = re.compile('[%s]' % re.escape(string.punctuation))
words_clean_p = [re_path.sub('', w) for w in data] # Clean punctuation
relevant_words = [x for x in words_clean_p if x not in stop_words] # Get only relevant words
return relevant_words
def word_frequency_counter(words):
"""Count and analyze word frequencies"""
count_word_freq = [words.count(word) for word in words]
freq_word_dic = dict(zip(words, count_word_freq)) # Convert the frequency list and word list in Dict
return dict(sorted(freq_word_dic.items(), key=itemgetter(1), reverse=True)) # Dict ordered by frequency (gtl)
dic = word_frequency_counter(clean_words(data_url))
# Template HTML Table Listing Report.
table_base = f"""<style>
.i-am-centered {{ margin: auto; max-width: 800px;}}
table .alto {{background-color:gray;}}
</style>
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css">
<br>
<div class="i-am-centered">
<div class="row">
<div class="col-lg-8 col-md-8 col-sm-8 col-xs-8">
<h2>Shakespeare's Research</h2>
<hr>
<table class="table table-hover table-bordered">
<caption>{len(dic)} Relevant words in three Shakespeare's literary masterpieces.</caption>
<thead><tr><th class="alto" scope="col">Words</th><th class="alto" scope="col">Frecuency</th></tr></thead><tbody>"""
if __name__ == '__main__':
# Create a HTML Table Listing Report.
for w, f in dic.items():
table_item = f'<tr><th> {str(w)} </th><th> {str(f)} </td></tr>'
table_base = table_base + table_item
table_base = f'{table_base}</tbody></table></div></div></div></div>'
with open("word_frequencies_report.html", "w") as file:
file.write(table_base)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment