Skip to content

Instantly share code, notes, and snippets.

@abdullah-alnahas
Last active February 17, 2019 08:49
Show Gist options
  • Save abdullah-alnahas/073ad7f410b28ea9c9d4c0a83259fbf2 to your computer and use it in GitHub Desktop.
Save abdullah-alnahas/073ad7f410b28ea9c9d4c0a83259fbf2 to your computer and use it in GitHub Desktop.
Build a wordcloud for your website - Python code.
import re
import time
import string
import regex
from nltk import sent_tokenize
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from wordcloud import WordCloud, STOPWORDS
def simple_get(url):
"""
Borrowed from https://realpython.com/python-web-scraping-practical-introduction/
Attempts to get the content at `url` by making an HTTP GET request.
If the content-type of response is some kind of HTML/XML, return the
text content, otherwise return None.
"""
try:
with closing(get(url, stream=True)) as resp:
if is_good_response(resp):
return resp.content
else:
return None
except RequestException as e:
print('Error during requests to {0} : {1}'.format(url, str(e)))
return None
def is_good_response(resp):
"""
Borrowed from https://realpython.com/python-web-scraping-practical-introduction/
Returns True if the response seems to be HTML, False otherwise.
"""
content_type = resp.headers['Content-Type'].lower()
return (resp.status_code == 200
and content_type is not None
and content_type.find('html') > -1)
def get_urls(htmlsoup):
urls = []
links = htmlsoup.find_all('a')
for link in links:
url = link.get('href', '')
if 'scrapinghub.com' in url:
urls.append(url)
return urls
def clean_line(line,
eol='\n',
minlen=1,
url_re = re.compile(r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))""", re.IGNORECASE|re.UNICODE)
):
""" The url regex is borrowed from https://github.com/rcompton/ryancompton.net/blob/master/assets/praw_drugs/urlmarker.py#L23 """
if not line.strip():
return line
#preprocess
##remove all URLs
line = url_re.sub('', line)
##clean from html tags
line = BeautifulSoup(line, "lxml").get_text()
##remove everything other than Latin chars, numbers and punctuations
line = regex.sub('[^\p{Latin} \p{Number}]', '', line)
##split into sentences
sentences = sent_tokenize(line)
##replace the strange double quotation marks with the normal ones, i.e. “ or ” --> "
sentences = [re.sub('(“|”)', '"', sentence) for sentence in sentences]
##replace the strange single quotation marks with the normal ones, i.e. ‘ or ’ --> '
sentences = [re.sub('(‘|’)', "'", sentence) for sentence in sentences]
##replace multiple spaces by one space
sentences = [re.sub('(\s)+', ' ', sentence) for sentence in sentences]
##lower
sentences = [sentence.strip().lower() + eol for sentence in sentences if len(sentence.split(" ")) > minlen]
return sentences
def extract_text(htmlsoup):
lines = []
for line in htmlsoup.text.split('\n'):
lines.append(clean_line(line))
return lines
def draw_word_cloud(text):
"""
"""
wordcloud = WordCloud(max_words=1000, stopwords=set(STOPWORDS), random_state=1).generate(text)
wordcloud.to_file("scrapinghub_wordcloud.png")
def flatten(l, a=[]):
""" Borrowed from https://stackoverflow.com/a/40252152/2558856 with a tiny modification """
for i in l:
if isinstance(i, list):
flatten(i, a)
else:
a.append(i)
return a
if __name__ == '__main__':
elapsed_time = time.time()
urls = ['https://scrapinghub.com']
scrapped_urls = set()
scrapped_count = 0
text_lines = []
try:
while urls:
url = urls.pop(0)
if url in scrapped_urls:
continue
scrapped_urls.add(url)
html_content = simple_get(url)
if not type(html_content) == bytes:
continue
htmlsoup = BeautifulSoup(html_content, 'lxml')
urls.extend(get_urls(htmlsoup))
textual_content = extract_text(htmlsoup)
text_lines.extend(textual_content)
scrapped_count += 1
print(f"Done scraping {url} -- count: {scrapped_count}")
except KeyboardInterrupt:
print("\nInterrupted!\nProducing WordCloud..")
pass
text_lines = flatten(text_lines)
text = '\n'.join(text_lines)
if not text:
text = 'empty!'
draw_word_cloud(text)
with open('scrapinghub.txt', 'w') as infile:
infile.write(text)
elapsed_time = (time.time() - elapsed_time)/60
redcolor = '\033[01;31;47m'
blackcolor = '\033[01;30;47m'
nativecolor = '\033[m'
print("Scrapped {} pages from {}scraping{}hub{} within {:5.2f} minutes.".
format(scrapped_count, blackcolor, redcolor, nativecolor, elapsed_time)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment