Skip to content

Instantly share code, notes, and snippets.

@nerflad
Last active August 18, 2017 01:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nerflad/e8b545b0a42d434715c1c44b42d40421 to your computer and use it in GitHub Desktop.
Save nerflad/e8b545b0a42d434715c1c44b42d40421 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import os, sys
import urllib.request
import time
urls = ("https://www.notsomoderndrummer.com/not-so-modern-drummer/columns/david-barsalou/mike-clark-words-of-wisdom?rq=mike%20clark%3A%20words%20of%20wisdom",\
"https://www.notsomoderndrummer.com/not-so-modern-drummer/columns/david-barsalou/mike-clark-words-of-wisdom-part-2?rq=mike%20clark%3A%20words%20of%20wisdom",\
"https://www.notsomoderndrummer.com/not-so-modern-drummer/columns/david-barsalou/mike-clark-words-of-wisdom-part-3?rq=mike%20clark%3A%20words%20of%20wisdom",\
"https://www.notsomoderndrummer.com/not-so-modern-drummer/columns/david-barsalou/mike-clark-words-of-wisdom-part-4?rq=mike%20clark%3A%20words%20of%20wisdom",\
"https://www.notsomoderndrummer.com/not-so-modern-drummer/columns/david-barsalou/mike-clark-words-of-wisdom-part-5?rq=mike%20clark%3A%20words%20of%20wisdom",\
"https://www.notsomoderndrummer.com/not-so-modern-drummer/columns/david-barsalou/mike-clark-words-of-wisdom-part-6?rq=mike%20clark%3A%20words%20of%20wisdom",\
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2015/9/15/mike-clark-words-of-wisdom-part-6?rq=mike%20clark%3A%20words%20of%20wisdom",\
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2016/1/22/mike-clark-words-of-wisdom-part8?rq=mike%20clark%3A%20words%20of%20wisdom",\
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2016/4/29/mike-clark-words-of-wisdom-part-9?rq=mike%20clark%3A%20words%20of%20wisdom",\
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2016/8/1/mike-clark-words-of-wisdom-part-10?rq=mike%20clark%3A%20words%20of%20wisdom",\
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2016/11/4/mike-clark-words-of-wisdom-part-11?rq=mike%20clark%3A%20words%20of%20wisdom",\
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2016/12/15/mike-clark-words-of-wisdom-part-12?rq=mike%20clark%3A%20words%20of%20wisdom",\
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2017/3/15/mike-clark-words-of-wisdom-13?rq=mike%20clark%3A%20words%20of%20wisdom",\
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2017/5/16/mike-clark-words-of-wisdom-chapter-14?rq=mike%20clark%3A%20words%20of%20wisdom",\
"https://www.notsomoderndrummer.com/not-so-modern-drummer/2017/6/20/mike-clark-words-of-wisdom-chapter-15?rq=mike%20clark%3A%20words%20of%20wisdom",\
)
def bs4_resultset_to_strings(list_):
newlist = []
for i in list_:
newlist.append("".join((str(i), '\n')))
return newlist
def get_soup_from_url(url):
page = urllib.request.urlopen(url).read()
page = page.decode("utf-8")
soup = BeautifulSoup(page, 'html.parser')
return soup
def get_blogtext_from_soup(soup):
text = soup.find_all('div', class_='sqs-block-html')
return text[1]
def standardify_html(string_list):
header = ["<!doctype html>\n",\
"<html>\n",\
"<head>\n",\
" <meta charset=\"utf-8\" />\n",\
" <style>\n",\
" div { margin: auto; width: 60%; padding: 2em; background-color: #EEEEEE; }\n",\
" </style>\n",\
"</head>\n",\
"<body>\n",\
" <h2>Wisdom of Clark</h2>\n"]
for i in reversed(header):
string_list.insert(0, i)
string_list.append("</body>\n")
string_list.append("</html>\n")
return string_list
def main():
final_html = []
for i,x in enumerate(urls):
for retry in range(3): # retry up to three times with 4 second delay
try:
soup = get_soup_from_url(x)
break
except urllib.error.HTTPError:
print("Retrying URL:", x)
time.sleep(4)
content = bs4_resultset_to_strings(get_blogtext_from_soup(soup))
final_html += content
final_html = standardify_html(final_html)
with open('wisdom-of-clark.html', 'w+') as _file:
for i in final_html:
_file.write(i)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment