Skip to content

Instantly share code, notes, and snippets.

@nerflad nerflad/
Last active Aug 18, 2017

What would you like to do?
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import os, sys
import urllib.request
import time
urls = ("",\
def bs4_resultset_to_strings(list_):
newlist = []
for i in list_:
newlist.append("".join((str(i), '\n')))
return newlist
def get_soup_from_url(url):
page = urllib.request.urlopen(url).read()
page = page.decode("utf-8")
soup = BeautifulSoup(page, 'html.parser')
return soup
def get_blogtext_from_soup(soup):
text = soup.find_all('div', class_='sqs-block-html')
return text[1]
def standardify_html(string_list):
header = ["<!doctype html>\n",\
" <meta charset=\"utf-8\" />\n",\
" <style>\n",\
" div { margin: auto; width: 60%; padding: 2em; background-color: #EEEEEE; }\n",\
" </style>\n",\
" <h2>Wisdom of Clark</h2>\n"]
for i in reversed(header):
string_list.insert(0, i)
return string_list
def main():
final_html = []
for i,x in enumerate(urls):
for retry in range(3): # retry up to three times with 4 second delay
soup = get_soup_from_url(x)
except urllib.error.HTTPError:
print("Retrying URL:", x)
content = bs4_resultset_to_strings(get_blogtext_from_soup(soup))
final_html += content
final_html = standardify_html(final_html)
with open('wisdom-of-clark.html', 'w+') as _file:
for i in final_html:
if __name__ == "__main__":
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.