This script scrapes the blogs of text. Please use judiciously.
##scrapes entry text from all blogs and writes to text file
##horrible code can be blamed on @peteyreplies
from bs4 import BeautifulSoup
import os
import csv
import string
import urllib
import urllib2
import re
import unicodedata
##get list of best of the blogs links
#define a base URL & page variable to increment
baseURL = ""
p = 0
#html = ""
headers = { 'User-Agent' : 'PeteyBlogBot' }
#loop through & download last (p==80 for test, p<=4600 for production as of 5/14/2014) blog links into a single stupidly huge document
while p <= 4600:
html = ""
#doc = urllib2.urlopen(baseURL + str(p))
doc = urllib2.urlopen(urllib2.Request((baseURL + str(p)), None, headers))
html =
soup = BeautifulSoup(html)
print str(p)
#get entry links and write to list
links = []
linkSoup = soup.find_all("h3")
for link in linkSoup:
thisLink = link.a['href']
fixedLink = str(thisLink)
##eventually, should add some stuff to pull entry authors, links, dates, etc, and store in dict w/ entry text)
#loop through all extracted links
for i in links:
#create a separate soup for each linked entry
html2 = ""
doc2 = urllib2.urlopen(i)
html2 = html2 +
soup2 = BeautifulSoup(html2, "lxml")
#get entry text
#first, scrape the soup for all <p> that have no id nor class (which for some reason is just sentences)
lines = soup2.find_all('p', id='', class_='')
#remove the 'see complete archives' outlier which always comes first
#then, iterate through this list of strings, get and clean the text, and write line to text file
for l in lines:
thisLine = l.getText()
if "Comments have been closed." in thisLine:
if "No comments yet!" in thisLine:
if '[email protected]' in thisLine:
cleanLine = unicodedata.normalize('NFKD', thisLine).encode('ascii', 'ignore')
cleanerLine = cleanLine.replace('\n','').replace('\\','')
with open('blogtext.txt', 'a') as out_file:
out_file.write(' ' + cleanerLine)
#begin loop anew
p = p + 20
