Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@thomasjensen
Created January 5, 2012 00:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save thomasjensen/1562997 to your computer and use it in GitHub Desktop.
Save thomasjensen/1562997 to your computer and use it in GitHub Desktop.
Download blog posts from R-bloggers
from BeautifulSoup import BeautifulSoup
import mechanize
import time
url = "http://www.r-bloggers.com/"
br = mechanize.Browser()
page = br.open(url)
count = 0
nextLink = []
errorlog = open("errorlog.txt","w")
errorlog.write("Pages not downloaded:\n")
errorlog.close()
while nextLink != None:
time.sleep(1)
links = []
soup = BeautifulSoup(page)
excerpts = soup.findAll("p",{"class":"excerpt"})
for excerpt in excerpts:
link = excerpt.findNext("a", {"class":"more-link"})["href"]
links.append(link)
for link in links:
try:
site = br.open(str(link)).read()
filename = "/Users/thomasjensen/Documents/RBloggersScrape/download/post" + str(count) + ".html"
print filename
html = open(filename,"wb")
html.write(site)
html.close()
count += 1
except:
error = open("errorlog.txt","a")
text = str(link) + "\n"
error.write(text)
error.close()
current = soup.find("span",{"class":"current"}).text
nextNumber = str(int(current) + 1)
try:
nextLink = soup.find("a", title = nextNumber)["href"]
page = br.open(nextLink)
except:
nextLink = None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment