Skip to content

Instantly share code, notes, and snippets.

@adambard
Created December 1, 2013 06:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adambard/7729366 to your computer and use it in GitHub Desktop.
Save adambard/7729366 to your computer and use it in GitHub Desktop.
import hashlib
import requests
from bs4 import BeautifulSoup
URL = ("http://www.forbes.com/pictures/emjl45gkke/on-startups/")
# $('.next.prev_next a').attr('href')
def get_page(url):
fname = u"tmp/" + hashlib.md5(url).hexdigest()
try:
with open(fname, 'r') as f:
return f.read()
except IOError:
resp = requests.get(url)
body = resp.content
with open(fname, 'w') as f:
f.write(body)
return body
def read_page(url):
body = get_page(url)
soup = BeautifulSoup(body)
try:
title = soup.select('.copy_box h2')[0].text
content = unicode('\n'.join(x.get_text() for x in soup.select('.copy_box p'))).encode('utf-8', 'replace')
print title
print u"-" * len(title)
print content.strip()
print u""
print u""
next_page = soup.select('.next.prev_next a')[0]['href']
return next_page
except IndexError:
print u"ERROR"
print body
def main():
n = 0
url = None
new_url = URL
while n < 100 and url != new_url and new_url is not None and new_url != '':
url = new_url
new_url = read_page(url)
n += 1
print n, u"articles."
if __name__ == u"__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment