Created

Embed URL

HTTPS clone URL

SSH clone URL

You can clone with HTTPS or SSH.

Download Gist
View scrape.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
import hashlib
import requests
from bs4 import BeautifulSoup
 
 
URL = ("http://www.forbes.com/pictures/emjl45gkke/on-startups/")
 
 
# $('.next.prev_next a').attr('href')
 
 
def get_page(url):
fname = u"tmp/" + hashlib.md5(url).hexdigest()
 
try:
with open(fname, 'r') as f:
return f.read()
except IOError:
resp = requests.get(url)
body = resp.content
with open(fname, 'w') as f:
f.write(body)
return body
 
 
def read_page(url):
body = get_page(url)
soup = BeautifulSoup(body)
 
try:
title = soup.select('.copy_box h2')[0].text
content = unicode('\n'.join(x.get_text() for x in soup.select('.copy_box p'))).encode('utf-8', 'replace')
print title
print u"-" * len(title)
print content.strip()
print u""
print u""
next_page = soup.select('.next.prev_next a')[0]['href']
return next_page
except IndexError:
print u"ERROR"
print body
 
 
def main():
n = 0
url = None
new_url = URL
while n < 100 and url != new_url and new_url is not None and new_url != '':
url = new_url
new_url = read_page(url)
n += 1
 
print n, u"articles."
 
 
if __name__ == u"__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.