Created
December 1, 2013 06:57
-
-
Save adambard/7729366 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib | |
import requests | |
from bs4 import BeautifulSoup | |
URL = ("http://www.forbes.com/pictures/emjl45gkke/on-startups/") | |
# $('.next.prev_next a').attr('href') | |
def get_page(url): | |
fname = u"tmp/" + hashlib.md5(url).hexdigest() | |
try: | |
with open(fname, 'r') as f: | |
return f.read() | |
except IOError: | |
resp = requests.get(url) | |
body = resp.content | |
with open(fname, 'w') as f: | |
f.write(body) | |
return body | |
def read_page(url): | |
body = get_page(url) | |
soup = BeautifulSoup(body) | |
try: | |
title = soup.select('.copy_box h2')[0].text | |
content = unicode('\n'.join(x.get_text() for x in soup.select('.copy_box p'))).encode('utf-8', 'replace') | |
print title | |
print u"-" * len(title) | |
print content.strip() | |
print u"" | |
print u"" | |
next_page = soup.select('.next.prev_next a')[0]['href'] | |
return next_page | |
except IndexError: | |
print u"ERROR" | |
print body | |
def main(): | |
n = 0 | |
url = None | |
new_url = URL | |
while n < 100 and url != new_url and new_url is not None and new_url != '': | |
url = new_url | |
new_url = read_page(url) | |
n += 1 | |
print n, u"articles." | |
if __name__ == u"__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment