Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
import hashlib
import requests
from bs4 import BeautifulSoup
URL = ("")
# $('.next.prev_next a').attr('href')
def get_page(url):
fname = u"tmp/" + hashlib.md5(url).hexdigest()
with open(fname, 'r') as f:
except IOError:
resp = requests.get(url)
body = resp.content
with open(fname, 'w') as f:
return body
def read_page(url):
body = get_page(url)
soup = BeautifulSoup(body)
title ='.copy_box h2')[0].text
content = unicode('\n'.join(x.get_text() for x in'.copy_box p'))).encode('utf-8', 'replace')
print title
print u"-" * len(title)
print content.strip()
print u""
print u""
next_page ='.next.prev_next a')[0]['href']
return next_page
except IndexError:
print u"ERROR"
print body
def main():
n = 0
url = None
new_url = URL
while n < 100 and url != new_url and new_url is not None and new_url != '':
url = new_url
new_url = read_page(url)
n += 1
print n, u"articles."
if __name__ == u"__main__":
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment