Skip to content

Instantly share code, notes, and snippets.

@simonrw
Created February 1, 2013 16:27
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save simonrw/4692366 to your computer and use it in GitHub Desktop.
Save simonrw/4692366 to your computer and use it in GitHub Desktop.
Code for flattening a paginated website
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
def analyse_page(url):
r = requests.get(url)
assert r.status_code == 200
soup = BeautifulSoup(r.content)
parent_div = soup.find(id='gallery-mode')
content_div_contents = parent_div.contents[1]
return [str(val) for val in content_div_contents.find_all('p')]
if __name__ == '__main__':
url_root = 'http://whatculture.com/film/quentin-tarantino-definitive-guide.php/'
npages = 179
extra_html = []
# Add the header
extra_html.extend([
'<!doctype html>',
'<html>',
'<head>',
'<meta charset="UTF-8">',
'</head>',
'<body>',
])
for i in xrange(1, npages + 1):
print "Analysing page {0}".format(i)
page_url = url_root + str(i)
extra_html.extend(analyse_page(page_url))
extra_html.append('<hr />')
extra_html.extend([
'</body>',
'</html>'
])
with open('/tmp/index.html', 'w') as outfile:
outfile.write('\n'.join(extra_html))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment