Skip to content

Instantly share code, notes, and snippets.

@davish
Created January 6, 2013 00:53
Show Gist options
  • Save davish/4464564 to your computer and use it in GitHub Desktop.
Save davish/4464564 to your computer and use it in GitHub Desktop.
Python program to scrape data from a hypertext e-book (e.g. Eloquent JavaScript) into a previously created subdirectory book/ of the current working directory. The one dependency is BeautifulSoup4. Import the module and call the scrape() function with the url, id of the div containing the table of contents, and the css class of the div containin…
# Copyright (c) Davis Haupt
# Licenced under the MIT License
from bs4 import BeautifulSoup
import urllib
def getContent(url, content_div):
soup = BeautifulSoup(urllib.urlopen(url))
return (soup.find("div", content_div), soup.title.string)
def save(s, n):
f = open(n + '.html', 'w')
f.write(s)
f.close()
def scrape(url, table_of_contents_div, content_div):
soup = BeautifulSoup(urllib.urlopen(url))
contents = []
for link in soup.find("div", id=table_of_contents_div).find_all('a'):
page = link.get("href")
p = getContent(url + page, content_div)
title = p[1]
c = p[0]
hasslash = title.find('/')
if hasslash != -1
title = title[:hasslash] + title[hasslash + 1:]
contents.append((title, c))
print page
build_html(contents)
return "ok"
def build_html(pages):
"""
Takes a list of tuples where first elements are titles and second elements are page contents
"""
links = []
for p in pages:
k = p[0]
v = p[1]
links.append('<a href="%s.html">%s</a>' % (k, k))
save(str(v), "book/" + k)
print k
table = '<br>'.join(links)
save(table, "book/index")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment