Created
January 6, 2013 00:53
-
-
Save davish/4464564 to your computer and use it in GitHub Desktop.
Python program to scrape data from a hypertext e-book (e.g. Eloquent JavaScript) into a previously created subdirectory book/ of the current working directory. The one dependency is BeautifulSoup4. Import the module and call the scrape() function with the url, id of the div containing the table of contents, and the css class of the div containin…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (c) Davis Haupt | |
# Licenced under the MIT License | |
from bs4 import BeautifulSoup | |
import urllib | |
def getContent(url, content_div): | |
soup = BeautifulSoup(urllib.urlopen(url)) | |
return (soup.find("div", content_div), soup.title.string) | |
def save(s, n): | |
f = open(n + '.html', 'w') | |
f.write(s) | |
f.close() | |
def scrape(url, table_of_contents_div, content_div): | |
soup = BeautifulSoup(urllib.urlopen(url)) | |
contents = [] | |
for link in soup.find("div", id=table_of_contents_div).find_all('a'): | |
page = link.get("href") | |
p = getContent(url + page, content_div) | |
title = p[1] | |
c = p[0] | |
hasslash = title.find('/') | |
if hasslash != -1 | |
title = title[:hasslash] + title[hasslash + 1:] | |
contents.append((title, c)) | |
print page | |
build_html(contents) | |
return "ok" | |
def build_html(pages): | |
""" | |
Takes a list of tuples where first elements are titles and second elements are page contents | |
""" | |
links = [] | |
for p in pages: | |
k = p[0] | |
v = p[1] | |
links.append('<a href="%s.html">%s</a>' % (k, k)) | |
save(str(v), "book/" + k) | |
print k | |
table = '<br>'.join(links) | |
save(table, "book/index") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment