davish/bookscraper.py

## bookscraper.py
# Copyright (c) Davis Haupt
# Licenced under the MIT License

from bs4 import BeautifulSoup
import urllib

def getContent(url, content_div):
  soup = BeautifulSoup(urllib.urlopen(url))
  return (soup.find("div", content_div), soup.title.string)

def save(s, n):
  f = open(n + '.html', 'w')
  f.write(s)
  f.close()


def scrape(url, table_of_contents_div, content_div):
  soup = BeautifulSoup(urllib.urlopen(url))
  contents = []

  for link in soup.find("div", id=table_of_contents_div).find_all('a'):
    page = link.get("href")
    p = getContent(url + page, content_div)
    title = p[1]
    c = p[0]
    hasslash = title.find('/')
    if hasslash != -1
      title = title[:hasslash] + title[hasslash + 1:]
    contents.append((title, c))
    print page

  build_html(contents)
  return "ok"

def build_html(pages):
  """
  Takes a list of tuples where first elements are titles and second elements are page contents
  """
  links = []
  for p in pages:
    k = p[0]
    v = p[1]

    links.append('<a href="%s.html">%s</a>' % (k, k))
    save(str(v), "book/" + k)
    print k
  table = '<br>'.join(links)
  save(table, "book/index")
	# Copyright (c) Davis Haupt
	# Licenced under the MIT License

	from bs4 import BeautifulSoup
	import urllib

	def getContent(url, content_div):
	soup = BeautifulSoup(urllib.urlopen(url))
	return (soup.find("div", content_div), soup.title.string)

	def save(s, n):
	f = open(n + '.html', 'w')
	f.write(s)
	f.close()


	def scrape(url, table_of_contents_div, content_div):
	soup = BeautifulSoup(urllib.urlopen(url))
	contents = []

	for link in soup.find("div", id=table_of_contents_div).find_all('a'):
	page = link.get("href")
	p = getContent(url + page, content_div)
	title = p[1]
	c = p[0]
	hasslash = title.find('/')
	if hasslash != -1
	title = title[:hasslash] + title[hasslash + 1:]
	contents.append((title, c))
	print page

	build_html(contents)
	return "ok"

	def build_html(pages):
	"""
	Takes a list of tuples where first elements are titles and second elements are page contents
	"""
	links = []
	for p in pages:
	k = p[0]
	v = p[1]

	links.append('<a href="%s.html">%s</a>' % (k, k))
	save(str(v), "book/" + k)
	print k
	table = '<br>'.join(links)
	save(table, "book/index")