gazs/scifiarchive.py

## scifiarchive.py
import urllib, re
from BeautifulSoup import BeautifulSoup, Tag, NavigableString

archive_base = "http://lexal.net/scifi/scifiction/"
archive_soup = BeautifulSoup( urllib.urlopen(archive_base + "/archive.html" ) )

for link in archive_soup.findAll("a", {"href": re.compile('originals_archive')}):
  novella = BeautifulSoup(urllib.urlopen(archive_base + link['href']))
  # span.storytitle
  # span.storybio
  # td.bodytext
  storytitle =  novella.find("span", {"class": "storytitle"}).text
  storybio =  novella.find("span", {"class": "storybio"}).text
  bodytext =  novella.find("td", {"class": "bodytext"})
  print storytitle, storybio
  soup = BeautifulSoup()
  titletag = Tag(soup, "h1")
  titletag.insert(0, NavigableString(storytitle))
  biotag = Tag(soup, "h2")
  biotag.insert(0, NavigableString(storybio))
  soup.insert(0, titletag)
  soup.insert(1, biotag)
  soup.insert(2, bodytext)
  file = open(link["href"].split("/")[2] + ".html", "w")
  file.write(str(soup))
  file.close()
	import urllib, re
	from BeautifulSoup import BeautifulSoup, Tag, NavigableString

	archive_base = "http://lexal.net/scifi/scifiction/"
	archive_soup = BeautifulSoup( urllib.urlopen(archive_base + "/archive.html" ) )

	for link in archive_soup.findAll("a", {"href": re.compile('originals_archive')}):
	novella = BeautifulSoup(urllib.urlopen(archive_base + link['href']))
	# span.storytitle
	# span.storybio
	# td.bodytext
	storytitle = novella.find("span", {"class": "storytitle"}).text
	storybio = novella.find("span", {"class": "storybio"}).text
	bodytext = novella.find("td", {"class": "bodytext"})
	print storytitle, storybio
	soup = BeautifulSoup()
	titletag = Tag(soup, "h1")
	titletag.insert(0, NavigableString(storytitle))
	biotag = Tag(soup, "h2")
	biotag.insert(0, NavigableString(storybio))
	soup.insert(0, titletag)
	soup.insert(1, biotag)
	soup.insert(2, bodytext)
	file = open(link["href"].split("/")[2] + ".html", "w")
	file.write(str(soup))
	file.close()