Skip to content

Instantly share code, notes, and snippets.

@gazs
Created November 17, 2010 08:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gazs/703151 to your computer and use it in GitHub Desktop.
Save gazs/703151 to your computer and use it in GitHub Desktop.
import urllib, re
from BeautifulSoup import BeautifulSoup, Tag, NavigableString
archive_base = "http://lexal.net/scifi/scifiction/"
archive_soup = BeautifulSoup( urllib.urlopen(archive_base + "/archive.html" ) )
for link in archive_soup.findAll("a", {"href": re.compile('originals_archive')}):
novella = BeautifulSoup(urllib.urlopen(archive_base + link['href']))
# span.storytitle
# span.storybio
# td.bodytext
storytitle = novella.find("span", {"class": "storytitle"}).text
storybio = novella.find("span", {"class": "storybio"}).text
bodytext = novella.find("td", {"class": "bodytext"})
print storytitle, storybio
soup = BeautifulSoup()
titletag = Tag(soup, "h1")
titletag.insert(0, NavigableString(storytitle))
biotag = Tag(soup, "h2")
biotag.insert(0, NavigableString(storybio))
soup.insert(0, titletag)
soup.insert(1, biotag)
soup.insert(2, bodytext)
file = open(link["href"].split("/")[2] + ".html", "w")
file.write(str(soup))
file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment