Skip to content

Instantly share code, notes, and snippets.

Created Nov 17, 2010
What would you like to do?
import urllib, re
from BeautifulSoup import BeautifulSoup, Tag, NavigableString
archive_base = ""
archive_soup = BeautifulSoup( urllib.urlopen(archive_base + "/archive.html" ) )
for link in archive_soup.findAll("a", {"href": re.compile('originals_archive')}):
novella = BeautifulSoup(urllib.urlopen(archive_base + link['href']))
# span.storytitle
# span.storybio
# td.bodytext
storytitle = novella.find("span", {"class": "storytitle"}).text
storybio = novella.find("span", {"class": "storybio"}).text
bodytext = novella.find("td", {"class": "bodytext"})
print storytitle, storybio
soup = BeautifulSoup()
titletag = Tag(soup, "h1")
titletag.insert(0, NavigableString(storytitle))
biotag = Tag(soup, "h2")
biotag.insert(0, NavigableString(storybio))
soup.insert(0, titletag)
soup.insert(1, biotag)
soup.insert(2, bodytext)
file = open(link["href"].split("/")[2] + ".html", "w")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment