Skip to content

Instantly share code, notes, and snippets.

@leroux
Last active January 1, 2016 02:29
Show Gist options
  • Save leroux/8079928 to your computer and use it in GitHub Desktop.
Save leroux/8079928 to your computer and use it in GitHub Desktop.
Hacky Worm Scraper
# Worm Scraper by Arc
# Use pandoc to convert into something nicer.
import re
from bs4 import BeautifulSoup
from urllib2 import urlopen
url = "http://parahumans.wordpress.com/category/stories-arcs-1-10/arc-1-gestation/1-01/"
#endUrl = "http://parahumans.wordpress.com/2013/11/19/interlude-end/"
arc = ""
arcN = 0
f = None
# Keep traversing until post with no "Next Chapter" is hit.
while True:
html = urlopen(url.encode('utf-8').strip()).read()
soup = BeautifulSoup(html)
title = soup.find("h1", class_="entry-title")
title.name = "h2"
#title = soup.find("h1", class_="entry-title").string.wrap(soup.new_tag("b"))
print title.text
maybeArc = title.string.split()[0]
if maybeArc != arc and maybeArc != "Interlude" and maybeArc != "Interlude:":
arc = maybeArc
arcN = arcN + 1
if f is not None: f.close()
f = open('Worm ' + str(arcN) + ' - ' + arc + '.html', 'a')
content = soup.find("div", class_="entry-content")
#nextChapter_tags = soup.find_all("a", title="Next Chapter")
nextChapter_tags = soup.find_all("a", text = re.compile('.*Next.Chapter.*'))
# Get next chapter url.
if len(nextChapter_tags) > 0:
url = nextChapter_tags[0]['href']
# Remove all links (includes last/next chapter links).
for tag in content.find_all("a", href=True):
tag.decompose()
# Remove "share this".
content.find("div", id="jp-post-flair").decompose()
f.write(title.prettify().encode('utf-8').strip() + '\n')
f.write(content.prettify().encode('utf-8').strip() + '\n')
f.write("<hr>")
if len(nextChapter_tags) == 0:
break
# Worm Scraper
# Dumps into an html file.
# Use pandoc to convert into something nicer.
import re
from bs4 import BeautifulSoup
from urllib2 import urlopen
url = "http://parahumans.wordpress.com/category/stories-arcs-1-10/arc-1-gestation/1-01/"
#endUrl = "http://parahumans.wordpress.com/2013/11/19/interlude-end/"
f = open('worm.html', 'w')
f.write("<h1>Worm</h1>\n")
# Keep traversing until post with no "Next Chapter" is hit.
while True:
html = urlopen(url.encode('utf-8').strip()).read()
soup = BeautifulSoup(html)
title = soup.find("h1", class_="entry-title")
title.name = "h2"
#title = soup.find("h1", class_="entry-title").string.wrap(soup.new_tag("b"))
print title.text
content = soup.find("div", class_="entry-content")
#nextChapter_tags = soup.find_all("a", title="Next Chapter")
nextChapter_tags = soup.find_all("a", text = re.compile('.*Next.Chapter.*'))
# Get next chapter url.
if len(nextChapter_tags) > 0:
url = nextChapter_tags[0]['href']
# Remove all links (includes last/next chapter links).
for tag in content.find_all("a", href=True):
tag.decompose()
# Remove "share this".
content.find("div", id="jp-post-flair").decompose()
f.write(title.prettify().encode('utf-8').strip() + '\n')
f.write(content.prettify().encode('utf-8').strip() + '\n')
f.write("<hr>")
if len(nextChapter_tags) == 0:
break
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment