leroux/scrape-worm-separate.py

## scrape-worm-separate.py
# Worm Scraper by Arc

# Use pandoc to convert into something nicer.

import re

from bs4 import BeautifulSoup
from urllib2 import urlopen

url = "http://parahumans.wordpress.com/category/stories-arcs-1-10/arc-1-gestation/1-01/"
#endUrl = "http://parahumans.wordpress.com/2013/11/19/interlude-end/"

arc = ""
arcN = 0

f = None

# Keep traversing until post with no "Next Chapter" is hit.
while True:
    html = urlopen(url.encode('utf-8').strip()).read()
    soup = BeautifulSoup(html)

    title = soup.find("h1", class_="entry-title")
    title.name = "h2"
    #title = soup.find("h1", class_="entry-title").string.wrap(soup.new_tag("b"))
    print title.text

    maybeArc = title.string.split()[0]
    if maybeArc != arc and maybeArc != "Interlude" and maybeArc != "Interlude:":
        arc = maybeArc
        arcN = arcN + 1
        if f is not None: f.close()
        f = open('Worm ' + str(arcN) + ' - ' + arc + '.html', 'a')

    content = soup.find("div", class_="entry-content")

    #nextChapter_tags = soup.find_all("a", title="Next Chapter")
    nextChapter_tags = soup.find_all("a", text = re.compile('.*Next.Chapter.*'))

    # Get next chapter url.
    if len(nextChapter_tags) > 0:
        url = nextChapter_tags[0]['href']

    # Remove all links (includes last/next chapter links).
    for tag in content.find_all("a", href=True):
        tag.decompose()

    # Remove "share this".
    content.find("div", id="jp-post-flair").decompose()

    f.write(title.prettify().encode('utf-8').strip() + '\n')
    f.write(content.prettify().encode('utf-8').strip() + '\n')
    f.write("<hr>")

    if len(nextChapter_tags) == 0:
        break

## scrape-worm.py
# Worm Scraper

# Dumps into an html file.
# Use pandoc to convert into something nicer.

import re

from bs4 import BeautifulSoup
from urllib2 import urlopen

url = "http://parahumans.wordpress.com/category/stories-arcs-1-10/arc-1-gestation/1-01/"
#endUrl = "http://parahumans.wordpress.com/2013/11/19/interlude-end/"

f = open('worm.html', 'w')

f.write("<h1>Worm</h1>\n")

# Keep traversing until post with no "Next Chapter" is hit.
while True:
    html = urlopen(url.encode('utf-8').strip()).read()
    soup = BeautifulSoup(html)

    title = soup.find("h1", class_="entry-title")
    title.name = "h2"
    #title = soup.find("h1", class_="entry-title").string.wrap(soup.new_tag("b"))
    print title.text

    content = soup.find("div", class_="entry-content")

    #nextChapter_tags = soup.find_all("a", title="Next Chapter")
    nextChapter_tags = soup.find_all("a", text = re.compile('.*Next.Chapter.*'))

    # Get next chapter url.
    if len(nextChapter_tags) > 0:
        url = nextChapter_tags[0]['href']

    # Remove all links (includes last/next chapter links).
    for tag in content.find_all("a", href=True):
        tag.decompose()

    # Remove "share this".
    content.find("div", id="jp-post-flair").decompose()

    f.write(title.prettify().encode('utf-8').strip() + '\n')
    f.write(content.prettify().encode('utf-8').strip() + '\n')
    f.write("<hr>")

    if len(nextChapter_tags) == 0:
        break

f.close()
	# Worm Scraper by Arc

	# Use pandoc to convert into something nicer.

	import re

	from bs4 import BeautifulSoup
	from urllib2 import urlopen

	url = "http://parahumans.wordpress.com/category/stories-arcs-1-10/arc-1-gestation/1-01/"
	#endUrl = "http://parahumans.wordpress.com/2013/11/19/interlude-end/"

	arc = ""
	arcN = 0

	f = None

	# Keep traversing until post with no "Next Chapter" is hit.
	while True:
	html = urlopen(url.encode('utf-8').strip()).read()
	soup = BeautifulSoup(html)

	title = soup.find("h1", class_="entry-title")
	title.name = "h2"
	#title = soup.find("h1", class_="entry-title").string.wrap(soup.new_tag("b"))
	print title.text

	maybeArc = title.string.split()[0]
	if maybeArc != arc and maybeArc != "Interlude" and maybeArc != "Interlude:":
	arc = maybeArc
	arcN = arcN + 1
	if f is not None: f.close()
	f = open('Worm ' + str(arcN) + ' - ' + arc + '.html', 'a')

	content = soup.find("div", class_="entry-content")

	#nextChapter_tags = soup.find_all("a", title="Next Chapter")
	nextChapter_tags = soup.find_all("a", text = re.compile('.Next.Chapter.'))

	# Get next chapter url.
	if len(nextChapter_tags) > 0:
	url = nextChapter_tags[0]['href']

	# Remove all links (includes last/next chapter links).
	for tag in content.find_all("a", href=True):
	tag.decompose()

	# Remove "share this".
	content.find("div", id="jp-post-flair").decompose()

	f.write(title.prettify().encode('utf-8').strip() + '\n')
	f.write(content.prettify().encode('utf-8').strip() + '\n')
	f.write("<hr>")

	if len(nextChapter_tags) == 0:
	break
	# Worm Scraper

	# Dumps into an html file.
	# Use pandoc to convert into something nicer.

	import re

	from bs4 import BeautifulSoup
	from urllib2 import urlopen

	url = "http://parahumans.wordpress.com/category/stories-arcs-1-10/arc-1-gestation/1-01/"
	#endUrl = "http://parahumans.wordpress.com/2013/11/19/interlude-end/"

	f = open('worm.html', 'w')

	f.write("<h1>Worm</h1>\n")

	# Keep traversing until post with no "Next Chapter" is hit.
	while True:
	html = urlopen(url.encode('utf-8').strip()).read()
	soup = BeautifulSoup(html)

	title = soup.find("h1", class_="entry-title")
	title.name = "h2"
	#title = soup.find("h1", class_="entry-title").string.wrap(soup.new_tag("b"))
	print title.text

	content = soup.find("div", class_="entry-content")

	#nextChapter_tags = soup.find_all("a", title="Next Chapter")
	nextChapter_tags = soup.find_all("a", text = re.compile('.Next.Chapter.'))

	# Get next chapter url.
	if len(nextChapter_tags) > 0:
	url = nextChapter_tags[0]['href']

	# Remove all links (includes last/next chapter links).
	for tag in content.find_all("a", href=True):
	tag.decompose()

	# Remove "share this".
	content.find("div", id="jp-post-flair").decompose()

	f.write(title.prettify().encode('utf-8').strip() + '\n')
	f.write(content.prettify().encode('utf-8').strip() + '\n')
	f.write("<hr>")

	if len(nextChapter_tags) == 0:
	break

	f.close()