quicklystarfish/MoR-cover.jpg

## ffn_audiobook_generator.py
#!/usr/bin/env python2.7
# requires OS X 10.6+

import urllib2
import subprocess

# available in pypi
import lxml.cssselect
import lxml.etree
import mutagen

selector = lxml.cssselect.CSSSelector

def get_document(url):
    """Retrieves a URL and parses the response as an HTML document."""

    data = urllib2.urlopen(url).read()

    return lxml.etree.fromstring(data.decode("ascii", "ignore"), lxml.etree.HTMLParser(encoding="UTF-8"))

def get_chapters(story_id):
    print "Retrieving chapter index..."

    document = get_document("http://www.fanfiction.net/s/{}/{}/"
                            .format(story_id, 1))
    chapter_selector = selector("select[name*=chapter]")(document)[0]

    for label in lxml.etree.ElementTextIterator(chapter_selector):
        n, _, title = label.partition(". ")
        yield (int(n), title)

def get_chapter_text(story_id, chapter):
    print "Retrieving text of chapter {}/{}".format(story_id, chapter)

    document = get_document("http://www.fanfiction.net/s/{}/{}/"
                            .format(story_id, chapter))

    text_element = selector("#storytext")(document)[0]

    for hr in selector("hr")(text_element):
        if getattr(hr, "tail", None):
            hr.tail = "section-break\n\n" + hr.tail
        else:
            hr.tail = "section-break\n\n"

    lxml.etree.strip_tags(text_element, "i", "b", "a")
    return "\n\n".join(lxml.etree.ElementTextIterator(text_element))

def get_text_by_chapter(story_id, chapters):
    """Yields (number, title, body) for each chapter in a story."""

    for n, title in chapters:
        yield n, title, "Chapter {}: {}\n\n{}".format(n, title, get_chapter_text(story_id, n))

def say(s, *a):
    """Executes the say command with the specified data and arguments."""

    subprocess.Popen(["say"] + list(a), stdin=subprocess.PIPE).communicate(s)

import mutagen.m4a

def dump_story(story_id, story_title="", story_author="",
               cover=None, comment=None):
    """Generates a sequence of m4a files for a story."""

    chapters = list(get_chapters(story_id))
    methods_text = get_text_by_chapter(story_id, chapters)

    for n, title, text in methods_text:
        if n == 1:
            text = "Generated from the text at fanfiction.net/s/{}\n\n".format(story_id) + text
            if story_author:
                text = "by " + story_author + "\n\n" + text

            if story_title:
                text = story_title + "\n\n" + text


        filename = "{}-{:03d}-{}.m4a".format(story_title or story_id, n, title)

        print "Writing", filename
        say(text, "-o", filename)
        print "Writing meta info"

        info = mutagen.m4a.M4A(filename)
        info["trkn"] = (n, len(chapters))

        info["\xa9nam"] = "Chapter {}: {}".format(n, title)

        if story_title:
            info["\xa9alb"] = story_title

        if story_author:
            info["\xa9ART"] = story_author

        if cover:
            info["covr"] = cover

        if comment is not None:
            info["\xa9cmt"] = comment
        else:
            comment = ("Source: http://www.fanfiction.net/s/"
                       "{}. Generated using Mac OS X 10.6's "
                       "speech synthesis by a script available at "
                       "https://gist.github.com/982770."
                       .format(story_id))

        info.save()

if __name__ == "__main__":
    cover = mutagen.m4a.M4ACover(open("MoR-cover.jpg", "rb").read(), mutagen.m4a.M4ACover.FORMAT_JPEG)
    dump_story(5782108, "Harry Potter and the Methods of Rationality", "Eliezer Yudkowsky", cover)

## MoR-cover.jpg

      
    Raw
  

              MoR-cover.jpg
	#!/usr/bin/env python2.7
	# requires OS X 10.6+

	import urllib2
	import subprocess

	# available in pypi
	import lxml.cssselect
	import lxml.etree
	import mutagen

	selector = lxml.cssselect.CSSSelector

	def get_document(url):
	"""Retrieves a URL and parses the response as an HTML document."""

	data = urllib2.urlopen(url).read()

	return lxml.etree.fromstring(data.decode("ascii", "ignore"), lxml.etree.HTMLParser(encoding="UTF-8"))

	def get_chapters(story_id):
	print "Retrieving chapter index..."

	document = get_document("http://www.fanfiction.net/s/{}/{}/"
	.format(story_id, 1))
	chapter_selector = selector("select[name*=chapter]")(document)[0]

	for label in lxml.etree.ElementTextIterator(chapter_selector):
	n, _, title = label.partition(". ")
	yield (int(n), title)

	def get_chapter_text(story_id, chapter):
	print "Retrieving text of chapter {}/{}".format(story_id, chapter)

	document = get_document("http://www.fanfiction.net/s/{}/{}/"
	.format(story_id, chapter))

	text_element = selector("#storytext")(document)[0]

	for hr in selector("hr")(text_element):
	if getattr(hr, "tail", None):
	hr.tail = "section-break\n\n" + hr.tail
	else:
	hr.tail = "section-break\n\n"

	lxml.etree.strip_tags(text_element, "i", "b", "a")
	return "\n\n".join(lxml.etree.ElementTextIterator(text_element))

	def get_text_by_chapter(story_id, chapters):
	"""Yields (number, title, body) for each chapter in a story."""

	for n, title in chapters:
	yield n, title, "Chapter {}: {}\n\n{}".format(n, title, get_chapter_text(story_id, n))

	def say(s, *a):
	"""Executes the say command with the specified data and arguments."""

	subprocess.Popen(["say"] + list(a), stdin=subprocess.PIPE).communicate(s)

	import mutagen.m4a

	def dump_story(story_id, story_title="", story_author="",
	cover=None, comment=None):
	"""Generates a sequence of m4a files for a story."""

	chapters = list(get_chapters(story_id))
	methods_text = get_text_by_chapter(story_id, chapters)

	for n, title, text in methods_text:
	if n == 1:
	text = "Generated from the text at fanfiction.net/s/{}\n\n".format(story_id) + text
	if story_author:
	text = "by " + story_author + "\n\n" + text

	if story_title:
	text = story_title + "\n\n" + text


	filename = "{}-{:03d}-{}.m4a".format(story_title or story_id, n, title)

	print "Writing", filename
	say(text, "-o", filename)
	print "Writing meta info"

	info = mutagen.m4a.M4A(filename)
	info["trkn"] = (n, len(chapters))

	info["\xa9nam"] = "Chapter {}: {}".format(n, title)

	if story_title:
	info["\xa9alb"] = story_title

	if story_author:
	info["\xa9ART"] = story_author

	if cover:
	info["covr"] = cover

	if comment is not None:
	info["\xa9cmt"] = comment
	else:
	comment = ("Source: http://www.fanfiction.net/s/"
	"{}. Generated using Mac OS X 10.6's "
	"speech synthesis by a script available at "
	"https://gist.github.com/982770."
	.format(story_id))

	info.save()

	if __name__ == "__main__":
	cover = mutagen.m4a.M4ACover(open("MoR-cover.jpg", "rb").read(), mutagen.m4a.M4ACover.FORMAT_JPEG)
	dump_story(5782108, "Harry Potter and the Methods of Rationality", "Eliezer Yudkowsky", cover)