Created
May 20, 2011 11:41
-
-
Save quicklystarfish/982770 to your computer and use it in GitHub Desktop.
Audiobook generator using OS X 10.6+'s speech synthesis.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
# requires OS X 10.6+ | |
import urllib2 | |
import subprocess | |
# available in pypi | |
import lxml.cssselect | |
import lxml.etree | |
import mutagen | |
selector = lxml.cssselect.CSSSelector | |
def get_document(url): | |
"""Retrieves a URL and parses the response as an HTML document.""" | |
data = urllib2.urlopen(url).read() | |
return lxml.etree.fromstring(data.decode("ascii", "ignore"), lxml.etree.HTMLParser(encoding="UTF-8")) | |
def get_chapters(story_id): | |
print "Retrieving chapter index..." | |
document = get_document("http://www.fanfiction.net/s/{}/{}/" | |
.format(story_id, 1)) | |
chapter_selector = selector("select[name*=chapter]")(document)[0] | |
for label in lxml.etree.ElementTextIterator(chapter_selector): | |
n, _, title = label.partition(". ") | |
yield (int(n), title) | |
def get_chapter_text(story_id, chapter): | |
print "Retrieving text of chapter {}/{}".format(story_id, chapter) | |
document = get_document("http://www.fanfiction.net/s/{}/{}/" | |
.format(story_id, chapter)) | |
text_element = selector("#storytext")(document)[0] | |
for hr in selector("hr")(text_element): | |
if getattr(hr, "tail", None): | |
hr.tail = "section-break\n\n" + hr.tail | |
else: | |
hr.tail = "section-break\n\n" | |
lxml.etree.strip_tags(text_element, "i", "b", "a") | |
return "\n\n".join(lxml.etree.ElementTextIterator(text_element)) | |
def get_text_by_chapter(story_id, chapters): | |
"""Yields (number, title, body) for each chapter in a story.""" | |
for n, title in chapters: | |
yield n, title, "Chapter {}: {}\n\n{}".format(n, title, get_chapter_text(story_id, n)) | |
def say(s, *a): | |
"""Executes the say command with the specified data and arguments.""" | |
subprocess.Popen(["say"] + list(a), stdin=subprocess.PIPE).communicate(s) | |
import mutagen.m4a | |
def dump_story(story_id, story_title="", story_author="", | |
cover=None, comment=None): | |
"""Generates a sequence of m4a files for a story.""" | |
chapters = list(get_chapters(story_id)) | |
methods_text = get_text_by_chapter(story_id, chapters) | |
for n, title, text in methods_text: | |
if n == 1: | |
text = "Generated from the text at fanfiction.net/s/{}\n\n".format(story_id) + text | |
if story_author: | |
text = "by " + story_author + "\n\n" + text | |
if story_title: | |
text = story_title + "\n\n" + text | |
filename = "{}-{:03d}-{}.m4a".format(story_title or story_id, n, title) | |
print "Writing", filename | |
say(text, "-o", filename) | |
print "Writing meta info" | |
info = mutagen.m4a.M4A(filename) | |
info["trkn"] = (n, len(chapters)) | |
info["\xa9nam"] = "Chapter {}: {}".format(n, title) | |
if story_title: | |
info["\xa9alb"] = story_title | |
if story_author: | |
info["\xa9ART"] = story_author | |
if cover: | |
info["covr"] = cover | |
if comment is not None: | |
info["\xa9cmt"] = comment | |
else: | |
comment = ("Source: http://www.fanfiction.net/s/" | |
"{}. Generated using Mac OS X 10.6's " | |
"speech synthesis by a script available at " | |
"https://gist.github.com/982770." | |
.format(story_id)) | |
info.save() | |
if __name__ == "__main__": | |
cover = mutagen.m4a.M4ACover(open("MoR-cover.jpg", "rb").read(), mutagen.m4a.M4ACover.FORMAT_JPEG) | |
dump_story(5782108, "Harry Potter and the Methods of Rationality", "Eliezer Yudkowsky", cover) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment