Created
October 10, 2015 22:41
-
-
Save DanielOaks/ccda1d19b456e60c1607 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Given an ePub file from FicSave, generates LaTeX files | |
# Written by Daniel Oaks, released under the ISC license | |
__doc__ = """Make-LaTeX. | |
Given an ePub file from FicSave, generates LaTeX files. | |
Usage: | |
make-latex <name> [options] | |
make-latex -h | --help | |
make-latex -v | --version | |
Options: | |
--silent Suppresses mundane printouts, prints what's important | |
--verbose Printout more information than normal | |
-h --help Show help | |
-v --version Show version | |
""" | |
import re | |
import os | |
import zipfile | |
from docopt import docopt | |
# We should probably create a class that provides a simplified interface to both lxml and pq's | |
# attributes from a single, unified object, but that's a little overkill for this project right now | |
from pyquery import PyQuery as pq | |
import lxml, lxml.html | |
def lx(element): | |
"""Return a given element as an lxml element.""" | |
if type(element) == lxml.html.HtmlElement: | |
return element | |
elif type(element) == pq: | |
return lxml.html.fragments_fromstring(element.__html__())[0] | |
elif type(element) == str: | |
return lxml.html.fragments_fromstring(element)[0] | |
# text parser | |
def StripAn(contents, chap_name=''): | |
# strip initial junk | |
text = '' | |
for content in contents[:20]: | |
if isinstance(content, str): | |
text += content | |
else: | |
text += Parse(content) | |
if chap_name.lower().replace(' ', '') in text.lower().replace(' ', '').replace('-', ''): | |
while True: | |
if isinstance(contents[0], str): | |
text = contents[0] | |
else: | |
text = Parse(contents[0]) | |
print(chap_name, text.replace('\n', ' ')) | |
# if Parse(contents[0]) | |
del contents[0] | |
if chap_name.lower().replace(' ', '') in text.lower().replace(' ', '').replace('-', ''): | |
break | |
# strip author's note at end of chapter | |
text = '' | |
for content in contents[-50:]: | |
if isinstance(content, str): | |
text += content | |
else: | |
text += Parse(content) | |
if 'a/n' in text.lower(): | |
while True: | |
if isinstance(contents[-1], str): | |
text = contents[-1] | |
else: | |
text = Parse(contents[-1]) | |
print(chap_name, text[:50].replace('\n', ' ')) | |
del contents[-1] | |
if 'a/n' in text.lower(): | |
break | |
return contents | |
RE_EMPTY_EMPH = re.compile(r'\\emph{\s+}') | |
# RE_FINAL_VSPACE = re.compile(r'\n\\vspace\{1.2cm\}\s+\Z') | |
def FixQuotes(in_string): | |
output = '' | |
last_character = '' | |
in_single_quote = False | |
in_double_quote = False | |
for character in in_string: | |
if character == '"': | |
if in_double_quote: | |
in_double_quote = False | |
output += "''" | |
else: | |
in_double_quote = True | |
output += '``' | |
elif character == "'": | |
if in_single_quote: | |
in_single_quote = False | |
output += "'" | |
else: | |
if last_character == ' ': | |
in_single_quote = True | |
output += '`' | |
else: | |
output += character | |
else: | |
output += character | |
last_character = character | |
return output | |
def Parse(html, is_base=False, chap_name=''): | |
html = pq(html) | |
tag = lx(html).tag | |
text = '' | |
if tag == 'br': | |
text = '\n\n' | |
html_contents = html.contents() | |
if is_base: | |
html_contents = StripAn(html_contents, chap_name) | |
for content in html_contents: | |
if isinstance(content, str): | |
text += content.replace('\n', ' ').replace(' ', ' ') | |
else: | |
text += Parse(content) | |
if tag == 'em': | |
text = '\\emph{{{}}}'.format(text) | |
if tag == 'strong': | |
text = '{{\\bfseries {} }}'.format(text) | |
if tag == 'p': | |
text = FixQuotes(text.strip().replace('...', '{\\ldots}').replace("'' }", "} ''").replace('`` ', '``').replace(" ''", "''")) | |
text = text.replace('``\\emph{', '\\emph{``').replace("}''", "''}").replace("}'", "'}").replace("{\ldots''}", "{\ldots}''").replace("{\ldots'}", "{\ldots}'") | |
text = text.replace("``\\emph{ ", "\\emph{ ``").replace("`\emph{ ", "\emph{ `").replace("``{\ldots}\emph{", "\emph{``{\ldots}").replace("`{\ldots}{\emph", "{\emph`{\ldots}").replace('} .', '}.').replace('} ,', '},').replace('} !', '}!') | |
text = text.replace('} ,', '},').replace('`` ', '``') | |
text = text.replace('&', '\\&') | |
text = text + '\n\n' | |
if is_base: | |
text = text.strip() + '\n' | |
# return text | |
if '----------------' in text: | |
return '\\vspace{1.2cm}\n\n' | |
else: | |
text = text.replace(' ', '').replace(' ', ' ').replace('\\emph{ ', ' \\emph{').replace(' }', '} ') | |
text = RE_EMPTY_EMPH.sub('', text) # remove empty \emph{} blocks | |
return text | |
# application | |
if __name__ == '__main__': | |
args = docopt(__doc__) | |
folder_name = args['<name>'] | |
epub_name = os.path.join(folder_name, 'fic.epub') | |
working_folder = os.path.join(folder_name, 'working') | |
# extract epub file | |
with zipfile.ZipFile(epub_name) as epub: | |
epub.extractall(working_folder) | |
# get chapters | |
chapter_html_path_template = os.path.join(working_folder, 'OEBPS', 'Chapter{}.html') | |
chapter_latex_folder = os.path.join(folder_name, 'latex') | |
if not os.path.exists(chapter_latex_folder): | |
os.makedirs(chapter_latex_folder) | |
chapter_latex_path_template = os.path.join(chapter_latex_folder, 'chapter_{}.tex') | |
chapter_number = 0 | |
while True: | |
chapter_number += 1 | |
chapter_html_path = chapter_html_path_template.format(chapter_number) | |
chapter_latex_path = chapter_latex_path_template.format(chapter_number) | |
# make sure chapter exists | |
if not os.path.exists(chapter_html_path): | |
break | |
# open chapter | |
with open(chapter_html_path, 'r') as chapter_html_file: | |
chapter_html = chapter_html_file.read() | |
# dumb optional-linebreak character | |
chapter_html = chapter_html.replace(' ', '') | |
# need to strip encoding declaration below or lxml yells at us | |
chapter_html = chapter_html.lstrip('<?xml version="1.0" encoding="utf-8"?>') | |
chapter_html = chapter_html.replace('xmlns="http://www.w3.org/1999/xhtml"', '') | |
chapter_pq = pq(chapter_html).remove_namespaces() | |
# parse out title and output text | |
chapter_title = Parse(chapter_pq('h2')).split(':')[1].strip() | |
chapter_text = Parse(chapter_pq('#storytext'), is_base=True, chap_name=chapter_title) | |
# print output text | |
with open(chapter_latex_path, 'w') as latex_file: | |
latex_file.write('\\chapter{{ {} }}'.format(chapter_title)) | |
latex_file.write('\n\n') | |
latex_file.write(chapter_text) | |
# throw out all chapters tex file | |
all_chapters_filename = os.path.join(chapter_latex_folder, 'all_chapters.tex') | |
with open(all_chapters_filename, 'w') as all_chapters: | |
for i in range(1, chapter_number): | |
all_chapters.write('\\input{{./chapters/chapter_{}.tex}}\n'.format(i)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment