Skip to content

Instantly share code, notes, and snippets.

@DanielOaks
Created October 10, 2015 22:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DanielOaks/ccda1d19b456e60c1607 to your computer and use it in GitHub Desktop.
Save DanielOaks/ccda1d19b456e60c1607 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# Given an ePub file from FicSave, generates LaTeX files
# Written by Daniel Oaks, released under the ISC license
__doc__ = """Make-LaTeX.
Given an ePub file from FicSave, generates LaTeX files.
Usage:
make-latex <name> [options]
make-latex -h | --help
make-latex -v | --version
Options:
--silent Suppresses mundane printouts, prints what's important
--verbose Printout more information than normal
-h --help Show help
-v --version Show version
"""
import re
import os
import zipfile
from docopt import docopt
# We should probably create a class that provides a simplified interface to both lxml and pq's
# attributes from a single, unified object, but that's a little overkill for this project right now
from pyquery import PyQuery as pq
import lxml, lxml.html
def lx(element):
"""Return a given element as an lxml element."""
if type(element) == lxml.html.HtmlElement:
return element
elif type(element) == pq:
return lxml.html.fragments_fromstring(element.__html__())[0]
elif type(element) == str:
return lxml.html.fragments_fromstring(element)[0]
# text parser
def StripAn(contents, chap_name=''):
# strip initial junk
text = ''
for content in contents[:20]:
if isinstance(content, str):
text += content
else:
text += Parse(content)
if chap_name.lower().replace(' ', '') in text.lower().replace(' ', '').replace('-', ''):
while True:
if isinstance(contents[0], str):
text = contents[0]
else:
text = Parse(contents[0])
print(chap_name, text.replace('\n', ' '))
# if Parse(contents[0])
del contents[0]
if chap_name.lower().replace(' ', '') in text.lower().replace(' ', '').replace('-', ''):
break
# strip author's note at end of chapter
text = ''
for content in contents[-50:]:
if isinstance(content, str):
text += content
else:
text += Parse(content)
if 'a/n' in text.lower():
while True:
if isinstance(contents[-1], str):
text = contents[-1]
else:
text = Parse(contents[-1])
print(chap_name, text[:50].replace('\n', ' '))
del contents[-1]
if 'a/n' in text.lower():
break
return contents
RE_EMPTY_EMPH = re.compile(r'\\emph{\s+}')
# RE_FINAL_VSPACE = re.compile(r'\n\\vspace\{1.2cm\}\s+\Z')
def FixQuotes(in_string):
output = ''
last_character = ''
in_single_quote = False
in_double_quote = False
for character in in_string:
if character == '"':
if in_double_quote:
in_double_quote = False
output += "''"
else:
in_double_quote = True
output += '``'
elif character == "'":
if in_single_quote:
in_single_quote = False
output += "'"
else:
if last_character == ' ':
in_single_quote = True
output += '`'
else:
output += character
else:
output += character
last_character = character
return output
def Parse(html, is_base=False, chap_name=''):
html = pq(html)
tag = lx(html).tag
text = ''
if tag == 'br':
text = '\n\n'
html_contents = html.contents()
if is_base:
html_contents = StripAn(html_contents, chap_name)
for content in html_contents:
if isinstance(content, str):
text += content.replace('\n', ' ').replace(' ', ' ')
else:
text += Parse(content)
if tag == 'em':
text = '\\emph{{{}}}'.format(text)
if tag == 'strong':
text = '{{\\bfseries {} }}'.format(text)
if tag == 'p':
text = FixQuotes(text.strip().replace('...', '{\\ldots}').replace("'' }", "} ''").replace('`` ', '``').replace(" ''", "''"))
text = text.replace('``\\emph{', '\\emph{``').replace("}''", "''}").replace("}'", "'}").replace("{\ldots''}", "{\ldots}''").replace("{\ldots'}", "{\ldots}'")
text = text.replace("``\\emph{ ", "\\emph{ ``").replace("`\emph{ ", "\emph{ `").replace("``{\ldots}\emph{", "\emph{``{\ldots}").replace("`{\ldots}{\emph", "{\emph`{\ldots}").replace('} .', '}.').replace('} ,', '},').replace('} !', '}!')
text = text.replace('} ,', '},').replace('`` ', '``')
text = text.replace('&', '\\&')
text = text + '\n\n'
if is_base:
text = text.strip() + '\n'
# return text
if '----------------' in text:
return '\\vspace{1.2cm}\n\n'
else:
text = text.replace(' ', '').replace(' ', ' ').replace('\\emph{ ', ' \\emph{').replace(' }', '} ')
text = RE_EMPTY_EMPH.sub('', text) # remove empty \emph{} blocks
return text
# application
if __name__ == '__main__':
args = docopt(__doc__)
folder_name = args['<name>']
epub_name = os.path.join(folder_name, 'fic.epub')
working_folder = os.path.join(folder_name, 'working')
# extract epub file
with zipfile.ZipFile(epub_name) as epub:
epub.extractall(working_folder)
# get chapters
chapter_html_path_template = os.path.join(working_folder, 'OEBPS', 'Chapter{}.html')
chapter_latex_folder = os.path.join(folder_name, 'latex')
if not os.path.exists(chapter_latex_folder):
os.makedirs(chapter_latex_folder)
chapter_latex_path_template = os.path.join(chapter_latex_folder, 'chapter_{}.tex')
chapter_number = 0
while True:
chapter_number += 1
chapter_html_path = chapter_html_path_template.format(chapter_number)
chapter_latex_path = chapter_latex_path_template.format(chapter_number)
# make sure chapter exists
if not os.path.exists(chapter_html_path):
break
# open chapter
with open(chapter_html_path, 'r') as chapter_html_file:
chapter_html = chapter_html_file.read()
# dumb optional-linebreak character
chapter_html = chapter_html.replace('&#13;', '')
# need to strip encoding declaration below or lxml yells at us
chapter_html = chapter_html.lstrip('<?xml version="1.0" encoding="utf-8"?>')
chapter_html = chapter_html.replace('xmlns="http://www.w3.org/1999/xhtml"', '')
chapter_pq = pq(chapter_html).remove_namespaces()
# parse out title and output text
chapter_title = Parse(chapter_pq('h2')).split(':')[1].strip()
chapter_text = Parse(chapter_pq('#storytext'), is_base=True, chap_name=chapter_title)
# print output text
with open(chapter_latex_path, 'w') as latex_file:
latex_file.write('\\chapter{{ {} }}'.format(chapter_title))
latex_file.write('\n\n')
latex_file.write(chapter_text)
# throw out all chapters tex file
all_chapters_filename = os.path.join(chapter_latex_folder, 'all_chapters.tex')
with open(all_chapters_filename, 'w') as all_chapters:
for i in range(1, chapter_number):
all_chapters.write('\\input{{./chapters/chapter_{}.tex}}\n'.format(i))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment