DanielOaks/ficsave-to-latex.py

## ficsave-to-latex.py
#!/usr/bin/env python3
# Given an ePub file from FicSave, generates LaTeX files
# Written by Daniel Oaks, released under the ISC license
__doc__ = """Make-LaTeX.

Given an ePub file from FicSave, generates LaTeX files.

Usage:
  make-latex <name> [options]
  make-latex -h | --help
  make-latex -v | --version

Options:
  --silent        Suppresses mundane printouts, prints what's important
  --verbose       Printout more information than normal
  -h --help       Show help
  -v --version    Show version
"""

import re
import os
import zipfile
from docopt import docopt

# We should probably create a class that provides a simplified interface to both lxml and pq's
# attributes from a single, unified object, but that's a little overkill for this project right now
from pyquery import PyQuery as pq
import lxml, lxml.html


def lx(element):
    """Return a given element as an lxml element."""
    if type(element) == lxml.html.HtmlElement:
        return element
    elif type(element) == pq:
        return lxml.html.fragments_fromstring(element.__html__())[0]
    elif type(element) == str:
        return lxml.html.fragments_fromstring(element)[0]


# text parser
def StripAn(contents, chap_name=''):
    # strip initial junk
    text = ''
    for content in contents[:20]:
        if isinstance(content, str):
            text += content
        else:
            text += Parse(content)

    if chap_name.lower().replace(' ', '') in text.lower().replace(' ', '').replace('-', ''):
        while True:
            if isinstance(contents[0], str):
                text = contents[0]
            else:
                text = Parse(contents[0])

            print(chap_name, text.replace('\n', ' '))

            # if Parse(contents[0])
            del contents[0]

            if chap_name.lower().replace(' ', '') in text.lower().replace(' ', '').replace('-', ''):
                break

    # strip author's note at end of chapter
    text = ''
    for content in contents[-50:]:
        if isinstance(content, str):
            text += content
        else:
            text += Parse(content)

    if 'a/n' in text.lower():
        while True:
            if isinstance(contents[-1], str):
                text = contents[-1]
            else:
                text = Parse(contents[-1])

            print(chap_name, text[:50].replace('\n', ' '))

            del contents[-1]

            if 'a/n' in text.lower():
                break

    return contents


RE_EMPTY_EMPH = re.compile(r'\\emph{\s+}')
# RE_FINAL_VSPACE = re.compile(r'\n\\vspace\{1.2cm\}\s+\Z')

def FixQuotes(in_string):
    output = ''
    last_character = ''
    in_single_quote = False
    in_double_quote = False

    for character in in_string:
        if character == '"':
            if in_double_quote:
                in_double_quote = False
                output += "''"
            else:
                in_double_quote = True
                output += '``'
        elif character == "'":
            if in_single_quote:
                in_single_quote = False
                output += "'"
            else:
                if last_character == ' ':
                    in_single_quote = True
                    output += '`'
                else:
                    output += character
        else:
            output += character
            last_character = character

    return output


def Parse(html, is_base=False, chap_name=''):
    html = pq(html)
    tag = lx(html).tag

    text = ''

    if tag == 'br':
        text = '\n\n'

    html_contents = html.contents()
    if is_base:
        html_contents = StripAn(html_contents, chap_name)

    for content in html_contents:
        if isinstance(content, str):
            text += content.replace('\n', ' ').replace('  ', ' ')
        else:
            text += Parse(content)

    if tag == 'em':
        text = '\\emph{{{}}}'.format(text)

    if tag == 'strong':
        text = '{{\\bfseries {} }}'.format(text)

    if tag == 'p':
        text = FixQuotes(text.strip().replace('...', '{\\ldots}').replace("'' }", "} ''").replace('`` ', '``').replace(" ''", "''"))
        text = text.replace('``\\emph{', '\\emph{``').replace("}''", "''}").replace("}'", "'}").replace("{\ldots''}", "{\ldots}''").replace("{\ldots'}", "{\ldots}'")
        text = text.replace("``\\emph{ ", "\\emph{ ``").replace("`\emph{ ", "\emph{ `").replace("``{\ldots}\emph{", "\emph{``{\ldots}").replace("`{\ldots}{\emph", "{\emph`{\ldots}").replace('} .', '}.').replace('} ,', '},').replace('} !', '}!')
        text = text.replace('} ,', '},').replace('`` ', '``')
        text = text.replace('&', '\\&')
        text = text + '\n\n'

    if is_base:
        text = text.strip() + '\n'

    # return text
    if '----------------' in text:
        return '\\vspace{1.2cm}\n\n'
    else:
        text = text.replace('    ', '').replace('  ', ' ').replace('\\emph{ ', ' \\emph{').replace(' }', '} ')
        text = RE_EMPTY_EMPH.sub('', text)  # remove empty \emph{} blocks
        return text


# application
if __name__ == '__main__':
    args = docopt(__doc__)

    folder_name = args['<name>']
    epub_name = os.path.join(folder_name, 'fic.epub')
    working_folder = os.path.join(folder_name, 'working')

    # extract epub file
    with zipfile.ZipFile(epub_name) as epub:
        epub.extractall(working_folder)

    # get chapters
    chapter_html_path_template = os.path.join(working_folder, 'OEBPS', 'Chapter{}.html')
    chapter_latex_folder = os.path.join(folder_name, 'latex')
    if not os.path.exists(chapter_latex_folder):
        os.makedirs(chapter_latex_folder)
    chapter_latex_path_template = os.path.join(chapter_latex_folder, 'chapter_{}.tex')
    chapter_number = 0
    while True:
        chapter_number += 1
        chapter_html_path = chapter_html_path_template.format(chapter_number)
        chapter_latex_path = chapter_latex_path_template.format(chapter_number)

        # make sure chapter exists
        if not os.path.exists(chapter_html_path):
            break

        # open chapter
        with open(chapter_html_path, 'r') as chapter_html_file:
            chapter_html = chapter_html_file.read()
            # dumb optional-linebreak character
            chapter_html = chapter_html.replace('&#13;', '')
            # need to strip encoding declaration below or lxml yells at us
            chapter_html = chapter_html.lstrip('<?xml version="1.0" encoding="utf-8"?>')
            chapter_html = chapter_html.replace('xmlns="http://www.w3.org/1999/xhtml"', '')
            chapter_pq = pq(chapter_html).remove_namespaces()

        # parse out title and output text
        chapter_title = Parse(chapter_pq('h2')).split(':')[1].strip()
        chapter_text = Parse(chapter_pq('#storytext'), is_base=True, chap_name=chapter_title)

        # print output text
        with open(chapter_latex_path, 'w') as latex_file:
            latex_file.write('\\chapter{{ {} }}'.format(chapter_title))
            latex_file.write('\n\n')
            latex_file.write(chapter_text)

    # throw out all chapters tex file
    all_chapters_filename = os.path.join(chapter_latex_folder, 'all_chapters.tex')
    with open(all_chapters_filename, 'w') as all_chapters:
        for i in range(1, chapter_number):
            all_chapters.write('\\input{{./chapters/chapter_{}.tex}}\n'.format(i))
	#!/usr/bin/env python3
	# Given an ePub file from FicSave, generates LaTeX files
	# Written by Daniel Oaks, released under the ISC license
	__doc__ = """Make-LaTeX.

	Given an ePub file from FicSave, generates LaTeX files.

	Usage:
	make-latex <name> [options]
	make-latex -h \| --help
	make-latex -v \| --version

	Options:
	--silent Suppresses mundane printouts, prints what's important
	--verbose Printout more information than normal
	-h --help Show help
	-v --version Show version
	"""

	import re
	import os
	import zipfile
	from docopt import docopt

	# We should probably create a class that provides a simplified interface to both lxml and pq's
	# attributes from a single, unified object, but that's a little overkill for this project right now
	from pyquery import PyQuery as pq
	import lxml, lxml.html


	def lx(element):
	"""Return a given element as an lxml element."""
	if type(element) == lxml.html.HtmlElement:
	return element
	elif type(element) == pq:
	return lxml.html.fragments_fromstring(element.__html__())[0]
	elif type(element) == str:
	return lxml.html.fragments_fromstring(element)[0]


	# text parser
	def StripAn(contents, chap_name=''):
	# strip initial junk
	text = ''
	for content in contents[:20]:
	if isinstance(content, str):
	text += content
	else:
	text += Parse(content)

	if chap_name.lower().replace(' ', '') in text.lower().replace(' ', '').replace('-', ''):
	while True:
	if isinstance(contents[0], str):
	text = contents[0]
	else:
	text = Parse(contents[0])

	print(chap_name, text.replace('\n', ' '))

	# if Parse(contents[0])
	del contents[0]

	if chap_name.lower().replace(' ', '') in text.lower().replace(' ', '').replace('-', ''):
	break

	# strip author's note at end of chapter
	text = ''
	for content in contents[-50:]:
	if isinstance(content, str):
	text += content
	else:
	text += Parse(content)

	if 'a/n' in text.lower():
	while True:
	if isinstance(contents[-1], str):
	text = contents[-1]
	else:
	text = Parse(contents[-1])

	print(chap_name, text[:50].replace('\n', ' '))

	del contents[-1]

	if 'a/n' in text.lower():
	break

	return contents


	RE_EMPTY_EMPH = re.compile(r'\\emph{\s+}')
	# RE_FINAL_VSPACE = re.compile(r'\n\\vspace\{1.2cm\}\s+\Z')

	def FixQuotes(in_string):
	output = ''
	last_character = ''
	in_single_quote = False
	in_double_quote = False

	for character in in_string:
	if character == '"':
	if in_double_quote:
	in_double_quote = False
	output += "''"
	else:
	in_double_quote = True
	output += '``'
	elif character == "'":
	if in_single_quote:
	in_single_quote = False
	output += "'"
	else:
	if last_character == ' ':
	in_single_quote = True
	output += '`'
	else:
	output += character
	else:
	output += character
	last_character = character

	return output


	def Parse(html, is_base=False, chap_name=''):
	html = pq(html)
	tag = lx(html).tag

	text = ''

	if tag == 'br':
	text = '\n\n'

	html_contents = html.contents()
	if is_base:
	html_contents = StripAn(html_contents, chap_name)

	for content in html_contents:
	if isinstance(content, str):
	text += content.replace('\n', ' ').replace(' ', ' ')
	else:
	text += Parse(content)

	if tag == 'em':
	text = '\\emph{{{}}}'.format(text)

	if tag == 'strong':
	text = '{{\\bfseries {} }}'.format(text)

	if tag == 'p':
	text = FixQuotes(text.strip().replace('...', '{\\ldots}').replace("'' }", "} ''").replace('`` ', '``').replace(" ''", "''"))
	text = text.replace('``\\emph{', '\\emph{``').replace("}''", "''}").replace("}'", "'}").replace("{\ldots''}", "{\ldots}''").replace("{\ldots'}", "{\ldots}'")
	text = text.replace("``\\emph{ ", "\\emph{ ``").replace("`\emph{ ", "\emph{ `").replace("``{\ldots}\emph{", "\emph{``{\ldots}").replace("`{\ldots}{\emph", "{\emph`{\ldots}").replace('} .', '}.').replace('} ,', '},').replace('} !', '}!')
	text = text.replace('} ,', '},').replace('`` ', '``')
	text = text.replace('&', '\\&')
	text = text + '\n\n'

	if is_base:
	text = text.strip() + '\n'

	# return text
	if '----------------' in text:
	return '\\vspace{1.2cm}\n\n'
	else:
	text = text.replace(' ', '').replace(' ', ' ').replace('\\emph{ ', ' \\emph{').replace(' }', '} ')
	text = RE_EMPTY_EMPH.sub('', text) # remove empty \emph{} blocks
	return text


	# application
	if __name__ == '__main__':
	args = docopt(__doc__)

	folder_name = args['<name>']
	epub_name = os.path.join(folder_name, 'fic.epub')
	working_folder = os.path.join(folder_name, 'working')

	# extract epub file
	with zipfile.ZipFile(epub_name) as epub:
	epub.extractall(working_folder)

	# get chapters
	chapter_html_path_template = os.path.join(working_folder, 'OEBPS', 'Chapter{}.html')
	chapter_latex_folder = os.path.join(folder_name, 'latex')
	if not os.path.exists(chapter_latex_folder):
	os.makedirs(chapter_latex_folder)
	chapter_latex_path_template = os.path.join(chapter_latex_folder, 'chapter_{}.tex')
	chapter_number = 0
	while True:
	chapter_number += 1
	chapter_html_path = chapter_html_path_template.format(chapter_number)
	chapter_latex_path = chapter_latex_path_template.format(chapter_number)

	# make sure chapter exists
	if not os.path.exists(chapter_html_path):
	break

	# open chapter
	with open(chapter_html_path, 'r') as chapter_html_file:
	chapter_html = chapter_html_file.read()
	# dumb optional-linebreak character
	chapter_html = chapter_html.replace(' ', '')
	# need to strip encoding declaration below or lxml yells at us
	chapter_html = chapter_html.lstrip('<?xml version="1.0" encoding="utf-8"?>')
	chapter_html = chapter_html.replace('xmlns="http://www.w3.org/1999/xhtml"', '')
	chapter_pq = pq(chapter_html).remove_namespaces()

	# parse out title and output text
	chapter_title = Parse(chapter_pq('h2')).split(':')[1].strip()
	chapter_text = Parse(chapter_pq('#storytext'), is_base=True, chap_name=chapter_title)

	# print output text
	with open(chapter_latex_path, 'w') as latex_file:
	latex_file.write('\\chapter{{ {} }}'.format(chapter_title))
	latex_file.write('\n\n')
	latex_file.write(chapter_text)

	# throw out all chapters tex file
	all_chapters_filename = os.path.join(chapter_latex_folder, 'all_chapters.tex')
	with open(all_chapters_filename, 'w') as all_chapters:
	for i in range(1, chapter_number):
	all_chapters.write('\\input{{./chapters/chapter_{}.tex}}\n'.format(i))