Skip to content

Instantly share code, notes, and snippets.

@MercuriXito
Last active March 21, 2020 14:04
Show Gist options
  • Save MercuriXito/6031e9be887c63e50ed7718367ba7d3f to your computer and use it in GitHub Desktop.
Save MercuriXito/6031e9be887c63e50ed7718367ba7d3f to your computer and use it in GitHub Desktop.
LaTeX2Markdown (Adapted from https://github.com/ajtulloch/LaTeX2Markdown): Add features: (1) Conver Cross-Ref. (2) Conver verbatim to codes.
import os, sys, json
import re
class BibtexParser:
""" Simple Parser of bibtex
"""
def __init__(self, path, encoding = "utf-8"):
# self.path = os.path.abspath(path)
self.path = path
self.encoding = encoding
self.string = self._read_file(self.path)
# construct reg
self._item_re = re.compile(r"""(@(?P<bibtype>\w+)\{ # type
(?P<refname>\w+),\n? # name
(?P<content>(.+?[=|:].+?,?\n)+) # key-values
\}){1}?""", flags= re.VERBOSE)
# key-value reg in each item
self._key_re = re.compile(r"""
^(?P<key>.+)=.*?\{{1,2}(?P<value>.+?)\}{1,2},?$
""", flags=re.DOTALL + re.VERBOSE + re.M)
def _read_file(self, path):
with open(path, "r", encoding = self.encoding) as f:
lines=f.readlines()
return "".join(lines)
def _extract_keyvalue(self, content):
dicts = []
for line in content.split("\n"):
matchobj = self._key_re.match(line)
if not matchobj: continue
key = matchobj.group("key").lstrip().rstrip()
value = matchobj.group("value").lstrip().rstrip()
dicts.append((key,value))
return dict(dicts)
def _extract_file(self):
matchobj = ""
target_str = self.string
items = []
while True:
matchobj = self._item_re.search(target_str)
if not matchobj: break
bibtype = matchobj.group("bibtype")
refname = matchobj.group("refname")
keys = matchobj.group("content")
idx = matchobj.lastindex + len(keys) + len(bibtype) + len(refname)
# extract the keys and values in `keys`
content = self._extract_keyvalue(keys)
items.append(dict([("bibtype",bibtype),("refname",refname),("content",content)]))
target_str = target_str[idx:]
return items
def tojson(self):
return json.dumps(self.getitem())
def getitem(self):
try:
return self._item
except AttributeError:
self._item = self._extract_file()
return self._item
def toplaintext(self, oneitem, markdown_style=True):
strip = lambda x: x.lstrip().rstrip()
gstrip = lambda x: [strip(i) for i in x]
def authorparse(author_str, abbreviate = False, plain = False):
authors = author_str.split("and")
names = [gstrip(author.split(",")) for author in authors]
string = ", ".join([ "{} {}".format(name, fname) for fname, name in names[:-1]])
string += " and {} {}.".format(names[-1][1], names[-1][0])
return string
author = authorparse(oneitem["content"]["author"])
title = oneitem["content"]["title"]
year = oneitem["content"]["year"]
text = "{}. **{}**. {}\n".format(author, title, year)
return text
if __name__ == "__main__":
pass
import re
import os, argparse
from collections import defaultdict
from BibParser import BibtexParser
#------------------------------------------------------------------------------
# Basic configuration - modify this to change output formatting
_block_configuration = {
"chapter": {
"markdown_heading": "##",
"pretty_name": "",
"show_count": False
},
"enumerate": {
"line_indent_char": "",
"list_heading": "1. ",
"markdown_heading": "",
"pretty_name": "",
"show_count": False
},
"exer": {
"line_indent_char": "> ",
"markdown_heading": "####",
"pretty_name": "Exercise",
"show_count": True
},
"itemize": {
"line_indent_char": "",
"list_heading": "* ",
"markdown_heading": "",
"pretty_name": "",
"show_count": False
},
"lem": {
"line_indent_char": "> ",
"markdown_heading": "####",
"pretty_name": "Lemma",
"show_count": True
},
"lstlisting": {
"line_indent_char": " ",
"markdown_heading": "",
"pretty_name": "",
"show_count": False
},
"proof": {
"line_indent_char": "",
"markdown_heading": "####",
"pretty_name": "Proof",
"show_count": False
},
"prop": {
"line_indent_char": "> ",
"markdown_heading": "####",
"pretty_name": "Proposition",
"show_count": True
},
"section": {
"markdown_heading": "###",
"pretty_name": "",
"show_count": False
},
"subsection": {
"markdown_heading": "####",
"pretty_name": "",
"show_count": False
},
"thm": {
"line_indent_char": "> ",
"markdown_heading": "####",
"pretty_name": "Theorem",
"show_count": True
}
}
#------------------------------------------------------------------------------
class LaTeX2Markdown(object):
"""Initialise with a LaTeX string - see the main routine for examples of
reading this string from an existing .tex file.
To modify the outputted markdown, modify the _block_configuration variable
before initializing the LaTeX2Markdown instance."""
def __init__(self, latex_string,
block_configuration = _block_configuration,
block_counter = defaultdict(lambda: 1)):
self._block_configuration = block_configuration
self._latex_string = latex_string
self._block_counter = block_counter
# Precompile the regexes
# Select everything in the main matter
self._main_re = re.compile(r"""\\begin{document}
(?P<main>.*)
\\end{document}""",
flags=re.DOTALL + re.VERBOSE)
# Select all our block materials.
self._block_re = re.compile(r"""\\begin{(?P<block_name>exer|proof|thm|lem|prop)} # block name
(\[(?P<block_title>.*?)\])? # Optional block title
(?P<block_contents>.*?) # Non-greedy block contents
\\end{(?P=block_name)}""", # closing block
flags=re.DOTALL + re.VERBOSE)
# Select all our list blocks
self._lists_re = re.compile(r"""\\begin{(?P<block_name>enumerate|itemize)} # list name
(\[.*?\])? # Optional enumerate settings i.e. (a)
(?P<block_contents>.*?) # Non-greedy list contents
\\end{(?P=block_name)}""", # closing list
flags=re.DOTALL + re.VERBOSE)
# Select all our headers
self._header_re = re.compile(r"""\\(?P<header_name>chapter|section|subsection) # Header
{(?P<header_contents>.*?)}""", # Header title
flags=re.DOTALL + re.VERBOSE)
# Select all our 'auxillary blocks' - these need special treatment
# for future use - e.g. pygments highlighting instead of code blocks
# in Markdown
self._aux_block_re = re.compile(r"""\\begin{(?P<block_name>lstlisting)} # block name
(?P<block_contents>.*?) # Non-greedy block contents
\\end{(?P=block_name)}""", # closing block
flags=re.DOTALL + re.VERBOSE)
def _replace_header(self, matchobj):
"""Creates a header string for a section/subsection/chapter match.
For example, "### 2 - Integral Calculus\n" """
header_name = matchobj.group('header_name')
header_contents = matchobj.group('header_contents')
header = self._format_block_name(header_name)
block_config = self._block_configuration[header_name]
# If we have a count, separate the title from the count with a dash
separator = "-" if block_config.get("show_count") else ""
output_str = "{header} {separator} {title}\n".format(
header=header,
title=header_contents,
separator=separator)
return output_str
def _replace_block(self, matchobj):
"""Create a string that replaces an entire block.
The string consists of a header (e.g. ### Exercise 1)
and a block, containing the LaTeX code.
The block may be optionally indented, blockquoted, etc.
These settings are customizable through the config.json
file"""
block_name = matchobj.group('block_name')
block_contents = matchobj.group('block_contents')
# Block title may not exist, so use .get method
block_title = matchobj.groupdict().get('block_title')
# We have to format differently for lists
if block_name in {"itemize", "enumerate"}:
formatted_contents = self._format_list_contents(block_name,
block_contents)
else:
formatted_contents = self._format_block_contents(block_name,
block_contents)
header = self._format_block_name(block_name, block_title)
output_str = "{header}\n\n{block_contents}".format(
header=header,
block_contents=formatted_contents)
return output_str
def _format_block_contents(self, block_name, block_contents):
"""Format the contents of a block with configuration parameters
provided in the self._block_configuration attribute"""
block_config = self._block_configuration[block_name]
line_indent_char = block_config["line_indent_char"]
output_str = ""
for line in block_contents.lstrip().rstrip().split("\n"):
line = line.lstrip().rstrip()
indented_line = line_indent_char + line + "\n"
output_str += indented_line
return output_str
def _format_list_contents(self, block_name, block_contents):
"""To format a list, we must remove the \item declaration in the
LaTeX source. All else is as in the _format_block_contents method."""
block_config = self._block_configuration[block_name]
list_heading = block_config["list_heading"]
output_str = ""
for line in block_contents.lstrip().rstrip().split("\n"):
line = line.lstrip().rstrip()
markdown_list_line = line.replace(r"\item", list_heading)
output_str += markdown_list_line + "\n"
return output_str
def _format_block_name(self, block_name, block_title=None):
"""Format the Markdown header associated with a block.
Due to the optional block_title, we split the string construction
into two parts."""
block_config = self._block_configuration[block_name]
pretty_name = block_config["pretty_name"]
show_count = block_config["show_count"]
markdown_heading = block_config["markdown_heading"]
block_count = self._block_counter[block_name] if show_count else ""
self._block_counter[block_name] += 1
output_str = "{markdown_heading} {pretty_name} {block_count}".format(
markdown_heading=markdown_heading,
pretty_name=pretty_name,
block_count=block_count)
if block_title:
output_str = "{output_str} ({block_title})".format(
output_str=output_str,
block_title=block_title)
return output_str.lstrip().rstrip()
def _latex_to_markdown(self):
"""Main function, returns the formatted Markdown as a string.
Uses a lot of custom regexes to fix a lot of content - you may have
to add or remove some regexes to suit your own needs."""
# Get main content, skipping preamble and closing tags.
try:
output = self._main_re.search(self._latex_string).group("main")
except AttributeError:
output = self._latex_string
# Reformat, lists, blocks, and headers.
output = self._lists_re.sub(self._replace_block, output)
output = self._block_re.sub(self._replace_block, output)
output = self._header_re.sub(self._replace_header, output)
output = self._aux_block_re.sub(self._replace_block, output)
# Fix \\ formatting for line breaks in align blocks
output = re.sub(r" \\\\", r" \\\\\\\\", output)
# Convert align* block to align - this fixes formatting
output = re.sub(r"align\*", r"align", output)
# Fix emph, textbf, texttt formatting
output = re.sub(r"\\emph{(.*?)}", r"*\1*", output)
output = re.sub(r"\\textbf{(.*?)}", r"**\1**", output)
output = re.sub(r"\\texttt{(.*?)}", r"`\1`", output)
# Fix \% formatting
output = re.sub(r"\\%", r"%", output)
# Fix argmax, etc.
output = re.sub(r"\\arg(max|min)", r"\\text{arg\1}", output)
# Throw away content in IGNORE/END block
output = re.sub(r"% LaTeX2Markdown IGNORE(.*?)\% LaTeX2Markdown END",
"", output, flags=re.DOTALL)
return output.lstrip().rstrip()
def to_markdown(self):
return self._latex_to_markdown()
def to_latex(self):
return self._latex_string
#------------------------------------------------------------------------------
class EntireSub:
""" Subtitude the codes based on the entire file
"""
def __init__(self, tex_str):
self._codes_re = re.compile(r"""\\begin\{(?P<name>verbatim)\}
(?P<content>.*?)
\\end\{(?P<endname>verbatim)\}""",
flags = re.DOTALL + re.VERBOSE + re.MULTILINE)
self.tex_str = tex_str
def _replace_verbatim(self, matchobj):
# name = matchobj.group("name")
content = matchobj.group("content")
lang_token = ""
detected_tokens = ["torch","numpy","torchvision","import os"] # a few key words to detect python
for token in detected_tokens:
if token in content:
lang_token = "python"
break
output_str = "```{}\n{}\n```\n".format(lang_token, content)
return output_str
def _sub(self):
output_str = self.tex_str
output_str = self._codes_re.sub(self._replace_verbatim, output_str)
return output_str
def to_markdown(self):
return self._sub()
def matchlist(reg, string):
target_string = string
matches = []
while True:
matchobj = reg.search(target_string)
if not matchobj: break
idx = matchobj.end()
target_string = target_string[idx:]
matches.append(matchobj)
return matches
# sub reference
def subcrossref(string, root):
""" Substitude the cross references in tex.
"""
_bibtexref_re = re.compile(r"\\bibliography\{(?P<bibname>\w+)\}")
_cite_re = re.compile(r"\\cite\{(?P<citename>\w+)\}")
_citestyle_re = re.compile(r"\\bibliographystyle\{(?P<citestyle>\w+)\}")
# find the path of bibtex and parse it
matchobj = _bibtexref_re.search(string)
if not matchobj:
print("No bibliography finded!")
return string
bibpath = root + os.sep + matchobj.group("bibname") + ".bib"
bib = BibtexParser(bibpath)
refs = bib.getitem()
# search all the `\cite` marks and only extract referenced citation from bibtex
matches = matchlist(_cite_re, string)
matchnames = set([match.group("citename") for match in matches])
refs = dict([(ref["refname"], ref) for ref in refs])
refs = dict([(matchname, [idx + 1, refs[matchname]]) for idx, matchname in enumerate(matchnames)])
# generate text for references
reftext = "\n".join(["{}. {}".format(idx, bib.toplaintext(ref)) for idx, ref in refs.values()])
# subtidudes the citation
target_string = string
def subcite(matchobj):
citename = matchobj.group("citename")
citeidx = refs[citename][0]
return "[{}]".format(citeidx)
target_string = _cite_re.sub(subcite, string)
string = target_string
# add refernces text
string += "\n\n## Reference\n***\n\n{}\n".format(reftext)
return string
#------------------------------------------------------------------------------
def options():
parser = argparse.ArgumentParser()
parser.add_argument("infile", type=str, help="Input TeX file")
parser.add_argument("outfile", type=str, help="Ouput Markdown file")
parser.add_argument("-e", "--encoding", type=str, default="utf-8", help="Encoding for reading and writing files")
opt = parser.parse_args()
return opt
def main():
opt = options()
input_file, output_file, encoding = opt.infile, opt.outfile, opt.encoding
fileroot = os.sep.join(input_file.split(os.sep)[:-1])
tex_str = ""
# AMS convert
with open(input_file, 'r', encoding=encoding) as f:
latex_string = f.read()
y = LaTeX2Markdown(latex_string)
markdown_string = y.to_markdown()
tex_str += markdown_string
# Other convert (based on entire files)
suber = EntireSub(tex_str)
tex_str = suber.to_markdown()
# cross-ref
tex_str = subcrossref(tex_str, fileroot)
with open(output_file, 'w', encoding=encoding) as f_out:
f_out.write(tex_str)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment