Last active
March 21, 2020 14:04
-
-
Save MercuriXito/6031e9be887c63e50ed7718367ba7d3f to your computer and use it in GitHub Desktop.
LaTeX2Markdown (Adapted from https://github.com/ajtulloch/LaTeX2Markdown): Add features: (1) Conver Cross-Ref. (2) Conver verbatim to codes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, sys, json | |
import re | |
class BibtexParser: | |
""" Simple Parser of bibtex | |
""" | |
def __init__(self, path, encoding = "utf-8"): | |
# self.path = os.path.abspath(path) | |
self.path = path | |
self.encoding = encoding | |
self.string = self._read_file(self.path) | |
# construct reg | |
self._item_re = re.compile(r"""(@(?P<bibtype>\w+)\{ # type | |
(?P<refname>\w+),\n? # name | |
(?P<content>(.+?[=|:].+?,?\n)+) # key-values | |
\}){1}?""", flags= re.VERBOSE) | |
# key-value reg in each item | |
self._key_re = re.compile(r""" | |
^(?P<key>.+)=.*?\{{1,2}(?P<value>.+?)\}{1,2},?$ | |
""", flags=re.DOTALL + re.VERBOSE + re.M) | |
def _read_file(self, path): | |
with open(path, "r", encoding = self.encoding) as f: | |
lines=f.readlines() | |
return "".join(lines) | |
def _extract_keyvalue(self, content): | |
dicts = [] | |
for line in content.split("\n"): | |
matchobj = self._key_re.match(line) | |
if not matchobj: continue | |
key = matchobj.group("key").lstrip().rstrip() | |
value = matchobj.group("value").lstrip().rstrip() | |
dicts.append((key,value)) | |
return dict(dicts) | |
def _extract_file(self): | |
matchobj = "" | |
target_str = self.string | |
items = [] | |
while True: | |
matchobj = self._item_re.search(target_str) | |
if not matchobj: break | |
bibtype = matchobj.group("bibtype") | |
refname = matchobj.group("refname") | |
keys = matchobj.group("content") | |
idx = matchobj.lastindex + len(keys) + len(bibtype) + len(refname) | |
# extract the keys and values in `keys` | |
content = self._extract_keyvalue(keys) | |
items.append(dict([("bibtype",bibtype),("refname",refname),("content",content)])) | |
target_str = target_str[idx:] | |
return items | |
def tojson(self): | |
return json.dumps(self.getitem()) | |
def getitem(self): | |
try: | |
return self._item | |
except AttributeError: | |
self._item = self._extract_file() | |
return self._item | |
def toplaintext(self, oneitem, markdown_style=True): | |
strip = lambda x: x.lstrip().rstrip() | |
gstrip = lambda x: [strip(i) for i in x] | |
def authorparse(author_str, abbreviate = False, plain = False): | |
authors = author_str.split("and") | |
names = [gstrip(author.split(",")) for author in authors] | |
string = ", ".join([ "{} {}".format(name, fname) for fname, name in names[:-1]]) | |
string += " and {} {}.".format(names[-1][1], names[-1][0]) | |
return string | |
author = authorparse(oneitem["content"]["author"]) | |
title = oneitem["content"]["title"] | |
year = oneitem["content"]["year"] | |
text = "{}. **{}**. {}\n".format(author, title, year) | |
return text | |
if __name__ == "__main__": | |
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import os, argparse | |
from collections import defaultdict | |
from BibParser import BibtexParser | |
#------------------------------------------------------------------------------ | |
# Basic configuration - modify this to change output formatting | |
_block_configuration = { | |
"chapter": { | |
"markdown_heading": "##", | |
"pretty_name": "", | |
"show_count": False | |
}, | |
"enumerate": { | |
"line_indent_char": "", | |
"list_heading": "1. ", | |
"markdown_heading": "", | |
"pretty_name": "", | |
"show_count": False | |
}, | |
"exer": { | |
"line_indent_char": "> ", | |
"markdown_heading": "####", | |
"pretty_name": "Exercise", | |
"show_count": True | |
}, | |
"itemize": { | |
"line_indent_char": "", | |
"list_heading": "* ", | |
"markdown_heading": "", | |
"pretty_name": "", | |
"show_count": False | |
}, | |
"lem": { | |
"line_indent_char": "> ", | |
"markdown_heading": "####", | |
"pretty_name": "Lemma", | |
"show_count": True | |
}, | |
"lstlisting": { | |
"line_indent_char": " ", | |
"markdown_heading": "", | |
"pretty_name": "", | |
"show_count": False | |
}, | |
"proof": { | |
"line_indent_char": "", | |
"markdown_heading": "####", | |
"pretty_name": "Proof", | |
"show_count": False | |
}, | |
"prop": { | |
"line_indent_char": "> ", | |
"markdown_heading": "####", | |
"pretty_name": "Proposition", | |
"show_count": True | |
}, | |
"section": { | |
"markdown_heading": "###", | |
"pretty_name": "", | |
"show_count": False | |
}, | |
"subsection": { | |
"markdown_heading": "####", | |
"pretty_name": "", | |
"show_count": False | |
}, | |
"thm": { | |
"line_indent_char": "> ", | |
"markdown_heading": "####", | |
"pretty_name": "Theorem", | |
"show_count": True | |
} | |
} | |
#------------------------------------------------------------------------------ | |
class LaTeX2Markdown(object): | |
"""Initialise with a LaTeX string - see the main routine for examples of | |
reading this string from an existing .tex file. | |
To modify the outputted markdown, modify the _block_configuration variable | |
before initializing the LaTeX2Markdown instance.""" | |
def __init__(self, latex_string, | |
block_configuration = _block_configuration, | |
block_counter = defaultdict(lambda: 1)): | |
self._block_configuration = block_configuration | |
self._latex_string = latex_string | |
self._block_counter = block_counter | |
# Precompile the regexes | |
# Select everything in the main matter | |
self._main_re = re.compile(r"""\\begin{document} | |
(?P<main>.*) | |
\\end{document}""", | |
flags=re.DOTALL + re.VERBOSE) | |
# Select all our block materials. | |
self._block_re = re.compile(r"""\\begin{(?P<block_name>exer|proof|thm|lem|prop)} # block name | |
(\[(?P<block_title>.*?)\])? # Optional block title | |
(?P<block_contents>.*?) # Non-greedy block contents | |
\\end{(?P=block_name)}""", # closing block | |
flags=re.DOTALL + re.VERBOSE) | |
# Select all our list blocks | |
self._lists_re = re.compile(r"""\\begin{(?P<block_name>enumerate|itemize)} # list name | |
(\[.*?\])? # Optional enumerate settings i.e. (a) | |
(?P<block_contents>.*?) # Non-greedy list contents | |
\\end{(?P=block_name)}""", # closing list | |
flags=re.DOTALL + re.VERBOSE) | |
# Select all our headers | |
self._header_re = re.compile(r"""\\(?P<header_name>chapter|section|subsection) # Header | |
{(?P<header_contents>.*?)}""", # Header title | |
flags=re.DOTALL + re.VERBOSE) | |
# Select all our 'auxillary blocks' - these need special treatment | |
# for future use - e.g. pygments highlighting instead of code blocks | |
# in Markdown | |
self._aux_block_re = re.compile(r"""\\begin{(?P<block_name>lstlisting)} # block name | |
(?P<block_contents>.*?) # Non-greedy block contents | |
\\end{(?P=block_name)}""", # closing block | |
flags=re.DOTALL + re.VERBOSE) | |
def _replace_header(self, matchobj): | |
"""Creates a header string for a section/subsection/chapter match. | |
For example, "### 2 - Integral Calculus\n" """ | |
header_name = matchobj.group('header_name') | |
header_contents = matchobj.group('header_contents') | |
header = self._format_block_name(header_name) | |
block_config = self._block_configuration[header_name] | |
# If we have a count, separate the title from the count with a dash | |
separator = "-" if block_config.get("show_count") else "" | |
output_str = "{header} {separator} {title}\n".format( | |
header=header, | |
title=header_contents, | |
separator=separator) | |
return output_str | |
def _replace_block(self, matchobj): | |
"""Create a string that replaces an entire block. | |
The string consists of a header (e.g. ### Exercise 1) | |
and a block, containing the LaTeX code. | |
The block may be optionally indented, blockquoted, etc. | |
These settings are customizable through the config.json | |
file""" | |
block_name = matchobj.group('block_name') | |
block_contents = matchobj.group('block_contents') | |
# Block title may not exist, so use .get method | |
block_title = matchobj.groupdict().get('block_title') | |
# We have to format differently for lists | |
if block_name in {"itemize", "enumerate"}: | |
formatted_contents = self._format_list_contents(block_name, | |
block_contents) | |
else: | |
formatted_contents = self._format_block_contents(block_name, | |
block_contents) | |
header = self._format_block_name(block_name, block_title) | |
output_str = "{header}\n\n{block_contents}".format( | |
header=header, | |
block_contents=formatted_contents) | |
return output_str | |
def _format_block_contents(self, block_name, block_contents): | |
"""Format the contents of a block with configuration parameters | |
provided in the self._block_configuration attribute""" | |
block_config = self._block_configuration[block_name] | |
line_indent_char = block_config["line_indent_char"] | |
output_str = "" | |
for line in block_contents.lstrip().rstrip().split("\n"): | |
line = line.lstrip().rstrip() | |
indented_line = line_indent_char + line + "\n" | |
output_str += indented_line | |
return output_str | |
def _format_list_contents(self, block_name, block_contents): | |
"""To format a list, we must remove the \item declaration in the | |
LaTeX source. All else is as in the _format_block_contents method.""" | |
block_config = self._block_configuration[block_name] | |
list_heading = block_config["list_heading"] | |
output_str = "" | |
for line in block_contents.lstrip().rstrip().split("\n"): | |
line = line.lstrip().rstrip() | |
markdown_list_line = line.replace(r"\item", list_heading) | |
output_str += markdown_list_line + "\n" | |
return output_str | |
def _format_block_name(self, block_name, block_title=None): | |
"""Format the Markdown header associated with a block. | |
Due to the optional block_title, we split the string construction | |
into two parts.""" | |
block_config = self._block_configuration[block_name] | |
pretty_name = block_config["pretty_name"] | |
show_count = block_config["show_count"] | |
markdown_heading = block_config["markdown_heading"] | |
block_count = self._block_counter[block_name] if show_count else "" | |
self._block_counter[block_name] += 1 | |
output_str = "{markdown_heading} {pretty_name} {block_count}".format( | |
markdown_heading=markdown_heading, | |
pretty_name=pretty_name, | |
block_count=block_count) | |
if block_title: | |
output_str = "{output_str} ({block_title})".format( | |
output_str=output_str, | |
block_title=block_title) | |
return output_str.lstrip().rstrip() | |
def _latex_to_markdown(self): | |
"""Main function, returns the formatted Markdown as a string. | |
Uses a lot of custom regexes to fix a lot of content - you may have | |
to add or remove some regexes to suit your own needs.""" | |
# Get main content, skipping preamble and closing tags. | |
try: | |
output = self._main_re.search(self._latex_string).group("main") | |
except AttributeError: | |
output = self._latex_string | |
# Reformat, lists, blocks, and headers. | |
output = self._lists_re.sub(self._replace_block, output) | |
output = self._block_re.sub(self._replace_block, output) | |
output = self._header_re.sub(self._replace_header, output) | |
output = self._aux_block_re.sub(self._replace_block, output) | |
# Fix \\ formatting for line breaks in align blocks | |
output = re.sub(r" \\\\", r" \\\\\\\\", output) | |
# Convert align* block to align - this fixes formatting | |
output = re.sub(r"align\*", r"align", output) | |
# Fix emph, textbf, texttt formatting | |
output = re.sub(r"\\emph{(.*?)}", r"*\1*", output) | |
output = re.sub(r"\\textbf{(.*?)}", r"**\1**", output) | |
output = re.sub(r"\\texttt{(.*?)}", r"`\1`", output) | |
# Fix \% formatting | |
output = re.sub(r"\\%", r"%", output) | |
# Fix argmax, etc. | |
output = re.sub(r"\\arg(max|min)", r"\\text{arg\1}", output) | |
# Throw away content in IGNORE/END block | |
output = re.sub(r"% LaTeX2Markdown IGNORE(.*?)\% LaTeX2Markdown END", | |
"", output, flags=re.DOTALL) | |
return output.lstrip().rstrip() | |
def to_markdown(self): | |
return self._latex_to_markdown() | |
def to_latex(self): | |
return self._latex_string | |
#------------------------------------------------------------------------------ | |
class EntireSub: | |
""" Subtitude the codes based on the entire file | |
""" | |
def __init__(self, tex_str): | |
self._codes_re = re.compile(r"""\\begin\{(?P<name>verbatim)\} | |
(?P<content>.*?) | |
\\end\{(?P<endname>verbatim)\}""", | |
flags = re.DOTALL + re.VERBOSE + re.MULTILINE) | |
self.tex_str = tex_str | |
def _replace_verbatim(self, matchobj): | |
# name = matchobj.group("name") | |
content = matchobj.group("content") | |
lang_token = "" | |
detected_tokens = ["torch","numpy","torchvision","import os"] # a few key words to detect python | |
for token in detected_tokens: | |
if token in content: | |
lang_token = "python" | |
break | |
output_str = "```{}\n{}\n```\n".format(lang_token, content) | |
return output_str | |
def _sub(self): | |
output_str = self.tex_str | |
output_str = self._codes_re.sub(self._replace_verbatim, output_str) | |
return output_str | |
def to_markdown(self): | |
return self._sub() | |
def matchlist(reg, string): | |
target_string = string | |
matches = [] | |
while True: | |
matchobj = reg.search(target_string) | |
if not matchobj: break | |
idx = matchobj.end() | |
target_string = target_string[idx:] | |
matches.append(matchobj) | |
return matches | |
# sub reference | |
def subcrossref(string, root): | |
""" Substitude the cross references in tex. | |
""" | |
_bibtexref_re = re.compile(r"\\bibliography\{(?P<bibname>\w+)\}") | |
_cite_re = re.compile(r"\\cite\{(?P<citename>\w+)\}") | |
_citestyle_re = re.compile(r"\\bibliographystyle\{(?P<citestyle>\w+)\}") | |
# find the path of bibtex and parse it | |
matchobj = _bibtexref_re.search(string) | |
if not matchobj: | |
print("No bibliography finded!") | |
return string | |
bibpath = root + os.sep + matchobj.group("bibname") + ".bib" | |
bib = BibtexParser(bibpath) | |
refs = bib.getitem() | |
# search all the `\cite` marks and only extract referenced citation from bibtex | |
matches = matchlist(_cite_re, string) | |
matchnames = set([match.group("citename") for match in matches]) | |
refs = dict([(ref["refname"], ref) for ref in refs]) | |
refs = dict([(matchname, [idx + 1, refs[matchname]]) for idx, matchname in enumerate(matchnames)]) | |
# generate text for references | |
reftext = "\n".join(["{}. {}".format(idx, bib.toplaintext(ref)) for idx, ref in refs.values()]) | |
# subtidudes the citation | |
target_string = string | |
def subcite(matchobj): | |
citename = matchobj.group("citename") | |
citeidx = refs[citename][0] | |
return "[{}]".format(citeidx) | |
target_string = _cite_re.sub(subcite, string) | |
string = target_string | |
# add refernces text | |
string += "\n\n## Reference\n***\n\n{}\n".format(reftext) | |
return string | |
#------------------------------------------------------------------------------ | |
def options(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("infile", type=str, help="Input TeX file") | |
parser.add_argument("outfile", type=str, help="Ouput Markdown file") | |
parser.add_argument("-e", "--encoding", type=str, default="utf-8", help="Encoding for reading and writing files") | |
opt = parser.parse_args() | |
return opt | |
def main(): | |
opt = options() | |
input_file, output_file, encoding = opt.infile, opt.outfile, opt.encoding | |
fileroot = os.sep.join(input_file.split(os.sep)[:-1]) | |
tex_str = "" | |
# AMS convert | |
with open(input_file, 'r', encoding=encoding) as f: | |
latex_string = f.read() | |
y = LaTeX2Markdown(latex_string) | |
markdown_string = y.to_markdown() | |
tex_str += markdown_string | |
# Other convert (based on entire files) | |
suber = EntireSub(tex_str) | |
tex_str = suber.to_markdown() | |
# cross-ref | |
tex_str = subcrossref(tex_str, fileroot) | |
with open(output_file, 'w', encoding=encoding) as f_out: | |
f_out.write(tex_str) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment