Skip to content

Instantly share code, notes, and snippets.

@mbroedl
Created July 5, 2019 12:42
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save mbroedl/63ab97f6b88432954670d4158854ad15 to your computer and use it in GitHub Desktop.
Save mbroedl/63ab97f6b88432954670d4158854ad15 to your computer and use it in GitHub Desktop.
Inverse pandoc citeproc from docx to markdown
#!/bin/env python
'''
Due to changes made towards pandoc2, at the moment mostly only the inversion of (some) citations and re-wrapping of lines into somewhat semantic units.
I previously had some pandoc filters that also converted track changes to CriticMarkup, could accept or reject them, and merged comments to footnotes or html comments;
due to the change in pandoc filters they don't work at the moment, so that functionality is not used for now (but it is implemented in the script).
Usage:
`pandoc -s test.md --bibliography library.bib -M link-citations -t docx -o test.docx`
`pandoc -s test.docx --wrap=none --track-changes=all --atx-headers --reference-location=section -t markdown | ./invert-citeproc.py > test.rev.md`
**Note:** With earlier versions of pandoc, the script was able to read diacritics from the references section.
As empty anchors seem to be stripped by now, this is not possible anymore, and references are currently ignored.
It added a pipe in between which pulled lines starting with `[]{#ref-}` to the front, that then would be read in the following postprocessor pipe.
The usage to revert back to markdown used to be as follows:
`pandoc -s test.docx --wrap=none --track-changes=all --atx-headers --reference-location=section -t markdown | python -c 'import sys; sys.stdout.writelines(sorted(sys.stdin.readlines(), key=lambda x: not x.startswith("[]{#ref-")))' | ./invert-citeproc.py > test.rev.md`
**Caveats:**
* diacritics don't work as of now, as the name is guessed from the citekey
* some citation styles may not work
* there are contractions when using page references, which vary by CSL used
'''
import sys
import re
import os
mode = "accept"
# mode = "reject"
# mode = "all"
commentMode = 'after-line'
# commentMode = 'remove'
# commentMode = 'after-line'
# removeHighlightThreshold = 0.9 # TODO not yet implemented
commentPrefix = '@CMT:' if not 'COMMENTATOR' in os.environ else '@{}:'.format(os.environ['COMMENTATOR'])
global commentid
commentid = 0
global citationMap
citationMap = {}
global failedCitations
failedCitations = []
def splitSemantics(matchobj):
return re.sub(
r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<! [A-Z]\.)(?!\n)(?<=[.?!;:])\s",
"\n",
matchobj.group(0)
)
def moveComment(matchobj, line):
global commentMode
global commentPrefix
if commentMode == 'footnote':
global commentid
cmt = '[^cmt-' + str(commentid) + ']'
commentid += 1
comments.append(cmt + ': ' + re.sub(' (¶ )+', '\n', matchobj.group(3)))
return matchobj.group(1) + cmt + matchobj.group(5)
elif commentMode == 'after-line':
highlight = re.search(r'({==|^)((?:(?!(?:==}|{==)).)*?)==}{>>' + matchobj.group(3)[:5], line, re.M)
rtrn = matchobj.group(1) + matchobj.group(5)
this_line = ''
if not highlight:
cmt = '<!-- {} [ERROR] {}-->'.format(commentPrefix, matchobj.group(3))
else:
is_inline = highlight.group(1) == '{=='
len_highlight = len(highlight.group(2)) + 6
len_paragraph = len(line) - len(matchobj.group(3))
this_line = [l for l in line.split('\n') if highlight.group(0) in l][0]
len_this_line = len(this_line) - len(matchobj.group(3))
if (not highlight) or len_highlight > 1.0 * len_this_line:
highlight = '[PARAGRAPH] '
# WON'T WORK
elif len_highlight > len_this_line * 0.70:
highlight = '[LINE] '
else:
highlight = highlight.group(2)
highlight = '>>' + highlight + '<< ' if len(highlight.strip()) > 0 and highlight[0] != '[' else highlight
cmt = '<!-- ' + commentPrefix + ' ' + highlight + re.sub(' (¶ )+', '\n', matchobj.group(3)) + ' -->'
if this_line and this_line.startswith('==}'):
rtrn = cmt + '\n' + rtrn
else:
rtrn = rtrn + '\n' + cmt
# comments.append(cmt)
return rtrn
def recreateCitation(matchobj):
# 1: preface [if 2 = (]
# 2: ( or ;
# 3: in bracket preface [if any]
# 4: year
# 5: citation key
# 6: page number, etc [if any]
# 7: closing bracket [if any]
citekey = matchobj.group(5)
if citekey in citationMap:
author = citationMap[citekey]
else:
author = re.sub('[0-9]+.*', '', citekey).capitalize()
# TODO note that guessing
def removeAuthor(author, string):
if author in string:
return(string.split(author, 1)[0])
else:
return(string)
citation = ''
nobracket = True
if matchobj.group(2) in [';', ',']:
if matchobj.group(1) != '':
raise ValueError('Regex Mistake in `' + matchobj.group(0) + '`')
citation = '; '
nobracket = False
else:
citation = removeAuthor(author, matchobj.group(1))
if author not in matchobj.group(1):
citation += '['
nobracket = False
citation += removeAuthor(author, matchobj.group(3))
citation += '@' + citekey
citation += matchobj.group(6)
if matchobj.group(7) == ')' and not nobracket:
citation += ']'
return citation
def rememberCitation(matchobj):
global citationMap
citationMap[matchobj.group(1)] = matchobj.group(2)
if re.search(r"(\{[+\-=>]{2}|[<+\-=]{2}\})", matchobj.group(0)) is not None:
failedCitations.append(matchobj.group(0))
return ''
def processLine(line):
global comments
comments = []
line = re.sub(
r"\\--",
"--",
line
)
# remove references
line = re.sub(
r"\[\]{#ref-([^ ]+) .anchor}\W*([A-Za-z]+)?(?:(?!(?:\n\n.|\[\^.)).)*",
rememberCitation,
line
)
# line = re.sub(
# r"^(#.*){#.*}$",
# "\1",
# line
# )
line = re.sub(
r"{>>author:[^<]+<<}",
"",
line
)
if mode == "accept":
line = re.sub(
r"({--(?:(?!--}).)*--}|{\+\+|\+\+})",
"",
line
)
# replace paragraph breaks
line = re.sub(
r"\[\]{.paragraph-insertion [^}]*}",
"\n\n",
line
)
elif mode == "reject":
line = re.sub(
r"({\+\+(?:(?!\+\+}).)*\+\+}|{--|--})",
"",
line
)
# replace paragraph breaks
line = re.sub(
r"\[\]{.paragraph-insertion [^}]*}",
"",
line
)
if commentMode == "remove":
line = re.sub(
r"({>>(?:(?!<<}).)*<<}|{==|==})",
"",
line
)
line = re.sub(
# r"\(.*?, \[[0-9]{4}[a-z]?\]\(#ref-([^)]+?(?:[0-9]{4})?[^)]*?)\)(, [^)]+)?\)",
r"(?:((?:[\w,-_;.&]+ ){,6})([\(;,]))([^\(]*?(?:{==)?)\[([([0-9]{4}[a-z]?)\](?:\(\\l\)\[a\])?\(#ref-([\w_-]+)\)(.*?)((?:{==.*==})?\)|(?=;)|(?=, \[))",
recreateCitation,
line
)
line = re.sub(
r"((?:^|[+\-<]{2}}|\]|\))(.*?)(?:\[|\(|{[+\->]{2}|$))",
splitSemantics,
line
)
while True:
oldline = line[:]
line = re.sub(
r"(==})({>>)(.*?)(<<})(.*)(?=\n)",
lambda x: moveComment(x, line),
line
)
if oldline == line:
break
# line = re.sub(
# r"[A-Z][a-z]+(?:(?:,| &| and|) \w+)+ \(\[[0-9]{4}[a-z]?\]\(#ref-((?:[^)0-9]+?)(?:[0-9]{4})?[^)0-9]*?)\)(?:, ([^)]+))?\)",
# "@\1\2",
# line
# )
# fix italics
line = re.sub(r"(?<=\W)\*(\S+(?:\s+[^*\s]+)*?)\*(?=\W)", r"_\1_", line)
## TODO doesn't work with BOLD!! i.e. bold needs to be protected
# fix empty additions
line = re.sub(r" +{\+\+ +\+\+}", " ", line)
# remove highlights
if commentMode == 'after-line':
line = re.sub(r"({==|==})", "", line)
# fix empty space before punctuation
line = re.sub(r" (?=[.;:,])", "", line)
# fix extra whitespace
line = re.sub(r" +", " ", line)
# fix em-dashes
line = re.sub(r"---", "—", line)
# fix empty lines
line = re.sub(r"\n{2,}", "\n\n", line)
# add line break before 2nd level heading
line = re.sub(r"##", "\n##", line)
# fix escaped brackets
line = re.sub(r"\\\[", "[", line)
line = re.sub(r"\\\]", "]", line)
# fix protected spaces
line = re.sub(r"[\u202F\u00A0]", " ", line)
# fix page numbers
line = re.sub(r"(?<=p\.)\s(?=\d)", "", line)
line = re.sub(r"\((p\.\d+)\)", r"[\1]", line)
# remove header ids
line = re.sub(r"(?<=^)(#.*) {#.*$", r"\1", line)
# fix footnotes
line = re.sub(r"^(\[\^\w+\]:)\n", r"\1 ", line)
# failed comments
line = re.sub(r"{>>", "<!-- {} [START] -->".format(commentPrefix), line)
line = re.sub(r"<<}", "<!-- {} [END] -->".format(commentPrefix), line)
line = re.sub(r"(\S)\s*(<!--)", r"\1\n\2", line)
line = re.sub(r"(-->)\s*(\S)", r"\1\n\2", line)
if len(comments) > 0:
emptylines = '\n' if commentMode == 'footnote' else ''
line = line + emptylines + '\n'.join(comments) + emptylines + '\n'
return line
if __name__ == "__main__":
empty = False
for line in sys.stdin:
line = processLine(line)
if len(line.strip()) == 0:
if empty:
continue
else:
empty = True
else:
empty = False
# sys.stderr.write("DEBUG: got line: " + line)
sys.stdout.write(line)
if len(failedCitations) > 0:
sys.stdout.writelines(['\n', '<!--NOTES IN THE CITATIONS-->\n<!--\n'])
sys.stdout.writelines('\n'.join(failedCitations))
sys.stdout.writelines(['\n-->\n'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment