Last active
December 27, 2018 18:27
-
-
Save bpj/ebdc1e8f88409d13931ebf4b638265b0 to your computer and use it in GitHub Desktop.
Remove unnecessary backslashes in Markdown which has been reformatted by Pandoc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use utf8; | |
use v5.14; | |
use strict; | |
use warnings; | |
use warnings qw(FATAL utf8); | |
use open qw(:std :utf8); | |
use charnames qw(:full :short); | |
# This regex can be used to remove unnecessary backslashes in | |
# Markdown which has been reformatted by Pandoc. | |
# Substrings which are captured in the (first and only) capture group | |
# should be *kept*. Backslashes which are matched outside the capture | |
# group should be discarded! | |
# If you think this regex is too agressive, or not aggressive enough, | |
# in what it keeps please comment out/uncomment those alternatives | |
# which you want to omit or apply, just as some more aggressive | |
# variants are commented out already. | |
# It should be trivial to use this pattern in Python with the | |
# so-called "new" `regex` module <https://pypi.org/project/regex/> | |
# like this: `regex.sub(pattern,(lambda m: m.group(1) or ""),text)` | |
my $backslash_regex = qr{(?mx) | |
( # KEEP CAPTURED SUBSTRINGS | |
# FIRST MATCH DELIMITED AND FENCED CODE | |
# not preceded by an escaping backslash | |
(?<! \\ ) (?: \\ \\ )* \K | |
(?msx: | |
# code in backticks | |
(?P<BackTicks> \`+ ) .+? (?P=BackTicks) | |
| # code in tildes | |
# can't be more precise than this since a code block | |
# may be the first thing in a list item | |
(?P<Tildes> \~{3,} ) .+? (?P=Tildes) | |
) | |
| # backslash itself | |
\\ \\ | |
| # any backslashes followed by letters (TeX, including XeTeX) | |
\\ \pL | |
| # hard space and hard line break | |
\\ (?: \N{SPACE} | \r | \n ) | |
| # inline style marker | |
\\ [_*~^`] | |
| # things which otherwise might be an ordered list marker | |
^ \s* [A-Za-z0-9]+ \\ [\.\)] # 1. a. i. 1) a) i) | |
| ^ \s* \\? \( [A-Za-z0-9]+ \\? \) # (1) (a) (i) | |
# | # a more aggressive, reformatting-safe version of | |
# # things which otherwise might be an ordered list marker | |
# [A-Za-z0-9]+ \\ [.)] | |
| # things which otherwise might be an unordered/definition list marker | |
^ \s* \\ [-+*~:] \s+ | |
# | # square brackets are mostly harmless | |
# # if you run with -shortcut_reference_links | |
# # OTHERWISE UNCOMMENT THIS ALTERNATIVE | |
# \\ \[ | \\ \] | |
| # things which otherwise might be an attribute block | |
(?<= \] | \` ) \\ \{ | |
| # things which otherwise might be an inline link | |
(?<= \] ) \\ \( | |
| # things which otherwise might be HTML or raw links | |
\\ \< | \\ \> | |
) | |
| \\ | |
}; | |
# slurp the whole document content/STDIN | |
my $text = do { local $/; <>; }; | |
# keep capture contents, discard the rest: | |
# return the capture if it is defined, else return the empty string | |
# could also be written `defined($1) ? $1 : ""` | |
$text =~ s{$backslash_regex}{ $1 // "" }eg; | |
# print contents to STDOUT | |
print $text; | |
__END__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment