bpj/pandoc-backslashes.pl

## pandoc-backslashes.pl
#!/usr/bin/env perl

use utf8;
use v5.14;
use strict;
use warnings;
use warnings  qw(FATAL utf8);
use open      qw(:std :utf8);
use charnames qw(:full :short);

# This regex can be used to remove unnecessary backslashes in
# Markdown which has been reformatted by Pandoc.
# Substrings which are captured in the (first and only) capture group
# should be *kept*. Backslashes which are matched outside the capture
# group should be discarded!

# If you think this regex is too agressive, or not aggressive enough,
# in what it keeps please comment out/uncomment those alternatives
# which you want to omit or apply, just as some more aggressive
# variants are commented out already.

# It should be trivial to use this pattern in Python with the
# so-called "new" `regex` module <https://pypi.org/project/regex/>
# like this: `regex.sub(pattern,(lambda m: m.group(1) or ""),text)`

my $backslash_regex = qr{(?mx)
    (   # KEEP CAPTURED SUBSTRINGS

        # FIRST MATCH DELIMITED AND FENCED CODE
        # not preceded by an escaping backslash
        (?<! \\ ) (?: \\ \\ )* \K
        (?msx:
            # code in backticks
            (?P<BackTicks> \`+ ) .+? (?P=BackTicks)
        |   # code in tildes
            # can't be more precise than this since a code block
            # may be the first thing in a list item
            (?P<Tildes> \~{3,} ) .+? (?P=Tildes)
        )
    |   # backslash itself
        \\ \\
    |   # any backslashes followed by letters (TeX, including XeTeX)
        \\ \pL
    |   # hard space and hard line break
        \\ (?: \N{SPACE} | \r | \n )
    |   # inline style marker
        \\ [_*~^`]
    |   # things which otherwise might be an ordered list marker
        ^ \s* [A-Za-z0-9]+ \\ [\.\)]       # 1. a. i. 1) a) i)
    |   ^ \s* \\? \( [A-Za-z0-9]+ \\? \)   # (1) (a) (i)
    # |   # a more aggressive, reformatting-safe version of
    #     # things which otherwise might be an ordered list marker
    #     [A-Za-z0-9]+ \\ [.)]
    |   # things which otherwise might be an unordered/definition list marker
        ^ \s* \\ [-+*~:] \s+
    # |   # square brackets are mostly harmless
    #     # if you run with -shortcut_reference_links
    #     # OTHERWISE UNCOMMENT THIS ALTERNATIVE
    #     \\ \[ | \\ \]
    |   # things which otherwise might be an attribute block
        (?<= \] | \` ) \\ \{
    |   # things which otherwise might be an inline link
        (?<= \] ) \\ \(
    |   # things which otherwise might be HTML or raw links
        \\ \< | \\ \>
    )
    |   \\
};

# slurp the whole document content/STDIN
my $text = do { local $/; <>; };

# keep capture contents, discard the rest:
# return the capture if it is defined, else return the empty string
# could also be written `defined($1) ? $1 : ""`
$text =~ s{$backslash_regex}{ $1 // "" }eg;

# print contents to STDOUT
print $text;

__END__
	#!/usr/bin/env perl

	use utf8;
	use v5.14;
	use strict;
	use warnings;
	use warnings qw(FATAL utf8);
	use open qw(:std :utf8);
	use charnames qw(:full :short);

	# This regex can be used to remove unnecessary backslashes in
	# Markdown which has been reformatted by Pandoc.
	# Substrings which are captured in the (first and only) capture group
	# should be kept. Backslashes which are matched outside the capture
	# group should be discarded!

	# If you think this regex is too agressive, or not aggressive enough,
	# in what it keeps please comment out/uncomment those alternatives
	# which you want to omit or apply, just as some more aggressive
	# variants are commented out already.

	# It should be trivial to use this pattern in Python with the
	# so-called "new" `regex` module <https://pypi.org/project/regex/>
	# like this: `regex.sub(pattern,(lambda m: m.group(1) or ""),text)`

	my $backslash_regex = qr{(?mx)
	( # KEEP CAPTURED SUBSTRINGS

	# FIRST MATCH DELIMITED AND FENCED CODE
	# not preceded by an escaping backslash
	(?<! \\ ) (?: \\ \\ )* \K
	(?msx:
	# code in backticks
	(?P<BackTicks> \`+ ) .+? (?P=BackTicks)
	\| # code in tildes
	# can't be more precise than this since a code block
	# may be the first thing in a list item
	(?P<Tildes> \~{3,} ) .+? (?P=Tildes)
	)
	\| # backslash itself
	\\ \\
	\| # any backslashes followed by letters (TeX, including XeTeX)
	\\ \pL
	\| # hard space and hard line break
	\\ (?: \N{SPACE} \| \r \| \n )
	\| # inline style marker
	\\ [_*~^`]
	\| # things which otherwise might be an ordered list marker
	^ \s* [A-Za-z0-9]+ \\ [\.\)] # 1. a. i. 1) a) i)
	\| ^ \s* \\? \( [A-Za-z0-9]+ \\? \) # (1) (a) (i)
	# \| # a more aggressive, reformatting-safe version of
	# # things which otherwise might be an ordered list marker
	# [A-Za-z0-9]+ \\ [.)]
	\| # things which otherwise might be an unordered/definition list marker
	^ \s* \\ [-+*~:] \s+
	# \| # square brackets are mostly harmless
	# # if you run with -shortcut_reference_links
	# # OTHERWISE UNCOMMENT THIS ALTERNATIVE
	# \\ \[ \| \\ \]
	\| # things which otherwise might be an attribute block
	(?<= \] \| \` ) \\ \{
	\| # things which otherwise might be an inline link
	(?<= \] ) \\ \(
	\| # things which otherwise might be HTML or raw links
	\\ \< \| \\ \>
	)
	\| \\
	};

	# slurp the whole document content/STDIN
	my $text = do { local $/; <>; };

	# keep capture contents, discard the rest:
	# return the capture if it is defined, else return the empty string
	# could also be written `defined($1) ? $1 : ""`
	$text =~ s{$backslash_regex}{ $1 // "" }eg;

	# print contents to STDOUT
	print $text;

	__END__