bpj/pandoc-class2style.pl

## pandoc-class2style.pl
#!/usr/bin/env perl

# pandoc-class2style.pl - filter to translate single span/div classes into LaTeX commands and attribute lists

# POD documentation below the code!

use utf8;
use autodie 2.29;
use 5.010001;
use strict;
use warnings;
use warnings qw(FATAL utf8);

use Carp qw[ carp croak ];

use Pandoc::Elements 0.33;
use Pandoc::Walker 0.27 qw[ action transform  ];

# Ensure a value is a hashref
sub _hashify ($$;%) {
    my ( $value, $default, %opt ) = @_;
    defined( $value ) or $value = {};
    'HASH' eq ref( $value )
    ## $value = { $default => $value } unless $opt{key};
    ## $value = { $value => $default } if $opt{key};
      or $value
      = +{ ( $opt{key} ? $value : $default ) => ( $opt{key} ? $default : $value ) };
    if ( exists $opt{clone} ) { # $opt{clone} defaults to true = make shallow clone
        return $value unless $opt{clone};
    }
    return +{%$value};
}

# Ensure a value is an array (list)
sub _listify ($;%) {
    my ( $value, %opt ) = @_;
    defined( $value )        or $value = [];
    'ARRAY' eq ref( $value ) or $value = [$value];
    if ( exists $opt{clone} ) { # $opt{clone} defaults to true = make shallow clone
        return $value unless $opt{clone};
    }
    return [@$value];
}

# These formats require HTML-style output.  Add more as needed!
my @html_formats = qw[ html html5 epub ];

my $out_format = shift @ARGV;
my $json       = <>;
my $doc        = pandoc_json( $json );

my $meta = $doc->meta;  # Metadata
## $meta->value('foo') = get value of metadata field 'foo' as plain Perl data structure

# produce html-style output if an html output format is selected
# or the metadata field 'class2style_html' has a true value
my ( $to_html ) = ( grep { $_ eq $out_format } @html_formats );
$to_html ||= $meta->value( 'class2style_html' );
# produce docx-style output if docx output format is selected
# or the metadata field 'class2style_docx' has a true value
my $to_docx = ( 'docx' eq $out_format ) || $meta->value( 'class2style_docx' ) // 0;
# produce latex-style output if latex output format is selected
my $to_latex = 'latex' eq $out_format;

# keep existing classes if the metadata field 'class2style_keep' has a true value
my $keep_classes = $meta->value('class2style_keep') // 0;

# get 'style' definitions from metadata field 'class2style' is a hashref
my $style4class = $meta->value( 'class2style' ) // +{};
if ( 'HASH' eq ref $style4class ) {
    if ( $to_html ) {
        # default to 'html' definitions for html-style output
        $style4class = $style4class->{$out_format} // $style4class->{html} // [];
    }
    elsif ( $to_docx ) {
        # use 'docx' definitions for docx output
        $style4class = $style4class->{docx} // [];
    }
    else {
        # default to output format name definitions
        $style4class = $style4class->{$out_format} // [];
    }
}
else {
    croak "Metadata-->class2style must be mapping";
}

# turn definitions into a list of strings and/or hashes
$style4class = _listify $style4class;
for my $item ( @$style4class ) {    # normalize items to hashrefs
    'HASH' eq ref $item and next;   # assume item is a string
    $item = +{ $item => $item };
}

# flatten list of hashes to hash
$style4class = +{ map { ; %$_ } @$style4class };

# for html output values should be hashes with attribute--value pairs
if ( $to_html ) {
    for my $attrs ( values %$style4class ) {
        ## string values becomes the value of a key 'class'
        $attrs = _hashify $attrs, 'class';
    }
}

## Alternative interface: pick up classes ending in a dot
## and use them as command/environment/style names
my $class_re = qr/(?<!\S)(\pL+)\.(?!\S)/;


# Perform different actions depending on output format/style
my %actions = $to_latex ? (
    # wrap inline elements in a command
    'Span|Code' => sub {    # { for poor editor
        state $end_cmd = RawInline latex => '}';
        my ( $elem, $action ) = @_;
        my @commands = get_styles( $elem );
        return unless @commands;
        my $is_code = $elem->name =~ /Code/;
        ## recurse into child elements
        unless ( $is_code ) {
            transform( $elem->content, $action, $action );
        }
        ## replace a span with its content
        my @ret = $is_code ? $elem : @{ $elem->content };
        ## step thru commands in reverse order to keep left--right sequence
        for my $com ( reverse @commands ) {
            no warnings qw[ uninitialized numeric ]; # in case there is no AFTER
            ## COM becomes {before=>COM, after=>''} unless COM is a hash
            $com = _hashify $com, 'before';
            ## CONTENT becomes \BEFORE{CONTENT}AFTER
            ## wrap before in \ and { unless any
            $com->{before} =~ s/^(?!\\)/\\/;
            $com->{before} =~ s/\{?$/\{/;
            ## prepend } to after unless any
            $com->{after} =~ s/^(?!\})/\}/;
            unshift @ret, RawInline latex => $com->{before}; # prepend BEFORE
            push @ret, RawInline latex => $com->{after};    # append AFTER
        }
        return \@ret;
    },
    # Wrap block elements in an environment
    'Div|CodeBlock' => sub {
        my ( $elem, $action ) = @_;
        my @envs = get_styles( $elem );
        return unless @envs;
        my $is_code = $elem->name =~ /Code/;
        ## recurse into child elements
        unless ( $is_code ) {
            transform( $elem->content, $action, $action );
        }
        my @ret = $elem;
        for my $env ( reverse @envs ) {
            no warnings qw[ uninitialized numeric ]; # in case there are no arguments
            ## $env becomes {name=>ENVNAME, args=>''} unless $env is a hash
            $env = _hashify $env, 'name';
            ## prepend \begin{NAME}ARGS to block
            unshift @ret, RawBlock latex => "\\begin\{$env->{name}\}$env->{args}";
            ## append \end{NAME}
            push @ret, RawBlock latex => "\\end\{$env->{name}\}";
        }
        return \@ret;
    },
  )
  : $to_docx ? (
    # add "custom-style" attributes to elements,
    # possibly after removing existing classes
    'Span|Div' => sub {
        my ( $elem, $action ) = @_;
        my @styles = get_styles( $elem );
        return unless @styles;
        transform( $elem->content, $action, $action );
        $elem->class("") unless $keep_classes;
        ## ['foo', 'bar', 'baz'] becomes 'FooBarBaz'
        ## since docx named styles aren't additive
        my $style = join "", map { ; ucfirst $_ } @styles;
        $elem->add_attribute( 'custom-style' => $style );
        return $elem;
    },
    # For DOCX code elements need to be wrapped in a container element
    'Code|CodeBlock' => sub {
        state $wrap  = [ \&Span, \&Div ];
        ### state $strip = [ \&Code, \&CodeBlock ];
        my ( $elem, $action ) = @_;
        my @styles = get_styles( $elem );
        return unless @styles;
        ## delete existing classes?
        $elem->class("") unless $keep_classes;
        my $style = join "", map { ; ucfirst $_ } @styles;
        my $type = $elem->is_block || 0;
        ### my $code = $strip->[$type]->( attributes {}, $elem->content );
        return $wrap->[$type]->( attributes + { 'custom-style' => $style }, $elem );
    },
  )

  # HTML output format
  # Add 'style' attributes to elements,
  # possibly after removing existing classes
  ### XXX: we used to wrap a new element for each style
  : $to_html ? (
    'Span|Div|Code|CodeBlock' => sub {
        ### state $wraps = [ \&Span, \&Div ];
        my ( $elem, $action ) = @_;
        my @styles = get_styles( $elem );
        return unless @styles;
        ## recurse into child elements
        unless ( $elem->name =~ /Code/ ) {
            transform( $elem->content, $action, $action );
        }
        ### my $wrap = $wraps->[ $elem->is_block || 0 ];
        ### my $kv = $elem->keyvals;
        ## non-reference styles are classes.
        ## treat them specially for efficiency.
        my @classes = grep { !ref $_ } @styles;
        ## We turn them into a single 'style'
        push @styles, +{ class => "@classes" } if @classes;
        ## delete existing classes?
        $elem->class("") unless $keep_classes;
        ### my $ret
        ###   = $elem->name =~ /Code/ ? [$elem]
        ###   : keys( %$classes ) ? [ $wrap->( attributes $classes, $elem->content ) ]
        ###   :                     $elem->content;
        ## loop through the styles
        ### for my $style ( reverse grep { ref $_ } @styles ) {
        for my $style ( grep { ref $_ } @styles ) { # no reverse when not wrapping
            ### $ret = [ $wrap->( attributes + {%$style}, $ret ) ];
            while ( my @args = each %$style ) { # each key--value pair
                $elem->add_attribute(@args);    # add them to attributes
            }
        }
        return $elem;
        ### return $ret;
    },
  )

  # some other $out_format
  : (
    'Span|Div|Code|CodeBlock' => sub {
        my ( $elem, $action ) = @_;
        my $classes = $elem->class;
        ## Just remove trailing dots from classes
        return unless $classes =~ s/$class_re/$1/g;
        $elem->class( $classes );
        ## recurse into child elements
        unless ( $elem->name =~ /Code/ ) {
            transform( $elem->content, $action, $action );
        }
        return $elem;
    },
  );

# compile the actions
my $action = action \%actions;

# Allow applying the action recursively
$doc->transform( $action, $action );

print $doc->to_json;

sub get_styles {
    my ( $elem ) = @_;
    ## get a list of defined styles corresponding to classes,
    ## and/or dotted classes minus the dot
    ## Warning! Deep perl mumbo-jumbo here!
    return
        ## 3. get flat list of defined styles and/or a possibly empty list of de-dotted classes
      map { @{ _listify $style4class->{$_} // [ $_ =~ /$class_re/g ] } }
        ## 2. for each class with a defined style and/or each dotted class
      grep { exists( $style4class->{$_} ) || $_ =~ /$class_re/ }
        ## 1. split the class attribute on whitespace
      $elem->class =~ /\S+/g;
}

__END__


=encoding UTF-8

=head1 NAME

pandoc-class2style.pl - filter to translate single pandoc classes into attribute lists or LaTeX commands

=head1 VERSION

1.000

=head1 SYNOPSIS

pandoc -F pandoc-class2style.pl ...

=head1 DESCRIPTION

B<< pandoc-class2style.pl >> is a L<< Pandoc|http://pandoc.org/ >> filter which lets you use spans (or divs) with a single class in your source document and have the necessary LaTeX markup, DOCX custom styles, or HTML attributes of your choice injected during conversion. You still have to wrap the 'special' text in a span or div but since you only need to mark each span with a class with as few letters as you want the source becomes much less cluttered. It also becomes I<< much >> easier to produce multiple formats from the same Markdown source.

You declare a mapping from short classes to LaTeX commands or environments, DOCX custom styles or HTML attributes in your YAML metadata as follows:

    ---
    class2style:
      latex:
        u:    uline
        uu:   uuline
        grc:  textgreek[variant=ancient]
        he:   texthebrew
        la:   textlatin
        sc:   textsc
        blue: textcolor{blue}
      docx:
        - u:   Underlined
          uu:  DoubleUnderlined
          grc: Greek
          he:  Hebrew
          la:  Latin
          sc:  SmallCaps
        - blue
      html:
        u:
          class: uline
        uu:
          class: uuline
        grc:
          lang: grc
        he:
          lang: he
          dir: rtl
        la:
          lang: la
        sc:
          class: small-caps
    lang: en
    otherlangs:
    - grc
    - he
    - la
    mainfont: FreeSerif # or any other font you prefer
    xcolor: hyperref, svgnames
    ...

    [Underlined]{.u} [Double underlined]{.uu}

    [Ἑλληνιστής]{.grc}

    [עִבְרִית‎]{.he}

    [Lingua Romanica]{.la .sc}

    [I'm *blue*!]{.blue}

Running pandoc with this filter gives the following outputs for the above:

C<< pandoc -F pandoc-class2style.pl c2stest.md -t latex >>:

    \uline{Underlined} \uuline{Double underlined}

    \textgreek[variant=ancient]{Ἑλληνιστής}

    \texthebrew{עִבְרִית‎}

    \textlatin{\textsc{Lingua Romanica}}

    \textcolor{blue}{I'm \emph{blue}!}

C<< pandoc -F pandoc-class2style.pl c2stest.md -t html5 >>:

    <p><span class="uline">Underlined</span>
    <span class="uuline">Double underlined</span></p>
    <p><span lang="grc">Ἑλληνιστής</span></p>
    <p><span lang="he" dir="rtl">עִבְרִית‎</span></p>
    <p><span class="small-caps" lang="la">Lingua Romanica</span></p>
    <p><span class="blue">I'm <em>blue</em>!</span></p>

Finally I can't show the DOCX output here, but it is as if the Markdown had been like this:

    [Underlined]{custom-style="Underlined"}
    [Double underlined]{custom-style="DoubleUnderlined"}

    [Ἑλληνιστής]{custom-style="Greek"}

    [עִבְרִית‎]{custom-style="Hebrew"}

    [Lingua Romanica]{custom-style="LatinSmallCaps"}

    [I'm *blue*!]{custom-style="Blue"}

=head2 Note on the terms I<< 'style' >>, I<< CSS style >> I<<< C<< custom-style >> >>> and I<< DOCX style >>

I originally had three different filters for each of LaTeX, HTML and DOCX with essentially the same interface. When I combined them to make maintenance and configuration easier it was a bit of a problem what to call the combined filter. In the end I decided to use I<< style >> as the most general term, qualified as follows:

The word I<< 'style' >> in scare quotes means any of the AST modifications performed by this filter in order to affect how elements with certain classes are rendered in any of the supported output formats. It does thus not necessarily refer to a DOCX style as applied through Pandoc's C<< custom-style >> attribute. In particular it does not refer to the HTML C<< style >> attribute. It is best practice to avoid that attribute and apply CSS styles through tag, class, id and attribute selectors in a separate style sheet. When talking about CSS the phrase I<< CSS style >> is used.

Similarly the word I<<< C<< custom-style >> >>>, hyphenated but sometimes without code formatting is used when talking about the C<< custom-style >> attribute which tells Pandoc's docx writer to apply a particular named DOCX style to the contents of a span or div. Finally the phrase I<< DOCX style >> is used for the named styles which you can define, modify and apply to text elements in a word processor.

=head2 Divs and spans

In LaTeX mode 'styles' applied to spans become commands and 'styles' applied to divs become environments. This is not configurable. I have experimented with configuring this in the past and my experience wasn't good. If you really want to try to use a command as an environment you can try the L<<< I<< environ >> package|http://texdoc.net/pkg/environ >>>.

Similarly DOCX C<< custom-style >>s become character styles for spans and paragraph styles for divs. This is part of L<<< Pandoc's built-in C<< custom-style >> feature|http://pandoc.org/MANUAL.html#custom-styles-in-docx-output >>>.

Also note what was said on namespaces below!

=head3 Multiple 'styles' per spanE<0x2f>div

If you apply several classes with associated styles to the same span or div they are combined.

In LaTeX mode the commands and environments are nested. The left-to-right order of the classes in the source is preserved, so that C<< [foo]{.bar .baz} >> becomes C<< \bar{\baz{foo}} >> but C<< [foo]{.baz .bar} >> becomes C<< \baz{\bar{foo}} >>. Similarly environments are nested with the one corresponding to the leftmost class becoming outermost and the one corresponding to the rightmost class becoming innermost.

Because DOCX named styles aren't additive things become a little more complicated. Multiple class 'styles' become concatenated with the first letter of each component style capitalized, as seen in the C<< LatinSmallCaps >> example. You will need to define each such combined style in your reference-docx. At least you can let your C<< SmallCaps >> style inherit from the built-in C<< Small Caps >> style and your C<< LatinSmallCaps >> style inherit from your C<< SmallCaps >> style so that changes in the ancestor styles get reflected in the descendant styles.

=head2 One 'style' per class

Note that since there can only be one 'style' per class and output format you need to use a separate class for each LaTeX command or environment or for each DOCX character or paragraph style.

=head3 Namespaces

The one-style-per-class behavior is consistent with how things work in LaTeX where commands and environments share a namespace, and DOCX where character and paragraph styles also share the same namespace. If this bothers you when producing HTML remember that nothing stops you from defining HTML 'styles' with the same attributes, including classes, corresponding to different input classes. You can even use the YAML anchor--reference syntax to reduce typing, file size and errors:

    class2style:
      latex:
        he: texthebrew
        he-block: hebrew
      docx:
        he: Hebrew
        he-block: HebrewPara
      html:
        he: &hebrew
          lang: he
          dir:  rtl
        he-block: *hebrew

Here C<< *hebrew >> is a reference which causes the value of the key C<< html--E<0x3e>he-block >> to be the same as the value of the key C<< html-he >> which is marked with the anchor C<< &hebrew >>.

I don't know which of Pandoc andE<0x2f>or LibreOffice andE<0x2f>or Word imposes the limitation that DOCX paragraph and character styles can't have the same name, which is a little strange given the separation between those two kinds of styles.

The LaTeX namespace limitation is due to the fact that the LaTeX implementation of an environment C<< foo >> involves defining the commands C<< \foo >> and C<< \endfoo >>. Why it was called C<< \foo >> and not C<< \beginfoo >> is anybody's guess...

=head2 Applying 'styles' to Markdown output.

By default 'styles' are only applied when the output format is one of C<< latex >>, C<< docx >>, C<< html >>, C<< html5 >> or C<< epub >>. You can override this by setting one of the metadata variables C<< class2style_html >> or C<< class2style_docx >> to a true value on the command line.

In fact you can run with any output format and make this filter behave as if the output format had been C<< html >> or C<< docx >>. Just say:

    $ pandoc -F pandoc-class2style.pl -t markdown -M class2style_html ...

    $ pandoc -F pandoc-class2style.pl -t markdown -M class2style_docx ...

There is no similar variable for LaTeX because Markdown markup inside the wrapped spans and divs will be broken if latex-mode output is converted to Markdown.

=head3 Both HTML and DOCX attributes at the same time

When applying 'styles' to markdown output you may wish to assign both HTML attributes and DOCX C<< custom-style >> attributes at the same time. There is an easy workaround for this: just include a "custom-style" attribute in your C<< class2style--E<0x3e>html--E<0x3e>CLASS >> metadata mapping and run with the C<< -M class2style_html >> switch on the command line.

    class2style:
      html:
        sc:
          class: 'small-caps'
          'custom-style': 'Small Caps'

=head2 Keeping the original classes

By default the existing classes of a span or div element which gets new arguments associated with it are deleted. This is so that you don't get any duplicated attributes if you first run the filter when producing Markdown output and then at a later time run the filter on the same document again, e.g. to also apply 'styles' to elements added later. This behavior can be overridden by passing the switch C<< -M class2style_keep >> on the command line.

=head2 The (un)limits of LaTeX code injection

Sometimes you need to pass extra arguments to a LaTeX command or environment. If those arguments come before the main argument(the one containing the span content) you can generally include it in your command line string as in the C<< Blue: textcolor{blue} >> example; anything you put as C<< COMMAND >> in your C<< CLASS: COMMAND >> metadata field will be put into the frame C<< \...{ >> and prepended to the span content as a raw latex string. In the rare cases where you need to put arguments after the span content argument you can replace C<< COMMAND >> with a mapping with the two keys C<< before >> and C<< after >>:

    CLASS:
      before: BEFORE
      after: AFTER

In this case C<< BEFORE >> will be put into the same C<< \...{ >> frame before the content and C<< AFTER >> will be put into a C<< }... >> frame after the content, giving you C<< \BEFORE{CONTENT}AFTER >>.

With environments (i.e. divs) you always need a mapping with the two keys C<< name >> and C<< args >> to pass arguments, with the value of C<< name >> being the environment name and the value of C<< args >> being the argument string:

    ---
    class2style:
      latex:
        grc-block:
          name: greek
          args: '[variant=ancient]'
    ...

    <div class="grc-block">

    | Ἄφοβον ὁ θεός,
    | ἀνύποπτον ὁ θάνατος
    | καὶ τἀγαθὸν μὲν εὔκτητον,
    | τὸ δὲ δεινὸν εὐεκκαρτέρητον

    </div>

which thus becomes

    \begin{greek}[variant=ancient]

    Ἄφοβον ὁ θεός,\\
    ἀνύποπτον ὁ θάνατος\\
    καὶ τἀγαθὸν μὲν εὔκτητον,\\
    τὸ δὲ δεινὸν εὐεκκαρτέρητον

    \end{greek}

In all these cases you may need to quote your values so that they don't confuse the YAML parser or Pandoc's Markdown parser which both will have a go at the values before the filter sees them. You may even have to wrap values containing LaTeX code both in outer single quotes for YAML and in inner backticks for Pandoc to ensure that they come intact to the filter:

    class2style:
      latex:
        foo: '`framebox[1.1\width]`'

In fact you can write e.g. C<< '`\uline{`' >>. No extra backslash or opening brace will be added if you do, but then the twofold quoting is absolutely necessary.

=head3 No per-element arguments

Note that you will have to declare a separate class for each combination of command or environment and extra arguments. I have experimented with specifying custom arguments as attributes to a span or div in the past and in general it leads to cluttered Markdown source and complicated filter code with concomitant risk for errors. Even though the one class--one combination of command and arguments approach might mean more declarations in your metadata it keeps the body of your document cleaner. If the volume of the metadata declarations bother you remember that you can put metadata blocks anywhere, and that they are less in the way at the end of the file.

=head2 Code

This filter also works on inline code and code blocks.

=head2 The "list of strings and mappings"

As you may have noticed the value of the C<< docx >> key in our initial example is a list of strings and mappings. This can be done with any output format. String list items will be expanded into a single-element mapping C<< STRING: STRING >>, and then the list of mappings will be flattened into a single mapping, with later elements overriding earlier elements with the same key.

=head2 The "dotted class" shortcut

Finally you can in some cases forgo of the metadata declaration and instead append a period at the end of a class name. This will result in a command, environment, HTML class or DOCX style where the name is equal to the class name without the trailing period.

    [Framed]{.fbox.}

    \fbox{Framed}

    <p><span class="fbox">Framed</span></p>

    [Framed]{custom-style="Fbox"}

=head1 PREREQUISITES

In addition to L<< Pandoc|http://pandoc.org/ >> this filter requires the following perl modules:

=over

=item *

Carp

=item *

Pandoc::Elements 0.33

=item *

Pandoc::Walker 0.27

=item *

autodie 2.29

=item *

perl 5.010001

=item *

strict

=item *

warnings

=back

=head2 New to Perl?

This filter requires perl (minimum version as given above) and the Perl modules listed above to function. If you haven't used Perl before information on how to getE<0x2f>install perl andE<0x2f>or Perl modules can be found at the URLS below, which lead to the official information on these topics.

Don't worry! If your operating system is Linux or Mac you probably already have a new enough version of perl installed. If you don't or if your operating system is Windows it is easy to install a recent version, and once you have perl installed installing modules is very easy. Just follow the instructions linked to below.

Getting perl
L<< https:E<0x2f>E<0x2f>www.perl.orgE<0x2f>get.html|https://www.perl.org/get.html >>

(For Windows I recommend Strawberry Perl as module installation is easier there.)

Installing Perl modules
L<< http:E<0x2f>E<0x2f>www.cpan.orgE<0x2f>modulesE<0x2f>INSTALL.html|http://www.cpan.org/modules/INSTALL.html >>

=head1 AUTHOR

Benct Philip Jonsson (bpjonsson@gmail.com, L<< https:E<0x2f>E<0x2f>github.comE<0x2f>bpj|https://github.com/bpj >>)

=head1 COPYRIGHT

Copyright 2017- Benct Philip Jonsson

=head1 LICENSE

This is free software; you can redistribute it andE<0x2f>or modify it under the same terms as the Perl 5 programming language system itself. See L<< http:E<0x2f>E<0x2f>dev.perl.orgE<0x2f>licensesE<0x2f>|http://dev.perl.org/licenses/ >>.

=cut

## pandoc-class2style.pod

      
    Raw
  

              pandoc-class2style.pod
            
          
    NAME


pandoc-class2style.pl - filter to translate single pandoc classes into attribute lists or LaTeX commands

VERSION


1.000

SYNOPSIS


pandoc -F pandoc-class2style.pl ...

DESCRIPTION


pandoc-class2style.pl is a Pandoc filter which lets you use spans (or divs) with a single class in your source document and have the necessary LaTeX markup, DOCX custom styles, or HTML attributes of your choice injected during conversion. You still have to wrap the 'special' text in a span or div but since you only need to mark each span with a class with as few letters as you want the source becomes much less cluttered. It also becomes much easier to produce multiple formats from the same Markdown source.

You declare a mapping from short classes to LaTeX commands or environments, DOCX custom styles or HTML attributes in your YAML metadata as follows:

---
class2style:
  latex:
    u:    uline
    uu:   uuline
    grc:  textgreek[variant=ancient]
    he:   texthebrew
    la:   textlatin
    sc:   textsc 
    blue: textcolor{blue}
  docx:
    - u:   Underlined
      uu:  DoubleUnderlined
      grc: Greek
      he:  Hebrew
      la:  Latin
      sc:  SmallCaps
    - blue
  html:
    u:
      class: uline
    uu:
      class: uuline
    grc:
      lang: grc
    he:
      lang: he
      dir: rtl
    la:
      lang: la
    sc:
      class: small-caps
lang: en
otherlangs:
- grc
- he
- la
mainfont: FreeSerif # or any other font you prefer
xcolor: hyperref, svgnames
...

[Underlined]{.u} [Double underlined]{.uu}

[Ἑλληνιστής]{.grc}

[עִבְרִית‎]{.he}

[Lingua Romanica]{.la .sc}

[I'm *blue*!]{.blue}

Running pandoc with this filter gives the following outputs for the above:

pandoc -F pandoc-class2style.pl c2stest.md -t latex:

\uline{Underlined} \uuline{Double underlined}

\textgreek[variant=ancient]{Ἑλληνιστής}

\texthebrew{עִבְרִית‎}

\textlatin{\textsc{Lingua Romanica}}

\textcolor{blue}{I'm \emph{blue}!}

pandoc -F pandoc-class2style.pl c2stest.md -t html5:

<p><span class="uline">Underlined</span>
<span class="uuline">Double underlined</span></p>
<p><span lang="grc">Ἑλληνιστής</span></p>
<p><span lang="he" dir="rtl">עִבְרִית‎</span></p>
<p><span class="small-caps" lang="la">Lingua Romanica</span></p>
<p><span class="blue">I'm <em>blue</em>!</span></p>

Finally I can't show the DOCX output here, but it is as if the Markdown had been like this:

[Underlined]{custom-style="Underlined"}
[Double underlined]{custom-style="DoubleUnderlined"}

[Ἑλληνιστής]{custom-style="Greek"}

[עִבְרִית‎]{custom-style="Hebrew"}

[Lingua Romanica]{custom-style="LatinSmallCaps"}

[I'm *blue*!]{custom-style="Blue"}

Note on the terms 'style', CSS style custom-style and DOCX style


I originally had three different filters for each of LaTeX, HTML and DOCX with essentially the same interface. When I combined them to make maintenance and configuration easier it was a bit of a problem what to call the combined filter. In the end I decided to use style as the most general term, qualified as follows:

The word 'style' in scare quotes means any of the AST modifications performed by this filter in order to affect how elements with certain classes are rendered in any of the supported output formats. It does thus not necessarily refer to a DOCX style as applied through Pandoc's custom-style attribute. In particular it does not refer to the HTML style attribute. It is best practice to avoid that attribute and apply CSS styles through tag, class, id and attribute selectors in a separate style sheet. When talking about CSS the phrase CSS style is used.

Similarly the word custom-style, hyphenated but sometimes without code formatting is used when talking about the custom-style attribute which tells Pandoc's docx writer to apply a particular named DOCX style to the contents of a span or div. Finally the phrase DOCX style is used for the named styles which you can define, modify and apply to text elements in a word processor.

Divs and spans


In LaTeX mode 'styles' applied to spans become commands and 'styles' applied to divs become environments. This is not configurable. I have experimented with configuring this in the past and my experience wasn't good. If you really want to try to use a command as an environment you can try the environ package.

Similarly DOCX custom-styles become character styles for spans and paragraph styles for divs. This is part of Pandoc's built-in custom-style feature.

Also note what was said on namespaces below!

Multiple 'styles' per span/div


If you apply several classes with associated styles to the same span or div they are combined.

In LaTeX mode the commands and environments are nested. The left-to-right order of the classes in the source is preserved, so that [foo]{.bar .baz} becomes \bar{\baz{foo}} but [foo]{.baz .bar} becomes \baz{\bar{foo}}. Similarly environments are nested with the one corresponding to the leftmost class becoming outermost and the one corresponding to the rightmost class becoming innermost.

Because DOCX named styles aren't additive things become a little more complicated. Multiple class 'styles' become concatenated with the first letter of each component style capitalized, as seen in the LatinSmallCaps example. You will need to define each such combined style in your reference-docx. At least you can let your SmallCaps style inherit from the built-in Small Caps style and your LatinSmallCaps style inherit from your SmallCaps style so that changes in the ancestor styles get reflected in the descendant styles.

One 'style' per class


Note that since there can only be one 'style' per class and output format you need to use a separate class for each LaTeX command or environment or for each DOCX character or paragraph style.

Namespaces


The one-style-per-class behavior is consistent with how things work in LaTeX where commands and environments share a namespace, and DOCX where character and paragraph styles also share the same namespace. If this bothers you when producing HTML remember that nothing stops you from defining HTML 'styles' with the same attributes, including classes, corresponding to different input classes. You can even use the YAML anchor--reference syntax to reduce typing, file size and errors:

class2style:
  latex:
    he: texthebrew
    he-block: hebrew
  docx:
    he: Hebrew
    he-block: HebrewPara
  html:
    he: &hebrew
      lang: he
      dir:  rtl
    he-block: *hebrew

Here *hebrew is a reference which causes the value of the key html-->he-block to be the same as the value of the key html-he which is marked with the anchor &hebrew.

I don't know which of Pandoc and/or LibreOffice and/or Word imposes the limitation that DOCX paragraph and character styles can't have the same name, which is a little strange given the separation between those two kinds of styles.

The LaTeX namespace limitation is due to the fact that the LaTeX implementation of an environment foo involves defining the commands \foo and \endfoo. Why it was called \foo and not \beginfoo is anybody's guess...

Applying 'styles' to Markdown output.


By default 'styles' are only applied when the output format is one of latex, docx, html, html5 or epub. You can override this by setting one of the metadata variables class2style_html or class2style_docx to a true value on the command line.

In fact you can run with any output format and make this filter behave as if the output format had been html or docx. Just say:

$ pandoc -F pandoc-class2style.pl -t markdown -M class2style_html ...

$ pandoc -F pandoc-class2style.pl -t markdown -M class2style_docx ...

There is no similar variable for LaTeX because Markdown markup inside the wrapped spans and divs will be broken if latex-mode output is converted to Markdown.

Both HTML and DOCX attributes at the same time


When applying 'styles' to markdown output you may wish to assign both HTML attributes and DOCX custom-style attributes at the same time. There is an easy workaround for this: just include a "custom-style" attribute in your class2style-->html-->CLASS metadata mapping and run with the -M class2style_html switch on the command line.

class2style:
  html:
    sc:
      class: 'small-caps'
      'custom-style': 'Small Caps'

Keeping the original classes


By default the existing classes of a span or div element which gets new arguments associated with it are deleted. This is so that you don't get any duplicated attributes if you first run the filter when producing Markdown output and then at a later time run the filter on the same document again, e.g. to also apply 'styles' to elements added later. This behavior can be overridden by passing the switch -M class2style_keep on the command line.

The (un)limits of LaTeX code injection


Sometimes you need to pass extra arguments to a LaTeX command or environment. If those arguments come before the main argument(the one containing the span content) you can generally include it in your command line string as in the Blue: textcolor{blue} example; anything you put as COMMAND in your CLASS: COMMAND metadata field will be put into the frame \...{ and prepended to the span content as a raw latex string. In the rare cases where you need to put arguments after the span content argument you can replace COMMAND with a mapping with the two keys before and after:

CLASS:
  before: BEFORE
  after: AFTER

In this case BEFORE will be put into the same \...{ frame before the content and AFTER will be put into a }... frame after the content, giving you \BEFORE{CONTENT}AFTER.

With environments (i.e. divs) you always need a mapping with the two keys name and args to pass arguments, with the value of name being the environment name and the value of args being the argument string:

---
class2style:
  latex:
    grc-block:
      name: greek
      args: '[variant=ancient]'
...

<div class="grc-block">

| Ἄφοβον ὁ θεός,
| ἀνύποπτον ὁ θάνατος
| καὶ τἀγαθὸν μὲν εὔκτητον,
| τὸ δὲ δεινὸν εὐεκκαρτέρητον

</div>

which thus becomes

\begin{greek}[variant=ancient]

Ἄφοβον ὁ θεός,\\
ἀνύποπτον ὁ θάνατος\\
καὶ τἀγαθὸν μὲν εὔκτητον,\\
τὸ δὲ δεινὸν εὐεκκαρτέρητον

\end{greek}

In all these cases you may need to quote your values so that they don't confuse the YAML parser or Pandoc's Markdown parser which both will have a go at the values before the filter sees them. You may even have to wrap values containing LaTeX code both in outer single quotes for YAML and in inner backticks for Pandoc to ensure that they come intact to the filter:

class2style:
  latex:
    foo: '`framebox[1.1\width]`'

In fact you can write e.g. '`\uline{`'. No extra backslash or opening brace will be added if you do, but then the twofold quoting is absolutely necessary.

No per-element arguments


Note that you will have to declare a separate class for each combination of command or environment and extra arguments. I have experimented with specifying custom arguments as attributes to a span or div in the past and in general it leads to cluttered Markdown source and complicated filter code with concomitant risk for errors. Even though the one class--one combination of command and arguments approach might mean more declarations in your metadata it keeps the body of your document cleaner. If the volume of the metadata declarations bother you remember that you can put metadata blocks anywhere, and that they are less in the way at the end of the file.

Code


This filter also works on inline code and code blocks.

The "list of strings and mappings"


As you may have noticed the value of the docx key in our initial example is a list of strings and mappings. This can be done with any output format. String list items will be expanded into a single-element mapping STRING: STRING, and then the list of mappings will be flattened into a single mapping, with later elements overriding earlier elements with the same key.

The "dotted class" shortcut


Finally you can in some cases forgo of the metadata declaration and instead append a period at the end of a class name. This will result in a command, environment, HTML class or DOCX style where the name is equal to the class name without the trailing period.

[Framed]{.fbox.}

\fbox{Framed}

<p><span class="fbox">Framed</span></p>

[Framed]{custom-style="Fbox"}

PREREQUISITES


In addition to Pandoc this filter requires the following perl modules:


Carp


Pandoc::Elements 0.33


Pandoc::Walker 0.27


autodie 2.29


perl 5.010001


strict


warnings


New to Perl?


This filter requires perl (minimum version as given above) and the Perl modules listed above to function. If you haven't used Perl before information on how to get/install perl and/or Perl modules can be found at the URLS below, which lead to the official information on these topics.

Don't worry! If your operating system is Linux or Mac you probably already have a new enough version of perl installed. If you don't or if your operating system is Windows it is easy to install a recent version, and once you have perl installed installing modules is very easy. Just follow the instructions linked to below.

Getting perl https://www.perl.org/get.html

(For Windows I recommend Strawberry Perl as module installation is easier there.)

Installing Perl modules http://www.cpan.org/modules/INSTALL.html

AUTHOR


Benct Philip Jonsson (bpjonsson@gmail.com, https://github.com/bpj)

COPYRIGHT


Copyright 2017- Benct Philip Jonsson

LICENSE


This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. See http://dev.perl.org/licenses/.