kbauer/html2bbcode.py

## html2bbcode.py
#!/usr/bin/env python3

#### USAGE
##
##     python3 <scriptname> INPUTFILE [ EXTRARULE ... ]
##
## Output is written to stdout.
##
## INPUTFILE must be a html file. Xml should work too, but will like
## require redefining all replacement rules.
##
## EXTRARULE are additional replacement rules in the format of the
## variable TAGRULES, which can also redefine predefined rules. Each
## rule is a separate shell argument and must be passed as a JSON
## string, e.g.
##
##     python3 <scriptname> myfile.html \
##         '[["h2","h3"],["[SIZE=4][B]","[/B][/SIZE]"]]' \
##         > myfile.bbcode
##
## Uses the form [LIST][*]...[*]...[/LIST] for lists. Some recent
## implementations of BBCODE sadly use the more verbose
## [LIST][LI]...[/LI][LI]...[/LI][/LIST] syntax; For these you need to
## specify a corresponding EXTRARULE.
##
## When tags are encountered, for which not rule is defined, they will
## be omitted and reported on stderr *after* the bbcode output; The
## ordering of "stdout before stderr" is enforced by flushing the
## streams.
##
## Useful in combination with a markdown converter, e.g.
##
##     pandoc --from=markdown --to=html myfile.md myfile.html
##     python3 <scriptname> myfile.html > myfile.bbcode


#### TAGRULES.
##
## Format:
##
## TAGRULE = [['tag1','tag2', ...], RULE]
## RULE, either:
##    'ignore'
##    [BEFORE_TEXT,AFTER_TEXT]
##
## In the strings inside RULE, attributes of the html tag are
## accessible as {NAME}, see e.g. the tag rule for 'a'.
TAGRULES = [
    [['h1'], ['\n\n[B][size=5]', '[/size][/B]\n']],
    [['p','div'], ['','\n']],
    [['p','div'], ['','\n']],
    [['head'], 'ignore'],
    [['html', 'body'], ['','']],
    [['strong','b'], ['[B]','[/B]']],
    [['em','i'], ['[I]','[/I]']],
    [['pre'], ['[CODE]\n','\n[/CODE]']],
    [['code'], ['[font=Courier New]','[/font]']],
    [['a'], ['[URL={href}]','[/URL]']],
    [['u'], ['[U]','[/U]']],
    [['ul'], ['[LIST]','[/LIST]']],
    [['li'], ['[*]', '']],
]

#### CODE
from lxml import html
from lxml import etree
from collections import namedtuple
import sys
import json


def main(inpath, tagrules_override):
    tree = html.parse(inpath)
    root = tree.getroot()

    unknowntags = []
    out = []
    ## Normalize tagrules
    tagrules = dict()
    for taglist, rule in TAGRULES + list(tagrules_override):
        for tag in taglist:
            tagrules[tag] = rule


    def _recur(node):
        if not node.tag in tagrules:
            if node.tag == etree.Comment:
                tagrules[node.tag] = 'ignore'
            else:
                tagrules[node.tag] = ('','')
                unknowntags.append(node.tag)

        rule = tagrules[node.tag]
        if len(rule) == 2:
            out.append(rule[0].format(**node.attrib))
            if node.text:
                out.append(node.text)
            for child in node.getchildren():
                _recur(child)
                if child.tail:  # The text after each tag e.g. '<a><br/>AFTER</a>'
                    out.append(child.tail)
            out.append(rule[1].format(**node.attrib))
        elif rule == 'ignore':
            pass
        else:
            raise Exception('Invalid rule', rule)

    _recur(root)
    outstring = ''.join(out).strip()

    sys.stdout.write(outstring)
    sys.stdout.write('\n')
    sys.stdout.flush()
    for utag in unknowntags:
        sys.stderr.write('Unknown tag ' + repr(utag) + '\n')
    sys.stderr.flush()


def tagrules_from_shell(args):
    '''
    Reads tagrules from a shell-argument list.
    Each tagrule must parse a json list.
    '''
    for arg in args:
        yield json.loads(arg)


if __name__ == '__main__':
    main(sys.argv[1], tagrules_from_shell(sys.argv[2:]))
	#!/usr/bin/env python3

	#### USAGE
	##
	## python3 <scriptname> INPUTFILE [ EXTRARULE ... ]
	##
	## Output is written to stdout.
	##
	## INPUTFILE must be a html file. Xml should work too, but will like
	## require redefining all replacement rules.
	##
	## EXTRARULE are additional replacement rules in the format of the
	## variable TAGRULES, which can also redefine predefined rules. Each
	## rule is a separate shell argument and must be passed as a JSON
	## string, e.g.
	##
	## python3 <scriptname> myfile.html \
	## '[["h2","h3"],["[SIZE=4][B]","[/B][/SIZE]"]]' \
	## > myfile.bbcode
	##
	## Uses the form [LIST][]...[]...[/LIST] for lists. Some recent
	## implementations of BBCODE sadly use the more verbose
	## [LIST][LI]...[/LI][LI]...[/LI][/LIST] syntax; For these you need to
	## specify a corresponding EXTRARULE.
	##
	## When tags are encountered, for which not rule is defined, they will
	## be omitted and reported on stderr after the bbcode output; The
	## ordering of "stdout before stderr" is enforced by flushing the
	## streams.
	##
	## Useful in combination with a markdown converter, e.g.
	##
	## pandoc --from=markdown --to=html myfile.md myfile.html
	## python3 <scriptname> myfile.html > myfile.bbcode


	#### TAGRULES.
	##
	## Format:
	##
	## TAGRULE = [['tag1','tag2', ...], RULE]
	## RULE, either:
	## 'ignore'
	## [BEFORE_TEXT,AFTER_TEXT]
	##
	## In the strings inside RULE, attributes of the html tag are
	## accessible as {NAME}, see e.g. the tag rule for 'a'.
	TAGRULES = [
	[['h1'], ['\n\n[B][size=5]', '[/size][/B]\n']],
	[['p','div'], ['','\n']],
	[['p','div'], ['','\n']],
	[['head'], 'ignore'],
	[['html', 'body'], ['','']],
	[['strong','b'], ['[B]','[/B]']],
	[['em','i'], ['[I]','[/I]']],
	[['pre'], ['[CODE]\n','\n[/CODE]']],
	[['code'], ['[font=Courier New]','[/font]']],
	[['a'], ['[URL={href}]','[/URL]']],
	[['u'], ['[U]','[/U]']],
	[['ul'], ['[LIST]','[/LIST]']],
	[['li'], ['[*]', '']],
	]

	#### CODE
	from lxml import html
	from lxml import etree
	from collections import namedtuple
	import sys
	import json


	def main(inpath, tagrules_override):
	tree = html.parse(inpath)
	root = tree.getroot()

	unknowntags = []
	out = []
	## Normalize tagrules
	tagrules = dict()
	for taglist, rule in TAGRULES + list(tagrules_override):
	for tag in taglist:
	tagrules[tag] = rule


	def _recur(node):
	if not node.tag in tagrules:
	if node.tag == etree.Comment:
	tagrules[node.tag] = 'ignore'
	else:
	tagrules[node.tag] = ('','')
	unknowntags.append(node.tag)

	rule = tagrules[node.tag]
	if len(rule) == 2:
	out.append(rule[0].format(**node.attrib))
	if node.text:
	out.append(node.text)
	for child in node.getchildren():
	_recur(child)
	if child.tail: # The text after each tag e.g. '<a><br/>AFTER</a>'
	out.append(child.tail)
	out.append(rule[1].format(**node.attrib))
	elif rule == 'ignore':
	pass
	else:
	raise Exception('Invalid rule', rule)

	_recur(root)
	outstring = ''.join(out).strip()

	sys.stdout.write(outstring)
	sys.stdout.write('\n')
	sys.stdout.flush()
	for utag in unknowntags:
	sys.stderr.write('Unknown tag ' + repr(utag) + '\n')
	sys.stderr.flush()


	def tagrules_from_shell(args):
	'''
	Reads tagrules from a shell-argument list.
	Each tagrule must parse a json list.
	'''
	for arg in args:
	yield json.loads(arg)



	if __name__ == '__main__':
	main(sys.argv[1], tagrules_from_shell(sys.argv[2:]))