Skip to content

Instantly share code, notes, and snippets.

@kbauer
Created November 20, 2017 13:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kbauer/946288e1f171c2b149abf6cee99d9334 to your computer and use it in GitHub Desktop.
Save kbauer/946288e1f171c2b149abf6cee99d9334 to your computer and use it in GitHub Desktop.
A simple python3 script that takes html input and converts it to (an approximation in) BBCODE. Usage described in header comment. Also useful with markdown, preprocessed to html by pandoc. Allows customizing replacement rules for html-tags, which should also allow converting xml documents to some degree.
#!/usr/bin/env python3
#### USAGE
##
## python3 <scriptname> INPUTFILE [ EXTRARULE ... ]
##
## Output is written to stdout.
##
## INPUTFILE must be a html file. Xml should work too, but will like
## require redefining all replacement rules.
##
## EXTRARULE are additional replacement rules in the format of the
## variable TAGRULES, which can also redefine predefined rules. Each
## rule is a separate shell argument and must be passed as a JSON
## string, e.g.
##
## python3 <scriptname> myfile.html \
## '[["h2","h3"],["[SIZE=4][B]","[/B][/SIZE]"]]' \
## > myfile.bbcode
##
## Uses the form [LIST][*]...[*]...[/LIST] for lists. Some recent
## implementations of BBCODE sadly use the more verbose
## [LIST][LI]...[/LI][LI]...[/LI][/LIST] syntax; For these you need to
## specify a corresponding EXTRARULE.
##
## When tags are encountered, for which not rule is defined, they will
## be omitted and reported on stderr *after* the bbcode output; The
## ordering of "stdout before stderr" is enforced by flushing the
## streams.
##
## Useful in combination with a markdown converter, e.g.
##
## pandoc --from=markdown --to=html myfile.md myfile.html
## python3 <scriptname> myfile.html > myfile.bbcode
#### TAGRULES.
##
## Format:
##
## TAGRULE = [['tag1','tag2', ...], RULE]
## RULE, either:
## 'ignore'
## [BEFORE_TEXT,AFTER_TEXT]
##
## In the strings inside RULE, attributes of the html tag are
## accessible as {NAME}, see e.g. the tag rule for 'a'.
TAGRULES = [
[['h1'], ['\n\n[B][size=5]', '[/size][/B]\n']],
[['p','div'], ['','\n']],
[['p','div'], ['','\n']],
[['head'], 'ignore'],
[['html', 'body'], ['','']],
[['strong','b'], ['[B]','[/B]']],
[['em','i'], ['[I]','[/I]']],
[['pre'], ['[CODE]\n','\n[/CODE]']],
[['code'], ['[font=Courier New]','[/font]']],
[['a'], ['[URL={href}]','[/URL]']],
[['u'], ['[U]','[/U]']],
[['ul'], ['[LIST]','[/LIST]']],
[['li'], ['[*]', '']],
]
#### CODE
from lxml import html
from lxml import etree
from collections import namedtuple
import sys
import json
def main(inpath, tagrules_override):
tree = html.parse(inpath)
root = tree.getroot()
unknowntags = []
out = []
## Normalize tagrules
tagrules = dict()
for taglist, rule in TAGRULES + list(tagrules_override):
for tag in taglist:
tagrules[tag] = rule
def _recur(node):
if not node.tag in tagrules:
if node.tag == etree.Comment:
tagrules[node.tag] = 'ignore'
else:
tagrules[node.tag] = ('','')
unknowntags.append(node.tag)
rule = tagrules[node.tag]
if len(rule) == 2:
out.append(rule[0].format(**node.attrib))
if node.text:
out.append(node.text)
for child in node.getchildren():
_recur(child)
if child.tail: # The text after each tag e.g. '<a><br/>AFTER</a>'
out.append(child.tail)
out.append(rule[1].format(**node.attrib))
elif rule == 'ignore':
pass
else:
raise Exception('Invalid rule', rule)
_recur(root)
outstring = ''.join(out).strip()
sys.stdout.write(outstring)
sys.stdout.write('\n')
sys.stdout.flush()
for utag in unknowntags:
sys.stderr.write('Unknown tag ' + repr(utag) + '\n')
sys.stderr.flush()
def tagrules_from_shell(args):
'''
Reads tagrules from a shell-argument list.
Each tagrule must parse a json list.
'''
for arg in args:
yield json.loads(arg)
if __name__ == '__main__':
main(sys.argv[1], tagrules_from_shell(sys.argv[2:]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment