Skip to content

Instantly share code, notes, and snippets.

@AKST
Last active August 29, 2015 13:56
Show Gist options
  • Save AKST/8844071 to your computer and use it in GitHub Desktop.
Save AKST/8844071 to your computer and use it in GitHub Desktop.
Transforms Wikitext into basic html mainly (p & h2 tags), removes citations & meta data. Source available under apache 2 license.
# Copyright 2014 Angus Thomsen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
metadata_indicators = ['|', '{{', '}}', '[[', '*']
def is_meta_data(line):
return bool(filter(lambda i: line.startswith(i), metadata_indicators))
def is_content(line):
return not (not line or is_meta_data(line))
def is_header(line):
return line.startswith('==') and line.endswith('==')
def strip_citations(raw):
line = ''
lookback = ''
record = True
for char in list(raw):
skip = False
if char in '{}[]':
lookback += char
if len(lookback) > 1:
if record and lookback == '{{':
record = False
line = line[:-1]
elif not record and lookback == '}}':
record = True
skip = True
elif lookback == '[[' or lookback == ']]':
line = line[:-1]
skip = True
lookback = ''
if record and not skip:
line += char
return line
def parse(line):
if is_header(line):
tag_type = 'h2'
line = line[2:-2]
else:
tag_type = 'p'
line = strip_citations(line)
return '<{0}>{1}</{0}>'.format(tag_type, line)
def wikiToHtml(wiki_text):
return '\n'.join(parse(line) for line in wiki_text.split('\n') if is_content(line))
if __name__ == '__main__':
import sys
if len(sys.argv) >= 3:
with open(sys.argv[1], 'r') as fin:
wikitext = fin.read()
with open(sys.argv[2], 'w') as fout:
fout.write(wikiToHtml(wikitext))
else:
raise Exception('please provide input & output files (in that order)')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment