Skip to content

Instantly share code, notes, and snippets.

@javiercantero
Created March 3, 2016 11:28
Show Gist options
  • Save javiercantero/3c4a05c7193a3068f9ea to your computer and use it in GitHub Desktop.
Save javiercantero/3c4a05c7193a3068f9ea to your computer and use it in GitHub Desktop.
A simple script to transform Wikipedia articles from inline references style to list-defined references style
#!/usr/bin/python3
#
# inline2ldr
#
# Extract all the inline citations from a Wikipedia article's text and
# paste them into the {{reflist}} template. The tool automatically
# generates name references ("autoref0001", ...) for those without one.
#
# Usage: inline2ldr < article-orig.txt > article.txt
#
# ####################################################################
#
# inline2ldr - transform Wikipedia inline into list-defined references
# Copyright (C) 2016 Javier Cantero
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# ####################################################################
import sys
import re
REF_REGEX = '<ref(\s+name=([\w-]+|"[^"]+"))?\s*>\s*({{cite[^}]+}})\s*</ref>'
CITE_REGEX = '{{cite\s+(\S[^|]+)(\s*\|\s*[^}|]+)+\s*}}'
REFLIST_REGEX = '{{[Rr]eflist(\|\w+)?}}'
REFNAME_REGEX = '<ref\s+name=([\w-]-+|"[^"]+")\s*/>'
class CiteException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class Cite(object):
def __init__(self):
self.r = re.compile(CITE_REGEX, flags=re.M)
def normalize(self, cite):
m = self.r.match(cite)
if m is None:
raise CiteException('Error cite format')
cite_fields = cite[6:-2].split('|')
cite_type = cite_fields[0].strip()
new_cite = "{{cite %s" % (cite_type)
for cite_field in cite_fields[1:]:
cite_field_parts = cite_field.split('=', maxsplit=1)
if len(cite_field_parts) != 2:
raise CiteException('Error cite field format')
cite_field_name=cite_field_parts[0].strip()
cite_field_value=cite_field_parts[1].strip()
new_cite += "|%s=%s" %(cite_field_name,cite_field_value)
new_cite += "}}"
return new_cite
def main():
article = sys.stdin.read()
# find references and store them in a dict (ref name = key)
# if the ref has no name, then generate one
refs = {}
refs_order = []
c = Cite()
autoref_count = 1
r = re.compile(REF_REGEX, flags=re.M)
for m in r.finditer(article):
if m.group(2) is None:
ref_name = 'autoref%04d' % autoref_count
autoref_count += 1
else:
ref_name = m.group(2)
if ref_name[0] == '"' and ref_name[-1] == '"':
ref_name = ref_name[1:-1]
if ref_name in refs.keys():
sys.stderr.write("Error: Duplicated reference %s\n" %(ref_name))
continue
ref_cite = m.group(3)
try:
ref_cite = c.normalize(ref_cite)
except CiteException as ex:
sys.stderr.write("Error: unknown cite %s (%s)\n" %(ref_cite, ex))
continue
refs[ ref_name ] = ref_cite
refs_order.append( ref_name )
# replace inline refs by the {{r}} template
article = article.replace( m.group(0), '{{r|%s}}'%(ref_name), 1 )
# replace the remainder <ref> tags by the {{r}} template
r = re.compile(REFNAME_REGEX, flags=re.M)
for m in r.finditer(article):
ref_name = m.group(1)
if ref_name[0] == '"' and ref_name[-1] == '"':
ref_name = ref_name[1:-1]
article = article.replace( m.group(0), '{{r|%s}}'%(ref_name), 1 )
new_reflist = "{{reflist|30em|refs=\n"
for ref_name in refs_order:
new_reflist += '<ref name="%s">%s</ref>\n' %(ref_name, refs[ ref_name ])
new_reflist += "}}\n"
# find {{reflist}} and add found references
reflist_r = re.compile(REFLIST_REGEX, flags=re.M)
if reflist_r.search(article):
article = reflist_r.sub( new_reflist, article, count=1 )
else:
article += "\n\n" + new_reflist
print( article )
sys.stderr.write("Inline references found: %d\n" %(len(refs)) )
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment