Skip to content

Instantly share code, notes, and snippets.

@bnewbold
Created August 13, 2021 02:39
Show Gist options
  • Save bnewbold/c5ca0630b42ee3e17947221e015e421c to your computer and use it in GitHub Desktop.
Save bnewbold/c5ca0630b42ee3e17947221e015e421c to your computer and use it in GitHub Desktop.
Quick bulk XML extraction of wikipedia references using mwxml and wikiciteparser
#!/usr/bin/env python3
import sys
import json
import mwxml
import mwtypes
import mwtypes.files
import mwparserfromhell
from wikiciteparser.parser import parse_citation_template
def extract_revision(revision: mwtypes.Revision) -> dict:
meta = {}
meta["revision_id"] = revision.id
refs = []
wikicode = mwparserfromhell.parse(revision.text)
for tmpl in wikicode.filter_templates():
parsed = parse_citation_template(tmpl)
if parsed:
refs.append(parsed)
meta["refs"] = refs
return meta
def run(xml_file_path: str):
"""
Iterate over all revisions of all pages not in a namespace from the dump.
If we are processing one of the 'current' dumps, there will be only one
Revision per Page.
"""
dump = mwxml.Dump.from_file(mwtypes.files.reader(xml_file_path))
site_name = dump.site_info.dbname
for page in dump.pages:
if (not page.namespace in [0, "0"]) or page.redirect:
# print(f"SKIPPED: [{page.namespace}] {page.title} redirect={page.redirect}", file=sys.stderr)
continue
for revision in page:
if revision.deleted.text or not revision.text:
continue
meta = extract_revision(revision)
meta["site_name"] = site_name
meta["page_title"] = page.title
print(json.dumps(meta))
if __name__ == "__main__":
if not len(sys.argv) != 2:
print("I need a single argument: XML dump file path", file=sys.stderr)
sys.exit(-1)
run(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment