Skip to content

Instantly share code, notes, and snippets.

@kashav
Created October 13, 2021 20:52
Show Gist options
  • Save kashav/b9b58e77520395e1d42bc9bb7e69345a to your computer and use it in GitHub Desktop.
Save kashav/b9b58e77520395e1d42bc9bb7e69345a to your computer and use it in GitHub Desktop.
Fetch all revisions for a Wikipedia article
#!/usr/bin/env python3
# Mostly stolen from https://stackoverflow.com/questions/45193005
import mwclient
import json
import os
import time
import sys
def main(argv):
if len(argv) < 2:
print(f"usage: {argv[0]} <article> [outfile]")
return 1
article = argv[1]
outfile = open(argv[2], "w") if len(argv) == 3 else sys.stdout
site = mwclient.Site("en.wikipedia.org")
page = site.pages[article]
for i, (info, content) in enumerate(
zip(page.revisions(), page.revisions(prop="content"))
):
info["timestamp"] = time.strftime("%Y-%m-%dT%H:%M:%S", info["timestamp"])
print(i, info["timestamp"], file=sys.stderr)
outfile.write(json.dumps({"info": info, "content": content}) + "\n")
if len(argv) == 3:
outfile.close()
return 0
if __name__ == "__main__":
exit(main(sys.argv))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment