Skip to content

Instantly share code, notes, and snippets.

Created June 8, 2021 09:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save apergos/966c9e251128d3514d03d76625af0d33 to your computer and use it in GitHub Desktop.
Save apergos/966c9e251128d3514d03d76625af0d33 to your computer and use it in GitHub Desktop.
generate table of revision ids and times for plotting wiki project growth
generate a table of rev id number vs time in 1 million revision increments,
suitable for feeding to gnuplot to chart project growth.
import getopt
import json
import sys
import time
import requests
def usage(message=None):
display a nice usage message along with an optional message
describing an error
if message:
sys.stderr.write(message + "\n")
usage_message = """Usage: $0 --domain [--startrev <num>] [--endrev <num>]
[--dryrun] [--verbose]
or: $0 --help
Retrieve revision information in a format that can be fed to gnuplot to generate
a graph showing the increase in the number of revisions over time.
--domain (-d): fqdn of wiki from which to retrieve data. e.g.
--startrev (-s): starting rev_id; default 1
--endrev (-e): ending rev_id; default the current maximum revid
--dryrun (-d): write the commands that would have been run instead of running them
--verbose (-v): write some progress messages some day
--help (-h): show this help message
def get_default_opts():
initialize args with default values and return them
args = {'domain': None, 'start_rev': '1', 'end_rev': None, 'dryrun': False, 'verbose': False}
return args
def check_opts(args):
whine if mandatory args not supplied, or if numeric ones aren't
if 'domain' not in args or not args['domain']:
usage("The argument 'domain' must be specified and may not be empty.")
if not args['start_rev'].isdigit():
usage("The 'startrev' argument must be a number.")
if args['end_rev'] and not args['end_rev'].isdigit():
usage("The 'endrev' argument must be a number.")
if not args['end_rev']:
args['end_rev'] = get_max_rev(args['domain'])
def process_opts():
get command-line args and values, falling back to defaults
where needed, whining about bad args
(options, remainder) = getopt.gnu_getopt(
sys.argv[1:], "d:e:s:Dvh",
["domain=", "startrev=", "endrev=",
"dryrun", "verbose", "help"])
except getopt.GetoptError as err:
usage("Unknown option specified: " + str(err))
args = get_default_opts()
for (opt, val) in options:
if opt in ["-d", "--domain"]:
args['domain'] = val
elif opt in ["-e", "--endrev"]:
args['end_rev'] = val
elif opt in ["-s", "--startrev"]:
args['start_rev'] = val
elif opt in ["-D", "--dryrun"]:
args['dryrun'] = True
elif opt in ["-v", "--verbose"]:
args['verbose'] = True
elif opt in ["-h", "--help"]:
usage('Help for this script\n')
usage("Unknown option specified: <%s>" % opt)
if remainder:
usage("Unknown option(s) specified: {opt}".format(opt=remainder[0]))
return args
def get_revids_url(domain, revids):
return url and params that will let us get info about the specified list of revisions
via the mediawiki api for the specified domain
base = '/w/api.php'
url = 'https://' + domain + base
params = {'action': 'query', 'prop': 'revisions', 'revids': '|'.join(revids),
'rvprop': 'ids|timestamp', 'format': 'json'}
return url, params
def get_session():
get an open session for making requests
sess = requests.Session()
{"User-Agent": " (atg: when dumps are going to be infeasible ;-)",
"Accept": "application/json"})
return sess
def get_revinfo_from_json(content):
given json output from mediawiki api for revision info,
get the revids and timestamps out of the results and
return them
if there's no revid in the content or it can't be parsed,
return None
revid_timestamp = {}
revinfo = json.loads(content)
badrevs = []
if 'badrevids' in revinfo['query']:
badrevs = list(revinfo['query']['badrevids'].keys())
# print(revinfo)
if 'pages' not in revinfo['query']:
# all the revs were bad I guess
return {}
for page in revinfo['query']['pages']:
revisions = revinfo['query']['pages'][page]['revisions']
for revision in revisions:
revid_timestamp[revision['revid']] = revision['timestamp']
# print(revision['revid'], revision['timestamp'])
return revid_timestamp, badrevs
except Exception:
return None
def get_maxrev_url(domain):
return the url to retrieve the max rev id for the given
domain via the mw api
base = '/w/api.php'
url = 'https://' + domain + base
params = {'action': 'query', 'list': 'allrevisions', 'arvlimit': '1',
'arvdir': 'older', 'format': 'json'}
return url, params
def get_revid_from_json(content):
given json response from mw api that should contain a rev id,
extract and return it
revinfo = json.loads(content)
revisions = []
for entry in revinfo['query']['allrevisions']:
if len(revisions) != 1:
return None
revid = revisions[0]['revid']
return revid
except Exception:
return None
def get_max_rev(domain):
get the max rev id for the specified domain and return
it as an int
sess = get_session()
url, params = get_maxrev_url(domain)
response = sess.get(url, params=params, timeout=5)
if response.status_code != 200:
sys.stderr.write("failed to get revid for %s\n" % url)
return None
revid = get_revid_from_json(response.content)
return revid
def get_revinfo(revids, domain):
get revision information for the specified revids at the
given domain
if we can't get a good response, return None
sess = get_session()
url, params = get_revids_url(domain, revids)
response = sess.get(url, params=params, timeout=5)
if response.status_code != 200:
sys.stderr.write("failed to get revid for %s\n" % url)
return None
revinfo, badrevs = get_revinfo_from_json(response.content)
return revinfo, badrevs
def display_revinfo(revinfo):
print rev id and timestamp for each revision in some nice format
revids = sorted(list(revinfo.keys()))
for revid in revids:
print(revid, revinfo[revid])
def do_revrange(revrange, domain):
get revinfo for a list of revisions
redo = []
batch = [str(revid) for revid in revrange[:10]]
while batch:
revinfo, badrevs = get_revinfo(batch, domain)
revrange = revrange[10:]
batch = [str(revid) for revid in revrange[:10]]
if not batch:
return redo
def do_main():
entry point
args = process_opts()
if args['verbose']:
print("running with arguments:", args)
revrange = range(int(args['start_rev']), int(args['end_rev']), 10000000)
redo = do_revrange(revrange, args['domain'])
if redo:
while redo:
new_range = [str(int(rev) + 1) for rev in redo]
redo = do_revrange(new_range, args['domain'])
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment