Skip to content

Instantly share code, notes, and snippets.

@milimetric
Created November 29, 2016 05:17
Show Gist options
  • Save milimetric/a8447734a6f650c73b7a16d44ec2574a to your computer and use it in GitHub Desktop.
Save milimetric/a8447734a6f650c73b7a16d44ec2574a to your computer and use it in GitHub Desktop.
# NOTE: required for the following to work:
# !pip install pymysql\n",
# !git clone https://gerrit.wikimedia.org/r/p/operations/mediawiki-config\n",
# !cd mediawiki-config && git pull origin master"
import pymysql
import ipaddress
import os
connection = pymysql.connect(
host='analytics-store.eqiad.wmnet',
read_default_file='/etc/mysql/conf.d/research-client.cnf',
charset='utf8mb4',
db='commonswiki',
cursorclass=pymysql.cursors.DictCursor
)
LABS_NETWORKS = [
ipaddress.IPv4Network(net)
for net in ['10.68.0.0/24', '10.68.16.0/21', '10.68.32.0/24', '10.68.48.0/24']
]
"""
-- NOTE: couldn't get this python to run because I can't install pymysql without pip on stat1003
-- So here's the alternative SQL:
set @start = '20161001000000';
set @end = '20161001000100';
select count(*) as total_edits,
sum(if(
inet_aton(cuc_ip) between inet_aton('10.68.0.0') and inet_aton('10.68.0.255')
or inet_aton(cuc_ip) between inet_aton('10.68.16.0') and inet_aton('10.68.16.255')
or inet_aton(cuc_ip) between inet_aton('10.68.17.0') and inet_aton('10.68.17.255')
or inet_aton(cuc_ip) between inet_aton('10.68.18.0') and inet_aton('10.68.18.255')
or inet_aton(cuc_ip) between inet_aton('10.68.32.0') and inet_aton('10.68.32.255')
or inet_aton(cuc_ip) between inet_aton('10.68.48.0') and inet_aton('10.68.48.255')
, 1, 0)) as labs_edits
from enwiki.cu_changes
where cuc_timestamp > @start and cuc_timestamp < @end
;
"""
def get_edit_counts(dbname, startts, endts):
"""
Returns number of labs edits (and total edits) for given date range in given db
"""
connection.ping(True)
labs_edits = 0
total_edits = 0
with connection.cursor() as cur:
cur.execute("USE %s" % dbname)
cur.execute("""
SELECT cuc_ip FROM cu_changes
WHERE cuc_timestamp > %s AND cuc_timestamp < %s
""", (startts, endts))
for row in cur:
total_edits += 1
#if total_edits % 1000000 == 0:
#print('processed %s edits so far' % total_edits)
try:
ip = ipaddress.IPv4Address(row['cuc_ip'].decode('utf-8'))
except ipaddress.AddressValueError:
# IPV4 address
continue
for network in LABS_NETWORKS:
if ip in network:
labs_edits += 1
continue
def get_public_open_wikis():
"""
Return list of dbnames of public, non-closed wikis.
Requires a mediawiki-config git clone in current directory
"""
def get_dblist(name):
"""
Returns a set containing dbs in a given dblist
"""
path = os.path.join('mediawiki-config', 'dblists', name + '.dblist')
with open(path, encoding='utf-8') as f:
return set([l.strip() for l in f])
return get_dblist('all') - get_dblist('closed') - get_dblist('private')
def calc_labs_edits(starttime, endttime):
"""
Calculate the number of all / labs edits for all open wikis in a given time period
Return a dict of 'dbname' => { 'all': <all-edits-count>, 'labs': <labs-edit-count>}
"""
stats = {}
for dbname in get_public_open_wikis():
try:
stats[dbname] = get_edit_counts(dbname, starttime, endttime)
except pymysql.InternalError as e:
print('skipping %s' % (dbname))
return stats
calc_labs_edits('20161001000000', '20161001000100')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment