Last active
August 29, 2015 14:03
-
-
Save legoktm/98a4ea1b217ce9463f7d to your computer and use it in GitHub Desktop.
thingy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# SUL audit thingy | |
# GPL v2 / MIT License | |
import calendar | |
import datetime | |
from collections import defaultdict, OrderedDict | |
import os | |
import time | |
import MySQLdb | |
import MySQLdb.cursors | |
# okay, I stole this from pywikibot | |
class Timestamp(datetime.datetime): | |
"""Class for handling Mediawiki timestamps. | |
This inherits from datetime.datetime, so it can use all of the methods | |
and operations of a datetime object. To ensure that the results of any | |
operation are also a Timestamp object, be sure to use only Timestamp | |
objects (and datetime.timedeltas) in any operation. | |
Use Timestamp.fromISOformat() and Timestamp.fromtimestampformat() to | |
create Timestamp objects from Mediawiki string formats. | |
Use Site.getcurrenttime() for the current time; this is more reliable | |
than using Timestamp.utcnow(). | |
""" | |
mediawikiTSFormat = "%Y%m%d%H%M%S" | |
ISO8601Format = "%Y-%m-%dT%H:%M:%SZ" | |
@classmethod | |
def fromISOformat(cls, ts): | |
"""Convert an ISO 8601 timestamp to a Timestamp object.""" | |
return cls.strptime(ts, cls.ISO8601Format) | |
@classmethod | |
def fromtimestampformat(cls, ts): | |
"""Convert the internal MediaWiki timestamp format to a Timestamp object.""" | |
return cls.strptime(ts, cls.mediawikiTSFormat) | |
def toISOformat(self): | |
"""Convert the Timestamp object to an ISO 8601 timestamp""" | |
return self.strftime(self.ISO8601Format) | |
def totimestampformat(self): | |
"""Convert the Timestamp object to the internal MediaWiki timestamp format.""" | |
return self.strftime(self.mediawikiTSFormat) | |
def __str__(self): | |
"""Return a string format recognized by the API""" | |
return self.toISOformat() | |
# This function I didn't steal from pywikibot, it's from | |
# http://ruslanspivak.com/2011/07/20/how-to-convert-python-utc-datetime-object-to-unix-timestamp/ | |
def to_unix(self): | |
return calendar.timegm(self.utctimetuple()) | |
def __add__(self, other): | |
newdt = datetime.datetime.__add__(self, other) | |
if isinstance(newdt, datetime.datetime): | |
return Timestamp(newdt.year, newdt.month, newdt.day, newdt.hour, | |
newdt.minute, newdt.second, newdt.microsecond, | |
newdt.tzinfo) | |
else: | |
return newdt | |
def __sub__(self, other): | |
newdt = datetime.datetime.__sub__(self, other) | |
if isinstance(newdt, datetime.datetime): | |
return Timestamp(newdt.year, newdt.month, newdt.day, newdt.hour, | |
newdt.minute, newdt.second, newdt.microsecond, | |
newdt.tzinfo) | |
else: | |
return newdt | |
class SULAuditer: | |
def get_db(self, dbname): | |
if not dbname in self.db_cache: | |
self.db_cache[dbname] = MySQLdb.connect( | |
db=dbname, | |
read_default_file=os.path.expanduser('~/sul.my.cnf'), | |
cursorclass=MySQLdb.cursors.DictCursor | |
) | |
return self.db_cache[dbname] | |
def close_db(self, dbname): | |
if dbname in self.db_cache: | |
db = self.db_cache.pop(dbname) | |
db.close() | |
def __init__(self): | |
self.db_cache = {} | |
# self.cdb = self.get_db('centralauth') | |
self.local_accounts = defaultdict(int) | |
self.local_attached = defaultdict(int) | |
self.local_not_attached = defaultdict(int) | |
self.local_with_email = defaultdict(int) | |
self.now_utc = Timestamp.utcnow().to_unix() | |
@property | |
def wikis(self): | |
# return ['enwikivoyage'] | |
if not hasattr(self, '_wikis'): | |
with open(os.path.expanduser('~/wikis.csv')) as f: | |
self._wikis = f.read().splitlines() | |
return self._wikis | |
def handle_local_user_info(self, res): | |
AA_DEPLOY = 20130430225551 # SELECT MIN(aa_lastlogin) on enwiki | |
#AA_DEPLOY = 20140702101508 | |
for row in res: | |
#print row | |
# Okay, so if a user hasn't logged in for a VERY long time, they're not in AA. | |
if row['aa_lastlogin']: | |
# yay, they're in AA. | |
touched_ts = row['aa_lastlogin'] | |
elif row['user_touched'] and (int(row['user_touched']) < AA_DEPLOY): | |
# If their user_touched is before AA was deployed, use it. | |
touched_ts = row['user_touched'] | |
else: | |
# Their user_touched is after AA was deployed, but they've never logged in. | |
# So use the oldest timestamp that we know they haven't logged in since. | |
touched_ts = str(AA_DEPLOY) | |
#print touched_ts | |
touched = Timestamp.fromtimestampformat(touched_ts) | |
months = (self.now_utc - touched.to_unix()) / (60 * 60 * 24 * 30) # Okay, estimate a month is 30 days. | |
months += 1 # Touched in the past month (0) is "1 month" | |
self.local_accounts[months] += 1 | |
if row['lu_attached_method']: | |
self.local_attached[months] += 1 | |
else: | |
self.local_not_attached[months] += 1 | |
if row['user_email']: | |
self.local_with_email[months] += 1 | |
def get_bulk_local_user_info(self, dbname, limit=2, last=''): | |
cur = self.get_db(dbname).cursor() | |
t = time.time() | |
cur.execute(""" | |
SELECT | |
user_id, | |
user_name, | |
user_touched, | |
aa_lastlogin, | |
user_id, | |
user_email, | |
lu_attached_method | |
FROM user | |
LEFT JOIN centralauth.localuser AS localuser | |
ON user.user_name=localuser.lu_name AND lu_wiki=%s | |
LEFT JOIN accountaudit_login | |
ON user.user_id=accountaudit_login.aa_user | |
WHERE user_id > %s | |
ORDER BY user_id | |
LIMIT %s""", (dbname, last, limit)) | |
res = cur.fetchall() | |
f = time.time() - t | |
cur.close() | |
#print res | |
self.handle_local_user_info(res) | |
if res: | |
last_name = res[-1]['user_id'] | |
else: | |
last_name = 0 | |
print 'Fetched up til %s, took %s seconds' % (last_name, f) | |
return len(res), last_name | |
def run(self): | |
limit = 5000 | |
for dbname in self.wikis: | |
print 'Starting on %s...' % dbname | |
count, last_name = self.get_bulk_local_user_info(dbname, limit) | |
while count == limit: | |
count, last_name = self.get_bulk_local_user_info(dbname, limit, last_name) | |
self.close_db(dbname) | |
class TableCreator: | |
def clean_dict(self, data): | |
need = [1, 2, 3, 4, 5, 6, 9, 12, 18, 24, 30, 36] | |
l = [] | |
for m in need: | |
#print l | |
l.append(data[m]) | |
# Ugh hack. | |
if type(l[0]) == int: | |
l.insert(0, sum(l)) | |
else: | |
l.insert(0, '-') | |
return l | |
def do_it(self, audit): | |
mapper = OrderedDict([ | |
('total', audit.local_accounts), | |
('attached accounts', audit.local_attached), | |
('non-attached accounts', audit.local_not_attached), | |
('... with e-mail', audit.local_with_email), | |
('... who do not clash with another account', defaultdict(lambda: '-')), | |
('... who clash with a global account', defaultdict(lambda: '-')), | |
('... global clash but appear to be merge-able', defaultdict(lambda: '-')), | |
('... who clash with 1 or more local accounts', defaultdict(lambda: '-')), | |
('... local clash but appear to be merge-able', defaultdict(lambda: '-')), | |
]) | |
text = u""" | |
{| class="wikitable" style="text-align: center;" | |
|- | |
! rowspan="2" colspan="2" | Group !! rowspan="2" | Total !! colspan="12" | Accounts touched in last … !! rowspan="2" | Group name | |
|- | |
! 1mo !! 2mo !! 3mo !! 4mo !! 5mo !! 6mo !! 9mo !! 12mo !! 18mo !! 24mo !! 30mo !! 36mo | |
|- | |
! rowspan="9" | Local accounts""" | |
for desc in mapper: | |
cleaned = self.clean_dict(mapper[desc]) | |
cleaned.insert(0, "''%s''" % desc) | |
text += "\n|" + ' || '.join(str(foo) for foo in cleaned) + '\n|-' | |
text = text[:-2] # Trim the last |- | |
text += '|}' | |
print text | |
if __name__ == '__main__': | |
audit = SULAuditer() | |
audit.run() | |
tc = TableCreator() | |
tc.do_it(audit) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment