Skip to content

Instantly share code, notes, and snippets.

@dpriskorn
Last active May 22, 2023 20:55
Show Gist options
  • Save dpriskorn/2aa823a2c82651d2bf657a5cc90583c3 to your computer and use it in GitHub Desktop.
Save dpriskorn/2aa823a2c82651d2bf657a5cc90583c3 to your computer and use it in GitHub Desktop.
wcd-prototype reference graph by pt [[Usuário:Danilo.mac]]
#!/usr/bin/env python3
"""
@ Autor: [[Usuário:Danilo.mac]]
@ Licença: GNU General Public License 3.0 (GPL V3) e Creative Commons Attribution/Share-Alike (CC-BY-SA)
Descrição: Busca de referencias no dump dos históricos da Wikipédia lusófona e gera um ranking dos usuários
que mais referenciaram artigos.
"""
import bz2, re, mariadb, os, time, json, pickle
from collections import defaultdict, Counter
norefsince = Counter()
maxyear = str(int(time.ctime()[-4:]) - 8)
users = defaultdict(Counter)
revids = {}
somerefpage = False
texttest = True
print(time.strftime('%Y-%m-%d %H:%M:%S'))
namespaces = ('0', '102')
reTag = re.compile(br'(?s)<(ns|id|timestamp|text|sha1|title)[^<>]*>([^<]*)</\1>')
#reRef = re.compile(r'&lt;[Rr][Ee][Ff]')
reRef = re.compile(r'&lt;[Rr][Ee][Ff][ &]|\{\{[Hh]arv[Rr]ef\||\{\{[Ss]fn\|')
reRefSec = re.compile(r'\{\{[Rr]eferências[\|\}]|&lt;references[ &]|\n== ?Referências ?==\n')
for ddir in sorted([d for d in os.listdir('/public/dumps/public/ptwiki/') if d.isdigit()], reverse=True):
ddir = '/public/dumps/public/ptwiki/' + ddir
with open(ddir + '/dumpstatus.json') as f:
dstatus = json.load(f)
if dstatus['jobs']['metahistorybz2dump']['status'] != 'done':
print('pulando %s, status = %s' % (ddir, dstatus['jobs']['metahistorybz2dump']['status']))
continue
dumps = [ddir + '/' + a for a in os.listdir(ddir) if re.match(r'ptwiki-.*-pages-meta-history.*\.bz2', a)]
dumps.sort(key=lambda d: int(re.search(r'\.xml-p(\d+)p', d).group(1)))
print(ddir)
break
else:
raise Exception('dumps não encontrados nos diretórios')
start = int(time.time())
def runtime():
t = int(time.time()) - start
return (t > 3600 and '{}h '.format(t // 3600) or '') + ('{}min '.format(t % 3600 // 60)) + ('{}s'.format(t % 60))
def query(sql):
connection = mariadb.connect(db='ptwiki_p', host='ptwiki.labsdb', default_file=os.path.expanduser('~/replica.my.cnf'))
c = connection.cursor()
c.execute(sql)
r = c.fetchall()
connection.close()
return r
class Page(object):
def __init__(self, title):
self.title = title
self.isopen = True
self.firstedit = True
self.lastnoref = '' # última edição sem referência
def revision(self, revid, ts, userid, text, sha1):
if not self.isopen:
return
ref = bool(reRef.search(text) or reRefSec.search(text))
if not ref:
self.firstedit = False
return
if self.firstedit or not userid:
self.isopen = False
return
year = ts[0:4]
users[year][userid] += 1
l = revids.setdefault(year, {}).setdefault(userid, [])
if len(l) >= 5:
l[revid % 5] = revid
else:
l.append(revid)
self.isopen = False
def revision_old(self, revid, ts, userid, text, sha1):
ref = bool(reRef.search(text) or reRefSec.search(text))
if not ref: # não tem ref
self.lastnoref = max(ts, self.lastnoref)
if self.firstref[0]: # tinha referência na edição anterior
self.firstref = ('', '', None)
self.norefsince = self.lastnoref
elif not self.norefsince:
self.norefsince = ts
# tem ref, o timestamp é menor que outra referência e não é a primeira edição
elif (not self.firstref[0] or self.firstref[0] > ts) and not self.firstedit:
self.firstref = self.previeusref = (ts, userid, sha1)
self.revid = revid
self.firstedit = False
def end(self):
if not self.isopen:
return
print('firstref = %r, norefsince = %r, revid = %r' % (self.firstref, self.norefsince, self.revid))
if not self.firstref[0]:
self.isopen = False
norefsince[self.norefsince[0:4]] += 1
return
year = self.firstref[0][0:4]
users[year][self.firstref[1]] += 1
if True or self.firstref[1] in getrevid:
l = revids[year][self.firstref[1]]
if len(l) >= 5:
l[self.revid % 5] = self.revid
else:
l.append(self.revid)
self.isopen = False
def gen(dumps):
buf = bytearray()
for dump in dumps:
print(dump)
f = bz2.BZ2File(dump)
buf.extend(f.read(20000000))
while True:
for match in reTag.finditer(buf):
tag, value = match.group(1).decode(), match.group(2).decode()
yield tag, value
del buf[0:match.end()]
l = len(buf)
buf.extend(f.read(20000000))
if len(buf) == l:
break
f.close()
with open('refsgraf.log', 'a') as logfile:
logfile.write('%s em %s\n' % (dump, runtime()))
def refsInHistory():
pages = {int(i[0]) for i in query("SELECT page_id FROM page WHERE page_namespace IN (0, 102) AND NOT page_is_redirect AND page_id NOT IN (SELECT cl_from FROM categorylinks WHERE cl_to = 'Desambiguação')")}
c, p = 0, 0
pagens = False
page = None
ts = None
userid = 0
hourlog = 1
log10 = 1
with open('refsgraf.log', 'a') as logfile:
logfile.write(time.strftime('%d-%m-%Y %H:%M:%S ') + ddir + '\n')
for tag, value in gen(dumps):
if tag == 'ns':
# ns indica o início de uma nova página, hora de salvar os dados da anterior
if page:
if p >= log10:
log10 = log10 * 10
print('%d páginas em %s' % (p, runtime()))
if hourlog < (int(time.time()) - start) / 3600:
hourlog += 1
print('%d páginas em %s' % (p, runtime()))
if value in namespaces:
pagens = True
page = None
elif tag == 'title':
title = value
elif pagens and tag == 'id': # pageid
pagens = False
pageid = int(value)
if pageid not in pages:
continue
page = Page(title)
p += 1
elif page:
if tag == 'timestamp':
ts = value[0:4] + value[5:7] + value[8:10] + value[11:13] + value[14:16] + value[17:19]
elif not ts and tag == 'id':
revid = int(value)
elif ts and tag == 'id': # userid
userid = int(value)
elif tag == 'text':
text = value
elif tag == 'sha1':
c += 1
page.revision(revid, ts, userid, text, value)
ts = None
userid = 0
with open('refusers.pkl', 'wb') as f:
pickle.dump((users, dict(revids)), f)
return p
def mkrank():
global users, revids
if not users:
with open('refusers.pkl', 'rb') as f:
users, revids = pickle.load(f)
rank = {year: users[year].most_common(10) for year in users}
del users
ids = {user[0] for year in rank for user in rank[year]}
connection = mariadb.connect(db='ptwiki_p', host='ptwiki.labsdb', default_file=os.path.expanduser('~/replica.my.cnf'))
c = connection.cursor()
sql = '''SELECT user_id, user_name FROM user WHERE user_id IN (%s)''' % ','.join('%d' % id for id in ids)
c.execute(sql)
ids = {int(l[0]): l[1].decode() for l in c}
connection.close()
table = ['{|class=wikitable\n|-\n!Posição||Usuário||Número de<br>referenciações||Exemplos']
for year in sorted(rank.keys(), reverse=True):
if year == time.strftime('%Y'):
continue
table.append('|-\n!colspan=4 style="font-size:x-large"|%s' % year)
for r, user in enumerate(rank[year]):
revs = user[0] in revids[year] and ', '.join('{{dif|%d}}' % r for r in revids[year][user[0]]) or 'none'
table.append('|-\n|%dº||%s||%d||%s' % (r + 1, ids[user[0]], user[1], revs))
table.append('|}')
text = '''Ranking de referenciadores.
A pesquisa foi feita no dump dos históricos, que contém todas edições de todos artigos, foram consideradas referenciações as edições que adicionaram referências ou seção de referência e que não foram a primeira edição do artigo.'''
with open('refrank.txt', 'w') as f:
f.write(text + '\n\n' + '\n'.join(table))
print('feito')
if __name__ == "__main__":
p = refsInHistory()
print('%s páginas em %s' % (p, runtime()))
mkrank()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment