Last active
May 22, 2023 20:55
-
-
Save dpriskorn/2aa823a2c82651d2bf657a5cc90583c3 to your computer and use it in GitHub Desktop.
wcd-prototype reference graph by pt [[Usuário:Danilo.mac]]
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
@ Autor: [[Usuário:Danilo.mac]] | |
@ Licença: GNU General Public License 3.0 (GPL V3) e Creative Commons Attribution/Share-Alike (CC-BY-SA) | |
Descrição: Busca de referencias no dump dos históricos da Wikipédia lusófona e gera um ranking dos usuários | |
que mais referenciaram artigos. | |
""" | |
import bz2, re, mariadb, os, time, json, pickle | |
from collections import defaultdict, Counter | |
norefsince = Counter() | |
maxyear = str(int(time.ctime()[-4:]) - 8) | |
users = defaultdict(Counter) | |
revids = {} | |
somerefpage = False | |
texttest = True | |
print(time.strftime('%Y-%m-%d %H:%M:%S')) | |
namespaces = ('0', '102') | |
reTag = re.compile(br'(?s)<(ns|id|timestamp|text|sha1|title)[^<>]*>([^<]*)</\1>') | |
#reRef = re.compile(r'<[Rr][Ee][Ff]') | |
reRef = re.compile(r'<[Rr][Ee][Ff][ &]|\{\{[Hh]arv[Rr]ef\||\{\{[Ss]fn\|') | |
reRefSec = re.compile(r'\{\{[Rr]eferências[\|\}]|<references[ &]|\n== ?Referências ?==\n') | |
for ddir in sorted([d for d in os.listdir('/public/dumps/public/ptwiki/') if d.isdigit()], reverse=True): | |
ddir = '/public/dumps/public/ptwiki/' + ddir | |
with open(ddir + '/dumpstatus.json') as f: | |
dstatus = json.load(f) | |
if dstatus['jobs']['metahistorybz2dump']['status'] != 'done': | |
print('pulando %s, status = %s' % (ddir, dstatus['jobs']['metahistorybz2dump']['status'])) | |
continue | |
dumps = [ddir + '/' + a for a in os.listdir(ddir) if re.match(r'ptwiki-.*-pages-meta-history.*\.bz2', a)] | |
dumps.sort(key=lambda d: int(re.search(r'\.xml-p(\d+)p', d).group(1))) | |
print(ddir) | |
break | |
else: | |
raise Exception('dumps não encontrados nos diretórios') | |
start = int(time.time()) | |
def runtime(): | |
t = int(time.time()) - start | |
return (t > 3600 and '{}h '.format(t // 3600) or '') + ('{}min '.format(t % 3600 // 60)) + ('{}s'.format(t % 60)) | |
def query(sql): | |
connection = mariadb.connect(db='ptwiki_p', host='ptwiki.labsdb', default_file=os.path.expanduser('~/replica.my.cnf')) | |
c = connection.cursor() | |
c.execute(sql) | |
r = c.fetchall() | |
connection.close() | |
return r | |
class Page(object): | |
def __init__(self, title): | |
self.title = title | |
self.isopen = True | |
self.firstedit = True | |
self.lastnoref = '' # última edição sem referência | |
def revision(self, revid, ts, userid, text, sha1): | |
if not self.isopen: | |
return | |
ref = bool(reRef.search(text) or reRefSec.search(text)) | |
if not ref: | |
self.firstedit = False | |
return | |
if self.firstedit or not userid: | |
self.isopen = False | |
return | |
year = ts[0:4] | |
users[year][userid] += 1 | |
l = revids.setdefault(year, {}).setdefault(userid, []) | |
if len(l) >= 5: | |
l[revid % 5] = revid | |
else: | |
l.append(revid) | |
self.isopen = False | |
def revision_old(self, revid, ts, userid, text, sha1): | |
ref = bool(reRef.search(text) or reRefSec.search(text)) | |
if not ref: # não tem ref | |
self.lastnoref = max(ts, self.lastnoref) | |
if self.firstref[0]: # tinha referência na edição anterior | |
self.firstref = ('', '', None) | |
self.norefsince = self.lastnoref | |
elif not self.norefsince: | |
self.norefsince = ts | |
# tem ref, o timestamp é menor que outra referência e não é a primeira edição | |
elif (not self.firstref[0] or self.firstref[0] > ts) and not self.firstedit: | |
self.firstref = self.previeusref = (ts, userid, sha1) | |
self.revid = revid | |
self.firstedit = False | |
def end(self): | |
if not self.isopen: | |
return | |
print('firstref = %r, norefsince = %r, revid = %r' % (self.firstref, self.norefsince, self.revid)) | |
if not self.firstref[0]: | |
self.isopen = False | |
norefsince[self.norefsince[0:4]] += 1 | |
return | |
year = self.firstref[0][0:4] | |
users[year][self.firstref[1]] += 1 | |
if True or self.firstref[1] in getrevid: | |
l = revids[year][self.firstref[1]] | |
if len(l) >= 5: | |
l[self.revid % 5] = self.revid | |
else: | |
l.append(self.revid) | |
self.isopen = False | |
def gen(dumps): | |
buf = bytearray() | |
for dump in dumps: | |
print(dump) | |
f = bz2.BZ2File(dump) | |
buf.extend(f.read(20000000)) | |
while True: | |
for match in reTag.finditer(buf): | |
tag, value = match.group(1).decode(), match.group(2).decode() | |
yield tag, value | |
del buf[0:match.end()] | |
l = len(buf) | |
buf.extend(f.read(20000000)) | |
if len(buf) == l: | |
break | |
f.close() | |
with open('refsgraf.log', 'a') as logfile: | |
logfile.write('%s em %s\n' % (dump, runtime())) | |
def refsInHistory(): | |
pages = {int(i[0]) for i in query("SELECT page_id FROM page WHERE page_namespace IN (0, 102) AND NOT page_is_redirect AND page_id NOT IN (SELECT cl_from FROM categorylinks WHERE cl_to = 'Desambiguação')")} | |
c, p = 0, 0 | |
pagens = False | |
page = None | |
ts = None | |
userid = 0 | |
hourlog = 1 | |
log10 = 1 | |
with open('refsgraf.log', 'a') as logfile: | |
logfile.write(time.strftime('%d-%m-%Y %H:%M:%S ') + ddir + '\n') | |
for tag, value in gen(dumps): | |
if tag == 'ns': | |
# ns indica o início de uma nova página, hora de salvar os dados da anterior | |
if page: | |
if p >= log10: | |
log10 = log10 * 10 | |
print('%d páginas em %s' % (p, runtime())) | |
if hourlog < (int(time.time()) - start) / 3600: | |
hourlog += 1 | |
print('%d páginas em %s' % (p, runtime())) | |
if value in namespaces: | |
pagens = True | |
page = None | |
elif tag == 'title': | |
title = value | |
elif pagens and tag == 'id': # pageid | |
pagens = False | |
pageid = int(value) | |
if pageid not in pages: | |
continue | |
page = Page(title) | |
p += 1 | |
elif page: | |
if tag == 'timestamp': | |
ts = value[0:4] + value[5:7] + value[8:10] + value[11:13] + value[14:16] + value[17:19] | |
elif not ts and tag == 'id': | |
revid = int(value) | |
elif ts and tag == 'id': # userid | |
userid = int(value) | |
elif tag == 'text': | |
text = value | |
elif tag == 'sha1': | |
c += 1 | |
page.revision(revid, ts, userid, text, value) | |
ts = None | |
userid = 0 | |
with open('refusers.pkl', 'wb') as f: | |
pickle.dump((users, dict(revids)), f) | |
return p | |
def mkrank(): | |
global users, revids | |
if not users: | |
with open('refusers.pkl', 'rb') as f: | |
users, revids = pickle.load(f) | |
rank = {year: users[year].most_common(10) for year in users} | |
del users | |
ids = {user[0] for year in rank for user in rank[year]} | |
connection = mariadb.connect(db='ptwiki_p', host='ptwiki.labsdb', default_file=os.path.expanduser('~/replica.my.cnf')) | |
c = connection.cursor() | |
sql = '''SELECT user_id, user_name FROM user WHERE user_id IN (%s)''' % ','.join('%d' % id for id in ids) | |
c.execute(sql) | |
ids = {int(l[0]): l[1].decode() for l in c} | |
connection.close() | |
table = ['{|class=wikitable\n|-\n!Posição||Usuário||Número de<br>referenciações||Exemplos'] | |
for year in sorted(rank.keys(), reverse=True): | |
if year == time.strftime('%Y'): | |
continue | |
table.append('|-\n!colspan=4 style="font-size:x-large"|%s' % year) | |
for r, user in enumerate(rank[year]): | |
revs = user[0] in revids[year] and ', '.join('{{dif|%d}}' % r for r in revids[year][user[0]]) or 'none' | |
table.append('|-\n|%dº||%s||%d||%s' % (r + 1, ids[user[0]], user[1], revs)) | |
table.append('|}') | |
text = '''Ranking de referenciadores. | |
A pesquisa foi feita no dump dos históricos, que contém todas edições de todos artigos, foram consideradas referenciações as edições que adicionaram referências ou seção de referência e que não foram a primeira edição do artigo.''' | |
with open('refrank.txt', 'w') as f: | |
f.write(text + '\n\n' + '\n'.join(table)) | |
print('feito') | |
if __name__ == "__main__": | |
p = refsInHistory() | |
print('%s páginas em %s' % (p, runtime())) | |
mkrank() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment