Created
May 22, 2023 20:56
-
-
Save dpriskorn/d4689b286466d51fa9ada940ffecd854 to your computer and use it in GitHub Desktop.
wcd-prototype visualization by [[Usuário:Danilo.mac]]
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
@ Autor: [[Usuário:Danilo.mac]] | |
@ Licença: GNU General Public License 3.0 (GPL V3) e Creative Commons Attribution/Share-Alike (CC-BY-SA) | |
Descrição: Busca de referencias no dump dos históricos da Wikipédia lusófona, gera um gráfico da evolução | |
das referências e atualiza o gráfico no Commons. | |
""" | |
import bz2, re, os, codecs, time, mariadb, json, pickle | |
print(time.strftime('%Y-%m-%d %H:%M:%S')) | |
namespaces = ('0', '102') | |
reTag = re.compile(br'(?s)<(ns|id|timestamp|text)[^<>]*>([^<]*)</\1>') | |
#reRef = re.compile(r'<[Rr][Ee][Ff]') | |
reRef = re.compile(r'<[Rr][Ee][Ff][ &]|\{\{[Hh]arv[Rr]ef\||\{\{[Ss]fn\|') | |
reRefSec = re.compile(r'\{\{[Rr]eferências[\|\}]|<references[ &]') | |
reHttp = re.compile(r'https?://') | |
for ddir in sorted([d for d in os.listdir('/public/dumps/public/ptwiki/') if d.isdigit()], reverse=True): | |
ddir = '/public/dumps/public/ptwiki/' + ddir | |
with open(ddir + '/dumpstatus.json') as f: | |
dstatus = json.load(f) | |
if dstatus['jobs']['metahistorybz2dump']['status'] != 'done': | |
print('pulando %s, status = %s' % (ddir, dstatus['jobs']['metahistorybz2dump']['status'])) | |
continue | |
dumps = [ddir + '/' + a for a in os.listdir(ddir) if re.match(r'ptwiki-.*-pages-meta-history.*\.bz2', a)] | |
dumps.sort(key=lambda d: int(re.search(r'\.xml-p(\d+)p', d).group(1))) | |
print(ddir) | |
break | |
else: | |
raise Exception('dumps não encontrados nos diretórios') | |
start = int(time.time()) | |
def runtime(): | |
t = int(time.time()) - start | |
return (t > 3600 and '{}h '.format(t // 3600) or '') + ('{}min '.format(t % 3600 // 60)) + ('{}s'.format(t % 60)) | |
def query(sql): | |
connection = mariadb.connect(db='ptwiki_p', host='ptwiki.labsdb', default_file=os.path.expanduser('~/replica.my.cnf')) | |
c = connection.cursor() | |
c.execute(sql) | |
r = c.fetchall() | |
connection.close() | |
return r | |
months = {} | |
class Page(object): | |
def __init__(self): | |
self.isopen = True | |
self.months = {} | |
def revision(self, month, text): | |
ref = bool(reRef.search(text)) # bool(notas de rodapé) | |
http = bool(reHttp.search(text)) # bool(ligações externas) | |
section = bool(reRefSec.search(text)) # bool(seção de referências) | |
self.months[month] = (http or section or ref, ref) | |
def end(self): | |
if not self.isopen: | |
return | |
last = [0, 0] | |
created = 1 | |
for month in sorted(self.months): | |
refs = [int(x) for x in self.months[month]] | |
old = months.get(month, (0, 0, 0)) | |
months[month] = (old[0] + created, old[1] + refs[0] - last[0], old[2] + refs[1] - last[1]) | |
last = refs | |
created = 0 | |
self.isopen = False | |
def gen(dumps): | |
buf = bytearray() | |
for dump in dumps: | |
print(dump) | |
f = bz2.BZ2File(dump) | |
buf.extend(f.read(20000000)) | |
while True: | |
for match in reTag.finditer(buf): | |
tag, value = match.group(1).decode(), match.group(2).decode() | |
yield tag, value | |
del buf[0:match.end()] | |
l = len(buf) | |
buf.extend(f.read(20000000)) | |
if len(buf) == l: | |
break | |
f.close() | |
with open('refsgraf.log', 'a') as logfile: | |
logfile.write('%s em %s\n' % (dump, runtime())) | |
def refsInHistory(): | |
pages = {int(i[0]) for i in query("SELECT page_id FROM page WHERE page_namespace IN (0, 102) AND NOT page_is_redirect AND page_id NOT IN (SELECT cl_from FROM categorylinks WHERE cl_to = 'Desambiguação')")} | |
c, p = 0, 0 | |
pagens = False | |
page = None | |
month = None | |
hourlog = 0 | |
with open('refsgraf.log', 'a') as logfile: | |
logfile.write(time.strftime('%d-%m-%Y %H:%M:%S ') + ddir + '\n') | |
for tag, value in gen(dumps): | |
if tag == 'ns': | |
# ns indica o início de uma nova página, hora de salvar os dados da anterior | |
if page: | |
page.end() | |
if hourlog < (int(time.time()) - start) / 3600: | |
hourlog += 1 | |
with open('refsgraf.log', 'a') as logfile: | |
logfile.write('%d páginas em %s\n' % (p, runtime())) | |
if value in namespaces: | |
pagens = True | |
page = None | |
elif pagens and tag == 'id': | |
pagens = False | |
pageid = int(value) | |
if pageid not in pages: | |
continue | |
page = Page() | |
p += 1 | |
elif page: | |
if tag == 'timestamp': | |
month = value[2:4] + value[5:7] | |
elif tag == 'text': | |
c += 1 | |
page.revision(month, value) | |
if page: | |
page.end() | |
return p | |
def graf(): | |
import matplotlib as mpl | |
mpl.use('Agg') | |
import matplotlib.pyplot as plt | |
from matplotlib.ticker import FuncFormatter | |
import datetime | |
data = (0, 0, 0) | |
X, anyref, refs = [], [], [] | |
for month in sorted(months): | |
data = (data[0] + months[month][0], data[1] + months[month][1], data[2] + months[month][2]) | |
X.append(datetime.date.fromordinal(datetime.date(2000 + int(month[:2]) + int(month[2:] == '12'), int(month[2:]) + | |
(month[2:] == '12' and -11 or 1), 1).toordinal() - 1)) # último dia do mês | |
anyref.append(float(data[1]) * 100 / data[0]) | |
refs.append(float(data[2]) * 100 / data[0]) | |
fig = plt.figure(figsize=(12, 4)) | |
plt.plot(X, anyref, 'b-', linewidth=2, label=u'Ligações externas, seção de referência ou notas') | |
plt.plot(X, refs, 'g-', linewidth=2, label=u'Notas de rodapé') | |
plt.ylim(ymax=100) | |
plt.legend(loc='best') | |
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda n, p: str(int(n)) + '%')) | |
plt.gca().yaxis.grid(linestyle='-', color='#dddddd', zorder=-1) | |
fig.savefig('refs.png', bbox_inches='tight') | |
with open('refsgraf.log', 'a') as logfile: | |
logfile.write('%.2f%%, %.2f%%' % (refs[-1], anyref[-1])) | |
def upload(): | |
import pywikibot | |
site = pywikibot.Site('commons', 'commons') | |
site.login() | |
useddump = re.search(r'ptwiki-20\d+-pages-meta-history', dumps[0]).group(0) | |
page = pywikibot.Page(site, 'File:Ptwiki references in articles.png') | |
site.upload(page, source_filename='refs.png', comment='update to %s' % useddump, ignore_warnings=True, report_success=True) | |
if __name__ == "__main__": | |
p = refsInHistory() | |
with open('refsgraf.log', 'a') as logfile: | |
logfile.write('TOTAL: %d em %s\n' % (p, runtime())) | |
print('months = ', months) | |
if months: | |
with open('refs.pkl', 'wb') as f: | |
pickle.dump(months, f) | |
graf() | |
upload() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment