Skip to content

Instantly share code, notes, and snippets.

@dpriskorn
Created May 22, 2023 20:56
Show Gist options
  • Save dpriskorn/d4689b286466d51fa9ada940ffecd854 to your computer and use it in GitHub Desktop.
Save dpriskorn/d4689b286466d51fa9ada940ffecd854 to your computer and use it in GitHub Desktop.
wcd-prototype visualization by [[Usuário:Danilo.mac]]
#!/usr/bin/env python3
"""
@ Autor: [[Usuário:Danilo.mac]]
@ Licença: GNU General Public License 3.0 (GPL V3) e Creative Commons Attribution/Share-Alike (CC-BY-SA)
Descrição: Busca de referencias no dump dos históricos da Wikipédia lusófona, gera um gráfico da evolução
das referências e atualiza o gráfico no Commons.
"""
import bz2, re, os, codecs, time, mariadb, json, pickle
print(time.strftime('%Y-%m-%d %H:%M:%S'))
namespaces = ('0', '102')
reTag = re.compile(br'(?s)<(ns|id|timestamp|text)[^<>]*>([^<]*)</\1>')
#reRef = re.compile(r'&lt;[Rr][Ee][Ff]')
reRef = re.compile(r'&lt;[Rr][Ee][Ff][ &]|\{\{[Hh]arv[Rr]ef\||\{\{[Ss]fn\|')
reRefSec = re.compile(r'\{\{[Rr]eferências[\|\}]|&lt;references[ &]')
reHttp = re.compile(r'https?://')
for ddir in sorted([d for d in os.listdir('/public/dumps/public/ptwiki/') if d.isdigit()], reverse=True):
ddir = '/public/dumps/public/ptwiki/' + ddir
with open(ddir + '/dumpstatus.json') as f:
dstatus = json.load(f)
if dstatus['jobs']['metahistorybz2dump']['status'] != 'done':
print('pulando %s, status = %s' % (ddir, dstatus['jobs']['metahistorybz2dump']['status']))
continue
dumps = [ddir + '/' + a for a in os.listdir(ddir) if re.match(r'ptwiki-.*-pages-meta-history.*\.bz2', a)]
dumps.sort(key=lambda d: int(re.search(r'\.xml-p(\d+)p', d).group(1)))
print(ddir)
break
else:
raise Exception('dumps não encontrados nos diretórios')
start = int(time.time())
def runtime():
t = int(time.time()) - start
return (t > 3600 and '{}h '.format(t // 3600) or '') + ('{}min '.format(t % 3600 // 60)) + ('{}s'.format(t % 60))
def query(sql):
connection = mariadb.connect(db='ptwiki_p', host='ptwiki.labsdb', default_file=os.path.expanduser('~/replica.my.cnf'))
c = connection.cursor()
c.execute(sql)
r = c.fetchall()
connection.close()
return r
months = {}
class Page(object):
def __init__(self):
self.isopen = True
self.months = {}
def revision(self, month, text):
ref = bool(reRef.search(text)) # bool(notas de rodapé)
http = bool(reHttp.search(text)) # bool(ligações externas)
section = bool(reRefSec.search(text)) # bool(seção de referências)
self.months[month] = (http or section or ref, ref)
def end(self):
if not self.isopen:
return
last = [0, 0]
created = 1
for month in sorted(self.months):
refs = [int(x) for x in self.months[month]]
old = months.get(month, (0, 0, 0))
months[month] = (old[0] + created, old[1] + refs[0] - last[0], old[2] + refs[1] - last[1])
last = refs
created = 0
self.isopen = False
def gen(dumps):
buf = bytearray()
for dump in dumps:
print(dump)
f = bz2.BZ2File(dump)
buf.extend(f.read(20000000))
while True:
for match in reTag.finditer(buf):
tag, value = match.group(1).decode(), match.group(2).decode()
yield tag, value
del buf[0:match.end()]
l = len(buf)
buf.extend(f.read(20000000))
if len(buf) == l:
break
f.close()
with open('refsgraf.log', 'a') as logfile:
logfile.write('%s em %s\n' % (dump, runtime()))
def refsInHistory():
pages = {int(i[0]) for i in query("SELECT page_id FROM page WHERE page_namespace IN (0, 102) AND NOT page_is_redirect AND page_id NOT IN (SELECT cl_from FROM categorylinks WHERE cl_to = 'Desambiguação')")}
c, p = 0, 0
pagens = False
page = None
month = None
hourlog = 0
with open('refsgraf.log', 'a') as logfile:
logfile.write(time.strftime('%d-%m-%Y %H:%M:%S ') + ddir + '\n')
for tag, value in gen(dumps):
if tag == 'ns':
# ns indica o início de uma nova página, hora de salvar os dados da anterior
if page:
page.end()
if hourlog < (int(time.time()) - start) / 3600:
hourlog += 1
with open('refsgraf.log', 'a') as logfile:
logfile.write('%d páginas em %s\n' % (p, runtime()))
if value in namespaces:
pagens = True
page = None
elif pagens and tag == 'id':
pagens = False
pageid = int(value)
if pageid not in pages:
continue
page = Page()
p += 1
elif page:
if tag == 'timestamp':
month = value[2:4] + value[5:7]
elif tag == 'text':
c += 1
page.revision(month, value)
if page:
page.end()
return p
def graf():
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import datetime
data = (0, 0, 0)
X, anyref, refs = [], [], []
for month in sorted(months):
data = (data[0] + months[month][0], data[1] + months[month][1], data[2] + months[month][2])
X.append(datetime.date.fromordinal(datetime.date(2000 + int(month[:2]) + int(month[2:] == '12'), int(month[2:]) +
(month[2:] == '12' and -11 or 1), 1).toordinal() - 1)) # último dia do mês
anyref.append(float(data[1]) * 100 / data[0])
refs.append(float(data[2]) * 100 / data[0])
fig = plt.figure(figsize=(12, 4))
plt.plot(X, anyref, 'b-', linewidth=2, label=u'Ligações externas, seção de referência ou notas')
plt.plot(X, refs, 'g-', linewidth=2, label=u'Notas de rodapé')
plt.ylim(ymax=100)
plt.legend(loc='best')
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda n, p: str(int(n)) + '%'))
plt.gca().yaxis.grid(linestyle='-', color='#dddddd', zorder=-1)
fig.savefig('refs.png', bbox_inches='tight')
with open('refsgraf.log', 'a') as logfile:
logfile.write('%.2f%%, %.2f%%' % (refs[-1], anyref[-1]))
def upload():
import pywikibot
site = pywikibot.Site('commons', 'commons')
site.login()
useddump = re.search(r'ptwiki-20\d+-pages-meta-history', dumps[0]).group(0)
page = pywikibot.Page(site, 'File:Ptwiki references in articles.png')
site.upload(page, source_filename='refs.png', comment='update to %s' % useddump, ignore_warnings=True, report_success=True)
if __name__ == "__main__":
p = refsInHistory()
with open('refsgraf.log', 'a') as logfile:
logfile.write('TOTAL: %d em %s\n' % (p, runtime()))
print('months = ', months)
if months:
with open('refs.pkl', 'wb') as f:
pickle.dump(months, f)
graf()
upload()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment