dpriskorn/visualization.py

## visualization.py
#!/usr/bin/env python3

"""
@ Autor:     [[Usuário:Danilo.mac]]

@ Licença:   GNU General Public License 3.0 (GPL V3) e Creative Commons Attribution/Share-Alike (CC-BY-SA)

Descrição:   Busca de referencias no dump dos históricos da Wikipédia lusófona, gera um gráfico da evolução
             das referências e atualiza o gráfico no Commons.

"""
import bz2, re, os, codecs, time, mariadb, json, pickle

print(time.strftime('%Y-%m-%d %H:%M:%S'))
namespaces = ('0', '102')
reTag = re.compile(br'(?s)<(ns|id|timestamp|text)[^<>]*>([^<]*)</\1>')
#reRef = re.compile(r'&lt;[Rr][Ee][Ff]')
reRef = re.compile(r'&lt;[Rr][Ee][Ff][ &]|\{\{[Hh]arv[Rr]ef\||\{\{[Ss]fn\|')
reRefSec = re.compile(r'\{\{[Rr]eferências[\|\}]|&lt;references[ &]')
reHttp = re.compile(r'https?://')
for ddir in sorted([d for d in os.listdir('/public/dumps/public/ptwiki/') if d.isdigit()], reverse=True):
  ddir = '/public/dumps/public/ptwiki/' + ddir
  with open(ddir + '/dumpstatus.json') as f:
    dstatus = json.load(f)
  if dstatus['jobs']['metahistorybz2dump']['status'] != 'done':
    print('pulando %s, status = %s' % (ddir, dstatus['jobs']['metahistorybz2dump']['status']))
    continue
  dumps = [ddir + '/' + a for a in os.listdir(ddir) if re.match(r'ptwiki-.*-pages-meta-history.*\.bz2', a)]
  dumps.sort(key=lambda d: int(re.search(r'\.xml-p(\d+)p', d).group(1)))
  print(ddir)
  break
else:
  raise Exception('dumps não encontrados nos diretórios')

start = int(time.time())
def runtime():
  t = int(time.time()) - start
  return (t > 3600 and '{}h '.format(t // 3600) or '') + ('{}min '.format(t % 3600 // 60)) + ('{}s'.format(t % 60))

def query(sql):
  connection = mariadb.connect(db='ptwiki_p', host='ptwiki.labsdb', default_file=os.path.expanduser('~/replica.my.cnf'))
  c = connection.cursor()
  c.execute(sql)
  r = c.fetchall()
  connection.close()
  return r

months = {}

class Page(object):

  def __init__(self):
    self.isopen = True
    self.months = {}

  def revision(self, month, text):
    ref = bool(reRef.search(text)) # bool(notas de rodapé)
    http = bool(reHttp.search(text)) # bool(ligações externas)
    section = bool(reRefSec.search(text)) # bool(seção de referências)
    self.months[month] = (http or section or ref, ref)

  def end(self):
    if not self.isopen:
      return
    last = [0, 0]
    created = 1
    for month in sorted(self.months):
      refs = [int(x) for x in self.months[month]]
      old = months.get(month, (0, 0, 0))
      months[month] = (old[0] + created, old[1] + refs[0] - last[0], old[2] + refs[1] - last[1])
      last = refs
      created = 0
    self.isopen = False

def gen(dumps):
  buf = bytearray()
  for dump in dumps:
    print(dump)
    f = bz2.BZ2File(dump)
    buf.extend(f.read(20000000))
    while True:
      for match in reTag.finditer(buf):
        tag, value = match.group(1).decode(), match.group(2).decode()
        yield tag, value
      del buf[0:match.end()]
      l = len(buf)
      buf.extend(f.read(20000000))
      if len(buf) == l:
        break
    f.close()
    with open('refsgraf.log', 'a') as logfile:
      logfile.write('%s em %s\n' % (dump, runtime()))

def refsInHistory():
  pages = {int(i[0]) for i in query("SELECT page_id FROM page WHERE page_namespace IN (0, 102) AND NOT page_is_redirect AND page_id NOT IN (SELECT cl_from FROM categorylinks WHERE cl_to = 'Desambiguação')")}
  c, p = 0, 0
  pagens = False
  page = None
  month = None
  hourlog = 0
  with open('refsgraf.log', 'a') as logfile:
    logfile.write(time.strftime('%d-%m-%Y %H:%M:%S ') + ddir + '\n')
  for tag, value in gen(dumps):
    if tag == 'ns':
      # ns indica o início de uma nova página, hora de salvar os dados da anterior
      if page:
        page.end()
        if hourlog < (int(time.time()) - start) / 3600:
          hourlog += 1
          with open('refsgraf.log', 'a') as logfile:
            logfile.write('%d páginas em %s\n' % (p, runtime()))
      if value in namespaces:
        pagens = True
      page = None
    elif pagens and tag == 'id':
      pagens = False
      pageid = int(value)
      if pageid not in pages:
        continue
      page = Page()
      p += 1
    elif page:
      if tag == 'timestamp':
        month = value[2:4] + value[5:7]
      elif tag == 'text':
        c += 1
        page.revision(month, value)
  if page:
    page.end()
  return p

def graf():
  import matplotlib as mpl
  mpl.use('Agg')
  import matplotlib.pyplot as plt
  from matplotlib.ticker import FuncFormatter
  import datetime

  data = (0, 0, 0)
  X, anyref, refs = [], [], []
  for month in sorted(months):
    data = (data[0] + months[month][0], data[1] + months[month][1], data[2] + months[month][2])
    X.append(datetime.date.fromordinal(datetime.date(2000 + int(month[:2]) + int(month[2:] == '12'), int(month[2:]) +
      (month[2:] == '12' and -11 or 1), 1).toordinal() - 1)) # último dia do mês
    anyref.append(float(data[1]) * 100 / data[0])
    refs.append(float(data[2]) * 100 / data[0])
  fig = plt.figure(figsize=(12, 4))
  plt.plot(X, anyref, 'b-', linewidth=2, label=u'Ligações externas, seção de referência ou notas')
  plt.plot(X, refs, 'g-', linewidth=2, label=u'Notas de rodapé')
  plt.ylim(ymax=100)
  plt.legend(loc='best')
  plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda n, p: str(int(n)) + '%'))
  plt.gca().yaxis.grid(linestyle='-', color='#dddddd', zorder=-1)
  fig.savefig('refs.png', bbox_inches='tight')
  with open('refsgraf.log', 'a') as logfile:
    logfile.write('%.2f%%, %.2f%%' % (refs[-1], anyref[-1]))

def upload():
  import pywikibot
  site = pywikibot.Site('commons', 'commons')
  site.login()
  useddump = re.search(r'ptwiki-20\d+-pages-meta-history', dumps[0]).group(0)
  page = pywikibot.Page(site, 'File:Ptwiki references in articles.png')
  site.upload(page, source_filename='refs.png', comment='update to %s' % useddump, ignore_warnings=True, report_success=True)

if __name__ == "__main__":
  p = refsInHistory()
  with open('refsgraf.log', 'a') as logfile:
    logfile.write('TOTAL: %d em %s\n' % (p, runtime()))
  print('months = ', months)
  if months:
    with open('refs.pkl', 'wb') as f:
      pickle.dump(months, f)
    graf()
    upload()
	#!/usr/bin/env python3

	"""
	@ Autor: [[Usuário:Danilo.mac]]

	@ Licença: GNU General Public License 3.0 (GPL V3) e Creative Commons Attribution/Share-Alike (CC-BY-SA)

	Descrição: Busca de referencias no dump dos históricos da Wikipédia lusófona, gera um gráfico da evolução
	das referências e atualiza o gráfico no Commons.

	"""
	import bz2, re, os, codecs, time, mariadb, json, pickle

	print(time.strftime('%Y-%m-%d %H:%M:%S'))
	namespaces = ('0', '102')
	reTag = re.compile(br'(?s)<(ns\|id\|timestamp\|text)[^<>]>([^<])</\1>')
	#reRef = re.compile(r'<[Rr][Ee][Ff]')
	reRef = re.compile(r'<[Rr][Ee][Ff][ &]\|\{\{[Hh]arv[Rr]ef\\|\|\{\{[Ss]fn\\|')
	reRefSec = re.compile(r'\{\{[Rr]eferências[\\|\}]\|<references[ &]')
	reHttp = re.compile(r'https?://')
	for ddir in sorted([d for d in os.listdir('/public/dumps/public/ptwiki/') if d.isdigit()], reverse=True):
	ddir = '/public/dumps/public/ptwiki/' + ddir
	with open(ddir + '/dumpstatus.json') as f:
	dstatus = json.load(f)
	if dstatus['jobs']['metahistorybz2dump']['status'] != 'done':
	print('pulando %s, status = %s' % (ddir, dstatus['jobs']['metahistorybz2dump']['status']))
	continue
	dumps = [ddir + '/' + a for a in os.listdir(ddir) if re.match(r'ptwiki-.-pages-meta-history.\.bz2', a)]
	dumps.sort(key=lambda d: int(re.search(r'\.xml-p(\d+)p', d).group(1)))
	print(ddir)
	break
	else:
	raise Exception('dumps não encontrados nos diretórios')

	start = int(time.time())
	def runtime():
	t = int(time.time()) - start
	return (t > 3600 and '{}h '.format(t // 3600) or '') + ('{}min '.format(t % 3600 // 60)) + ('{}s'.format(t % 60))

	def query(sql):
	connection = mariadb.connect(db='ptwiki_p', host='ptwiki.labsdb', default_file=os.path.expanduser('~/replica.my.cnf'))
	c = connection.cursor()
	c.execute(sql)
	r = c.fetchall()
	connection.close()
	return r

	months = {}

	class Page(object):

	def __init__(self):
	self.isopen = True
	self.months = {}

	def revision(self, month, text):
	ref = bool(reRef.search(text)) # bool(notas de rodapé)
	http = bool(reHttp.search(text)) # bool(ligações externas)
	section = bool(reRefSec.search(text)) # bool(seção de referências)
	self.months[month] = (http or section or ref, ref)

	def end(self):
	if not self.isopen:
	return
	last = [0, 0]
	created = 1
	for month in sorted(self.months):
	refs = [int(x) for x in self.months[month]]
	old = months.get(month, (0, 0, 0))
	months[month] = (old[0] + created, old[1] + refs[0] - last[0], old[2] + refs[1] - last[1])
	last = refs
	created = 0
	self.isopen = False

	def gen(dumps):
	buf = bytearray()
	for dump in dumps:
	print(dump)
	f = bz2.BZ2File(dump)
	buf.extend(f.read(20000000))
	while True:
	for match in reTag.finditer(buf):
	tag, value = match.group(1).decode(), match.group(2).decode()
	yield tag, value
	del buf[0:match.end()]
	l = len(buf)
	buf.extend(f.read(20000000))
	if len(buf) == l:
	break
	f.close()
	with open('refsgraf.log', 'a') as logfile:
	logfile.write('%s em %s\n' % (dump, runtime()))

	def refsInHistory():
	pages = {int(i[0]) for i in query("SELECT page_id FROM page WHERE page_namespace IN (0, 102) AND NOT page_is_redirect AND page_id NOT IN (SELECT cl_from FROM categorylinks WHERE cl_to = 'Desambiguação')")}
	c, p = 0, 0
	pagens = False
	page = None
	month = None
	hourlog = 0
	with open('refsgraf.log', 'a') as logfile:
	logfile.write(time.strftime('%d-%m-%Y %H:%M:%S ') + ddir + '\n')
	for tag, value in gen(dumps):
	if tag == 'ns':
	# ns indica o início de uma nova página, hora de salvar os dados da anterior
	if page:
	page.end()
	if hourlog < (int(time.time()) - start) / 3600:
	hourlog += 1
	with open('refsgraf.log', 'a') as logfile:
	logfile.write('%d páginas em %s\n' % (p, runtime()))
	if value in namespaces:
	pagens = True
	page = None
	elif pagens and tag == 'id':
	pagens = False
	pageid = int(value)
	if pageid not in pages:
	continue
	page = Page()
	p += 1
	elif page:
	if tag == 'timestamp':
	month = value[2:4] + value[5:7]
	elif tag == 'text':
	c += 1
	page.revision(month, value)
	if page:
	page.end()
	return p

	def graf():
	import matplotlib as mpl
	mpl.use('Agg')
	import matplotlib.pyplot as plt
	from matplotlib.ticker import FuncFormatter
	import datetime

	data = (0, 0, 0)
	X, anyref, refs = [], [], []
	for month in sorted(months):
	data = (data[0] + months[month][0], data[1] + months[month][1], data[2] + months[month][2])
	X.append(datetime.date.fromordinal(datetime.date(2000 + int(month[:2]) + int(month[2:] == '12'), int(month[2:]) +
	(month[2:] == '12' and -11 or 1), 1).toordinal() - 1)) # último dia do mês
	anyref.append(float(data[1]) * 100 / data[0])
	refs.append(float(data[2]) * 100 / data[0])
	fig = plt.figure(figsize=(12, 4))
	plt.plot(X, anyref, 'b-', linewidth=2, label=u'Ligações externas, seção de referência ou notas')
	plt.plot(X, refs, 'g-', linewidth=2, label=u'Notas de rodapé')
	plt.ylim(ymax=100)
	plt.legend(loc='best')
	plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda n, p: str(int(n)) + '%'))
	plt.gca().yaxis.grid(linestyle='-', color='#dddddd', zorder=-1)
	fig.savefig('refs.png', bbox_inches='tight')
	with open('refsgraf.log', 'a') as logfile:
	logfile.write('%.2f%%, %.2f%%' % (refs[-1], anyref[-1]))

	def upload():
	import pywikibot
	site = pywikibot.Site('commons', 'commons')
	site.login()
	useddump = re.search(r'ptwiki-20\d+-pages-meta-history', dumps[0]).group(0)
	page = pywikibot.Page(site, 'File:Ptwiki references in articles.png')
	site.upload(page, source_filename='refs.png', comment='update to %s' % useddump, ignore_warnings=True, report_success=True)

	if __name__ == "__main__":
	p = refsInHistory()
	with open('refsgraf.log', 'a') as logfile:
	logfile.write('TOTAL: %d em %s\n' % (p, runtime()))
	print('months = ', months)
	if months:
	with open('refs.pkl', 'wb') as f:
	pickle.dump(months, f)
	graf()
	upload()