dpriskorn/reference_graph.py

## reference_graph.py
#!/usr/bin/env python3

"""
@ Autor:     [[Usuário:Danilo.mac]]

@ Licença:   GNU General Public License 3.0 (GPL V3) e Creative Commons Attribution/Share-Alike (CC-BY-SA)

Descrição:   Busca de referencias no dump dos históricos da Wikipédia lusófona e gera um ranking dos usuários
             que mais referenciaram artigos.

"""
import bz2, re, mariadb, os, time, json, pickle
from collections import defaultdict, Counter

norefsince = Counter()
maxyear = str(int(time.ctime()[-4:]) - 8)
users = defaultdict(Counter)
revids = {}
somerefpage = False
texttest = True

print(time.strftime('%Y-%m-%d %H:%M:%S'))
namespaces = ('0', '102')
reTag = re.compile(br'(?s)<(ns|id|timestamp|text|sha1|title)[^<>]*>([^<]*)</\1>')
#reRef = re.compile(r'&lt;[Rr][Ee][Ff]')
reRef = re.compile(r'&lt;[Rr][Ee][Ff][ &]|\{\{[Hh]arv[Rr]ef\||\{\{[Ss]fn\|')
reRefSec = re.compile(r'\{\{[Rr]eferências[\|\}]|&lt;references[ &]|\n== ?Referências ?==\n')
for ddir in sorted([d for d in os.listdir('/public/dumps/public/ptwiki/') if d.isdigit()], reverse=True):
  ddir = '/public/dumps/public/ptwiki/' + ddir
  with open(ddir + '/dumpstatus.json') as f:
    dstatus = json.load(f)
  if dstatus['jobs']['metahistorybz2dump']['status'] != 'done':
    print('pulando %s, status = %s' % (ddir, dstatus['jobs']['metahistorybz2dump']['status']))
    continue
  dumps = [ddir + '/' + a for a in os.listdir(ddir) if re.match(r'ptwiki-.*-pages-meta-history.*\.bz2', a)]
  dumps.sort(key=lambda d: int(re.search(r'\.xml-p(\d+)p', d).group(1)))
  print(ddir)
  break
else:
  raise Exception('dumps não encontrados nos diretórios')

start = int(time.time())
def runtime():
  t = int(time.time()) - start
  return (t > 3600 and '{}h '.format(t // 3600) or '') + ('{}min '.format(t % 3600 // 60)) + ('{}s'.format(t % 60))

def query(sql):
  connection = mariadb.connect(db='ptwiki_p', host='ptwiki.labsdb', default_file=os.path.expanduser('~/replica.my.cnf'))
  c = connection.cursor()
  c.execute(sql)
  r = c.fetchall()
  connection.close()
  return r

class Page(object):

  def __init__(self, title):
    self.title = title
    self.isopen = True
    self.firstedit = True
    self.lastnoref = '' # última edição sem referência

  def revision(self, revid, ts, userid, text, sha1):
    if not self.isopen:
      return
    ref = bool(reRef.search(text) or reRefSec.search(text))
    if not ref:
      self.firstedit = False
      return
    if self.firstedit or not userid:
      self.isopen = False
      return
    year = ts[0:4]
    users[year][userid] += 1
    l = revids.setdefault(year, {}).setdefault(userid, [])
    if len(l) >= 5:
      l[revid % 5] = revid
    else:
      l.append(revid)
    self.isopen = False


  def revision_old(self, revid, ts, userid, text, sha1):
    ref = bool(reRef.search(text) or reRefSec.search(text))

    if not ref: # não tem ref
      self.lastnoref = max(ts, self.lastnoref)
      if self.firstref[0]: # tinha referência na edição anterior
        self.firstref = ('', '', None)
        self.norefsince = self.lastnoref
      elif not self.norefsince:
        self.norefsince = ts
    # tem ref, o timestamp é menor que outra referência e não é a primeira edição
    elif (not self.firstref[0] or self.firstref[0] > ts) and not self.firstedit:
      self.firstref = self.previeusref = (ts, userid, sha1)
      self.revid = revid
    self.firstedit = False

  def end(self):
    if not self.isopen:
      return
    print('firstref = %r, norefsince = %r, revid = %r' % (self.firstref, self.norefsince, self.revid))
    if not self.firstref[0]:
      self.isopen = False
      norefsince[self.norefsince[0:4]] += 1
      return
    year = self.firstref[0][0:4]
    users[year][self.firstref[1]] += 1
    if True or self.firstref[1] in getrevid:
      l = revids[year][self.firstref[1]]
      if len(l) >= 5:
        l[self.revid % 5] = self.revid
      else:
        l.append(self.revid)
    self.isopen = False

def gen(dumps):
  buf = bytearray()
  for dump in dumps:
    print(dump)
    f = bz2.BZ2File(dump)
    buf.extend(f.read(20000000))
    while True:
      for match in reTag.finditer(buf):
        tag, value = match.group(1).decode(), match.group(2).decode()
        yield tag, value
      del buf[0:match.end()]
      l = len(buf)
      buf.extend(f.read(20000000))
      if len(buf) == l:
        break
    f.close()
    with open('refsgraf.log', 'a') as logfile:
      logfile.write('%s em %s\n' % (dump, runtime()))

def refsInHistory():
  pages = {int(i[0]) for i in query("SELECT page_id FROM page WHERE page_namespace IN (0, 102) AND NOT page_is_redirect AND page_id NOT IN (SELECT cl_from FROM categorylinks WHERE cl_to = 'Desambiguação')")}
  c, p = 0, 0
  pagens = False
  page = None
  ts = None
  userid = 0
  hourlog = 1
  log10 = 1
  with open('refsgraf.log', 'a') as logfile:
    logfile.write(time.strftime('%d-%m-%Y %H:%M:%S ') + ddir + '\n')
  for tag, value in gen(dumps):
    if tag == 'ns':
      # ns indica o início de uma nova página, hora de salvar os dados da anterior
      if page:
        if p >= log10:
          log10 = log10 * 10
          print('%d páginas em %s' % (p, runtime()))
        if hourlog < (int(time.time()) - start) / 3600:
          hourlog += 1
          print('%d páginas em %s' % (p, runtime()))
      if value in namespaces:
        pagens = True
      page = None
    elif tag == 'title':
      title = value
    elif pagens and tag == 'id':  # pageid
      pagens = False
      pageid = int(value)
      if pageid not in pages:
        continue
      page = Page(title)
      p += 1
    elif page:
      if tag == 'timestamp':
        ts = value[0:4] + value[5:7] + value[8:10] + value[11:13] + value[14:16] + value[17:19]
      elif not ts and tag == 'id':
        revid = int(value)
      elif ts and tag == 'id':  # userid
        userid = int(value)
      elif tag == 'text':
        text = value
      elif tag == 'sha1':
        c += 1
        page.revision(revid, ts, userid, text, value)
        ts = None
        userid = 0
  with open('refusers.pkl', 'wb') as f:
    pickle.dump((users, dict(revids)), f)
  return p

def mkrank():
  global users, revids
  if not users:
    with open('refusers.pkl', 'rb') as f:
      users, revids = pickle.load(f)
  rank = {year: users[year].most_common(10) for year in users}
  del users
  ids = {user[0] for year in rank for user in rank[year]}

  connection = mariadb.connect(db='ptwiki_p', host='ptwiki.labsdb', default_file=os.path.expanduser('~/replica.my.cnf'))
  c = connection.cursor()
  sql = '''SELECT user_id, user_name FROM user WHERE user_id IN (%s)''' % ','.join('%d' % id for id in ids)
  c.execute(sql)
  ids = {int(l[0]): l[1].decode() for l in c}
  connection.close()
  table = ['{|class=wikitable\n|-\n!Posição||Usuário||Número de<br>referenciações||Exemplos']
  for year in sorted(rank.keys(), reverse=True):
    if year == time.strftime('%Y'):
      continue
    table.append('|-\n!colspan=4 style="font-size:x-large"|%s' % year)
    for r, user in enumerate(rank[year]):
      revs = user[0] in revids[year] and ', '.join('{{dif|%d}}' % r for r in revids[year][user[0]]) or 'none'
      table.append('|-\n|%dº||%s||%d||%s' % (r + 1, ids[user[0]], user[1], revs))
  table.append('|}')
  text = '''Ranking de referenciadores.

A pesquisa foi feita no dump dos históricos, que contém todas edições de todos artigos, foram consideradas referenciações as edições que adicionaram referências ou seção de referência e que não foram a primeira edição do artigo.'''
  with open('refrank.txt', 'w') as f:
    f.write(text + '\n\n' + '\n'.join(table))
  print('feito')

if __name__ == "__main__":
  p = refsInHistory()
  print('%s páginas em %s' % (p, runtime()))
  mkrank()
	#!/usr/bin/env python3

	"""
	@ Autor: [[Usuário:Danilo.mac]]

	@ Licença: GNU General Public License 3.0 (GPL V3) e Creative Commons Attribution/Share-Alike (CC-BY-SA)

	Descrição: Busca de referencias no dump dos históricos da Wikipédia lusófona e gera um ranking dos usuários
	que mais referenciaram artigos.

	"""
	import bz2, re, mariadb, os, time, json, pickle
	from collections import defaultdict, Counter

	norefsince = Counter()
	maxyear = str(int(time.ctime()[-4:]) - 8)
	users = defaultdict(Counter)
	revids = {}
	somerefpage = False
	texttest = True

	print(time.strftime('%Y-%m-%d %H:%M:%S'))
	namespaces = ('0', '102')
	reTag = re.compile(br'(?s)<(ns\|id\|timestamp\|text\|sha1\|title)[^<>]>([^<])</\1>')
	#reRef = re.compile(r'<[Rr][Ee][Ff]')
	reRef = re.compile(r'<[Rr][Ee][Ff][ &]\|\{\{[Hh]arv[Rr]ef\\|\|\{\{[Ss]fn\\|')
	reRefSec = re.compile(r'\{\{[Rr]eferências[\\|\}]\|<references[ &]\|\n== ?Referências ?==\n')
	for ddir in sorted([d for d in os.listdir('/public/dumps/public/ptwiki/') if d.isdigit()], reverse=True):
	ddir = '/public/dumps/public/ptwiki/' + ddir
	with open(ddir + '/dumpstatus.json') as f:
	dstatus = json.load(f)
	if dstatus['jobs']['metahistorybz2dump']['status'] != 'done':
	print('pulando %s, status = %s' % (ddir, dstatus['jobs']['metahistorybz2dump']['status']))
	continue
	dumps = [ddir + '/' + a for a in os.listdir(ddir) if re.match(r'ptwiki-.-pages-meta-history.\.bz2', a)]
	dumps.sort(key=lambda d: int(re.search(r'\.xml-p(\d+)p', d).group(1)))
	print(ddir)
	break
	else:
	raise Exception('dumps não encontrados nos diretórios')

	start = int(time.time())
	def runtime():
	t = int(time.time()) - start
	return (t > 3600 and '{}h '.format(t // 3600) or '') + ('{}min '.format(t % 3600 // 60)) + ('{}s'.format(t % 60))

	def query(sql):
	connection = mariadb.connect(db='ptwiki_p', host='ptwiki.labsdb', default_file=os.path.expanduser('~/replica.my.cnf'))
	c = connection.cursor()
	c.execute(sql)
	r = c.fetchall()
	connection.close()
	return r

	class Page(object):

	def __init__(self, title):
	self.title = title
	self.isopen = True
	self.firstedit = True
	self.lastnoref = '' # última edição sem referência

	def revision(self, revid, ts, userid, text, sha1):
	if not self.isopen:
	return
	ref = bool(reRef.search(text) or reRefSec.search(text))
	if not ref:
	self.firstedit = False
	return
	if self.firstedit or not userid:
	self.isopen = False
	return
	year = ts[0:4]
	users[year][userid] += 1
	l = revids.setdefault(year, {}).setdefault(userid, [])
	if len(l) >= 5:
	l[revid % 5] = revid
	else:
	l.append(revid)
	self.isopen = False


	def revision_old(self, revid, ts, userid, text, sha1):
	ref = bool(reRef.search(text) or reRefSec.search(text))

	if not ref: # não tem ref
	self.lastnoref = max(ts, self.lastnoref)
	if self.firstref[0]: # tinha referência na edição anterior
	self.firstref = ('', '', None)
	self.norefsince = self.lastnoref
	elif not self.norefsince:
	self.norefsince = ts
	# tem ref, o timestamp é menor que outra referência e não é a primeira edição
	elif (not self.firstref[0] or self.firstref[0] > ts) and not self.firstedit:
	self.firstref = self.previeusref = (ts, userid, sha1)
	self.revid = revid
	self.firstedit = False

	def end(self):
	if not self.isopen:
	return
	print('firstref = %r, norefsince = %r, revid = %r' % (self.firstref, self.norefsince, self.revid))
	if not self.firstref[0]:
	self.isopen = False
	norefsince[self.norefsince[0:4]] += 1
	return
	year = self.firstref[0][0:4]
	users[year][self.firstref[1]] += 1
	if True or self.firstref[1] in getrevid:
	l = revids[year][self.firstref[1]]
	if len(l) >= 5:
	l[self.revid % 5] = self.revid
	else:
	l.append(self.revid)
	self.isopen = False

	def gen(dumps):
	buf = bytearray()
	for dump in dumps:
	print(dump)
	f = bz2.BZ2File(dump)
	buf.extend(f.read(20000000))
	while True:
	for match in reTag.finditer(buf):
	tag, value = match.group(1).decode(), match.group(2).decode()
	yield tag, value
	del buf[0:match.end()]
	l = len(buf)
	buf.extend(f.read(20000000))
	if len(buf) == l:
	break
	f.close()
	with open('refsgraf.log', 'a') as logfile:
	logfile.write('%s em %s\n' % (dump, runtime()))

	def refsInHistory():
	pages = {int(i[0]) for i in query("SELECT page_id FROM page WHERE page_namespace IN (0, 102) AND NOT page_is_redirect AND page_id NOT IN (SELECT cl_from FROM categorylinks WHERE cl_to = 'Desambiguação')")}
	c, p = 0, 0
	pagens = False
	page = None
	ts = None
	userid = 0
	hourlog = 1
	log10 = 1
	with open('refsgraf.log', 'a') as logfile:
	logfile.write(time.strftime('%d-%m-%Y %H:%M:%S ') + ddir + '\n')
	for tag, value in gen(dumps):
	if tag == 'ns':
	# ns indica o início de uma nova página, hora de salvar os dados da anterior
	if page:
	if p >= log10:
	log10 = log10 * 10
	print('%d páginas em %s' % (p, runtime()))
	if hourlog < (int(time.time()) - start) / 3600:
	hourlog += 1
	print('%d páginas em %s' % (p, runtime()))
	if value in namespaces:
	pagens = True
	page = None
	elif tag == 'title':
	title = value
	elif pagens and tag == 'id': # pageid
	pagens = False
	pageid = int(value)
	if pageid not in pages:
	continue
	page = Page(title)
	p += 1
	elif page:
	if tag == 'timestamp':
	ts = value[0:4] + value[5:7] + value[8:10] + value[11:13] + value[14:16] + value[17:19]
	elif not ts and tag == 'id':
	revid = int(value)
	elif ts and tag == 'id': # userid
	userid = int(value)
	elif tag == 'text':
	text = value
	elif tag == 'sha1':
	c += 1
	page.revision(revid, ts, userid, text, value)
	ts = None
	userid = 0
	with open('refusers.pkl', 'wb') as f:
	pickle.dump((users, dict(revids)), f)
	return p

	def mkrank():
	global users, revids
	if not users:
	with open('refusers.pkl', 'rb') as f:
	users, revids = pickle.load(f)
	rank = {year: users[year].most_common(10) for year in users}
	del users
	ids = {user[0] for year in rank for user in rank[year]}

	connection = mariadb.connect(db='ptwiki_p', host='ptwiki.labsdb', default_file=os.path.expanduser('~/replica.my.cnf'))
	c = connection.cursor()
	sql = '''SELECT user_id, user_name FROM user WHERE user_id IN (%s)''' % ','.join('%d' % id for id in ids)
	c.execute(sql)
	ids = {int(l[0]): l[1].decode() for l in c}
	connection.close()
	table = ['{\|class=wikitable\n\|-\n!Posição\|\|Usuário\|\|Número de<br>referenciações\|\|Exemplos']
	for year in sorted(rank.keys(), reverse=True):
	if year == time.strftime('%Y'):
	continue
	table.append('\|-\n!colspan=4 style="font-size:x-large"\|%s' % year)
	for r, user in enumerate(rank[year]):
	revs = user[0] in revids[year] and ', '.join('{{dif\|%d}}' % r for r in revids[year][user[0]]) or 'none'
	table.append('\|-\n\|%dº\|\|%s\|\|%d\|\|%s' % (r + 1, ids[user[0]], user[1], revs))
	table.append('\|}')
	text = '''Ranking de referenciadores.

	A pesquisa foi feita no dump dos históricos, que contém todas edições de todos artigos, foram consideradas referenciações as edições que adicionaram referências ou seção de referência e que não foram a primeira edição do artigo.'''
	with open('refrank.txt', 'w') as f:
	f.write(text + '\n\n' + '\n'.join(table))
	print('feito')

	if __name__ == "__main__":
	p = refsInHistory()
	print('%s páginas em %s' % (p, runtime()))
	mkrank()