Skip to content

Instantly share code, notes, and snippets.

@boamaod
Created August 14, 2019 08:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save boamaod/2e2ad993059d10ee42b8ca43fc0c7f41 to your computer and use it in GitHub Desktop.
Save boamaod/2e2ad993059d10ee42b8ca43fc0c7f41 to your computer and use it in GitHub Desktop.
Dirty code to gather data for visualisation of Wikipedia contest results
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2018 Märt Põder <tramm@wikimedia.ee>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#
import requests
import simplejson
import json
from html import unescape
from urllib.parse import urlencode
session = None
# https://et.wikipedia.org/w/api.php?action=query&list=usercontribs&ucend=20180223140000&uclimit=5000&ucuser=M%C3%A4rt%20P%C3%B5der
# https://et.wikipedia.org/w/api.php?action=query&format=json&prop=revisions&titles=Vikipeedia:Sajast%20kasvab%20miljon&rvlimit=5000&rvend=20180223140000&rvprop=timestamp|user|comment
# https://et.wikipedia.org/w/api.php?action=query&format=json&ucnamespace=0&list=usercontribs&ucend=20180223140000&uclimit=5000&redirects&ucuser=Neptuunium&ucprop=title|sizediff|ids
# https://et.wikipedia.org/w/api.php?action=query&format=json&ucnamespace=0&list=usercontribs&ucend=20180223140000&uclimit=5000&ucuser=Neptuunium&ucprop=title|sizediff|ids
# https://et.wikipedia.org
# https://fiu-vro.wikipedia.org
# https://et.wikisource.org
# https://commons.wikimedia.org
lihtsurelikud = set()
with open('osalejad-red.json', 'r') as f:
inimesed = json.load(f)
for inimene in inimesed:
if inimene["seed"]=="0" and inimene["avalik"]=="1":
#print(inimene["vikikasutaja"])
lihtsurelikud.add(unescape(inimene["vikikasutaja"]).replace("&", "%26"))
campaign_start = "20180223140000"
redirects = {}
#users = {'Kamma', 'Kruusamägi', 'Pseudacorus'}
kutsutud = {'Tunguuz', "Mare Kõiva", "Puumarju", "Birgylorenz", "Sillu12", "Utvikipedist", "Ulvarkaart", "Nobenäpp" }
#kutsutud = {'Tunguuz', 'Antimust', "Punnivinn", "Alevtiina", "Andi.hektor", "Pkarro", "Notaator", "Sirkin23", "Urvastemiis", "Hans Krämer", "Martk83", "MKunnus", "Mare Kõiva", "Puumarju", "Birgylorenz", "Sillu12", "Endla", "AK720", "Tambetm", "Maakaru", "Velmaja", "Ullike", "Utvikipedist", "Proosamanna", "Nadosdelatsebenik", "Puik", "Taivop", "Fideelia", "Hsoosalu", "Vingianodepina", "Berkvaher", "MV", "Päevakoer", "Morel", "Mat Petheny", "Els.heinsalu", "Dequodlibet", "AnniAet", "Katlakytja", "Trtrlp", "Andrus Kallastu", "Kristaaru", "Curious NW", "DoktorFaustus123", "Marie Krause", "Ulvarkaart", "Nobenäpp" }
articles_vikipeedia = {
'Kultuuriajalugu',
'VOSK',
'Disputatsioon',
"Pommtsüklon",
"Pikaealisuse piirkond",
"Andres Põldroo",
"Teet Velling",
}
articles_lihtsurelik_add = {
'Tuule kõrts',
'Krüptopeo manifest',
'Krüptoanarhismi manifest',
'Küberruumi sõltumatuse deklaratsioon',
'Avatud ligipääsu geriljamanifest‎',
'Paar palwid Eesti ärksamaile poegadele ja tütardele',
'Kirjutamismaraton',
'Tõe ja õiguse pärast',
'Vikiandmed',
}
def login(user, password):
#rc = session.post("https://et.wikipedia.org")
#print(rc.cookies)
s = requests.session()
r = s.get('https://et.wikipedia.org/w/api.php?action=query&format=json&meta=tokens&type=login')
json = r.json()
login_token = json['query']['tokens']['logintoken']
#print(login_token)
rb = s.post('https://et.wikipedia.org/w/api.php?action=login&format=json&lgname=%s' % user, data = {"lgpassword": password, "lgtoken" : login_token})
json = rb.json()
return s
def is_article(article):
r4 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&prop=info&titles=%s' % (article))
json = r4.json()
for page in json['query']['pages']:
if "missing" in json['query']['pages'][page]:
length = 0
else:
length = json['query']['pages'][page]["length"]
r5 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&rvend=%s&prop=revisions&titles=%s&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1|size&rvdir=older' % (campaign_start, article))
revisions = []
revsize = 0
prevsize = -1
json = r5.json()
users = set()
for page in json['query']['pages']:
if "missing" in json['query']['pages'][page]: continue;
if "revisions" not in json['query']['pages'][page]: continue;
else:
revisions = json['query']['pages'][page]["revisions"]
for revision in revisions:
if prevsize==-1:
prevsize=revision["size"]
revsize=revision["size"]
else:
revsize += abs(revision["size"] - prevsize)
#print(revision)
users.add(revision["user"])
return [length, len(revisions), len(users), revsize]
def get_parent_categories(title):
r9 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&prop=categories&titles=%s' % (title))
json = r9.json()
#print(json)
cats = []
for page in json['query']['pages']:
if "missing" in json['query']['pages'][page]:
break
else:
for cat in json['query']['pages'][page]["categories"]:
cats.append(cat["title"])
return cats
def is_creator(username, pageid):
r6 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&rvstart=%s&prop=revisions&pageids=%s&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1&rvdir=older' % (campaign_start, pageid))
if "revisions" in r6.json()['query']['pages'][str(pageid)]:
return False
r3 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&rvend=%s&prop=revisions&pageids=%s&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1&rvdir=older' % (campaign_start, pageid))
#print(r3.json()['query']['pages'][str(pageid)])
#print(username, "?=", r3.json()['query']['pages'][str(pageid)]["revisions"][-1]["user"])
#print(r3.json()['query']['pages'][str(pageid)]["revisions"][-1]["timestamp"])
return username.lower() == r3.json()['query']['pages'][str(pageid)]["revisions"][-1]["user"].lower()
def get_users_from_articles(articles):
users = set()
for article in articles:
#print(article)
r = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&ucnamespace=0&rvend=%s&prop=revisions&titles=%s&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1' % (campaign_start, article))
json = r.json()
for page in json['query']['pages']:
if "missing" in json['query']['pages'][page]: continue;
if "revisions" not in json['query']['pages'][page]:
print ("JAMA!", article)
#print(json['query']['pages'][page])
continue;
#print(json['query']['pages'][page])
page_title = json['query']['pages'][page]["title"]
page_creator = json['query']['pages'][page]["revisions"][-1]["user"]
page_sha1 = json['query']['pages'][page]["revisions"][-1]["sha1"]
r2 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&ucnamespace=0&prop=revisions&titles=%s&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1&rvdir=older' % article)
json2 = r2.json()
first_rev_creator = json2['query']['pages'][page]["revisions"][-1]["user"]
first_rev_sha1 = json2['query']['pages'][page]["revisions"][-1]["sha1"]
if first_rev_creator.lower() == page_creator.lower() and first_rev_sha1 == page_sha1:
#print(page_title, page_creator)
#lõi kampaania ajal kampaania nimekirjas oleva artikli, järelikult on kampaanias osaline?
users.add(page_creator)
else:
pass
#print(page_title, page_creator)
#for revision in r.json()['query']['pages'][page]["revisions"]:
# print(revision['user'], revision['timestamp'], revision['comment'])
break;
return users
def resolve_redirect(title):
if title in redirects:
return redirects[title]
r = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&titles=%s&redirects' % (title))
json = r.json()
for page in json['query']['pages']:
if "missing" in json['query']['pages'][page]: continue;
page_title = json['query']['pages'][page]["title"]
if page_title != title:
redirects[title] = page_title
return page_title
return title
def get_created_and_modded(users):
created_articles = set()
modded_articles = set()
#print(users)
for user in users:
print(user, end='')
created_bytes = 0
modded_bytes = 0
r = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&ucnamespace=0&list=usercontribs&ucend=%s&uclimit=5000&ucuser=%s&ucprop=title|sizediff|ids' % (campaign_start, user))
not_created = set()
created = set()
for item in r.json()['query']['usercontribs']:
#print(item)
title = resolve_redirect(item['title'])
pageid = item["pageid"]
sizediff = item['sizediff']
if item['title'] in redirects:
print("#REDIRECT", item['title'], "->", title)
# api.php?action=query&list=allrevisions&arvdir=newer&arvlimit=5000
# api.php?action=query&format=json&ucnamespace=0&rvend=%s&prop=revisions&titles=%s&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1' % (campaign_start, article))
# r2 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&arvnamespace=0&list=allrevisions&arvend=%s&arvlimit=5000&arvuser=%s&arvprop=title|sizediff|ids' % (campaign_start, user))
# https://et.wikipedia.org/w/api.php?action=query&format=json&rvend=20180223140000&prop=revisions&titles=Enoshima&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1|size&rvdir=older
r7 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&rvend=%s&prop=revisions&titles=%s&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1|size&rvdir=older' % (campaign_start, title))
revisions = []
revsize = 0
prevsize = -1
json = r7.json()
users = set()
for page in json['query']['pages']:
if "missing" in json['query']['pages'][page]:
print("JAMA2");
continue;
if "revisions" not in json['query']['pages'][page]:
print("JAMA3");
continue;
else:
pageid=page
revisions = json['query']['pages'][page]["revisions"]
for revision in revisions:
if prevsize==-1:
prevsize=revision["size"]
revsize=revision["size"]
else:
revsize += abs(revision["size"] - prevsize)
#print(revision)
users.add(revision["user"])
sizediff = revsize
break # not needed (just one page), but just in case
if title not in created_articles:
if title in created or is_creator(user, pageid):
if title not in created:
#print ("LÕI:", user, item['title'], item['sizediff'], item["pageid"], item["revid"])
created.add(title)
created_bytes += abs(sizediff)
else:
created_bytes += abs(sizediff)
if title not in created:
#print("MODIS:", user, item['title'], item['sizediff'], item["pageid"], item["revid"])
modded_bytes += abs(sizediff)
not_created.add(title)
print("", len(created), created_bytes, len(not_created), modded_bytes, sep=";")
modded_articles.update(not_created)
created_articles.update(created)
return [created_articles, modded_articles]
session = login("Märt Põder@AndmeKaeveBot", "/censored/")
r = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&meta=userinfo')
print(r.json())
#print(get_parent_categories("Enoshima"))
#exit(1)
#print("size", "revs", "users", "revsize", "art", sep=";")
#for art in articles_vikipeedia:
# size, revs, users, revsize = is_article(art)
# print(size, revs, users, revsize, art, sep=";")
print("=======================")
print("\n\nISELIITUNUD:\n")
print("user", "newart", "bytes", "modart", "bytes", sep=";")
created_ise, modded_ise = get_created_and_modded(lihtsurelikud)
print(len(lihtsurelikud), lihtsurelikud)
print(len(created_ise), created_ise)
print(len(modded_ise), modded_ise)
created_ise.update(articles_lihtsurelik_add)
print("\n\nUUED ISELIITUNUTE ARTIKLID:\n")
print("size", "revs", "users", "revsize", "art", sep=";")
for art in created_ise:
size, revs, users, revsize = is_article(art)
print(size, revs, users, revsize, art, sep=";")
print("\n\nMUUDETUD ISELIITUNUTE ARTIKLID:\n")
print("size", "revs", "users", "revsize", "art", sep=";")
for art in modded_ise:
size, revs, users, revsize = is_article(art)
print(size, revs, users, revsize, art, sep=";")
print("=======================")
#for art in articles_vikipeedia:
# size, revs, users, revsize = is_article(art)
# print(size, revs, users, revsize, art, sep=";")
#print(len(kutsutud), kutsutud)
more_kutsutud = get_users_from_articles(articles_vikipeedia)
#print(len(more_kutsutud), more_kutsutud)
kutsutud.update(more_kutsutud)
print(len(kutsutud), kutsutud)
print("\n\nKUTSUTUD:\n")
print("user", "newart", "bytes", "modart", "bytes", sep=";")
created_kutsutud, modded_kutsutud = get_created_and_modded(kutsutud)
print("=======================")
print(len(kutsutud), kutsutud)
print(len(created_kutsutud), created_kutsutud)
print(len(modded_kutsutud), modded_kutsutud)
print(len(articles_vikipeedia), articles_vikipeedia)
created_kutsutud.update(articles_vikipeedia)
print("\n\nUUED KUTSUTUTE ARTIKLID:\n")
print("size", "revs", "users", "revsize", "art", sep=";")
for art in created_kutsutud:
size, revs, users, revsize = is_article(art)
print(size, revs, users, revsize, art, sep=";")
print("\n\nMUUDETUD KUTSUTUTE ARTIKLID:\n")
print("size", "revs", "users", "revsize", "art", sep=";")
for art in modded_kutsutud:
size, revs, users, revsize = is_article(art)
print(size, revs, users, revsize, art, sep=";")
#r = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&prop=revisions&titles=Küberolümpia&rvlimit=5000&rvend=20180223140000&rvprop=timestamp|user|comment')
#for item in r.json()['query']['usercontribs']:
# print(item['title'], item['size'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment