Created
August 14, 2019 08:15
-
-
Save boamaod/2e2ad993059d10ee42b8ca43fc0c7f41 to your computer and use it in GitHub Desktop.
Dirty code to gather data for visualisation of Wikipedia contest results
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# | |
# Copyright 2018 Märt Põder <tramm@wikimedia.ee> | |
# | |
# This program is free software; you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation; either version 2 of the License, or | |
# (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program; if not, write to the Free Software | |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
# MA 02110-1301, USA. | |
# | |
# | |
import requests | |
import simplejson | |
import json | |
from html import unescape | |
from urllib.parse import urlencode | |
session = None | |
# https://et.wikipedia.org/w/api.php?action=query&list=usercontribs&ucend=20180223140000&uclimit=5000&ucuser=M%C3%A4rt%20P%C3%B5der | |
# https://et.wikipedia.org/w/api.php?action=query&format=json&prop=revisions&titles=Vikipeedia:Sajast%20kasvab%20miljon&rvlimit=5000&rvend=20180223140000&rvprop=timestamp|user|comment | |
# https://et.wikipedia.org/w/api.php?action=query&format=json&ucnamespace=0&list=usercontribs&ucend=20180223140000&uclimit=5000&redirects&ucuser=Neptuunium&ucprop=title|sizediff|ids | |
# https://et.wikipedia.org/w/api.php?action=query&format=json&ucnamespace=0&list=usercontribs&ucend=20180223140000&uclimit=5000&ucuser=Neptuunium&ucprop=title|sizediff|ids | |
# https://et.wikipedia.org | |
# https://fiu-vro.wikipedia.org | |
# https://et.wikisource.org | |
# https://commons.wikimedia.org | |
lihtsurelikud = set() | |
with open('osalejad-red.json', 'r') as f: | |
inimesed = json.load(f) | |
for inimene in inimesed: | |
if inimene["seed"]=="0" and inimene["avalik"]=="1": | |
#print(inimene["vikikasutaja"]) | |
lihtsurelikud.add(unescape(inimene["vikikasutaja"]).replace("&", "%26")) | |
campaign_start = "20180223140000" | |
redirects = {} | |
#users = {'Kamma', 'Kruusamägi', 'Pseudacorus'} | |
kutsutud = {'Tunguuz', "Mare Kõiva", "Puumarju", "Birgylorenz", "Sillu12", "Utvikipedist", "Ulvarkaart", "Nobenäpp" } | |
#kutsutud = {'Tunguuz', 'Antimust', "Punnivinn", "Alevtiina", "Andi.hektor", "Pkarro", "Notaator", "Sirkin23", "Urvastemiis", "Hans Krämer", "Martk83", "MKunnus", "Mare Kõiva", "Puumarju", "Birgylorenz", "Sillu12", "Endla", "AK720", "Tambetm", "Maakaru", "Velmaja", "Ullike", "Utvikipedist", "Proosamanna", "Nadosdelatsebenik", "Puik", "Taivop", "Fideelia", "Hsoosalu", "Vingianodepina", "Berkvaher", "MV", "Päevakoer", "Morel", "Mat Petheny", "Els.heinsalu", "Dequodlibet", "AnniAet", "Katlakytja", "Trtrlp", "Andrus Kallastu", "Kristaaru", "Curious NW", "DoktorFaustus123", "Marie Krause", "Ulvarkaart", "Nobenäpp" } | |
articles_vikipeedia = { | |
'Kultuuriajalugu', | |
'VOSK', | |
'Disputatsioon', | |
"Pommtsüklon", | |
"Pikaealisuse piirkond", | |
"Andres Põldroo", | |
"Teet Velling", | |
} | |
articles_lihtsurelik_add = { | |
'Tuule kõrts', | |
'Krüptopeo manifest', | |
'Krüptoanarhismi manifest', | |
'Küberruumi sõltumatuse deklaratsioon', | |
'Avatud ligipääsu geriljamanifest', | |
'Paar palwid Eesti ärksamaile poegadele ja tütardele', | |
'Kirjutamismaraton', | |
'Tõe ja õiguse pärast', | |
'Vikiandmed', | |
} | |
def login(user, password): | |
#rc = session.post("https://et.wikipedia.org") | |
#print(rc.cookies) | |
s = requests.session() | |
r = s.get('https://et.wikipedia.org/w/api.php?action=query&format=json&meta=tokens&type=login') | |
json = r.json() | |
login_token = json['query']['tokens']['logintoken'] | |
#print(login_token) | |
rb = s.post('https://et.wikipedia.org/w/api.php?action=login&format=json&lgname=%s' % user, data = {"lgpassword": password, "lgtoken" : login_token}) | |
json = rb.json() | |
return s | |
def is_article(article): | |
r4 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&prop=info&titles=%s' % (article)) | |
json = r4.json() | |
for page in json['query']['pages']: | |
if "missing" in json['query']['pages'][page]: | |
length = 0 | |
else: | |
length = json['query']['pages'][page]["length"] | |
r5 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&rvend=%s&prop=revisions&titles=%s&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1|size&rvdir=older' % (campaign_start, article)) | |
revisions = [] | |
revsize = 0 | |
prevsize = -1 | |
json = r5.json() | |
users = set() | |
for page in json['query']['pages']: | |
if "missing" in json['query']['pages'][page]: continue; | |
if "revisions" not in json['query']['pages'][page]: continue; | |
else: | |
revisions = json['query']['pages'][page]["revisions"] | |
for revision in revisions: | |
if prevsize==-1: | |
prevsize=revision["size"] | |
revsize=revision["size"] | |
else: | |
revsize += abs(revision["size"] - prevsize) | |
#print(revision) | |
users.add(revision["user"]) | |
return [length, len(revisions), len(users), revsize] | |
def get_parent_categories(title): | |
r9 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&prop=categories&titles=%s' % (title)) | |
json = r9.json() | |
#print(json) | |
cats = [] | |
for page in json['query']['pages']: | |
if "missing" in json['query']['pages'][page]: | |
break | |
else: | |
for cat in json['query']['pages'][page]["categories"]: | |
cats.append(cat["title"]) | |
return cats | |
def is_creator(username, pageid): | |
r6 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&rvstart=%s&prop=revisions&pageids=%s&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1&rvdir=older' % (campaign_start, pageid)) | |
if "revisions" in r6.json()['query']['pages'][str(pageid)]: | |
return False | |
r3 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&rvend=%s&prop=revisions&pageids=%s&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1&rvdir=older' % (campaign_start, pageid)) | |
#print(r3.json()['query']['pages'][str(pageid)]) | |
#print(username, "?=", r3.json()['query']['pages'][str(pageid)]["revisions"][-1]["user"]) | |
#print(r3.json()['query']['pages'][str(pageid)]["revisions"][-1]["timestamp"]) | |
return username.lower() == r3.json()['query']['pages'][str(pageid)]["revisions"][-1]["user"].lower() | |
def get_users_from_articles(articles): | |
users = set() | |
for article in articles: | |
#print(article) | |
r = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&ucnamespace=0&rvend=%s&prop=revisions&titles=%s&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1' % (campaign_start, article)) | |
json = r.json() | |
for page in json['query']['pages']: | |
if "missing" in json['query']['pages'][page]: continue; | |
if "revisions" not in json['query']['pages'][page]: | |
print ("JAMA!", article) | |
#print(json['query']['pages'][page]) | |
continue; | |
#print(json['query']['pages'][page]) | |
page_title = json['query']['pages'][page]["title"] | |
page_creator = json['query']['pages'][page]["revisions"][-1]["user"] | |
page_sha1 = json['query']['pages'][page]["revisions"][-1]["sha1"] | |
r2 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&ucnamespace=0&prop=revisions&titles=%s&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1&rvdir=older' % article) | |
json2 = r2.json() | |
first_rev_creator = json2['query']['pages'][page]["revisions"][-1]["user"] | |
first_rev_sha1 = json2['query']['pages'][page]["revisions"][-1]["sha1"] | |
if first_rev_creator.lower() == page_creator.lower() and first_rev_sha1 == page_sha1: | |
#print(page_title, page_creator) | |
#lõi kampaania ajal kampaania nimekirjas oleva artikli, järelikult on kampaanias osaline? | |
users.add(page_creator) | |
else: | |
pass | |
#print(page_title, page_creator) | |
#for revision in r.json()['query']['pages'][page]["revisions"]: | |
# print(revision['user'], revision['timestamp'], revision['comment']) | |
break; | |
return users | |
def resolve_redirect(title): | |
if title in redirects: | |
return redirects[title] | |
r = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&titles=%s&redirects' % (title)) | |
json = r.json() | |
for page in json['query']['pages']: | |
if "missing" in json['query']['pages'][page]: continue; | |
page_title = json['query']['pages'][page]["title"] | |
if page_title != title: | |
redirects[title] = page_title | |
return page_title | |
return title | |
def get_created_and_modded(users): | |
created_articles = set() | |
modded_articles = set() | |
#print(users) | |
for user in users: | |
print(user, end='') | |
created_bytes = 0 | |
modded_bytes = 0 | |
r = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&ucnamespace=0&list=usercontribs&ucend=%s&uclimit=5000&ucuser=%s&ucprop=title|sizediff|ids' % (campaign_start, user)) | |
not_created = set() | |
created = set() | |
for item in r.json()['query']['usercontribs']: | |
#print(item) | |
title = resolve_redirect(item['title']) | |
pageid = item["pageid"] | |
sizediff = item['sizediff'] | |
if item['title'] in redirects: | |
print("#REDIRECT", item['title'], "->", title) | |
# api.php?action=query&list=allrevisions&arvdir=newer&arvlimit=5000 | |
# api.php?action=query&format=json&ucnamespace=0&rvend=%s&prop=revisions&titles=%s&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1' % (campaign_start, article)) | |
# r2 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&arvnamespace=0&list=allrevisions&arvend=%s&arvlimit=5000&arvuser=%s&arvprop=title|sizediff|ids' % (campaign_start, user)) | |
# https://et.wikipedia.org/w/api.php?action=query&format=json&rvend=20180223140000&prop=revisions&titles=Enoshima&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1|size&rvdir=older | |
r7 = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&rvend=%s&prop=revisions&titles=%s&rvlimit=5000&rvprop=timestamp|user|comment|ids|sha1|size&rvdir=older' % (campaign_start, title)) | |
revisions = [] | |
revsize = 0 | |
prevsize = -1 | |
json = r7.json() | |
users = set() | |
for page in json['query']['pages']: | |
if "missing" in json['query']['pages'][page]: | |
print("JAMA2"); | |
continue; | |
if "revisions" not in json['query']['pages'][page]: | |
print("JAMA3"); | |
continue; | |
else: | |
pageid=page | |
revisions = json['query']['pages'][page]["revisions"] | |
for revision in revisions: | |
if prevsize==-1: | |
prevsize=revision["size"] | |
revsize=revision["size"] | |
else: | |
revsize += abs(revision["size"] - prevsize) | |
#print(revision) | |
users.add(revision["user"]) | |
sizediff = revsize | |
break # not needed (just one page), but just in case | |
if title not in created_articles: | |
if title in created or is_creator(user, pageid): | |
if title not in created: | |
#print ("LÕI:", user, item['title'], item['sizediff'], item["pageid"], item["revid"]) | |
created.add(title) | |
created_bytes += abs(sizediff) | |
else: | |
created_bytes += abs(sizediff) | |
if title not in created: | |
#print("MODIS:", user, item['title'], item['sizediff'], item["pageid"], item["revid"]) | |
modded_bytes += abs(sizediff) | |
not_created.add(title) | |
print("", len(created), created_bytes, len(not_created), modded_bytes, sep=";") | |
modded_articles.update(not_created) | |
created_articles.update(created) | |
return [created_articles, modded_articles] | |
session = login("Märt Põder@AndmeKaeveBot", "/censored/") | |
r = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&meta=userinfo') | |
print(r.json()) | |
#print(get_parent_categories("Enoshima")) | |
#exit(1) | |
#print("size", "revs", "users", "revsize", "art", sep=";") | |
#for art in articles_vikipeedia: | |
# size, revs, users, revsize = is_article(art) | |
# print(size, revs, users, revsize, art, sep=";") | |
print("=======================") | |
print("\n\nISELIITUNUD:\n") | |
print("user", "newart", "bytes", "modart", "bytes", sep=";") | |
created_ise, modded_ise = get_created_and_modded(lihtsurelikud) | |
print(len(lihtsurelikud), lihtsurelikud) | |
print(len(created_ise), created_ise) | |
print(len(modded_ise), modded_ise) | |
created_ise.update(articles_lihtsurelik_add) | |
print("\n\nUUED ISELIITUNUTE ARTIKLID:\n") | |
print("size", "revs", "users", "revsize", "art", sep=";") | |
for art in created_ise: | |
size, revs, users, revsize = is_article(art) | |
print(size, revs, users, revsize, art, sep=";") | |
print("\n\nMUUDETUD ISELIITUNUTE ARTIKLID:\n") | |
print("size", "revs", "users", "revsize", "art", sep=";") | |
for art in modded_ise: | |
size, revs, users, revsize = is_article(art) | |
print(size, revs, users, revsize, art, sep=";") | |
print("=======================") | |
#for art in articles_vikipeedia: | |
# size, revs, users, revsize = is_article(art) | |
# print(size, revs, users, revsize, art, sep=";") | |
#print(len(kutsutud), kutsutud) | |
more_kutsutud = get_users_from_articles(articles_vikipeedia) | |
#print(len(more_kutsutud), more_kutsutud) | |
kutsutud.update(more_kutsutud) | |
print(len(kutsutud), kutsutud) | |
print("\n\nKUTSUTUD:\n") | |
print("user", "newart", "bytes", "modart", "bytes", sep=";") | |
created_kutsutud, modded_kutsutud = get_created_and_modded(kutsutud) | |
print("=======================") | |
print(len(kutsutud), kutsutud) | |
print(len(created_kutsutud), created_kutsutud) | |
print(len(modded_kutsutud), modded_kutsutud) | |
print(len(articles_vikipeedia), articles_vikipeedia) | |
created_kutsutud.update(articles_vikipeedia) | |
print("\n\nUUED KUTSUTUTE ARTIKLID:\n") | |
print("size", "revs", "users", "revsize", "art", sep=";") | |
for art in created_kutsutud: | |
size, revs, users, revsize = is_article(art) | |
print(size, revs, users, revsize, art, sep=";") | |
print("\n\nMUUDETUD KUTSUTUTE ARTIKLID:\n") | |
print("size", "revs", "users", "revsize", "art", sep=";") | |
for art in modded_kutsutud: | |
size, revs, users, revsize = is_article(art) | |
print(size, revs, users, revsize, art, sep=";") | |
#r = session.get('https://et.wikipedia.org/w/api.php?action=query&format=json&prop=revisions&titles=Küberolümpia&rvlimit=5000&rvend=20180223140000&rvprop=timestamp|user|comment') | |
#for item in r.json()['query']['usercontribs']: | |
# print(item['title'], item['size']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment