Last active
October 9, 2015 10:27
-
-
Save hubx/3488254 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import urllib2 | |
import json | |
import os | |
import csv | |
API_URL = "http://wiki.piratenbrandenburg.de/api.php" | |
# remove content and applicaions_list manually | |
CATEGORIES = ["Kategorie:Landesparteitag_2013.2_Satzungsänderungsantrag", | |
"Kategorie:Landesparteitag_2013.2_Grundsatzprogrammantrag", | |
"Kategorie:Landesparteitag_2013.2_Wahlprogrammantrag", | |
"Kategorie:Landesparteitag_2013.2_Positionspapier", | |
"Kategorie:Landesparteitag_2013.2_Sonstiger_Antrag",] | |
MAX_PAGEIDS = 50 | |
def get_json(endpoint): | |
url = ''.join([ | |
API_URL, | |
'?', | |
endpoint, | |
'&format=json', | |
]) | |
print url | |
return urllib2.urlopen(url).read() | |
def get_category(category, query_continue=""): | |
data = get_json("action=query&list=categorymembers&cmtitle=%s&cmcontinue=%s" % (category, query_continue)) | |
json_data = json.loads(data) | |
if not "query" in json_data: | |
print category | |
print json_data | |
pages = json_data["query"]["categorymembers"] | |
if "query-continue" in json_data: | |
pages += get_category(category,json_data["query-continue"]["categorymembers"]["cmcontinue"]) | |
return pages | |
def list_applications(categories): | |
if os.path.isfile("application_list"): | |
f = open('application_list','r') | |
return json.load(f) | |
return download_applications(categories) | |
def download_applications(categories): | |
applications = _list_applications(categories) | |
f = open('application_list','w+') | |
json.dump(applications, f) | |
f.flush() | |
return applications | |
def _list_applications(categories): | |
applications = {} | |
for category in categories: | |
pages = get_category(category) | |
applications[category] = pages | |
return applications | |
def get_raw_pageid(pageid): | |
data = get_json("action=query&prop=revisions&rvprop=content&pageids=%s" % pageid) | |
json_data = json.loads(data) | |
pages = json_data["query"]["pages"] | |
content = [] | |
for pageids in pages: | |
content += pages[pageids]["revisions"] | |
return content | |
def chunks(l, n): | |
for i in xrange(0, len(l), n): | |
yield l[i:i+n] | |
def get_pageid(pageids): | |
pages = [] | |
for chunk in chunks(pageids, MAX_PAGEIDS): | |
pages += get_raw_pageid("|".join(str(i) for i in chunk)) | |
return pages | |
def _list_content(applications): | |
pageids = {} | |
content = {} | |
for category in applications.iterkeys(): | |
for application in applications[category]: | |
if category in pageids: | |
pageids[category] += [application["pageid"]] | |
else: | |
pageids[category] = [application["pageid"]] | |
content[category] = get_pageid(pageids[category]) | |
return content | |
def download_content(applications): | |
content = _list_content(applications) | |
f = open('content','w+') | |
json.dump(content,f) | |
f.flush() | |
return content | |
def list_content(applications): | |
if os.path.isfile("content"): | |
f = open('content','r') | |
return json.load(f) | |
return download_content(applications) | |
def parse_content(content): | |
applications = {} | |
for category in content.iterkeys(): | |
applications_for_category = [] | |
for application_content in content[category]: | |
application = mediawiki_template(application_content["*"]) | |
#if application["Eingereicht"] != "": | |
if "Antragsteller" in application: | |
application["autor"] = application.get("Antragsteller") | |
if "Antragstitel" in application: | |
application["titel"] = application.get("Antragstitel") | |
if "Titel" in application: | |
application["titel"] = application.get("Titel") | |
#Titel zu titel conversion | |
applications_for_category.append(application) | |
applications_for_category.sort(key = lambda a: a.get("titel")) | |
applications[category] = applications_for_category | |
return applications | |
def mediawiki_template(mw_string): | |
""" returns media wiki template element as a hash""" | |
#Split content inside Template | |
strings = mw_string.split("{{")[1].split("}}")[0].split("\n|") | |
#remove "Antragsfabrikat" | |
strings = strings[1:] | |
mw_hash = {} | |
for string in strings: | |
keyval = string.split("=",1) | |
if 2 != len(keyval): | |
raise SyntaxError("Mediawiki parsing Error %s" % keyval) | |
keyval = [s.strip() for s in keyval] | |
key, val = keyval | |
mw_hash[key] = val | |
return mw_hash | |
def filter_content(content): | |
""" simple filter for some html tags to plain text""" | |
content = content.replace("<sup>1</sup>","¹") | |
content = content.replace("<sup>2</sup>","²") | |
content = content.replace("<sup>3</sup>","³") | |
content = content.replace("<br>","\n") | |
content = content.replace("<br\>","\n") | |
content = content.replace("<br\\n>","\n") | |
content = content.replace("<br />","\n") | |
content = content.replace("<br/>","\n") | |
return content | |
def write_content(applications, positions=[]): | |
open_position = [] | |
open_position.extend(positions.keys()) | |
for category in applications: | |
f = open(category,'w+') | |
writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) | |
writer.writerow( ("Number","Title","Text","Reason", | |
"Submitter (First Name)","Sachgebiet")) | |
for a in applications[category]: | |
try: | |
#number = applications_position.index(a.get("titel")) + 1 | |
open_position.remove(a.get("titel")) | |
except ValueError: | |
if not "titel" in a: | |
a["titel"] = "" | |
print '"' + a.get("titel") + '" im Antragsbuch nicht gefunden' | |
continue | |
if not "autor" in a: | |
a["autor"] = "" | |
if not "text" in a: | |
a["text"] = "" | |
if not "begruendung" in a: | |
a["begruendung"] = "" | |
writer.writerow( ( positions[a.get("titel")], | |
a.get("titel").encode('utf8'), | |
"<pre>"+filter_content(a.get("text").encode('utf8'))+"</pre>", | |
"<pre>"+filter_content(a.get("begruendung").encode('utf8'))+"</pre>", | |
a.get("autor").encode('utf8'), | |
"") ) #sachgebiet | |
f.flush() | |
f.close() | |
if open_position != []: | |
print "\nAnträge aus dem Antragsbuch, die nicht gefunden wurden: " | |
for a in open_position: | |
print '"' + a + '"' | |
#def write_participants(applications): | |
# f = open("participants",'w+') | |
# authors = [] | |
# writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) | |
# writer.writerow( ( | |
# "title", "first name", "last name", "gender", "email", "group id", | |
# "structure level", "committee", "about me", "comment", "is active")) | |
# for category in applications: | |
# for a in applications[category]: | |
# if not "autor" in a: | |
# a["autor"] = "" | |
# if a.get("autor") in authors: | |
# continue | |
# authors.append(a.get("autor")) | |
# writer.writerow( ( "", | |
# a.get("autor").encode('utf8'), | |
# "", | |
# "", | |
# "", | |
# "", | |
# "", | |
# "", | |
# "", | |
# "", | |
# "") ) #sachgebiet | |
# f.flush() | |
# f.close() | |
def get_application_positions(filename): | |
f = open(filename,'r') | |
lines = {} | |
for l in f.readlines(): | |
line = l.decode('utf8').strip().split(" - ") | |
lines[line[1]] = line[0] | |
#Titel => WPXX | |
return lines | |
if __name__ == '__main__': | |
#download_applications(CATEGORIES) | |
applications = list_applications(CATEGORIES) | |
#download_content(applications) | |
content = list_content(applications) | |
applications = parse_content(content) | |
#Ein Titel per Zeile, TO-Reihenfolge gegeben | |
positions = get_application_positions("reihenfolge-lpt2013.2") | |
#write_participants(applications) | |
write_content(applications, positions) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment