Skip to content

Instantly share code, notes, and snippets.

Last active October 9, 2015 10:27
Show Gist options
  • Save hubx/3488254 to your computer and use it in GitHub Desktop.
Save hubx/3488254 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2
import json
import os
import csv
API_URL = ""
# remove content and applicaions_list manually
CATEGORIES = ["Kategorie:Landesparteitag_2013.2_Satzungsänderungsantrag",
def get_json(endpoint):
url = ''.join([
print url
return urllib2.urlopen(url).read()
def get_category(category, query_continue=""):
data = get_json("action=query&list=categorymembers&cmtitle=%s&cmcontinue=%s" % (category, query_continue))
json_data = json.loads(data)
if not "query" in json_data:
print category
print json_data
pages = json_data["query"]["categorymembers"]
if "query-continue" in json_data:
pages += get_category(category,json_data["query-continue"]["categorymembers"]["cmcontinue"])
return pages
def list_applications(categories):
if os.path.isfile("application_list"):
f = open('application_list','r')
return json.load(f)
return download_applications(categories)
def download_applications(categories):
applications = _list_applications(categories)
f = open('application_list','w+')
json.dump(applications, f)
return applications
def _list_applications(categories):
applications = {}
for category in categories:
pages = get_category(category)
applications[category] = pages
return applications
def get_raw_pageid(pageid):
data = get_json("action=query&prop=revisions&rvprop=content&pageids=%s" % pageid)
json_data = json.loads(data)
pages = json_data["query"]["pages"]
content = []
for pageids in pages:
content += pages[pageids]["revisions"]
return content
def chunks(l, n):
for i in xrange(0, len(l), n):
yield l[i:i+n]
def get_pageid(pageids):
pages = []
for chunk in chunks(pageids, MAX_PAGEIDS):
pages += get_raw_pageid("|".join(str(i) for i in chunk))
return pages
def _list_content(applications):
pageids = {}
content = {}
for category in applications.iterkeys():
for application in applications[category]:
if category in pageids:
pageids[category] += [application["pageid"]]
pageids[category] = [application["pageid"]]
content[category] = get_pageid(pageids[category])
return content
def download_content(applications):
content = _list_content(applications)
f = open('content','w+')
return content
def list_content(applications):
if os.path.isfile("content"):
f = open('content','r')
return json.load(f)
return download_content(applications)
def parse_content(content):
applications = {}
for category in content.iterkeys():
applications_for_category = []
for application_content in content[category]:
application = mediawiki_template(application_content["*"])
#if application["Eingereicht"] != "":
if "Antragsteller" in application:
application["autor"] = application.get("Antragsteller")
if "Antragstitel" in application:
application["titel"] = application.get("Antragstitel")
if "Titel" in application:
application["titel"] = application.get("Titel")
#Titel zu titel conversion
applications_for_category.sort(key = lambda a: a.get("titel"))
applications[category] = applications_for_category
return applications
def mediawiki_template(mw_string):
""" returns media wiki template element as a hash"""
#Split content inside Template
strings = mw_string.split("{{")[1].split("}}")[0].split("\n|")
#remove "Antragsfabrikat"
strings = strings[1:]
mw_hash = {}
for string in strings:
keyval = string.split("=",1)
if 2 != len(keyval):
raise SyntaxError("Mediawiki parsing Error %s" % keyval)
keyval = [s.strip() for s in keyval]
key, val = keyval
mw_hash[key] = val
return mw_hash
def filter_content(content):
""" simple filter for some html tags to plain text"""
content = content.replace("<sup>1</sup>","¹")
content = content.replace("<sup>2</sup>","²")
content = content.replace("<sup>3</sup>","³")
content = content.replace("<br>","\n")
content = content.replace("<br\>","\n")
content = content.replace("<br\\n>","\n")
content = content.replace("<br />","\n")
content = content.replace("<br/>","\n")
return content
def write_content(applications, positions=[]):
open_position = []
for category in applications:
f = open(category,'w+')
writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
writer.writerow( ("Number","Title","Text","Reason",
"Submitter (First Name)","Sachgebiet"))
for a in applications[category]:
#number = applications_position.index(a.get("titel")) + 1
except ValueError:
if not "titel" in a:
a["titel"] = ""
print '"' + a.get("titel") + '" im Antragsbuch nicht gefunden'
if not "autor" in a:
a["autor"] = ""
if not "text" in a:
a["text"] = ""
if not "begruendung" in a:
a["begruendung"] = ""
writer.writerow( ( positions[a.get("titel")],
"") ) #sachgebiet
if open_position != []:
print "\nAnträge aus dem Antragsbuch, die nicht gefunden wurden: "
for a in open_position:
print '"' + a + '"'
#def write_participants(applications):
# f = open("participants",'w+')
# authors = []
# writer = csv.writer(f,delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
# writer.writerow( (
# "title", "first name", "last name", "gender", "email", "group id",
# "structure level", "committee", "about me", "comment", "is active"))
# for category in applications:
# for a in applications[category]:
# if not "autor" in a:
# a["autor"] = ""
# if a.get("autor") in authors:
# continue
# authors.append(a.get("autor"))
# writer.writerow( ( "",
# a.get("autor").encode('utf8'),
# "",
# "",
# "",
# "",
# "",
# "",
# "",
# "",
# "") ) #sachgebiet
# f.flush()
# f.close()
def get_application_positions(filename):
f = open(filename,'r')
lines = {}
for l in f.readlines():
line = l.decode('utf8').strip().split(" - ")
lines[line[1]] = line[0]
#Titel => WPXX
return lines
if __name__ == '__main__':
applications = list_applications(CATEGORIES)
content = list_content(applications)
applications = parse_content(content)
#Ein Titel per Zeile, TO-Reihenfolge gegeben
positions = get_application_positions("reihenfolge-lpt2013.2")
write_content(applications, positions)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment