Last active
December 25, 2015 10:49
-
-
Save aniversarioperu/6964023 to your computer and use it in GitHub Desktop.
Extraer proyectos de ley, y nombre de los autores, presentados durante el año 2013 (hasta octubre)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import socks | |
import cookielib | |
import socket | |
from bs4 import BeautifulSoup | |
import requests | |
import sys | |
import re | |
from os import listdir | |
import codecs | |
import urllib2 | |
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050) | |
socket.socket = socks.socksocket | |
def extract_doc_links(soup): | |
our_links = [] | |
for link in soup.find_all("a"): | |
if re.search("201[0-9]-CR$", link.get_text()): | |
href = link.get("href") | |
if href.endswith("ocument"): | |
our_link = "http://www2.congreso.gob.pe" + "/" + href | |
our_link = re.sub("//Sicr","/Sirc", our_link) | |
our_links.append(our_link) | |
return our_links | |
def parse_names(string): | |
""" | |
Parse string of names. Output only family name as list. | |
""" | |
names = [] | |
for i in string.split(","): | |
i = re.sub("\s{2}.+", "", i) | |
names.append(i) | |
return names | |
def get_authors_from_project(document_link): | |
""" | |
input: link to project page | |
output: list of author names as list | |
Using tor, found help here: | |
http://stackoverflow.com/questions/10967631/how-to-make-http-request-through-a-tor-socks-proxy-using-python | |
""" | |
cj = cookielib.CookieJar() | |
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) | |
opener.addheaders = [('Accept-Charset', 'utf-8')] | |
request = urllib2.Request(document_link) | |
request.add_header('Cache-Control','max-age=0') | |
response = opener.open(request) | |
proyecto = BeautifulSoup(response.read().decode("utf-8")) | |
del request | |
del response | |
for i in proyecto.find_all("input"): | |
if i.get("name") == "NomCongre": | |
return parse_names(i.get("value")) | |
## ------------------------------------------------ | |
def main(): | |
all_links = [] | |
for file in listdir("."): | |
if file.startswith("PA"): | |
print file | |
f = open(file, "r") | |
html_doc = f.read() | |
f.close() | |
soup = BeautifulSoup(html_doc) | |
all_links += extract_doc_links(soup) | |
print "Numero de proyectos de ley: %i " % len(all_links) | |
# Save names to file | |
f = codecs.open("all_authors.csv", "w", "utf-8") | |
f.write("Congresista\n") | |
f.close() | |
f = codecs.open("all_authors.csv", "a", "utf-8") | |
for link in all_links: | |
authors = get_authors_from_project(link) | |
for author in authors: | |
f.write(author + "\n") | |
f.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment