Skip to content

Instantly share code, notes, and snippets.

@aniversarioperu
Last active December 25, 2015 10:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aniversarioperu/6964023 to your computer and use it in GitHub Desktop.
Save aniversarioperu/6964023 to your computer and use it in GitHub Desktop.
Extraer proyectos de ley, y nombre de los autores, presentados durante el año 2013 (hasta octubre)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import socks
import cookielib
import socket
from bs4 import BeautifulSoup
import requests
import sys
import re
from os import listdir
import codecs
import urllib2
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050)
socket.socket = socks.socksocket
def extract_doc_links(soup):
our_links = []
for link in soup.find_all("a"):
if re.search("201[0-9]-CR$", link.get_text()):
href = link.get("href")
if href.endswith("ocument"):
our_link = "http://www2.congreso.gob.pe" + "/" + href
our_link = re.sub("//Sicr","/Sirc", our_link)
our_links.append(our_link)
return our_links
def parse_names(string):
"""
Parse string of names. Output only family name as list.
"""
names = []
for i in string.split(","):
i = re.sub("\s{2}.+", "", i)
names.append(i)
return names
def get_authors_from_project(document_link):
"""
input: link to project page
output: list of author names as list
Using tor, found help here:
http://stackoverflow.com/questions/10967631/how-to-make-http-request-through-a-tor-socks-proxy-using-python
"""
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('Accept-Charset', 'utf-8')]
request = urllib2.Request(document_link)
request.add_header('Cache-Control','max-age=0')
response = opener.open(request)
proyecto = BeautifulSoup(response.read().decode("utf-8"))
del request
del response
for i in proyecto.find_all("input"):
if i.get("name") == "NomCongre":
return parse_names(i.get("value"))
## ------------------------------------------------
def main():
all_links = []
for file in listdir("."):
if file.startswith("PA"):
print file
f = open(file, "r")
html_doc = f.read()
f.close()
soup = BeautifulSoup(html_doc)
all_links += extract_doc_links(soup)
print "Numero de proyectos de ley: %i " % len(all_links)
# Save names to file
f = codecs.open("all_authors.csv", "w", "utf-8")
f.write("Congresista\n")
f.close()
f = codecs.open("all_authors.csv", "a", "utf-8")
for link in all_links:
authors = get_authors_from_project(link)
for author in authors:
f.write(author + "\n")
f.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment