Skip to content

Instantly share code, notes, and snippets.

@jonatasleon
Last active August 29, 2015 14:16
Show Gist options
  • Save jonatasleon/2b1007b1816c3a3c2353 to your computer and use it in GitHub Desktop.
Save jonatasleon/2b1007b1816c3a3c2353 to your computer and use it in GitHub Desktop.
Recolhe informações diretamente do site do IBGE. Para acessar alguma cidade, o arquivo cities.txt de conter o estado e a cidade na seguite forma: sao-paulo|guaratingueta ou acre|acrelandia, onde cada cidade deve estar em uma linha. Adaptado a partir de https://unknownsec.wordpress.com/2014/10/09/coleta-de-dados-do-ibge-python-beatifulsoap-e-urll…
#!/usr/bin/env python
# -*- encoding:utf-8 -*-
import urllib2
from collections import Iterable
from bs4 import BeautifulSoup
URL_HOME = 'http://cidades.ibge.gov.br/xtras/home.php'
URL_UF = 'http://cidades.ibge.gov.br/xtras/'
URL_MUN = 'http://cidades.ibge.gov.br/xtras/'
FILE_CITIES = 'cities.txt'
def getCities():
f = open(FILE_CITIES, 'a+')
lst = []
for line in f:
lst.append(line.splitlines()[0])
f.close()
return lst
def searchData(cities):
if not cities: return ['Nenhuma cidade buscada']
html = urllib2.urlopen(URL_HOME).read()
soup = BeautifulSoup(html)
result = []
for link in soup.find_all('a'):
pagina = link.get('href')
if isinstance(pagina, Iterable) and "uf.php" in pagina:
paginaUf = pagina.split('xtras/')[1]
urlUf = URL_UF+paginaUf
htmlUf = urllib2.urlopen(urlUf).read()
soup = BeautifulSoup(htmlUf)
for links in soup.find_all('a'):
paginaMun = links.get('href')
try:
if "perfil.php" in paginaMun and "/estadosat/" not in paginaMun:
for city in cities:
if not city == "" and city in paginaMun:
urlMun = URL_MUN+paginaMun
htmlMun = urllib2.urlopen(urlMun).read()
soupMun = BeautifulSoup(htmlMun)
pop2014 = soupMun.find_all('td', {'class': 'valor'})[0].get_text()
pop2010 = soupMun.find_all('td', {'class': 'valor'})[1].get_text()
prefeito = soupMun.find_all('td',{'class': 'valor'})[6].get_text()
mun = soupMun.find('title').get_text().split("|")[3].strip()
cod = soupMun.find_all('td', {'class': 'valor'})[4].get_text()
dens = soupMun.find_all('td', {'class': 'valor'})[3].get_text()
area = soupMun.find_all('td', {'class': 'valor'})[2].get_text()
found = (cod, pop2014, pop2010, prefeito, mun, area, dens)
result.append(found)
except:
pass
return result
cities = getCities()
for city in searchData(cities):
print "Nome do município: ", city[4]
print "Código: ", city[0]
print "População 2014: ", city[1]
print "População 2010: ", city[2]
print "Área da unidade territorial(km²): ", city[5]
print "Densidade demográfica(hab/km²): ", city[6]
print "Prefeito : ", city[3]
print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment