Skip to content

Instantly share code, notes, and snippets.

@marcelcaraciolo
Created March 16, 2013 15:38
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marcelcaraciolo/5176932 to your computer and use it in GitHub Desktop.
Save marcelcaraciolo/5176932 to your computer and use it in GitHub Desktop.
from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
from time import sleep
BASE_URL = "http://www.cidades.com.br/"
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html)
def captura_cidades(link):
soup = make_soup(link)
link_cidades = soup.find('p', {'align': 'right'}).find('a')['href']
soup = make_soup(BASE_URL + link_cidades)
cidades = [ td.string for td in soup.find('td', {'width': '432'}).findAll('a') if td.string != None][::2]
return cidades
def get_state_links(url):
soup = make_soup(url)
table = soup.find('table', {'width': '387'})
estados_links = [ (link.find('h2').string, link['href']) for link in table.findAll('a')]
return estados_links
if __name__ == '__main__':
estados = (BASE_URL + "estados.htm")
estados = get_state_links(estados)
data = {}
for estado, link in estados[:3]:
cidades = captura_cidades(link)
data[estado] = cidades
sleep(1)
for estado, cidades in data.iteritems():
print estado, '-->', cidades
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment