Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
# scrape e-Compras GDF (https://www.compras.df.gov.br/)
import os
import requests
from bs4 import BeautifulSoup
baseurl = 'https://www.compras.df.gov.br/publico/'
basepath = '/Users/thiagomarzagao/Desktop/HTML/'
primeiro_id = 0 # ID of the first auction
ultimo_id = 48355 # ID of the last auction (as of 12/18/14)
def get_keys(soup):
keys = []
hrefs = [href for href in soup.find_all('a')][1:-2][::2]
for href in hrefs:
key = href.get('onclick')
key = key.replace('detalhamento(', '')
key = key.replace(')', '')
key = key.replace(',', ' ')
key = key.split()
keys.append(key)
return keys
def get_urls(keys):
urls = []
for key in keys:
piece1 = 'https://www.compras.df.gov.br/publico/Mapa_Lances.asp?'
piece2 = 'ID_Forn=' + key[0]
piece3 = '&ID_Lote=' + key[1]
url = piece1 + piece2 + piece3
if len(key) == 3:
piece4 = '&ID_Ordem=' + key[2]
url += piece4
urls.append(url)
return urls
session = requests.Session()
#for i in range(primeiro_id, ultimo_id):
for i in ['10033']:
print ''
print 'id:', i
# raspa pagina primaria (contem datas e resumo)
endurl = 'item_resultados.asp?id={}'.format(i)
url = baseurl + endurl
html = session.get(url, verify = False)
# checa se licitacao existe
if 'Favor tentar novamente.' in html.text:
continue
if 'mero de Licita' and 'o Encontrada' in html.text:
continue
# salva pagina principal
destination = basepath + 'item_resultados/item_resultados_{}.html'.format(i)
with open(destination, mode = 'wb') as output:
for line in html:
output.write(str(line))
print 'item_resultados'
# raspa pagina secundaria (contem lances e licitantes)
endurl = 'Mapa_Lances.asp?id={}'.format(i)
url = baseurl + endurl
html = session.get(url, verify = False)
# salva pagina secundaria
destination = basepath + 'Mapa_Lances/Mapa_Lances_{}.html'.format(i)
with open(destination, mode = 'wb') as output:
for line in html:
output.write(str(line))
print 'Mapa_Lances'
# raspa e salva lotes (se existirem)
if 'onClick="detalhamento' in html.text:
lotpath = basepath + 'lotes/' + i
if not os.path.exists(lotpath):
os.makedirs(lotpath)
else:
continue
soup = BeautifulSoup(html.text)
keys = get_keys(soup)
urls = get_urls(keys)
for n, url in enumerate(urls):
html = session.get(url, verify = False)
destination = lotpath + 'pagina_{}.html'.format(str(n))
with open(destination, mode = 'wb') as output:
for line in html:
output.write(str(line))
print 'pagina', n
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment