Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
# scrape e-Compras GDF (https://www.compras.df.gov.br/)
import os
import requests
from bs4 import BeautifulSoup
baseurl = 'https://www.compras.df.gov.br/publico/'
basepath = '/Users/thiagomarzagao/Desktop/HTML/'
primeiro_id = 0 # ID of the first auction
ultimo_id = 48355 # ID of the last auction (as of 12/18/14)
def get_keys(soup):
keys = []
hrefs = [href for href in soup.find_all('a')][1:-2][::2]
for href in hrefs:
key = href.get('onclick')
key = key.replace('detalhamento(', '')
key = key.replace(')', '')
key = key.replace(',', ' ')
key = key.split()
keys.append(key)
return keys
def get_urls(keys):
urls = []
for key in keys:
piece1 = 'https://www.compras.df.gov.br/publico/Mapa_Lances.asp?'
piece2 = 'ID_Forn=' + key[0]
piece3 = '&ID_Lote=' + key[1]
url = piece1 + piece2 + piece3
if len(key) == 3:
piece4 = '&ID_Ordem=' + key[2]
url += piece4
urls.append(url)
return urls
session = requests.Session()
#for i in range(primeiro_id, ultimo_id):
for i in ['10033']:
print ''
print 'id:', i
# raspa pagina primaria (contem datas e resumo)
endurl = 'item_resultados.asp?id={}'.format(i)
url = baseurl + endurl
html = session.get(url, verify = False)
# checa se licitacao existe
if 'Favor tentar novamente.' in html.text:
continue
if 'mero de Licita' and 'o Encontrada' in html.text:
continue
# salva pagina principal
destination = basepath + 'item_resultados/item_resultados_{}.html'.format(i)
with open(destination, mode = 'wb') as output:
for line in html:
output.write(str(line))
print 'item_resultados'
# raspa pagina secundaria (contem lances e licitantes)
endurl = 'Mapa_Lances.asp?id={}'.format(i)
url = baseurl + endurl
html = session.get(url, verify = False)
# salva pagina secundaria
destination = basepath + 'Mapa_Lances/Mapa_Lances_{}.html'.format(i)
with open(destination, mode = 'wb') as output:
for line in html:
output.write(str(line))
print 'Mapa_Lances'
# raspa e salva lotes (se existirem)
if 'onClick="detalhamento' in html.text:
lotpath = basepath + 'lotes/' + i
if not os.path.exists(lotpath):
os.makedirs(lotpath)
else:
continue
soup = BeautifulSoup(html.text)
keys = get_keys(soup)
urls = get_urls(keys)
for n, url in enumerate(urls):
html = session.get(url, verify = False)
destination = lotpath + 'pagina_{}.html'.format(str(n))
with open(destination, mode = 'wb') as output:
for line in html:
output.write(str(line))
print 'pagina', n
@cocol1zo

This comment has been minimized.

Copy link

cocol1zo commented Aug 17, 2019

Hola Thiago, buenas tardes, gracias por tu código,
he intentado hacer lo mismo con esta web: http://prodapp2.seace.gob.pe/seacebus-uiwd-pub/buscadorPublico/buscadorPublico.xhtml
pero no puedo, me puedes ayudar por favor, te agradezco.
saludos desde Perú.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.