Skip to content

Instantly share code, notes, and snippets.

Created January 19, 2015 13:07
Show Gist options
  • Save thiagomarzagao/0288f7ec358caf40a554 to your computer and use it in GitHub Desktop.
Save thiagomarzagao/0288f7ec358caf40a554 to your computer and use it in GitHub Desktop.
# scrape e-Compras GDF (
import os
import requests
from bs4 import BeautifulSoup
baseurl = ''
basepath = '/Users/thiagomarzagao/Desktop/HTML/'
primeiro_id = 0 # ID of the first auction
ultimo_id = 48355 # ID of the last auction (as of 12/18/14)
def get_keys(soup):
keys = []
hrefs = [href for href in soup.find_all('a')][1:-2][::2]
for href in hrefs:
key = href.get('onclick')
key = key.replace('detalhamento(', '')
key = key.replace(')', '')
key = key.replace(',', ' ')
key = key.split()
return keys
def get_urls(keys):
urls = []
for key in keys:
piece1 = ''
piece2 = 'ID_Forn=' + key[0]
piece3 = '&ID_Lote=' + key[1]
url = piece1 + piece2 + piece3
if len(key) == 3:
piece4 = '&ID_Ordem=' + key[2]
url += piece4
return urls
session = requests.Session()
#for i in range(primeiro_id, ultimo_id):
for i in ['10033']:
print ''
print 'id:', i
# raspa pagina primaria (contem datas e resumo)
endurl = 'item_resultados.asp?id={}'.format(i)
url = baseurl + endurl
html = session.get(url, verify = False)
# checa se licitacao existe
if 'Favor tentar novamente.' in html.text:
if 'mero de Licita' and 'o Encontrada' in html.text:
# salva pagina principal
destination = basepath + 'item_resultados/item_resultados_{}.html'.format(i)
with open(destination, mode = 'wb') as output:
for line in html:
print 'item_resultados'
# raspa pagina secundaria (contem lances e licitantes)
endurl = 'Mapa_Lances.asp?id={}'.format(i)
url = baseurl + endurl
html = session.get(url, verify = False)
# salva pagina secundaria
destination = basepath + 'Mapa_Lances/Mapa_Lances_{}.html'.format(i)
with open(destination, mode = 'wb') as output:
for line in html:
print 'Mapa_Lances'
# raspa e salva lotes (se existirem)
if 'onClick="detalhamento' in html.text:
lotpath = basepath + 'lotes/' + i
if not os.path.exists(lotpath):
soup = BeautifulSoup(html.text)
keys = get_keys(soup)
urls = get_urls(keys)
for n, url in enumerate(urls):
html = session.get(url, verify = False)
destination = lotpath + 'pagina_{}.html'.format(str(n))
with open(destination, mode = 'wb') as output:
for line in html:
print 'pagina', n
Copy link

Hola Thiago, buenas tardes, gracias por tu código,
he intentado hacer lo mismo con esta web:
pero no puedo, me puedes ayudar por favor, te agradezco.
saludos desde Perú.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment