thiagomarzagao/scrape_gdf.py

## scrape_gdf.py
# scrape e-Compras GDF (https://www.compras.df.gov.br/)

import os
import requests
from bs4 import BeautifulSoup

baseurl = 'https://www.compras.df.gov.br/publico/'
basepath = '/Users/thiagomarzagao/Desktop/HTML/'
primeiro_id = 0 # ID of the first auction
ultimo_id = 48355 # ID of the last auction (as of 12/18/14)

def get_keys(soup):
    keys = []
    hrefs = [href for href in soup.find_all('a')][1:-2][::2]
    for href in hrefs:
        key = href.get('onclick')
        key = key.replace('detalhamento(', '')
        key = key.replace(')', '')
        key = key.replace(',', ' ')
        key = key.split()
        keys.append(key)
    return keys

def get_urls(keys):
    urls = []
    for key in keys:
        piece1 = 'https://www.compras.df.gov.br/publico/Mapa_Lances.asp?'
        piece2 = 'ID_Forn=' + key[0]
        piece3 = '&ID_Lote=' + key[1]
        url = piece1 + piece2 + piece3
        if len(key) == 3:
            piece4 = '&ID_Ordem=' + key[2]
            url += piece4
        urls.append(url)
    return urls

session = requests.Session()
#for i in range(primeiro_id, ultimo_id):
for i in ['10033']:
    print ''
    print 'id:', i

    # raspa pagina primaria (contem datas e resumo)
    endurl = 'item_resultados.asp?id={}'.format(i)
    url = baseurl + endurl
    html = session.get(url, verify = False)

    # checa se licitacao existe
    if 'Favor tentar novamente.' in html.text:
		continue
    if 'mero de Licita' and 'o Encontrada' in html.text:
        continue

    # salva pagina principal
    destination = basepath + 'item_resultados/item_resultados_{}.html'.format(i)
    with open(destination, mode = 'wb') as output:
        for line in html:
            output.write(str(line))
    print 'item_resultados'

    # raspa pagina secundaria (contem lances e licitantes)
    endurl = 'Mapa_Lances.asp?id={}'.format(i)
    url = baseurl + endurl
    html = session.get(url, verify = False)

    # salva pagina secundaria
    destination = basepath + 'Mapa_Lances/Mapa_Lances_{}.html'.format(i)
    with open(destination, mode = 'wb') as output:
        for line in html:
            output.write(str(line))
    print 'Mapa_Lances'

    # raspa e salva lotes (se existirem)
    if 'onClick="detalhamento' in html.text:
        lotpath = basepath + 'lotes/' + i
        if not os.path.exists(lotpath):
            os.makedirs(lotpath)
        else:
            continue
        soup = BeautifulSoup(html.text)
        keys = get_keys(soup)
        urls = get_urls(keys)
        for n, url in enumerate(urls):
            html = session.get(url, verify = False)
            destination = lotpath + 'pagina_{}.html'.format(str(n))
            with open(destination, mode = 'wb') as output:
                for line in html:
                    output.write(str(line))
            print 'pagina', n
	# scrape e-Compras GDF (https://www.compras.df.gov.br/)

	import os
	import requests
	from bs4 import BeautifulSoup

	baseurl = 'https://www.compras.df.gov.br/publico/'
	basepath = '/Users/thiagomarzagao/Desktop/HTML/'
	primeiro_id = 0 # ID of the first auction
	ultimo_id = 48355 # ID of the last auction (as of 12/18/14)

	def get_keys(soup):
	keys = []
	hrefs = [href for href in soup.find_all('a')][1:-2][::2]
	for href in hrefs:
	key = href.get('onclick')
	key = key.replace('detalhamento(', '')
	key = key.replace(')', '')
	key = key.replace(',', ' ')
	key = key.split()
	keys.append(key)
	return keys

	def get_urls(keys):
	urls = []
	for key in keys:
	piece1 = 'https://www.compras.df.gov.br/publico/Mapa_Lances.asp?'
	piece2 = 'ID_Forn=' + key[0]
	piece3 = '&ID_Lote=' + key[1]
	url = piece1 + piece2 + piece3
	if len(key) == 3:
	piece4 = '&ID_Ordem=' + key[2]
	url += piece4
	urls.append(url)
	return urls

	session = requests.Session()
	#for i in range(primeiro_id, ultimo_id):
	for i in ['10033']:
	print ''
	print 'id:', i

	# raspa pagina primaria (contem datas e resumo)
	endurl = 'item_resultados.asp?id={}'.format(i)
	url = baseurl + endurl
	html = session.get(url, verify = False)

	# checa se licitacao existe
	if 'Favor tentar novamente.' in html.text:
	continue
	if 'mero de Licita' and 'o Encontrada' in html.text:
	continue

	# salva pagina principal
	destination = basepath + 'item_resultados/item_resultados_{}.html'.format(i)
	with open(destination, mode = 'wb') as output:
	for line in html:
	output.write(str(line))
	print 'item_resultados'

	# raspa pagina secundaria (contem lances e licitantes)
	endurl = 'Mapa_Lances.asp?id={}'.format(i)
	url = baseurl + endurl
	html = session.get(url, verify = False)

	# salva pagina secundaria
	destination = basepath + 'Mapa_Lances/Mapa_Lances_{}.html'.format(i)
	with open(destination, mode = 'wb') as output:
	for line in html:
	output.write(str(line))
	print 'Mapa_Lances'

	# raspa e salva lotes (se existirem)
	if 'onClick="detalhamento' in html.text:
	lotpath = basepath + 'lotes/' + i
	if not os.path.exists(lotpath):
	os.makedirs(lotpath)
	else:
	continue
	soup = BeautifulSoup(html.text)
	keys = get_keys(soup)
	urls = get_urls(keys)
	for n, url in enumerate(urls):
	html = session.get(url, verify = False)
	destination = lotpath + 'pagina_{}.html'.format(str(n))
	with open(destination, mode = 'wb') as output:
	for line in html:
	output.write(str(line))
	print 'pagina', n