Skip to content

Instantly share code, notes, and snippets.

@thiagomarzagao
Created January 19, 2015 13:09
Show Gist options
  • Save thiagomarzagao/f5b78bd8a4bc874b9846 to your computer and use it in GitHub Desktop.
Save thiagomarzagao/f5b78bd8a4bc874b9846 to your computer and use it in GitHub Desktop.
'''
parse (HTML -> JSON) e-Compras GDF content
'''
import os
import re
import json
import socket
from bs4 import BeautifulSoup
class Auction(object):
'''
contem funcoes gerais usadas p/ parse o HTML da licitacao
'''
def get_paths(self):
'''
ajusta o caminho ate os dados de acordo com o computador em uso
'''
if 'cade' in socket.getfqdn():
basepath = 'C:/Users/thiago.marzagao/Dropbox/CADE/RASPAGENS/GDF_V2/'
elif 'MacBook' in socket.getfqdn():
basepath = '/Users/thiagomarzagao/Dropbox/CADE/RASPAGENS/GDF_V2/'
elif 'ThinkPad' in socket.getfqdn():
basepath = '/home/thiago/Dropbox/CADE/RASPAGENS/GDF_V2/'
inpath1 = basepath + 'CONTEUDO/HTML/item_resultados/'
inpath2 = basepath + 'CONTEUDO/HTML/Mapa_Lances/'
inpath3 = basepath + 'CONTEUDO/HTML/lotes/'
outpath = basepath + 'CONTEUDO/JSON/'
return inpath1, inpath2, inpath3, outpath
def get_header(self, soup):
'''
retorna info basica da licitacao (id, datas, resumo)
'''
regex = re.compile('tribuchet-11-bold-claro|tribuchet-13-verde-escuro')
tds = soup.find_all('td')[116]
content = tds.find_all('td', class_ = regex)
header = [element.text.strip() for element in content]
return header
def get_registration(self, header):
'''
retorna datas e horas do periodo de inscricao
'''
start_date = header[3][3:13]
start_hour = header[3][14:22]
end_date = header[3][27:37]
end_hour = header[3][38:46]
registration = {'data_inicio': start_date,
'hora_inicio': start_hour,
'data_fim': end_date,
'hora_fim': end_hour}
return registration
def get_opening(self, header):
'''
retorna data e hora de abertura dos lances
'''
opening_date = header[5][:10]
opening_hour = header[5][10:19]
opening = {'data_abertura': opening_date,
'hora_abertura': opening_hour}
return opening
def get_general_info(self, html):
'''
retorna info basica em formato Python dict
'''
soup = BeautifulSoup(html)
header = self.get_header(soup)
summary = header[1]
registration = self.get_registration(header)
opening = self.get_opening(header)
general_info = {'resumo': summary,
'inscricao': registration,
'abertura': opening}
return general_info
def to_number(self, as_string):
'''
converte string em float e corrige virgulas
'''
as_string = as_string.replace('.', '')
as_string = as_string.replace(',', '.')
return float(as_string)
def fix_problem_field(self, field):
'''
separa quantidade e unidade
'''
splitted = field.split()[:2]
try:
quantity = self.to_number(splitted[0])
unity = splitted[1]
return quantity, unity
except:
return 'invalid_value', 'invalid_value'
def to_json(self, auction, i):
'''
exporta dados da licitacao p/ arquivo JSON
'''
fname = self.get_paths()[3] + 'licitacao_id_{}.json'.format(i)
with open(fname, mode = 'wb') as destination:
json.dump(auction, destination)
class SingleAuction(Auction):
'''
contem funcoes especificas p/ parse HTML de licitacao
de itens individualizados
'''
def parse_tables(self, soup):
'''
retorna tabelas contidas no HTML
'''
regex = '\d\.\d\.\d{2}\.\d{2}\.\d{2}\.\d{2}\.\d{4}\.\d{6}-\d{2}'
tables = []
raw_tables = soup.find_all('table')[1:-5]
for raw_table in raw_tables:
if raw_table.find('td').text:
table = []
rows = raw_table.find_all('tr')[1:]
for row in rows:
columns = row.find_all('td')
text = [cell.text.strip() for cell in columns]
if row.find_all('img'):
text.append('winner')
else:
text.append('')
table.append(text)
if not table:
continue
if re.match(regex, table[0][1]):
table.append('item')
else:
table.append('bids')
tables.append(table)
return tables
def organize_tables_data(self, tables):
'''
retorna tabelas em formato Python dict
'''
data = {}
for i, table in enumerate(tables):
bids = None
if table[-1] == 'item':
if i + 1 < len(tables):
if tables[i+1][-1] == 'bids':
bids = tables[i+1][:-1]
data[i] = {'item': table[0][:-1], 'bids': bids}
return data
def get_item(self, pair):
'''
retorna dados do item licitado
'''
item = {}
item['codigo'] = pair['item'][1]
item['qtde_cotada'] = self.to_number(pair['item'][2])
item['descricao'] = pair['item'][3]
item['est_preco_uni'] = self.to_number(pair['item'][4])
item['est_preco_tot'] = self.to_number(pair['item'][5])
return item
def get_bid(self, bid):
'''
retorna dados do lance
'''
lance = {}
lance['participante'] = bid[0]
lance['qtde'], lance['unidade'] = self.fix_problem_field(bid[1])
lance['preco_uni'] = self.to_number(bid[2])
lance['preco_tot'] = self.to_number(bid[3])
lance['diferenca'] = self.to_number(bid[4])
lance['preco_neg'] = bid[5]
if len(bid) == 8:
lance['marca'] = bid[6]
if bid[-1] == 'winner':
lance['vencedor'] = True
else:
lance['vencedor'] = False
return lance
def get_item_bids(self, data):
'''
retorna conjuntos de item-lances
'''
item_bids = {}
for key in data:
new_pair = {}
pair = data[key]
new_pair['item'] = self.get_item(pair)
if pair['bids']:
lances = {}
for i, bid in enumerate(pair['bids']):
lances[i] = self.get_bid(bid)
new_pair['lances'] = lances
else:
new_pair['lances'] = None
item_bids[key] = new_pair
return item_bids
def export_data(self, html):
'''
retorna os dados da licitacao
'''
soup = BeautifulSoup(html)
tables = self.parse_tables(soup)
data = self.organize_tables_data(tables)
item_bids = self.get_item_bids(data)
return item_bids
class LotsAuction(Auction):
'''
contem funcoes especificas p/ parse HTML de licitacao
de itens agrupados em lotes
'''
def get_htmls(self, i):
'''
retorna o HTML de cada lance/lote da licitacao
'''
path = self.get_paths()[2] + i + '/'
htmls = []
for file in os.listdir(path):
with open(path + file, mode = 'rb') as source:
html = source.read()
htmls.append(html)
return htmls
def get_bidder_cnpj(self, field):
'''
retorna razao social e CNPJ do participante
'''
field = field.strip()
cnpj = field.split()[-1]
cnpj = cnpj.replace('(', '')
cnpj = cnpj.replace(')', '')
i = field.index(cnpj)
bidder = field[:i-2]
return bidder, cnpj
def get_info(self, bid):
'''
retorna dados gerais do lance/lote
'''
info = {}
info['num_lote'] = int(bid[2])
info['preco_lote'] = self.to_number(bid[3])
info['participante'] = self.get_bidder_cnpj(bid[5])[0]
info['cnpj'] = self.get_bidder_cnpj(bid[5])[1]
return info
def get_item(self, bid, i):
'''
retorna dados de cada item do lance/lote
'''
item_bid = {}
item_bid['descricao'] = bid[i]
item_bid['quantidade'] = self.fix_problem_field(bid[i+1])[0]
item_bid['unidade'] = self.fix_problem_field(bid[i+1])[1]
item_bid['preco_uni'] = self.to_number(bid[i+2])
item_bid['preco_tot'] = self.to_number(bid[i+3])
item_bid['marca'] = bid[i+4]
return item_bid
def get_bids(self, htmls):
'''
retorna os lances/lotes de cada licitacao
'''
bids = []
for html in htmls:
soup = BeautifulSoup(html)
fields = soup.find_all(class_ = 'nota_tit')
bids.append([field.text for field in fields])
return bids
def organize_bids(self, bids):
'''
organiza os lances/lotes de cada licitacao
'''
data = {}
for n, bid in enumerate(bids):
new_bid = {}
new_bid['lance_info'] = self.get_info(bid)
items = {}
for k, idx in enumerate(range(len(bid[11:]))[::5]):
if u'Este Lote n\u00E3o possui Lances' in bid[11:][0]:
continue
items[k] = self.get_item(bid[11:], idx)
new_bid['lance_itens'] = items
data[n] = new_bid
return data
def export_data(self, i):
'''
retorna os dados da licitacao
'''
htmls = self.get_htmls(i)
bids = self.get_bids(htmls)
data = self.organize_bids(bids)
return data
def do_it():
'''
roda a coisa toda
'''
auction = Auction()
path1, path2 = auction.get_paths()[:2]
n = 0
for file1, file2 in zip(os.listdir(path1), os.listdir(path2)):
if 'licitacao' in file1:
if file1 == file2:
n += 1
print file1, n
i = file1[13:].replace('.html', '')
with open(path1 + file1, mode = 'rb') as buffer1:
html1 = buffer1.read()
if '<h1>Service Unavailable</h1>' in html1:
continue
general_info = auction.get_general_info(html1)
data = {}
data['info_geral'] = general_info
with open(path2 + file2, mode = 'rb') as buffer2:
html2 = buffer2.read()
if 'onclick="detalhamento' in html2:
auction = LotsAuction()
data['lances'] = auction.export_data(i)
else:
auction = SingleAuction()
data['objetos'] = auction.export_data(html2)
auction.to_json(data, i)
fullpath = get_paths()[3] + 'licitacao_id_{}.json'.format(i)
if not os.path.exists(fullpath):
print 'FILE NOT FOUND!'
print 'press any key to continue'
raw_input()
do_it()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment