''' | |
parse (HTML -> JSON) e-Compras GDF content | |
''' | |
import os | |
import re | |
import json | |
import socket | |
from bs4 import BeautifulSoup | |
class Auction(object): | |
''' | |
contem funcoes gerais usadas p/ parse o HTML da licitacao | |
''' | |
def get_paths(self): | |
''' | |
ajusta o caminho ate os dados de acordo com o computador em uso | |
''' | |
if 'cade' in socket.getfqdn(): | |
basepath = 'C:/Users/thiago.marzagao/Dropbox/CADE/RASPAGENS/GDF_V2/' | |
elif 'MacBook' in socket.getfqdn(): | |
basepath = '/Users/thiagomarzagao/Dropbox/CADE/RASPAGENS/GDF_V2/' | |
elif 'ThinkPad' in socket.getfqdn(): | |
basepath = '/home/thiago/Dropbox/CADE/RASPAGENS/GDF_V2/' | |
inpath1 = basepath + 'CONTEUDO/HTML/item_resultados/' | |
inpath2 = basepath + 'CONTEUDO/HTML/Mapa_Lances/' | |
inpath3 = basepath + 'CONTEUDO/HTML/lotes/' | |
outpath = basepath + 'CONTEUDO/JSON/' | |
return inpath1, inpath2, inpath3, outpath | |
def get_header(self, soup): | |
''' | |
retorna info basica da licitacao (id, datas, resumo) | |
''' | |
regex = re.compile('tribuchet-11-bold-claro|tribuchet-13-verde-escuro') | |
tds = soup.find_all('td')[116] | |
content = tds.find_all('td', class_ = regex) | |
header = [element.text.strip() for element in content] | |
return header | |
def get_registration(self, header): | |
''' | |
retorna datas e horas do periodo de inscricao | |
''' | |
start_date = header[3][3:13] | |
start_hour = header[3][14:22] | |
end_date = header[3][27:37] | |
end_hour = header[3][38:46] | |
registration = {'data_inicio': start_date, | |
'hora_inicio': start_hour, | |
'data_fim': end_date, | |
'hora_fim': end_hour} | |
return registration | |
def get_opening(self, header): | |
''' | |
retorna data e hora de abertura dos lances | |
''' | |
opening_date = header[5][:10] | |
opening_hour = header[5][10:19] | |
opening = {'data_abertura': opening_date, | |
'hora_abertura': opening_hour} | |
return opening | |
def get_general_info(self, html): | |
''' | |
retorna info basica em formato Python dict | |
''' | |
soup = BeautifulSoup(html) | |
header = self.get_header(soup) | |
summary = header[1] | |
registration = self.get_registration(header) | |
opening = self.get_opening(header) | |
general_info = {'resumo': summary, | |
'inscricao': registration, | |
'abertura': opening} | |
return general_info | |
def to_number(self, as_string): | |
''' | |
converte string em float e corrige virgulas | |
''' | |
as_string = as_string.replace('.', '') | |
as_string = as_string.replace(',', '.') | |
return float(as_string) | |
def fix_problem_field(self, field): | |
''' | |
separa quantidade e unidade | |
''' | |
splitted = field.split()[:2] | |
try: | |
quantity = self.to_number(splitted[0]) | |
unity = splitted[1] | |
return quantity, unity | |
except: | |
return 'invalid_value', 'invalid_value' | |
def to_json(self, auction, i): | |
''' | |
exporta dados da licitacao p/ arquivo JSON | |
''' | |
fname = self.get_paths()[3] + 'licitacao_id_{}.json'.format(i) | |
with open(fname, mode = 'wb') as destination: | |
json.dump(auction, destination) | |
class SingleAuction(Auction): | |
''' | |
contem funcoes especificas p/ parse HTML de licitacao | |
de itens individualizados | |
''' | |
def parse_tables(self, soup): | |
''' | |
retorna tabelas contidas no HTML | |
''' | |
regex = '\d\.\d\.\d{2}\.\d{2}\.\d{2}\.\d{2}\.\d{4}\.\d{6}-\d{2}' | |
tables = [] | |
raw_tables = soup.find_all('table')[1:-5] | |
for raw_table in raw_tables: | |
if raw_table.find('td').text: | |
table = [] | |
rows = raw_table.find_all('tr')[1:] | |
for row in rows: | |
columns = row.find_all('td') | |
text = [cell.text.strip() for cell in columns] | |
if row.find_all('img'): | |
text.append('winner') | |
else: | |
text.append('') | |
table.append(text) | |
if not table: | |
continue | |
if re.match(regex, table[0][1]): | |
table.append('item') | |
else: | |
table.append('bids') | |
tables.append(table) | |
return tables | |
def organize_tables_data(self, tables): | |
''' | |
retorna tabelas em formato Python dict | |
''' | |
data = {} | |
for i, table in enumerate(tables): | |
bids = None | |
if table[-1] == 'item': | |
if i + 1 < len(tables): | |
if tables[i+1][-1] == 'bids': | |
bids = tables[i+1][:-1] | |
data[i] = {'item': table[0][:-1], 'bids': bids} | |
return data | |
def get_item(self, pair): | |
''' | |
retorna dados do item licitado | |
''' | |
item = {} | |
item['codigo'] = pair['item'][1] | |
item['qtde_cotada'] = self.to_number(pair['item'][2]) | |
item['descricao'] = pair['item'][3] | |
item['est_preco_uni'] = self.to_number(pair['item'][4]) | |
item['est_preco_tot'] = self.to_number(pair['item'][5]) | |
return item | |
def get_bid(self, bid): | |
''' | |
retorna dados do lance | |
''' | |
lance = {} | |
lance['participante'] = bid[0] | |
lance['qtde'], lance['unidade'] = self.fix_problem_field(bid[1]) | |
lance['preco_uni'] = self.to_number(bid[2]) | |
lance['preco_tot'] = self.to_number(bid[3]) | |
lance['diferenca'] = self.to_number(bid[4]) | |
lance['preco_neg'] = bid[5] | |
if len(bid) == 8: | |
lance['marca'] = bid[6] | |
if bid[-1] == 'winner': | |
lance['vencedor'] = True | |
else: | |
lance['vencedor'] = False | |
return lance | |
def get_item_bids(self, data): | |
''' | |
retorna conjuntos de item-lances | |
''' | |
item_bids = {} | |
for key in data: | |
new_pair = {} | |
pair = data[key] | |
new_pair['item'] = self.get_item(pair) | |
if pair['bids']: | |
lances = {} | |
for i, bid in enumerate(pair['bids']): | |
lances[i] = self.get_bid(bid) | |
new_pair['lances'] = lances | |
else: | |
new_pair['lances'] = None | |
item_bids[key] = new_pair | |
return item_bids | |
def export_data(self, html): | |
''' | |
retorna os dados da licitacao | |
''' | |
soup = BeautifulSoup(html) | |
tables = self.parse_tables(soup) | |
data = self.organize_tables_data(tables) | |
item_bids = self.get_item_bids(data) | |
return item_bids | |
class LotsAuction(Auction): | |
''' | |
contem funcoes especificas p/ parse HTML de licitacao | |
de itens agrupados em lotes | |
''' | |
def get_htmls(self, i): | |
''' | |
retorna o HTML de cada lance/lote da licitacao | |
''' | |
path = self.get_paths()[2] + i + '/' | |
htmls = [] | |
for file in os.listdir(path): | |
with open(path + file, mode = 'rb') as source: | |
html = source.read() | |
htmls.append(html) | |
return htmls | |
def get_bidder_cnpj(self, field): | |
''' | |
retorna razao social e CNPJ do participante | |
''' | |
field = field.strip() | |
cnpj = field.split()[-1] | |
cnpj = cnpj.replace('(', '') | |
cnpj = cnpj.replace(')', '') | |
i = field.index(cnpj) | |
bidder = field[:i-2] | |
return bidder, cnpj | |
def get_info(self, bid): | |
''' | |
retorna dados gerais do lance/lote | |
''' | |
info = {} | |
info['num_lote'] = int(bid[2]) | |
info['preco_lote'] = self.to_number(bid[3]) | |
info['participante'] = self.get_bidder_cnpj(bid[5])[0] | |
info['cnpj'] = self.get_bidder_cnpj(bid[5])[1] | |
return info | |
def get_item(self, bid, i): | |
''' | |
retorna dados de cada item do lance/lote | |
''' | |
item_bid = {} | |
item_bid['descricao'] = bid[i] | |
item_bid['quantidade'] = self.fix_problem_field(bid[i+1])[0] | |
item_bid['unidade'] = self.fix_problem_field(bid[i+1])[1] | |
item_bid['preco_uni'] = self.to_number(bid[i+2]) | |
item_bid['preco_tot'] = self.to_number(bid[i+3]) | |
item_bid['marca'] = bid[i+4] | |
return item_bid | |
def get_bids(self, htmls): | |
''' | |
retorna os lances/lotes de cada licitacao | |
''' | |
bids = [] | |
for html in htmls: | |
soup = BeautifulSoup(html) | |
fields = soup.find_all(class_ = 'nota_tit') | |
bids.append([field.text for field in fields]) | |
return bids | |
def organize_bids(self, bids): | |
''' | |
organiza os lances/lotes de cada licitacao | |
''' | |
data = {} | |
for n, bid in enumerate(bids): | |
new_bid = {} | |
new_bid['lance_info'] = self.get_info(bid) | |
items = {} | |
for k, idx in enumerate(range(len(bid[11:]))[::5]): | |
if u'Este Lote n\u00E3o possui Lances' in bid[11:][0]: | |
continue | |
items[k] = self.get_item(bid[11:], idx) | |
new_bid['lance_itens'] = items | |
data[n] = new_bid | |
return data | |
def export_data(self, i): | |
''' | |
retorna os dados da licitacao | |
''' | |
htmls = self.get_htmls(i) | |
bids = self.get_bids(htmls) | |
data = self.organize_bids(bids) | |
return data | |
def do_it(): | |
''' | |
roda a coisa toda | |
''' | |
auction = Auction() | |
path1, path2 = auction.get_paths()[:2] | |
n = 0 | |
for file1, file2 in zip(os.listdir(path1), os.listdir(path2)): | |
if 'licitacao' in file1: | |
if file1 == file2: | |
n += 1 | |
print file1, n | |
i = file1[13:].replace('.html', '') | |
with open(path1 + file1, mode = 'rb') as buffer1: | |
html1 = buffer1.read() | |
if '<h1>Service Unavailable</h1>' in html1: | |
continue | |
general_info = auction.get_general_info(html1) | |
data = {} | |
data['info_geral'] = general_info | |
with open(path2 + file2, mode = 'rb') as buffer2: | |
html2 = buffer2.read() | |
if 'onclick="detalhamento' in html2: | |
auction = LotsAuction() | |
data['lances'] = auction.export_data(i) | |
else: | |
auction = SingleAuction() | |
data['objetos'] = auction.export_data(html2) | |
auction.to_json(data, i) | |
fullpath = get_paths()[3] + 'licitacao_id_{}.json'.format(i) | |
if not os.path.exists(fullpath): | |
print 'FILE NOT FOUND!' | |
print 'press any key to continue' | |
raw_input() | |
do_it() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment