Created
January 19, 2015 13:09
-
-
Save thiagomarzagao/f5b78bd8a4bc874b9846 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
parse (HTML -> JSON) e-Compras GDF content | |
''' | |
import os | |
import re | |
import json | |
import socket | |
from bs4 import BeautifulSoup | |
class Auction(object): | |
''' | |
contem funcoes gerais usadas p/ parse o HTML da licitacao | |
''' | |
def get_paths(self): | |
''' | |
ajusta o caminho ate os dados de acordo com o computador em uso | |
''' | |
if 'cade' in socket.getfqdn(): | |
basepath = 'C:/Users/thiago.marzagao/Dropbox/CADE/RASPAGENS/GDF_V2/' | |
elif 'MacBook' in socket.getfqdn(): | |
basepath = '/Users/thiagomarzagao/Dropbox/CADE/RASPAGENS/GDF_V2/' | |
elif 'ThinkPad' in socket.getfqdn(): | |
basepath = '/home/thiago/Dropbox/CADE/RASPAGENS/GDF_V2/' | |
inpath1 = basepath + 'CONTEUDO/HTML/item_resultados/' | |
inpath2 = basepath + 'CONTEUDO/HTML/Mapa_Lances/' | |
inpath3 = basepath + 'CONTEUDO/HTML/lotes/' | |
outpath = basepath + 'CONTEUDO/JSON/' | |
return inpath1, inpath2, inpath3, outpath | |
def get_header(self, soup): | |
''' | |
retorna info basica da licitacao (id, datas, resumo) | |
''' | |
regex = re.compile('tribuchet-11-bold-claro|tribuchet-13-verde-escuro') | |
tds = soup.find_all('td')[116] | |
content = tds.find_all('td', class_ = regex) | |
header = [element.text.strip() for element in content] | |
return header | |
def get_registration(self, header): | |
''' | |
retorna datas e horas do periodo de inscricao | |
''' | |
start_date = header[3][3:13] | |
start_hour = header[3][14:22] | |
end_date = header[3][27:37] | |
end_hour = header[3][38:46] | |
registration = {'data_inicio': start_date, | |
'hora_inicio': start_hour, | |
'data_fim': end_date, | |
'hora_fim': end_hour} | |
return registration | |
def get_opening(self, header): | |
''' | |
retorna data e hora de abertura dos lances | |
''' | |
opening_date = header[5][:10] | |
opening_hour = header[5][10:19] | |
opening = {'data_abertura': opening_date, | |
'hora_abertura': opening_hour} | |
return opening | |
def get_general_info(self, html): | |
''' | |
retorna info basica em formato Python dict | |
''' | |
soup = BeautifulSoup(html) | |
header = self.get_header(soup) | |
summary = header[1] | |
registration = self.get_registration(header) | |
opening = self.get_opening(header) | |
general_info = {'resumo': summary, | |
'inscricao': registration, | |
'abertura': opening} | |
return general_info | |
def to_number(self, as_string): | |
''' | |
converte string em float e corrige virgulas | |
''' | |
as_string = as_string.replace('.', '') | |
as_string = as_string.replace(',', '.') | |
return float(as_string) | |
def fix_problem_field(self, field): | |
''' | |
separa quantidade e unidade | |
''' | |
splitted = field.split()[:2] | |
try: | |
quantity = self.to_number(splitted[0]) | |
unity = splitted[1] | |
return quantity, unity | |
except: | |
return 'invalid_value', 'invalid_value' | |
def to_json(self, auction, i): | |
''' | |
exporta dados da licitacao p/ arquivo JSON | |
''' | |
fname = self.get_paths()[3] + 'licitacao_id_{}.json'.format(i) | |
with open(fname, mode = 'wb') as destination: | |
json.dump(auction, destination) | |
class SingleAuction(Auction): | |
''' | |
contem funcoes especificas p/ parse HTML de licitacao | |
de itens individualizados | |
''' | |
def parse_tables(self, soup): | |
''' | |
retorna tabelas contidas no HTML | |
''' | |
regex = '\d\.\d\.\d{2}\.\d{2}\.\d{2}\.\d{2}\.\d{4}\.\d{6}-\d{2}' | |
tables = [] | |
raw_tables = soup.find_all('table')[1:-5] | |
for raw_table in raw_tables: | |
if raw_table.find('td').text: | |
table = [] | |
rows = raw_table.find_all('tr')[1:] | |
for row in rows: | |
columns = row.find_all('td') | |
text = [cell.text.strip() for cell in columns] | |
if row.find_all('img'): | |
text.append('winner') | |
else: | |
text.append('') | |
table.append(text) | |
if not table: | |
continue | |
if re.match(regex, table[0][1]): | |
table.append('item') | |
else: | |
table.append('bids') | |
tables.append(table) | |
return tables | |
def organize_tables_data(self, tables): | |
''' | |
retorna tabelas em formato Python dict | |
''' | |
data = {} | |
for i, table in enumerate(tables): | |
bids = None | |
if table[-1] == 'item': | |
if i + 1 < len(tables): | |
if tables[i+1][-1] == 'bids': | |
bids = tables[i+1][:-1] | |
data[i] = {'item': table[0][:-1], 'bids': bids} | |
return data | |
def get_item(self, pair): | |
''' | |
retorna dados do item licitado | |
''' | |
item = {} | |
item['codigo'] = pair['item'][1] | |
item['qtde_cotada'] = self.to_number(pair['item'][2]) | |
item['descricao'] = pair['item'][3] | |
item['est_preco_uni'] = self.to_number(pair['item'][4]) | |
item['est_preco_tot'] = self.to_number(pair['item'][5]) | |
return item | |
def get_bid(self, bid): | |
''' | |
retorna dados do lance | |
''' | |
lance = {} | |
lance['participante'] = bid[0] | |
lance['qtde'], lance['unidade'] = self.fix_problem_field(bid[1]) | |
lance['preco_uni'] = self.to_number(bid[2]) | |
lance['preco_tot'] = self.to_number(bid[3]) | |
lance['diferenca'] = self.to_number(bid[4]) | |
lance['preco_neg'] = bid[5] | |
if len(bid) == 8: | |
lance['marca'] = bid[6] | |
if bid[-1] == 'winner': | |
lance['vencedor'] = True | |
else: | |
lance['vencedor'] = False | |
return lance | |
def get_item_bids(self, data): | |
''' | |
retorna conjuntos de item-lances | |
''' | |
item_bids = {} | |
for key in data: | |
new_pair = {} | |
pair = data[key] | |
new_pair['item'] = self.get_item(pair) | |
if pair['bids']: | |
lances = {} | |
for i, bid in enumerate(pair['bids']): | |
lances[i] = self.get_bid(bid) | |
new_pair['lances'] = lances | |
else: | |
new_pair['lances'] = None | |
item_bids[key] = new_pair | |
return item_bids | |
def export_data(self, html): | |
''' | |
retorna os dados da licitacao | |
''' | |
soup = BeautifulSoup(html) | |
tables = self.parse_tables(soup) | |
data = self.organize_tables_data(tables) | |
item_bids = self.get_item_bids(data) | |
return item_bids | |
class LotsAuction(Auction): | |
''' | |
contem funcoes especificas p/ parse HTML de licitacao | |
de itens agrupados em lotes | |
''' | |
def get_htmls(self, i): | |
''' | |
retorna o HTML de cada lance/lote da licitacao | |
''' | |
path = self.get_paths()[2] + i + '/' | |
htmls = [] | |
for file in os.listdir(path): | |
with open(path + file, mode = 'rb') as source: | |
html = source.read() | |
htmls.append(html) | |
return htmls | |
def get_bidder_cnpj(self, field): | |
''' | |
retorna razao social e CNPJ do participante | |
''' | |
field = field.strip() | |
cnpj = field.split()[-1] | |
cnpj = cnpj.replace('(', '') | |
cnpj = cnpj.replace(')', '') | |
i = field.index(cnpj) | |
bidder = field[:i-2] | |
return bidder, cnpj | |
def get_info(self, bid): | |
''' | |
retorna dados gerais do lance/lote | |
''' | |
info = {} | |
info['num_lote'] = int(bid[2]) | |
info['preco_lote'] = self.to_number(bid[3]) | |
info['participante'] = self.get_bidder_cnpj(bid[5])[0] | |
info['cnpj'] = self.get_bidder_cnpj(bid[5])[1] | |
return info | |
def get_item(self, bid, i): | |
''' | |
retorna dados de cada item do lance/lote | |
''' | |
item_bid = {} | |
item_bid['descricao'] = bid[i] | |
item_bid['quantidade'] = self.fix_problem_field(bid[i+1])[0] | |
item_bid['unidade'] = self.fix_problem_field(bid[i+1])[1] | |
item_bid['preco_uni'] = self.to_number(bid[i+2]) | |
item_bid['preco_tot'] = self.to_number(bid[i+3]) | |
item_bid['marca'] = bid[i+4] | |
return item_bid | |
def get_bids(self, htmls): | |
''' | |
retorna os lances/lotes de cada licitacao | |
''' | |
bids = [] | |
for html in htmls: | |
soup = BeautifulSoup(html) | |
fields = soup.find_all(class_ = 'nota_tit') | |
bids.append([field.text for field in fields]) | |
return bids | |
def organize_bids(self, bids): | |
''' | |
organiza os lances/lotes de cada licitacao | |
''' | |
data = {} | |
for n, bid in enumerate(bids): | |
new_bid = {} | |
new_bid['lance_info'] = self.get_info(bid) | |
items = {} | |
for k, idx in enumerate(range(len(bid[11:]))[::5]): | |
if u'Este Lote n\u00E3o possui Lances' in bid[11:][0]: | |
continue | |
items[k] = self.get_item(bid[11:], idx) | |
new_bid['lance_itens'] = items | |
data[n] = new_bid | |
return data | |
def export_data(self, i): | |
''' | |
retorna os dados da licitacao | |
''' | |
htmls = self.get_htmls(i) | |
bids = self.get_bids(htmls) | |
data = self.organize_bids(bids) | |
return data | |
def do_it(): | |
''' | |
roda a coisa toda | |
''' | |
auction = Auction() | |
path1, path2 = auction.get_paths()[:2] | |
n = 0 | |
for file1, file2 in zip(os.listdir(path1), os.listdir(path2)): | |
if 'licitacao' in file1: | |
if file1 == file2: | |
n += 1 | |
print file1, n | |
i = file1[13:].replace('.html', '') | |
with open(path1 + file1, mode = 'rb') as buffer1: | |
html1 = buffer1.read() | |
if '<h1>Service Unavailable</h1>' in html1: | |
continue | |
general_info = auction.get_general_info(html1) | |
data = {} | |
data['info_geral'] = general_info | |
with open(path2 + file2, mode = 'rb') as buffer2: | |
html2 = buffer2.read() | |
if 'onclick="detalhamento' in html2: | |
auction = LotsAuction() | |
data['lances'] = auction.export_data(i) | |
else: | |
auction = SingleAuction() | |
data['objetos'] = auction.export_data(html2) | |
auction.to_json(data, i) | |
fullpath = get_paths()[3] + 'licitacao_id_{}.json'.format(i) | |
if not os.path.exists(fullpath): | |
print 'FILE NOT FOUND!' | |
print 'press any key to continue' | |
raw_input() | |
do_it() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment