thiagomarzagao/html_to_json.py

## html_to_json.py
'''
parse (HTML -> JSON) e-Compras GDF content
'''

import os
import re
import json
import socket
from bs4 import BeautifulSoup

class Auction(object):
    '''
    contem funcoes gerais usadas p/ parse o HTML da licitacao
    '''

    def get_paths(self):
        '''
        ajusta o caminho ate os dados de acordo com o computador em uso
        '''
        if 'cade' in socket.getfqdn():
            basepath = 'C:/Users/thiago.marzagao/Dropbox/CADE/RASPAGENS/GDF_V2/'
        elif 'MacBook' in socket.getfqdn():
            basepath = '/Users/thiagomarzagao/Dropbox/CADE/RASPAGENS/GDF_V2/'
        elif 'ThinkPad' in socket.getfqdn():
            basepath = '/home/thiago/Dropbox/CADE/RASPAGENS/GDF_V2/'
        inpath1 = basepath + 'CONTEUDO/HTML/item_resultados/'
        inpath2 = basepath + 'CONTEUDO/HTML/Mapa_Lances/'
        inpath3 = basepath + 'CONTEUDO/HTML/lotes/'
        outpath = basepath + 'CONTEUDO/JSON/'
        return inpath1, inpath2, inpath3, outpath

    def get_header(self, soup):
        '''
        retorna info basica da licitacao (id, datas, resumo)
        '''
        regex = re.compile('tribuchet-11-bold-claro|tribuchet-13-verde-escuro')
        tds = soup.find_all('td')[116]
        content = tds.find_all('td', class_ = regex)
        header = [element.text.strip() for element in content]
        return header

    def get_registration(self, header):
        '''
        retorna datas e horas do periodo de inscricao
        '''
        start_date = header[3][3:13]
        start_hour = header[3][14:22]
        end_date = header[3][27:37]
        end_hour = header[3][38:46]
        registration = {'data_inicio': start_date,
                        'hora_inicio': start_hour,
                        'data_fim': end_date,
                        'hora_fim': end_hour}
        return registration

    def get_opening(self, header):
        '''
        retorna data e hora de abertura dos lances
        '''
        opening_date = header[5][:10]
        opening_hour = header[5][10:19]
        opening = {'data_abertura': opening_date,
                   'hora_abertura': opening_hour}
        return opening

    def get_general_info(self, html):
        '''
        retorna info basica em formato Python dict
        '''
        soup = BeautifulSoup(html)
        header = self.get_header(soup)
        summary = header[1]
        registration = self.get_registration(header)
        opening = self.get_opening(header)
        general_info = {'resumo': summary,
                        'inscricao': registration,
                        'abertura': opening}
        return general_info

    def to_number(self, as_string):
        '''
        converte string em float e corrige virgulas
        '''
        as_string = as_string.replace('.', '')
        as_string = as_string.replace(',', '.')
        return float(as_string)

    def fix_problem_field(self, field):
        '''
        separa quantidade e unidade
        '''
        splitted = field.split()[:2]
        try:
            quantity = self.to_number(splitted[0])
            unity = splitted[1]
            return quantity, unity
        except:
            return 'invalid_value', 'invalid_value'

    def to_json(self, auction, i):
        '''
        exporta dados da licitacao p/ arquivo JSON
        '''
        fname = self.get_paths()[3] + 'licitacao_id_{}.json'.format(i)
        with open(fname, mode = 'wb') as destination:
            json.dump(auction, destination)

class SingleAuction(Auction):
    '''
    contem funcoes especificas p/ parse HTML de licitacao
    de itens individualizados
    '''

    def parse_tables(self, soup):
        '''
        retorna tabelas contidas no HTML
        '''
        regex = '\d\.\d\.\d{2}\.\d{2}\.\d{2}\.\d{2}\.\d{4}\.\d{6}-\d{2}'
        tables = []
        raw_tables = soup.find_all('table')[1:-5]
        for raw_table in raw_tables:
            if raw_table.find('td').text:
                table = []
                rows = raw_table.find_all('tr')[1:]
                for row in rows:
                    columns = row.find_all('td')
                    text = [cell.text.strip() for cell in columns]
                    if row.find_all('img'):
                        text.append('winner')
                    else:
                        text.append('')
                    table.append(text)
                if not table:
                    continue
                if re.match(regex, table[0][1]):
                    table.append('item')
                else:
                    table.append('bids')
                tables.append(table)
        return tables

    def organize_tables_data(self, tables):
        '''
        retorna tabelas em formato Python dict
        '''
        data = {}
        for i, table in enumerate(tables):
            bids = None
            if table[-1] == 'item':
                if i + 1 < len(tables):
                    if tables[i+1][-1] == 'bids':
                        bids = tables[i+1][:-1]
                data[i] = {'item': table[0][:-1], 'bids': bids}
        return data

    def get_item(self, pair):
        '''
        retorna dados do item licitado
        '''
        item = {}
        item['codigo'] = pair['item'][1]
        item['qtde_cotada'] = self.to_number(pair['item'][2])
        item['descricao'] = pair['item'][3]
        item['est_preco_uni'] = self.to_number(pair['item'][4])
        item['est_preco_tot'] = self.to_number(pair['item'][5])
        return item

    def get_bid(self, bid):
        '''
        retorna dados do lance
        '''
        lance = {}
        lance['participante'] = bid[0]
        lance['qtde'], lance['unidade'] = self.fix_problem_field(bid[1])
        lance['preco_uni'] = self.to_number(bid[2])
        lance['preco_tot'] = self.to_number(bid[3])
        lance['diferenca'] = self.to_number(bid[4])
        lance['preco_neg'] = bid[5]
        if len(bid) == 8:
            lance['marca'] = bid[6]
        if bid[-1] == 'winner':
            lance['vencedor'] = True
        else:
            lance['vencedor'] = False
        return lance

    def get_item_bids(self, data):
        '''
        retorna conjuntos de item-lances
        '''
        item_bids = {}
        for key in data:
            new_pair = {}
            pair = data[key]
            new_pair['item'] = self.get_item(pair)
            if pair['bids']:
                lances = {}
                for i, bid in enumerate(pair['bids']):
                    lances[i] = self.get_bid(bid)
                new_pair['lances'] = lances
            else:
                new_pair['lances'] = None
            item_bids[key] = new_pair
        return item_bids

    def export_data(self, html):
        '''
        retorna os dados da licitacao
        '''
        soup = BeautifulSoup(html)
        tables = self.parse_tables(soup)
        data = self.organize_tables_data(tables)
        item_bids = self.get_item_bids(data)
        return item_bids

class LotsAuction(Auction):
    '''
    contem funcoes especificas p/ parse HTML de licitacao
    de itens agrupados em lotes
    '''

    def get_htmls(self, i):
        '''
        retorna o HTML de cada lance/lote da licitacao
        '''
        path = self.get_paths()[2] + i + '/'
        htmls = []
        for file in os.listdir(path):
            with open(path + file, mode = 'rb') as source:
                html = source.read()
                htmls.append(html)
        return htmls

    def get_bidder_cnpj(self, field):
        '''
        retorna razao social e CNPJ do participante
        '''
        field = field.strip()
        cnpj = field.split()[-1]
        cnpj = cnpj.replace('(', '')
        cnpj = cnpj.replace(')', '')
        i = field.index(cnpj)
        bidder = field[:i-2]
        return bidder, cnpj

    def get_info(self, bid):
        '''
        retorna dados gerais do lance/lote
        '''
        info = {}
        info['num_lote'] = int(bid[2])
        info['preco_lote'] = self.to_number(bid[3])
        info['participante'] = self.get_bidder_cnpj(bid[5])[0]
        info['cnpj'] = self.get_bidder_cnpj(bid[5])[1]
        return info

    def get_item(self, bid, i):
        '''
        retorna dados de cada item do lance/lote
        '''
        item_bid = {}
        item_bid['descricao'] = bid[i]
        item_bid['quantidade'] = self.fix_problem_field(bid[i+1])[0]
        item_bid['unidade'] = self.fix_problem_field(bid[i+1])[1]
        item_bid['preco_uni'] = self.to_number(bid[i+2])
        item_bid['preco_tot'] = self.to_number(bid[i+3])
        item_bid['marca'] = bid[i+4]
        return item_bid

    def get_bids(self, htmls):
        '''
        retorna os lances/lotes de cada licitacao
        '''
        bids = []
        for html in htmls:
            soup = BeautifulSoup(html)
            fields = soup.find_all(class_ = 'nota_tit')
            bids.append([field.text for field in fields])
        return bids

    def organize_bids(self, bids):
        '''
        organiza os lances/lotes de cada licitacao
        '''
        data = {}
        for n, bid in enumerate(bids):
            new_bid = {}
            new_bid['lance_info'] = self.get_info(bid)
            items = {}
            for k, idx in enumerate(range(len(bid[11:]))[::5]):
                if u'Este Lote n\u00E3o possui Lances' in bid[11:][0]:
                    continue
                items[k] = self.get_item(bid[11:], idx)
            new_bid['lance_itens'] = items
            data[n] = new_bid
        return data

    def export_data(self, i):
        '''
        retorna os dados da licitacao
        '''
        htmls = self.get_htmls(i)
        bids = self.get_bids(htmls)
        data = self.organize_bids(bids)
        return data

def do_it():
    '''
    roda a coisa toda
    '''
    auction = Auction()
    path1, path2 = auction.get_paths()[:2]
    n = 0
    for file1, file2 in zip(os.listdir(path1), os.listdir(path2)):
        if 'licitacao' in file1:
            if file1 == file2:
                n += 1
                print file1, n
                i = file1[13:].replace('.html', '')
                with open(path1 + file1, mode = 'rb') as buffer1:
                    html1 = buffer1.read()
                if '<h1>Service Unavailable</h1>' in html1:
                    continue
                general_info = auction.get_general_info(html1)
                data = {}
                data['info_geral'] = general_info
                with open(path2 + file2, mode = 'rb') as buffer2:
                    html2 = buffer2.read()
                if 'onclick="detalhamento' in html2:
                    auction = LotsAuction()
                    data['lances'] = auction.export_data(i)
                else:
                    auction = SingleAuction()
                    data['objetos'] = auction.export_data(html2)
                auction.to_json(data, i)
                fullpath = get_paths()[3] + 'licitacao_id_{}.json'.format(i)
                if not os.path.exists(fullpath):
                    print 'FILE NOT FOUND!'
                    print 'press any key to continue'
                    raw_input()
do_it()
	'''
	parse (HTML -> JSON) e-Compras GDF content
	'''

	import os
	import re
	import json
	import socket
	from bs4 import BeautifulSoup

	class Auction(object):
	'''
	contem funcoes gerais usadas p/ parse o HTML da licitacao
	'''

	def get_paths(self):
	'''
	ajusta o caminho ate os dados de acordo com o computador em uso
	'''
	if 'cade' in socket.getfqdn():
	basepath = 'C:/Users/thiago.marzagao/Dropbox/CADE/RASPAGENS/GDF_V2/'
	elif 'MacBook' in socket.getfqdn():
	basepath = '/Users/thiagomarzagao/Dropbox/CADE/RASPAGENS/GDF_V2/'
	elif 'ThinkPad' in socket.getfqdn():
	basepath = '/home/thiago/Dropbox/CADE/RASPAGENS/GDF_V2/'
	inpath1 = basepath + 'CONTEUDO/HTML/item_resultados/'
	inpath2 = basepath + 'CONTEUDO/HTML/Mapa_Lances/'
	inpath3 = basepath + 'CONTEUDO/HTML/lotes/'
	outpath = basepath + 'CONTEUDO/JSON/'
	return inpath1, inpath2, inpath3, outpath

	def get_header(self, soup):
	'''
	retorna info basica da licitacao (id, datas, resumo)
	'''
	regex = re.compile('tribuchet-11-bold-claro\|tribuchet-13-verde-escuro')
	tds = soup.find_all('td')[116]
	content = tds.find_all('td', class_ = regex)
	header = [element.text.strip() for element in content]
	return header

	def get_registration(self, header):
	'''
	retorna datas e horas do periodo de inscricao
	'''
	start_date = header[3][3:13]
	start_hour = header[3][14:22]
	end_date = header[3][27:37]
	end_hour = header[3][38:46]
	registration = {'data_inicio': start_date,
	'hora_inicio': start_hour,
	'data_fim': end_date,
	'hora_fim': end_hour}
	return registration

	def get_opening(self, header):
	'''
	retorna data e hora de abertura dos lances
	'''
	opening_date = header[5][:10]
	opening_hour = header[5][10:19]
	opening = {'data_abertura': opening_date,
	'hora_abertura': opening_hour}
	return opening

	def get_general_info(self, html):
	'''
	retorna info basica em formato Python dict
	'''
	soup = BeautifulSoup(html)
	header = self.get_header(soup)
	summary = header[1]
	registration = self.get_registration(header)
	opening = self.get_opening(header)
	general_info = {'resumo': summary,
	'inscricao': registration,
	'abertura': opening}
	return general_info

	def to_number(self, as_string):
	'''
	converte string em float e corrige virgulas
	'''
	as_string = as_string.replace('.', '')
	as_string = as_string.replace(',', '.')
	return float(as_string)

	def fix_problem_field(self, field):
	'''
	separa quantidade e unidade
	'''
	splitted = field.split()[:2]
	try:
	quantity = self.to_number(splitted[0])
	unity = splitted[1]
	return quantity, unity
	except:
	return 'invalid_value', 'invalid_value'

	def to_json(self, auction, i):
	'''
	exporta dados da licitacao p/ arquivo JSON
	'''
	fname = self.get_paths()[3] + 'licitacao_id_{}.json'.format(i)
	with open(fname, mode = 'wb') as destination:
	json.dump(auction, destination)

	class SingleAuction(Auction):
	'''
	contem funcoes especificas p/ parse HTML de licitacao
	de itens individualizados
	'''

	def parse_tables(self, soup):
	'''
	retorna tabelas contidas no HTML
	'''
	regex = '\d\.\d\.\d{2}\.\d{2}\.\d{2}\.\d{2}\.\d{4}\.\d{6}-\d{2}'
	tables = []
	raw_tables = soup.find_all('table')[1:-5]
	for raw_table in raw_tables:
	if raw_table.find('td').text:
	table = []
	rows = raw_table.find_all('tr')[1:]
	for row in rows:
	columns = row.find_all('td')
	text = [cell.text.strip() for cell in columns]
	if row.find_all('img'):
	text.append('winner')
	else:
	text.append('')
	table.append(text)
	if not table:
	continue
	if re.match(regex, table[0][1]):
	table.append('item')
	else:
	table.append('bids')
	tables.append(table)
	return tables

	def organize_tables_data(self, tables):
	'''
	retorna tabelas em formato Python dict
	'''
	data = {}
	for i, table in enumerate(tables):
	bids = None
	if table[-1] == 'item':
	if i + 1 < len(tables):
	if tables[i+1][-1] == 'bids':
	bids = tables[i+1][:-1]
	data[i] = {'item': table[0][:-1], 'bids': bids}
	return data

	def get_item(self, pair):
	'''
	retorna dados do item licitado
	'''
	item = {}
	item['codigo'] = pair['item'][1]
	item['qtde_cotada'] = self.to_number(pair['item'][2])
	item['descricao'] = pair['item'][3]
	item['est_preco_uni'] = self.to_number(pair['item'][4])
	item['est_preco_tot'] = self.to_number(pair['item'][5])
	return item

	def get_bid(self, bid):
	'''
	retorna dados do lance
	'''
	lance = {}
	lance['participante'] = bid[0]
	lance['qtde'], lance['unidade'] = self.fix_problem_field(bid[1])
	lance['preco_uni'] = self.to_number(bid[2])
	lance['preco_tot'] = self.to_number(bid[3])
	lance['diferenca'] = self.to_number(bid[4])
	lance['preco_neg'] = bid[5]
	if len(bid) == 8:
	lance['marca'] = bid[6]
	if bid[-1] == 'winner':
	lance['vencedor'] = True
	else:
	lance['vencedor'] = False
	return lance

	def get_item_bids(self, data):
	'''
	retorna conjuntos de item-lances
	'''
	item_bids = {}
	for key in data:
	new_pair = {}
	pair = data[key]
	new_pair['item'] = self.get_item(pair)
	if pair['bids']:
	lances = {}
	for i, bid in enumerate(pair['bids']):
	lances[i] = self.get_bid(bid)
	new_pair['lances'] = lances
	else:
	new_pair['lances'] = None
	item_bids[key] = new_pair
	return item_bids

	def export_data(self, html):
	'''
	retorna os dados da licitacao
	'''
	soup = BeautifulSoup(html)
	tables = self.parse_tables(soup)
	data = self.organize_tables_data(tables)
	item_bids = self.get_item_bids(data)
	return item_bids

	class LotsAuction(Auction):
	'''
	contem funcoes especificas p/ parse HTML de licitacao
	de itens agrupados em lotes
	'''

	def get_htmls(self, i):
	'''
	retorna o HTML de cada lance/lote da licitacao
	'''
	path = self.get_paths()[2] + i + '/'
	htmls = []
	for file in os.listdir(path):
	with open(path + file, mode = 'rb') as source:
	html = source.read()
	htmls.append(html)
	return htmls

	def get_bidder_cnpj(self, field):
	'''
	retorna razao social e CNPJ do participante
	'''
	field = field.strip()
	cnpj = field.split()[-1]
	cnpj = cnpj.replace('(', '')
	cnpj = cnpj.replace(')', '')
	i = field.index(cnpj)
	bidder = field[:i-2]
	return bidder, cnpj

	def get_info(self, bid):
	'''
	retorna dados gerais do lance/lote
	'''
	info = {}
	info['num_lote'] = int(bid[2])
	info['preco_lote'] = self.to_number(bid[3])
	info['participante'] = self.get_bidder_cnpj(bid[5])[0]
	info['cnpj'] = self.get_bidder_cnpj(bid[5])[1]
	return info

	def get_item(self, bid, i):
	'''
	retorna dados de cada item do lance/lote
	'''
	item_bid = {}
	item_bid['descricao'] = bid[i]
	item_bid['quantidade'] = self.fix_problem_field(bid[i+1])[0]
	item_bid['unidade'] = self.fix_problem_field(bid[i+1])[1]
	item_bid['preco_uni'] = self.to_number(bid[i+2])
	item_bid['preco_tot'] = self.to_number(bid[i+3])
	item_bid['marca'] = bid[i+4]
	return item_bid

	def get_bids(self, htmls):
	'''
	retorna os lances/lotes de cada licitacao
	'''
	bids = []
	for html in htmls:
	soup = BeautifulSoup(html)
	fields = soup.find_all(class_ = 'nota_tit')
	bids.append([field.text for field in fields])
	return bids

	def organize_bids(self, bids):
	'''
	organiza os lances/lotes de cada licitacao
	'''
	data = {}
	for n, bid in enumerate(bids):
	new_bid = {}
	new_bid['lance_info'] = self.get_info(bid)
	items = {}
	for k, idx in enumerate(range(len(bid[11:]))[::5]):
	if u'Este Lote n\u00E3o possui Lances' in bid[11:][0]:
	continue
	items[k] = self.get_item(bid[11:], idx)
	new_bid['lance_itens'] = items
	data[n] = new_bid
	return data

	def export_data(self, i):
	'''
	retorna os dados da licitacao
	'''
	htmls = self.get_htmls(i)
	bids = self.get_bids(htmls)
	data = self.organize_bids(bids)
	return data

	def do_it():
	'''
	roda a coisa toda
	'''
	auction = Auction()
	path1, path2 = auction.get_paths()[:2]
	n = 0
	for file1, file2 in zip(os.listdir(path1), os.listdir(path2)):
	if 'licitacao' in file1:
	if file1 == file2:
	n += 1
	print file1, n
	i = file1[13:].replace('.html', '')
	with open(path1 + file1, mode = 'rb') as buffer1:
	html1 = buffer1.read()
	if '<h1>Service Unavailable</h1>' in html1:
	continue
	general_info = auction.get_general_info(html1)
	data = {}
	data['info_geral'] = general_info
	with open(path2 + file2, mode = 'rb') as buffer2:
	html2 = buffer2.read()
	if 'onclick="detalhamento' in html2:
	auction = LotsAuction()
	data['lances'] = auction.export_data(i)
	else:
	auction = SingleAuction()
	data['objetos'] = auction.export_data(html2)
	auction.to_json(data, i)
	fullpath = get_paths()[3] + 'licitacao_id_{}.json'.format(i)
	if not os.path.exists(fullpath):
	print 'FILE NOT FOUND!'
	print 'press any key to continue'
	raw_input()
	do_it()