Skip to content

Instantly share code, notes, and snippets.

@linuxsoares
Created February 19, 2019 17:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save linuxsoares/cfe2819f88596f4b6fb38f9557bfac4c to your computer and use it in GitHub Desktop.
Save linuxsoares/cfe2819f88596f4b6fb38f9557bfac4c to your computer and use it in GitHub Desktop.
Crawler para pegar dados da ANAC
from scrapy.item import Item, Field
from scrapy.http import FormRequest
from scrapy.spiders import Spider
from scrapy.utils.response import open_in_browser
import re
from bs4 import BeautifulSoup
from dataclasses import dataclass
import unidecode
@dataclass
class Registry:
proprietario: str = ''
cpf_cnpj: str = ''
operador: str = ''
cpf_cnpj: str = ''
fabricante: str = ''
ano_de_fabricacao: str = ''
modelo: str = ''
tipo_icao: str = ''
tipo_de_habilitacao_para_pilotos: str = ''
classe_da_aeronave: str = ''
peso_maximo_de_decolagem: str = ''
numero_maximo_de_passageiros: str = ''
tipo_de_voo_autorizado: str = ''
categoria_de_registro: str = ''
numero_dos_certificados_cm__ca: str = ''
situacao_no_rab: str = ''
data_da_compra_transferencia: str = ''
data_de_validade_do_ca: str = ''
data_de_validade_da_iam: str = ''
situacao_de_aeronavegabilidade: str = ''
motivos: str = ''
class GitSpider(Spider):
name = "github"
allowed_domains = ["sistemas.anac.gov.br"]
start_urls = ["https://sistemas.anac.gov.br/aeronaves/cons_rab.asp"]
data = dict()
def parse(self, response):
formdata = {
'txmtc': 'Ptosp',
}
yield FormRequest.from_response(
response,
formdata=formdata,
clickdata={'name': 'enviar'},
callback=self.parse1
)
def parse1(self, response):
data = response.xpath('//table[@width="98%"]')
soup = BeautifulSoup(data[1].extract(), 'html.parser')
count = 1
title = ''
description = ''
div = soup.find_all('div')
registry = Registry()
get_data = False
attr = ''
for item in div:
item = item.text.replace(':', '').strip().replace('(', '').replace(')', '').replace('-', '').replace('/', '_').replace(' ', '_').lower()
item = unidecode.unidecode(item)
if get_data:
setattr(registry, attr, item.replace('_', ' ').capitalize())
get_data = False
if hasattr(registry, item):
get_data = True
attr = item
print(registry)
help: ## This help
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST) | sort
clean: ## Clean local environment
@find . -name "*.pyc" | xargs rm -rf
@find . -name "*.pyo" | xargs rm -rf
@find . -name "__pycache__" -type d | xargs rm -rf
@rm -f .coverage
@rm -rf htmlcov/
@rm -f coverage.xml
@rm -f *.log
flake8: ## Run flake8 command
@flake8 --show-source .
fix-python-import: ## Organize python imports
@isort -rc .
check-python-import: ## Check python imports
@isort --check
lint: clean flake8 check-python-import ## Run code lint
requirements-dev: ## Install development dependencies
@pip install -U -r requirements.txt
outdated: ## Show outdated dependencies
@pip list --outdated --format=columns
run:
scrapy runspider main.py
beautifulsoup4==4.7.1
Scrapy==1.6.0
unidecode==1.0.23
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment