Skip to content

Instantly share code, notes, and snippets.

@knezenk
Created December 5, 2021 20:38
Show Gist options
  • Save knezenk/7dd99c6d28ff82c5be6ba08c6b94fd36 to your computer and use it in GitHub Desktop.
Save knezenk/7dd99c6d28ff82c5be6ba08c6b94fd36 to your computer and use it in GitHub Desktop.
def start_requests(self):
for veiculo in tqdm(self.veiculos):
veiculo_dict = {
"Id": str(veiculo["_id"]),
"Url": veiculo["Url"],
}
#AQUI ELE DA O REQUEST EM CADA SITE DIFERENTE QUE VEM DO MEU BANCO:
yield scrapy.Request(veiculo_dict["Url"],
cb_kwargs=veiculo_dict,
callback=self.parse)
def parse(self, response, Url):
# add analisys data
mongo_dict = {
"url": response.url,
"created_at": datetime.now(),
"resolve_time": response.meta.get("download_latency"),
}
documents = []
urls = []
try:
urls = self.get_xml_links(Url)
except Exception as e:
print(traceback.print_exc())
print("INICIANDO A EXTRAÇÃO GERAL PELA PAGINA PRINCIPAL")
for link in response.css("a::attr(href)").extract():
#AQUI ELE PEGA CADA HREF DENTRO DE CADA SITE
url = response.urljoin(link)
if str(url) not in self.listaIgnores:
urls.append(url)
else:
pass
urls = list(set(urls))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment