Skip to content

Instantly share code, notes, and snippets.

@joffilyfe
Last active March 4, 2021 20:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joffilyfe/eb49c8c013b7c705c0837b8e71fdedf8 to your computer and use it in GitHub Desktop.
Save joffilyfe/eb49c8c013b7c705c0837b8e71fdedf8 to your computer and use it in GitHub Desktop.
import logging
from typing import List
import scrapy
LOGGER = logging.getLogger(__name__)
IGNORE_PATTERNS_LIST = ["pressreleases", "blog", "@", "#", "javascript"]
def should_ignore_link(
link, ignore_pattern_list: List[str] = IGNORE_PATTERNS_LIST
) -> bool:
"""Determina se um link deve ser ignorado ou não com base
na lista de padrões."""
for string in ignore_pattern_list:
if string in link:
return True
return False
class ScieloSpider(scrapy.Spider):
name = "scielospider"
start_urls = ["https://new.scielo.br"]
download_timeout = 360
def parse(self, response):
try:
if "pdf" in response.url:
return None
except Exception:
pass
for next_page in response.selector.xpath("//a"):
link = next_page.attrib.get("href", "").strip()
if len(link) == 0:
continue
elif should_ignore_link(link):
continue
elif (
"//" in link
or "http" in link
or "www." in link
and "new.scielo.br" not in link
):
continue
LOGGER.info("Follow '%s%s'.", self.start_urls[0], link)
yield response.follow(next_page, self.parse)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment