joffilyfe/new_scielo_spider.py

## new_scielo_spider.py
import logging
from typing import List

import scrapy

LOGGER = logging.getLogger(__name__)

IGNORE_PATTERNS_LIST = ["pressreleases", "blog", "@", "#", "javascript"]


def should_ignore_link(
    link, ignore_pattern_list: List[str] = IGNORE_PATTERNS_LIST
) -> bool:
    """Determina se um link deve ser ignorado ou não com base
    na lista de padrões."""

    for string in ignore_pattern_list:
        if string in link:
            return True
    return False


class ScieloSpider(scrapy.Spider):
    name = "scielospider"
    start_urls = ["https://new.scielo.br"]
    download_timeout = 360

    def parse(self, response):

        try:
            if "pdf" in response.url:
                return None
        except Exception:
            pass

        for next_page in response.selector.xpath("//a"):
            link = next_page.attrib.get("href", "").strip()

            if len(link) == 0:
                continue
            elif should_ignore_link(link):
                continue
            elif (
                "//" in link
                or "http" in link
                or "www." in link
                and "new.scielo.br" not in link
            ):
                continue

            LOGGER.info("Follow '%s%s'.", self.start_urls[0], link)

            yield response.follow(next_page, self.parse)
	import logging
	from typing import List

	import scrapy

	LOGGER = logging.getLogger(__name__)

	IGNORE_PATTERNS_LIST = ["pressreleases", "blog", "@", "#", "javascript"]


	def should_ignore_link(
	link, ignore_pattern_list: List[str] = IGNORE_PATTERNS_LIST
	) -> bool:
	"""Determina se um link deve ser ignorado ou não com base
	na lista de padrões."""

	for string in ignore_pattern_list:
	if string in link:
	return True
	return False


	class ScieloSpider(scrapy.Spider):
	name = "scielospider"
	start_urls = ["https://new.scielo.br"]
	download_timeout = 360

	def parse(self, response):

	try:
	if "pdf" in response.url:
	return None
	except Exception:
	pass

	for next_page in response.selector.xpath("//a"):
	link = next_page.attrib.get("href", "").strip()

	if len(link) == 0:
	continue
	elif should_ignore_link(link):
	continue
	elif (
	"//" in link
	or "http" in link
	or "www." in link
	and "new.scielo.br" not in link
	):
	continue

	LOGGER.info("Follow '%s%s'.", self.start_urls[0], link)

	yield response.follow(next_page, self.parse)