Skip to content

Instantly share code, notes, and snippets.

@mabelvj
Created January 16, 2019 01:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mabelvj/631181f7716ec6d6a6756afc77882374 to your computer and use it in GitHub Desktop.
Save mabelvj/631181f7716ec6d6a6756afc77882374 to your computer and use it in GitHub Desktop.
El Mundo Today Spider
# -*- coding: utf-8 -*-
import scrapy
from scrapy.exceptions import CloseSpider
from datetime import datetime
from urllib.parse import urljoin
class EmtSpider(scrapy.Spider):
name = 'emt'
allowed_domains = ['www.elmundotoday.com']
start_urls = ['https://www.elmundotoday.com/']
def __init__(self, year=None, month=None, *args, **kwargs):
"""
year: int
If not set to any value, it ill be set to current year,
month: int, None (optional)
Search can be done within the whole year or
within a specific month for a year.
"""
super(EmtSpider, self).__init__(*args, **kwargs)
if not year:
year = datetime.now().year
self.__start_urls = [urljoin(u, str(year)) for u in self.start_urls]
if month:
self.__start_urls = [
urljoin(u, datetime.date(month=month).strftime('%M'))
for u in self.__start_urls]
def parse(self, response):
if response.status in (300, 301, 302, 404):
raise CloseSpider('No more urls to go')
entries = response.xpath(
"//div[@class='td-ss-main-content']//a[@rel='bookmark']")
titles = entries.xpath("text()").extract()
post_urls = entries.xpath("@href").extract()
for item in zip(titles, post_urls):
yield scrapy.Request(url=item[1], callback=self.parse_post,
dont_filter=True,
meta={'title': item[0]}, priority=1)
def parse_post(self, response):
date = response.xpath(
"//meta[@itemprop='datePublished']/@content").extract_first()
return {'title': response.meta['title'], 'datePublished': date}
def start_requests(self, number_of_pages=50):
# generate page IDs from 1000 down to 501
for i in range(1, number_of_pages + 1):
if i > 1:
request_url = self.__start_urls[0] + '/page/%d' % i
else:
request_url = self.__start_urls[0]
yield scrapy.Request(url=request_url, callback=self.parse,
dont_filter=True,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment