Skip to content

Instantly share code, notes, and snippets.

@anibalpacheco
Last active June 6, 2020 06:31
Show Gist options
  • Save anibalpacheco/f06b169295704e9e0f99 to your computer and use it in GitHub Desktop.
Save anibalpacheco/f06b169295704e9e0f99 to your computer and use it in GitHub Desktop.
la diaria frontpage title spider
# -*- coding: utf-8 -*-
import re
import operator
import cssutils
import logging
from datetime import date, datetime, timedelta
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
from titulos import local_settings
from titulos.items import TituloItem, TituloLoader
today_only = getattr(local_settings, 'TODAY_ONLY', True)
try:
specific_date_only = datetime.strptime(
getattr(local_settings, 'SPECIFIC_DATE_ONLY'), '%Y-%m-%d').date()
except Exception:
specific_date_only = None
class CompartidoSpider(CrawlSpider):
name = "compartido"
allowed_domains = ["compartido.ladiaria", "pdftohtml.ladiaria"]
start_urls = (
'http://compartido.ladiaria/Archivo/PublicacionesCompletas/',
#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2006/200604_ABRIL/',
#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2006/200606_JUNIO/',
#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2008/200803_MARZO/',
#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2014/2014_01ENERO/',
#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2015_04ABRIL/',
#'http://compartido.ladiaria/Archivo/PublicacionesCompletas/2015_08AGOSTO/',
)
# Comment rules or set RULES=False if your SPECIFIC_DATE_ONLY pdf is on the
# base path.
# Change rule to a more precise year-month, etc. where your target pdf is.
if not today_only and getattr(local_settings, 'RULES', True):
rules = (
Rule(LinkExtractor(allow=('/20', )), callback='parse_internal'),
)
pdfdates = []
new_style_date = date(2008, 3, 20)
def __init__(self, yesterday=False, *args, **kwargs):
super(CompartidoSpider, self).__init__(*args, **kwargs)
self.yesterday = yesterday == u'1'
def parse_internal(self, response):
"""
@url http://compartido.ladiaria/Archivo/PublicacionesCompletas/2014/2014_01ENERO/
@returns requests 22
"""
dates, requests = self.find_pdf_urls(response)
if not dates:
le = LinkExtractor()
for url in le.extract_links(response):
requests.append(Request(url.url))
else:
self.pdfdates.extend(dates)
return requests
def parse_start_url(self, response):
dates, requests = self.find_pdf_urls(response)
self.pdfdates.extend(dates)
return requests
def find_pdf_urls(self, response):
"""
Return a tuple containing the dates for the pdf found and the list of
requests to processs with those pdf.
if today_only it scrapes only today's pdf. (or for specific date)
"""
dates, requests = [], []
le = LinkExtractor(
deny_extensions=[], restrict_xpaths='//tr[position()>3]')
for url in le.extract_links(response):
m = re.match(
r'.*ladiaria_(20[012]\d)([01]\d)([0123]\d).pdf$', url.url)
if m:
pdfdate = date(*tuple([int(x) for x in m.groups()]))
if not today_only or pdfdate == date.today() or (
self.yesterday and pdfdate == date.today() - timedelta(1)
) or specific_date_only and pdfdate == specific_date_only:
request = Request(
'http://pdftohtml.ladiaria/?url=' + url.url,
callback=self.parse_pages, method='HEAD')
request.meta['pdfdate'] = pdfdate
requests.append(request)
dates.append(pdfdate)
return dates, requests
def parse_pages(self, response):
"""
@url http://pdftohtml.ladiaria/?url=http://compartido.ladiaria/Archivo/PublicacionesCompletas/2014/2014_01ENERO/ladiaria_20140131.pdf
@returns requests 1
"""
request = Request(
re.sub(r'/salida.html$', '/salida-1.html', response.url),
callback=self.parse_frontpage)
request.meta['pdfdate'] = response.meta.get('pdfdate')
return request
def parse_frontpage(self, response):
"""
Parses the newspaper frontapge in HTML, it detects first the font sizes
using cssutils.
Based on the biggest font (or the second biggest for the old style
design) it scrapes the frontpage title.
"""
sheet = cssutils.parseString(
response.xpath("//style/text()").extract()[0])
fontsizes = {}
for rule in sheet:
name = rule.selectorList.selectorText
if name.startswith('.ft'):
fontsizes[name] = int(
re.match(
r'(\d+)px', rule.style['font-size']).groups()[0])
sorted_fontsizes = sorted(
fontsizes.items(), key=operator.itemgetter(1))
pdfdate = response.meta['pdfdate']
olstyle = pdfdate < self.new_style_date
tl = TituloLoader(
TituloItem(), response.css(
'p[class="%s"]' %
sorted_fontsizes[-2 if olstyle else -1][0][1:]))
tl.add_xpath('title', ".//text()")
tl.add_value('pdfdate', pdfdate)
tl.add_value(
'check',
sorted_fontsizes[-2 if olstyle else -1][1] ==
sorted_fontsizes[-3 if olstyle else -2][1])
return tl.load_item()
def close(self):
"""
Check if in all the week days we found a title, or if we found one in a
weekend.
"""
first_diaria = today_only and date.today() or specific_date_only or \
date(2006, 3, 20)
for single_date in (
first_diaria + timedelta(n) for n in range(
(date.today() - first_diaria).days + 1)):
if single_date.isoweekday() < 6:
if single_date not in self.pdfdates:
logging.error(
"No hay diaria " + single_date.strftime(u'%Y-%m-%d'))
else:
if single_date in self.pdfdates:
logging.error(
"Salio sab o dom " + single_date.strftime(u'%Y-%m-%d'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment