Skip to content

Instantly share code, notes, and snippets.

@dasdachs
Created October 14, 2018 23:57
Show Gist options
  • Save dasdachs/2118c71a73a83fea31a0d05474c63f6c to your computer and use it in GitHub Desktop.
Save dasdachs/2118c71a73a83fea31a0d05474c63f6c to your computer and use it in GitHub Desktop.
Check built docs for met tags with scrapy
# -*- coding: utf-8 -*-
# To run it install Scrapy
# https://doc.scrapy.org/en/latest/intro/install.html
# And then do the following
# $ scrapy startproject [your_project_name]
# $ cd your_project_name
# $ scrapy genspider docs [docs_domain.tld]
#
# Erase the content of your_project_name/spider/docs.py
# and replace it with this code and run
#
# $ scrapy crawl docs -o check_meta_tags.csv
#
# Now you have a CSV file with all the endpoint and the meta tags
import re
import scrapy
from scrapy.linkextractors import LinkExtractor
class DocsSpider(scrapy.Spider):
VERSION_RE = re.compile(r'(?<=en/)[\w\d\.]+(?=/)')
name = 'docs'
allowed_domains = ['astro-docs.readthedocs.io']
# The start_urls are hardcoded for the test I made
# Plase replace them with your or the offical Astropy docs URLs
start_urls = [
'https://astro-docs.readthedocs.io/en/latest/genindex.html',
'https://astro-docs.readthedocs.io/en/v3.0.x/genindex.html',
'https://astro-docs.readthedocs.io/en/v2.0.x_a/genindex.html',
]
def parse(self, response):
for link in LinkExtractor().extract_links(response):
yield scrapy.Request(link.url, callback=self.check_meta)
def check_meta(self, response):
version = re.search(self.VERSION_RE, response.url).group()
meta = response.xpath('//meta[contains(@name, "robots")]').extract_first()
uri = response.url.split(version)[1]
yield {
"docs_version": version,
"uri": uri,
"meta_tag": meta
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment