dasdachs/rtd_docs.py

## rtd_docs.py
# -*- coding: utf-8 -*-
# To run it install Scrapy
# https://doc.scrapy.org/en/latest/intro/install.html
# And then do the following
# $ scrapy startproject [your_project_name]
# $ cd your_project_name
# $ scrapy genspider docs [docs_domain.tld]
#
# Erase the content of your_project_name/spider/docs.py
# and replace it with this code and run
#
# $ scrapy crawl docs -o check_meta_tags.csv
#
# Now you have a CSV file with all the endpoint and the meta tags
import re

import scrapy
from scrapy.linkextractors import LinkExtractor


class DocsSpider(scrapy.Spider):
    VERSION_RE = re.compile(r'(?<=en/)[\w\d\.]+(?=/)')

    name = 'docs'
    allowed_domains = ['astro-docs.readthedocs.io']
    # The start_urls are hardcoded for the test I made
    # Plase replace them with your or the offical Astropy docs URLs
    start_urls = [
        'https://astro-docs.readthedocs.io/en/latest/genindex.html',
        'https://astro-docs.readthedocs.io/en/v3.0.x/genindex.html',
        'https://astro-docs.readthedocs.io/en/v2.0.x_a/genindex.html',
    ]

    def parse(self, response):
        for link in LinkExtractor().extract_links(response):
            yield scrapy.Request(link.url, callback=self.check_meta)

    def check_meta(self, response):
        version = re.search(self.VERSION_RE, response.url).group()
        meta = response.xpath('//meta[contains(@name, "robots")]').extract_first()
        uri = response.url.split(version)[1]
        yield {
            "docs_version": version,
            "uri": uri,
            "meta_tag": meta
        }
	# -- coding: utf-8 --
	# To run it install Scrapy
	# https://doc.scrapy.org/en/latest/intro/install.html
	# And then do the following
	# $ scrapy startproject [your_project_name]
	# $ cd your_project_name
	# $ scrapy genspider docs [docs_domain.tld]
	#
	# Erase the content of your_project_name/spider/docs.py
	# and replace it with this code and run
	#
	# $ scrapy crawl docs -o check_meta_tags.csv
	#
	# Now you have a CSV file with all the endpoint and the meta tags
	import re

	import scrapy
	from scrapy.linkextractors import LinkExtractor


	class DocsSpider(scrapy.Spider):
	VERSION_RE = re.compile(r'(?<=en/)[\w\d\.]+(?=/)')

	name = 'docs'
	allowed_domains = ['astro-docs.readthedocs.io']
	# The start_urls are hardcoded for the test I made
	# Plase replace them with your or the offical Astropy docs URLs
	start_urls = [
	'https://astro-docs.readthedocs.io/en/latest/genindex.html',
	'https://astro-docs.readthedocs.io/en/v3.0.x/genindex.html',
	'https://astro-docs.readthedocs.io/en/v2.0.x_a/genindex.html',
	]

	def parse(self, response):
	for link in LinkExtractor().extract_links(response):
	yield scrapy.Request(link.url, callback=self.check_meta)

	def check_meta(self, response):
	version = re.search(self.VERSION_RE, response.url).group()
	meta = response.xpath('//meta[contains(@name, "robots")]').extract_first()
	uri = response.url.split(version)[1]
	yield {
	"docs_version": version,
	"uri": uri,
	"meta_tag": meta
	}