Skip to content

Instantly share code, notes, and snippets.

@OddBloke
Created September 18, 2015 17:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save OddBloke/e952dc5e6f9f2135428e to your computer and use it in GitHub Desktop.
Save OddBloke/e952dc5e6f9f2135428e to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2
import json
import os
import re
import scrapy
from scrapy.crawler import CrawlerProcess
RELEASE_MAP = {
'8.04': 'hardy',
'10.04': 'lucid',
'10.10': 'maverick',
'11.04': 'natty',
'11.10': 'oneiric',
'12.04': 'precise',
'12.10': 'quantal',
'13.04': 'raring',
'13.10': 'saucy',
'14.04': 'trusty',
'14.10': 'utopic',
'15.04': 'vivid',
'15.10': 'wily',
}
class CloudImages403CheckingSpider(scrapy.Spider):
name = 'cloud_images_403_checker'
start_urls = ['http://cloud-images.ubuntu.com']
handle_httpstatus_list = [200, 403, 404]
def parse(self, response):
if 'cloud-images-archive' in response.url:
# We don't care about archived images
return
if response.status != 200:
# We have a directory that is not returning 200
yield {'url': response.url, 'status': response.status}
for href in response.xpath('//a/@href'):
url = href.extract()
if url.startswith('http') or url.startswith('?'):
# We don't want to go to a different domain or sort columns
continue
elif 'server' in url:
# server is just a symlink to directories we will already
# handle (and will nest to infinite depth)
continue
elif 'releases' in response.url and 'releases' in url:
# releases contains a link to releases which nests infinitely
continue
if url.endswith('/'):
# It's another listing page
if response.url.endswith('releases/'):
# Deduplicate release names (so we don't treat, e.g., 14.04
# and trusty as completely separate trees)
for release, rewrite in RELEASE_MAP.items():
url = re.sub(
r'{}(\.\d)?'.format(release), rewrite, url)
yield scrapy.Request(response.urljoin(url), self.parse)
else:
# It's a downloadable file; handle that differently
yield scrapy.Request(response.urljoin(url),
self.parse_downloadable,
method='HEAD')
def parse_downloadable(self, response):
# Just write out the url and status code for the downloadable
yield {'url': response.url, 'status': response.status}
class SeparateFailuresItemPipeline(object):
def __init__(self):
self._file = open('failures.jl', 'wb')
def process_item(self, item, spider):
if item['status'] != 200:
# If the request wasn't OK, shove it in a separate file
line = json.dumps(dict(item))
self._file.write(line)
self._file.flush()
return item
if __name__ == '__main__':
process = CrawlerProcess({
'FEED_FORMAT': 'jsonlines',
'FEED_URI': 'file://{}/all.jl'.format(os.getcwd()),
'ITEM_PIPELINES': {
'cloud_images_checker.SeparateFailuresItemPipeline': 500,
},
})
process.crawl(CloudImages403CheckingSpider)
process.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment