Skip to content

Instantly share code, notes, and snippets.

@mhz-tamb
Created May 22, 2017 11:58
Show Gist options
  • Save mhz-tamb/01a2d83598337e74999da88605c49485 to your computer and use it in GitHub Desktop.
Save mhz-tamb/01a2d83598337e74999da88605c49485 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from argparse import ArgumentParser
from urllib.parse import urlparse
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
parser = ArgumentParser()
parser.add_argument('-d', '--domain')
args = parser.parse_args()
domain = urlparse(args.domain)
class SiteSpider(CrawlSpider):
name = 'SiteSpider'
start_urls = [domain.geturl()]
allowed_domains = [domain.netloc];
rules = (
Rule(LinkExtractor(allow = ()), callback = 'parse_item', follow = True),
)
data = []
def parse_item(self, response):
return {
'url': response.url,
'status': response.status,
'h1': response.css('h1::text').extract_first(),
'title': response.css('title::text').extract_first(),
'keywords': response.css('meta[name=keywords]::attr(content)').extract_first(),
'description': response.css('meta[name=description]::attr(content)').extract_first()
}
process = CrawlerProcess({
'FEED_FORMAT': 'csv',
'FEED_URI': domain.netloc + '.csv'
})
process.crawl(SiteSpider)
process.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment