Skip to content

Instantly share code, notes, and snippets.

@JasonCrowe
Created February 22, 2019 20:14
Show Gist options
  • Save JasonCrowe/de29f373c8fe54d3ef856abdc700d074 to your computer and use it in GitHub Desktop.
Save JasonCrowe/de29f373c8fe54d3ef856abdc700d074 to your computer and use it in GitHub Desktop.
Spider to crawl and entire site and save each page to a sqlite database for further parseing.
# -*- coding: utf-8 -*-
import scrapy
import dataset
# db = dataset.connect('sqlite:///:memory:')
db = dataset.connect('sqlite:///database.db')
class FullSiteSpider(scrapy.Spider):
name = 'full_site'
allowed_domains = ['allglassparts.com']
start_urls = ['http://allglassparts.com/']
def parse(self, response):
prod_page = response.xpath('//div[@id="ProductDetails"]').extract_first()
if prod_page is not None:
db['pages'].insert({'page_source': response.text, 'url': response.url})
for href in response.xpath('//a/@href').getall():
yield scrapy.Request(response.urljoin(href), self.parse)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment