sidharthshah/scrapy-file-save-example.py

## scrapy-file-save-example.py
# -*- coding: utf-8 -*-
import os
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from hashlib import md5

from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client.skoovbot
collection = db.crawl_log
collection.ensure_index("url")

BASE_DIR = "flipkart_pages"

#Make sure we have directory created
if not os.path.exists(BASE_DIR):
    os.mkdir(BASE_DIR)


log_file = open(os.path.join(BASE_DIR, "ecom.log"), "w")

class ECommerceSpider(CrawlSpider):
    name = "ecom"
    allowed_domains = ["ecom.com"]
    start_urls = (
        <set of urls here>
    )

    rules = (
        Rule(LinkExtractor(allow=(r<allow regexes here>,), deny=(r'<deny regexes here>')), callback="parse_custom" ,follow=True),
    )

    def is_already_crawled(self, url):
        return collection.find({"url": url}).count() != 0

    def parse_custom(self, response):
        normalized_url = response.url
        if response.url.find("?") != -1:
            normalized_url = response.url.split("?")[0]

        if not self.is_already_crawled(normalized_url):
            filename = os.path.join(BASE_DIR, md5(normalized_url).hexdigest())
            with open(filename, 'wb') as f:
                f.write(response.body)

            rec = "%s\t%s\n" % (normalized_url, filename)
            log_file.write(rec)

            #Update MongoDB
            collection.insert({"url": normalized_url})
	# -- coding: utf-8 --
	import os
	import scrapy
	from scrapy.spiders import CrawlSpider, Rule
	from scrapy.linkextractors import LinkExtractor
	from hashlib import md5

	from pymongo import MongoClient
	client = MongoClient('localhost', 27017)
	db = client.skoovbot
	collection = db.crawl_log
	collection.ensure_index("url")

	BASE_DIR = "flipkart_pages"

	#Make sure we have directory created
	if not os.path.exists(BASE_DIR):
	os.mkdir(BASE_DIR)


	log_file = open(os.path.join(BASE_DIR, "ecom.log"), "w")

	class ECommerceSpider(CrawlSpider):
	name = "ecom"
	allowed_domains = ["ecom.com"]
	start_urls = (
	<set of urls here>
	)

	rules = (
	Rule(LinkExtractor(allow=(r<allow regexes here>,), deny=(r'<deny regexes here>')), callback="parse_custom" ,follow=True),
	)

	def is_already_crawled(self, url):
	return collection.find({"url": url}).count() != 0

	def parse_custom(self, response):
	normalized_url = response.url
	if response.url.find("?") != -1:
	normalized_url = response.url.split("?")[0]

	if not self.is_already_crawled(normalized_url):
	filename = os.path.join(BASE_DIR, md5(normalized_url).hexdigest())
	with open(filename, 'wb') as f:
	f.write(response.body)

	rec = "%s\t%s\n" % (normalized_url, filename)
	log_file.write(rec)

	#Update MongoDB
	collection.insert({"url": normalized_url})