Skip to content

Instantly share code, notes, and snippets.

@sidharthshah
Created November 3, 2015 01:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sidharthshah/797064c4761679b57322 to your computer and use it in GitHub Desktop.
Save sidharthshah/797064c4761679b57322 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import os
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from hashlib import md5
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client.skoovbot
collection = db.crawl_log
collection.ensure_index("url")
BASE_DIR = "flipkart_pages"
#Make sure we have directory created
if not os.path.exists(BASE_DIR):
os.mkdir(BASE_DIR)
log_file = open(os.path.join(BASE_DIR, "ecom.log"), "w")
class ECommerceSpider(CrawlSpider):
name = "ecom"
allowed_domains = ["ecom.com"]
start_urls = (
<set of urls here>
)
rules = (
Rule(LinkExtractor(allow=(r<allow regexes here>,), deny=(r'<deny regexes here>')), callback="parse_custom" ,follow=True),
)
def is_already_crawled(self, url):
return collection.find({"url": url}).count() != 0
def parse_custom(self, response):
normalized_url = response.url
if response.url.find("?") != -1:
normalized_url = response.url.split("?")[0]
if not self.is_already_crawled(normalized_url):
filename = os.path.join(BASE_DIR, md5(normalized_url).hexdigest())
with open(filename, 'wb') as f:
f.write(response.body)
rec = "%s\t%s\n" % (normalized_url, filename)
log_file.write(rec)
#Update MongoDB
collection.insert({"url": normalized_url})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment