Created
November 3, 2015 01:28
-
-
Save sidharthshah/797064c4761679b57322 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os | |
import scrapy | |
from scrapy.spiders import CrawlSpider, Rule | |
from scrapy.linkextractors import LinkExtractor | |
from hashlib import md5 | |
from pymongo import MongoClient | |
client = MongoClient('localhost', 27017) | |
db = client.skoovbot | |
collection = db.crawl_log | |
collection.ensure_index("url") | |
BASE_DIR = "flipkart_pages" | |
#Make sure we have directory created | |
if not os.path.exists(BASE_DIR): | |
os.mkdir(BASE_DIR) | |
log_file = open(os.path.join(BASE_DIR, "ecom.log"), "w") | |
class ECommerceSpider(CrawlSpider): | |
name = "ecom" | |
allowed_domains = ["ecom.com"] | |
start_urls = ( | |
<set of urls here> | |
) | |
rules = ( | |
Rule(LinkExtractor(allow=(r<allow regexes here>,), deny=(r'<deny regexes here>')), callback="parse_custom" ,follow=True), | |
) | |
def is_already_crawled(self, url): | |
return collection.find({"url": url}).count() != 0 | |
def parse_custom(self, response): | |
normalized_url = response.url | |
if response.url.find("?") != -1: | |
normalized_url = response.url.split("?")[0] | |
if not self.is_already_crawled(normalized_url): | |
filename = os.path.join(BASE_DIR, md5(normalized_url).hexdigest()) | |
with open(filename, 'wb') as f: | |
f.write(response.body) | |
rec = "%s\t%s\n" % (normalized_url, filename) | |
log_file.write(rec) | |
#Update MongoDB | |
collection.insert({"url": normalized_url}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment