harveyslash/gist:2bc93ae223b992568def107d6302ed77

## gistfile1.txt
import scrapy
import os
import urllib

BASEURL =

categories = {
"art&photography":"https://www.amazon.com/s/ref=sr_pg_2?fst=as%3Aoff&rh=n%3A283155%2Cn%3A%211000%2Cn%3A1&page=PAGENUM&bbn=1000&sort=featured-rank&ie=UTF8&qid=1479879286",
"Biographies & Memoirs":"https://www.amazon.com/s/ref=sr_pg_2?fst=as%3Aoff&rh=n%3A283155%2Cn%3A%211000%2Cn%3A2&page=PAGENUM&bbn=1000&sort=featured-rank&ie=UTF8&qid=1479879374",
"Business & Money":"https://www.amazon.com/s/ref=sr_pg_3?fst=as%3Aoff&rh=n%3A283155%2Cn%3A%211000%2Cn%3A3&page=PAGENUM&bbn=1000&sort=featured-rank&ie=UTF8&qid=1479879412",
"Calendars":"https://www.amazon.com/s/ref=sr_pg_2?fst=as%3Aoff&rh=n%3A283155%2Cn%3A%211000%2Cn%3A3248857011&page=PAGENUM&bbn=1000&sort=featured-rank&ie=UTF8&qid=1479879444"}
currentDirectory = ""
class QuotesSpider(scrapy.Spider):
    meta = {'dont_redirect': True, "handle_httpstatus_list" : [301, 302, 303]}
    name = "datasetAmazon"
    pageCountStart = 1
    pageCountEnd = 100


    def start_requests(self):

        global categories
        for category in categories:
            global currentDirectory

            if not os.path.exists(category):
                os.makedirs(category)
            currentDirectory = category

            for i in range(self.pageCountStart,self.pageCountEnd):
                rawURL = categories[category]
                finalURL = rawURL.replace("PAGENUM",str(i))
                print("PRINTEING FINAL URL !!!!!!!!!!!!!!!!!!!!!",finalURL)
                yield scrapy.Request(url=finalURL, callback=self.parse)


    def parse(self, response):
        # s-access-image cfMarker

        print("printing stuff !!!!!!!!!")
        listOfBooks =  response.css('img.s-access-image::attr(src)').extract()
        for i in listOfBooks:
            urllib.urlretrieve(i, currentDirectory+"/"+ i.split("/")[-1])
            print(i)


        # exit()
        # page = response.url.split("/")[-2]
        # print("printing page")
        # print(page)
	import scrapy
	import os
	import urllib

	BASEURL =

	categories = {
	"art&photography":"https://www.amazon.com/s/ref=sr_pg_2?fst=as%3Aoff&rh=n%3A283155%2Cn%3A%211000%2Cn%3A1&page=PAGENUM&bbn=1000&sort=featured-rank&ie=UTF8&qid=1479879286",
	"Biographies & Memoirs":"https://www.amazon.com/s/ref=sr_pg_2?fst=as%3Aoff&rh=n%3A283155%2Cn%3A%211000%2Cn%3A2&page=PAGENUM&bbn=1000&sort=featured-rank&ie=UTF8&qid=1479879374",
	"Business & Money":"https://www.amazon.com/s/ref=sr_pg_3?fst=as%3Aoff&rh=n%3A283155%2Cn%3A%211000%2Cn%3A3&page=PAGENUM&bbn=1000&sort=featured-rank&ie=UTF8&qid=1479879412",
	"Calendars":"https://www.amazon.com/s/ref=sr_pg_2?fst=as%3Aoff&rh=n%3A283155%2Cn%3A%211000%2Cn%3A3248857011&page=PAGENUM&bbn=1000&sort=featured-rank&ie=UTF8&qid=1479879444"}
	currentDirectory = ""
	class QuotesSpider(scrapy.Spider):
	meta = {'dont_redirect': True, "handle_httpstatus_list" : [301, 302, 303]}
	name = "datasetAmazon"
	pageCountStart = 1
	pageCountEnd = 100





	def start_requests(self):

	global categories
	for category in categories:
	global currentDirectory

	if not os.path.exists(category):
	os.makedirs(category)
	currentDirectory = category

	for i in range(self.pageCountStart,self.pageCountEnd):
	rawURL = categories[category]
	finalURL = rawURL.replace("PAGENUM",str(i))
	print("PRINTEING FINAL URL !!!!!!!!!!!!!!!!!!!!!",finalURL)
	yield scrapy.Request(url=finalURL, callback=self.parse)





	def parse(self, response):
	# s-access-image cfMarker

	print("printing stuff !!!!!!!!!")
	listOfBooks = response.css('img.s-access-image::attr(src)').extract()
	for i in listOfBooks:
	urllib.urlretrieve(i, currentDirectory+"/"+ i.split("/")[-1])
	print(i)



	# exit()
	# page = response.url.split("/")[-2]
	# print("printing page")
	# print(page)