Skip to content

Instantly share code, notes, and snippets.

@harveyslash
Created November 23, 2016 06:33
Show Gist options
  • Save harveyslash/2bc93ae223b992568def107d6302ed77 to your computer and use it in GitHub Desktop.
Save harveyslash/2bc93ae223b992568def107d6302ed77 to your computer and use it in GitHub Desktop.
import scrapy
import os
import urllib
BASEURL =
categories = {
"art&photography":"https://www.amazon.com/s/ref=sr_pg_2?fst=as%3Aoff&rh=n%3A283155%2Cn%3A%211000%2Cn%3A1&page=PAGENUM&bbn=1000&sort=featured-rank&ie=UTF8&qid=1479879286",
"Biographies & Memoirs":"https://www.amazon.com/s/ref=sr_pg_2?fst=as%3Aoff&rh=n%3A283155%2Cn%3A%211000%2Cn%3A2&page=PAGENUM&bbn=1000&sort=featured-rank&ie=UTF8&qid=1479879374",
"Business & Money":"https://www.amazon.com/s/ref=sr_pg_3?fst=as%3Aoff&rh=n%3A283155%2Cn%3A%211000%2Cn%3A3&page=PAGENUM&bbn=1000&sort=featured-rank&ie=UTF8&qid=1479879412",
"Calendars":"https://www.amazon.com/s/ref=sr_pg_2?fst=as%3Aoff&rh=n%3A283155%2Cn%3A%211000%2Cn%3A3248857011&page=PAGENUM&bbn=1000&sort=featured-rank&ie=UTF8&qid=1479879444"}
currentDirectory = ""
class QuotesSpider(scrapy.Spider):
meta = {'dont_redirect': True, "handle_httpstatus_list" : [301, 302, 303]}
name = "datasetAmazon"
pageCountStart = 1
pageCountEnd = 100
def start_requests(self):
global categories
for category in categories:
global currentDirectory
if not os.path.exists(category):
os.makedirs(category)
currentDirectory = category
for i in range(self.pageCountStart,self.pageCountEnd):
rawURL = categories[category]
finalURL = rawURL.replace("PAGENUM",str(i))
print("PRINTEING FINAL URL !!!!!!!!!!!!!!!!!!!!!",finalURL)
yield scrapy.Request(url=finalURL, callback=self.parse)
def parse(self, response):
# s-access-image cfMarker
print("printing stuff !!!!!!!!!")
listOfBooks = response.css('img.s-access-image::attr(src)').extract()
for i in listOfBooks:
urllib.urlretrieve(i, currentDirectory+"/"+ i.split("/")[-1])
print(i)
# exit()
# page = response.url.split("/")[-2]
# print("printing page")
# print(page)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment