Skip to content

Instantly share code, notes, and snippets.

/asos_spider.py Secret

Created January 8, 2018 20:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/ce89965c05b309f698ec72d00666cbf7 to your computer and use it in GitHub Desktop.
Save anonymous/ce89965c05b309f698ec72d00666cbf7 to your computer and use it in GitHub Desktop.
Asos spider
# -*- coding: utf-8 -*-
import scrapy
from scrapy.shell import inspect_response
from parsel import Selector
import json
import re
import pandas as pd
class AsosSpider(scrapy.Spider):
name = 'asos_spider'
start_urls = ['http://www.asos.com/women']
custom_settings = {
'DOWNLOAD_DELAY': 3,
'HTTPCACHE_ENABLED': True,
'ITEM_PIPELINES': {
'asos_project.pipelines.DuplicatesPipeline': 300,
},
}
def __init__(self, *args, **kwargs):
super(AsosSpider, self).__init__(*args, **kwargs)
df = pd.read_csv('categories.csv')
self.categories = df.set_index('cat_id')['cat_name'].to_dict()
def parse(self, response):
# inspect_response(response, self)
links = response.xpath(
'//nav[@data-testid="primarynav-large-women"]//div[@data-testid="secondarynav-container"]'
'[position() < last()-1]//section[1]//a[not(.="View all") and contains(@href, "cid=")]/@href').extract()
self.logger.info('We get {} links!'.format(len(links)))
for link in links[:3]:
yield scrapy.Request(link, callback=self.parse_category, meta={'category_link': link})
def parse_category(self, response):
# inspect_response(response, self)
all_prod = int(response.xpath('//progress/@max').extract_first())
cat_json_link = 'https://api.asos.com/product/search/v1/categories/{}?channel=desktop-web&country=GB&currency=GBP&keyStoreDataversion=2&lang=en&limit=72&offset={}&rowlength=3&store=1'
cat_id = int(response.url.split('cid=')[-1].split('&')[0])
cat = ', '.join(response.xpath('//div[@id="chrome-breadcrumb"]//li//text()').extract())
category_link = response.meta['category_link']
for i in range(0, all_prod + 1, 72):
link = cat_json_link.format(cat_id, i)
yield scrapy.Request(link, callback=self.parse_cat_json, meta={'cat': cat, 'category_link': category_link})
def parse_cat_json(self, response):
# inspect_response(response, self)
data = json.loads(response.body)
link = 'http://api.asos.com/product/catalogue/v2/products/{}?store=COM&lang=en-GB&sizeSchema=EU&cy=GBP'
for row in data['products']:
# prod_link = 'http://www.asos.com/' + row['url']
item = {}
item['category_link'] = response.meta['category_link']
# item['cat'] = response.meta['cat']
item['url'] = 'http://www.asos.com/' + row['url']
item['colour'] = row['colour']
yield scrapy.Request(link.format(row['id']), callback=self.parse_item, meta={'item': item})
def parse_item(self, response):
# inspect_response(response, self)
try:
data = json.loads(response.body)
item = response.meta['item']
item['json_url'] = response.url
item['cat'] = []
for i in data['webCategories']:
item['cat'].append(self.categories.get(i['id'], i['id']))
item['sku'] = 'asos_' + str(data['id'])
item['valid'] = 1
item['cPrice'] = data['price']['current']['value']
oprice = data['price']['previous']['value']
item['oPrice'] = item['cPrice'] if oprice == 0.0 else oprice
item['id'] = data['id']
item['code'] = data['productCode']
item['name'] = data['name']
# item['desc'] = '<ul>' + re.search(r"<ul>(.*)", data['description']).group(1)
if data['description']:
sel = Selector(text=data['description'])
item['desc'] = sel.xpath('//li[not(a)]/text()').extract()
item['fabric'] = data['info']['aboutMe']
item['sizeFi'] = data['info'].get('sizeAndFit')
item['care_info'] = data['info']['careInfo']
item['stock'] = 1 if data['isInStock'] else 0
item['guide'] = 'http://' + data['sizeGuide']
item['photos'] = []
for i in data['media']['images']:
if not i['type'] == 'Swatch':
item['photos'].append('http://' + i['url'])
if not data['media']['catwalk']:
item['video'] = None
else:
item['video'] = 'https://' + data['media']['catwalk'][0]['url']
item['vars'] = []
count = 0
for x in data['variants']:
d = {}
count += 1
d['sku'] = 'asos_{}_{}'.format(data['id'], count)
d['oPrice'] = x['price']['previous']['value']
d['cPrice'] = x['price']['current']['value']
d['colour'] = x['colour']
d['brandSize'] = x['brandSize']
d['sizeDesc'] = x['sizeDescription']
# d['szUK'] = x['size'].split(' - ')[0]
d['stock'] = 1 if x['isInStock'] else 0
d['sizeId'] = x['sizeId']
item['vars'].append(d)
yield item
except:
self.logger.info("Unexpected exception", exc_info=True)
inspect_response(response, self)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment