-
-
Save anonymous/ce89965c05b309f698ec72d00666cbf7 to your computer and use it in GitHub Desktop.
Asos spider
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from scrapy.shell import inspect_response | |
from parsel import Selector | |
import json | |
import re | |
import pandas as pd | |
class AsosSpider(scrapy.Spider): | |
name = 'asos_spider' | |
start_urls = ['http://www.asos.com/women'] | |
custom_settings = { | |
'DOWNLOAD_DELAY': 3, | |
'HTTPCACHE_ENABLED': True, | |
'ITEM_PIPELINES': { | |
'asos_project.pipelines.DuplicatesPipeline': 300, | |
}, | |
} | |
def __init__(self, *args, **kwargs): | |
super(AsosSpider, self).__init__(*args, **kwargs) | |
df = pd.read_csv('categories.csv') | |
self.categories = df.set_index('cat_id')['cat_name'].to_dict() | |
def parse(self, response): | |
# inspect_response(response, self) | |
links = response.xpath( | |
'//nav[@data-testid="primarynav-large-women"]//div[@data-testid="secondarynav-container"]' | |
'[position() < last()-1]//section[1]//a[not(.="View all") and contains(@href, "cid=")]/@href').extract() | |
self.logger.info('We get {} links!'.format(len(links))) | |
for link in links[:3]: | |
yield scrapy.Request(link, callback=self.parse_category, meta={'category_link': link}) | |
def parse_category(self, response): | |
# inspect_response(response, self) | |
all_prod = int(response.xpath('//progress/@max').extract_first()) | |
cat_json_link = 'https://api.asos.com/product/search/v1/categories/{}?channel=desktop-web&country=GB¤cy=GBP&keyStoreDataversion=2&lang=en&limit=72&offset={}&rowlength=3&store=1' | |
cat_id = int(response.url.split('cid=')[-1].split('&')[0]) | |
cat = ', '.join(response.xpath('//div[@id="chrome-breadcrumb"]//li//text()').extract()) | |
category_link = response.meta['category_link'] | |
for i in range(0, all_prod + 1, 72): | |
link = cat_json_link.format(cat_id, i) | |
yield scrapy.Request(link, callback=self.parse_cat_json, meta={'cat': cat, 'category_link': category_link}) | |
def parse_cat_json(self, response): | |
# inspect_response(response, self) | |
data = json.loads(response.body) | |
link = 'http://api.asos.com/product/catalogue/v2/products/{}?store=COM&lang=en-GB&sizeSchema=EU&cy=GBP' | |
for row in data['products']: | |
# prod_link = 'http://www.asos.com/' + row['url'] | |
item = {} | |
item['category_link'] = response.meta['category_link'] | |
# item['cat'] = response.meta['cat'] | |
item['url'] = 'http://www.asos.com/' + row['url'] | |
item['colour'] = row['colour'] | |
yield scrapy.Request(link.format(row['id']), callback=self.parse_item, meta={'item': item}) | |
def parse_item(self, response): | |
# inspect_response(response, self) | |
try: | |
data = json.loads(response.body) | |
item = response.meta['item'] | |
item['json_url'] = response.url | |
item['cat'] = [] | |
for i in data['webCategories']: | |
item['cat'].append(self.categories.get(i['id'], i['id'])) | |
item['sku'] = 'asos_' + str(data['id']) | |
item['valid'] = 1 | |
item['cPrice'] = data['price']['current']['value'] | |
oprice = data['price']['previous']['value'] | |
item['oPrice'] = item['cPrice'] if oprice == 0.0 else oprice | |
item['id'] = data['id'] | |
item['code'] = data['productCode'] | |
item['name'] = data['name'] | |
# item['desc'] = '<ul>' + re.search(r"<ul>(.*)", data['description']).group(1) | |
if data['description']: | |
sel = Selector(text=data['description']) | |
item['desc'] = sel.xpath('//li[not(a)]/text()').extract() | |
item['fabric'] = data['info']['aboutMe'] | |
item['sizeFi'] = data['info'].get('sizeAndFit') | |
item['care_info'] = data['info']['careInfo'] | |
item['stock'] = 1 if data['isInStock'] else 0 | |
item['guide'] = 'http://' + data['sizeGuide'] | |
item['photos'] = [] | |
for i in data['media']['images']: | |
if not i['type'] == 'Swatch': | |
item['photos'].append('http://' + i['url']) | |
if not data['media']['catwalk']: | |
item['video'] = None | |
else: | |
item['video'] = 'https://' + data['media']['catwalk'][0]['url'] | |
item['vars'] = [] | |
count = 0 | |
for x in data['variants']: | |
d = {} | |
count += 1 | |
d['sku'] = 'asos_{}_{}'.format(data['id'], count) | |
d['oPrice'] = x['price']['previous']['value'] | |
d['cPrice'] = x['price']['current']['value'] | |
d['colour'] = x['colour'] | |
d['brandSize'] = x['brandSize'] | |
d['sizeDesc'] = x['sizeDescription'] | |
# d['szUK'] = x['size'].split(' - ')[0] | |
d['stock'] = 1 if x['isInStock'] else 0 | |
d['sizeId'] = x['sizeId'] | |
item['vars'].append(d) | |
yield item | |
except: | |
self.logger.info("Unexpected exception", exc_info=True) | |
inspect_response(response, self) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment