Skip to content

Instantly share code, notes, and snippets.

@surjikal
Last active August 12, 2019 14:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save surjikal/11362303 to your computer and use it in GitHub Desktop.
Save surjikal/11362303 to your computer and use it in GitHub Desktop.
crawler for subset of barneys.com
_ = require 'lodash'
Crawler = require('crawler').Crawler
assert = require 'assert'
Promise = require 'bluebird'
request = require 'request'
fs = require 'fs'
TOP_LEVEL_CATEGORIES = [
'women'
'men'
'womens-beauty'
'home'
'kids'
]
copy = (x) -> JSON.parse JSON.stringify(x)
parsePrice = (x) ->
assert x, 'Oh dang, this string is empty.'
price = x.replace ',', ''
assert /^\$\d+(\.\d+)?$/.test(price), "Unhandled price format for price `#{x}`."
return price.match(/^\$(\d+(\.\d+)?)$/)[1]
BarneysCrawler = (crawler, baseUrl = '') ->
headers =
"Pragma": "no-cache"
"Accept-Encoding": ""
"Accept-Language": "en-US,en;q=0.8"
"User-Agent": "Dudebro Crawler"
"Accept": "application/json, text/javascript, */*; q=0.01"
"X-Requested-With": "XMLHttpRequest"
"Connection": "keep-alive"
"Cache-Control": "no-cache"
crawl = (endpoint) ->
console.trace() if not endpoint
uri = "#{baseUrl}#{endpoint}"
console.error "Queuing #{uri}"
deferred = Promise.defer()
crawler.queue {uri, jquery:true, headers, callback:(err, res, $) ->
return deferred.reject err if err
return deferred.resolve {res, $}
}
return deferred.promise
getSubcategories = (category) -> (crawl category.url).then ({res, $}) ->
$subcategoryLinks = $('#category-level-2').find('a')
console.error "--> Category `#{category.name}` has `#{$subcategoryLinks.length}` subcategories."
subcategories = []
$subcategoryLinks.each ->
$this = $(this)
subcategories.push
name: $this.text().trim().toLowerCase()
url: $this.attr('href')
return subcategories
getSubsubcategories = (subcategory) -> (crawl subcategory.url).then ({res, $}) ->
$subsubcategoryLinks = $('#category-level-3').find('a')
console.error "--> Subcategory `#{subcategory.name}` has `#{$subsubcategoryLinks.length}` subsubcategories."
subsubcategories = []
$subsubcategoryLinks.each ->
$this = $(this)
subsubcategories.push
name: $this.text().trim().toLowerCase()
url: $this.attr('href')
return subsubcategories
crawlSubsubcategory = (subsubcategory) ->
endpoint = subsubcategory.url + '&format=ajax&start=0&sz=1'
subsubcategory = copy(subsubcategory)
(crawl endpoint).then ({res, $}) ->
subsubcategory.products = []
$('.product').each ->
$product = $(this)
$info = $product.find('.p-info')
$name = $product.find('.name')
subsubcategory.products.push do ->
try
id: $info.find('.id').text().trim()
url: $info.find('.url').text().trim()
image: $product.find('.productimage').attr('data-product-quickview-image').trim()
designer: $name.find('.designername').text().trim().toLowerCase()
name: $name.find('.displayname').text().trim().toLowerCase()
price: parsePrice do ->
saleprice = $product.find('.salesprice').text()
standardprice = $product.find('.standardprice').text()
return (saleprice or standardprice)
catch error
console.error \
"""
ERROR: #{error.toString()}
Could not parse subsubcategory:
#{JSON.stringify(subsubcategory, null, 2)}
"""
throw error
return subsubcategory
crawlSubcategory = (subcategory) -> getSubsubcategories(subcategory).then (subsubcategories) ->
Promise.all(subsubcategories.map(crawlSubsubcategory))
.then (result) ->
subcategory.categories = result
return subcategory
crawlCategory = (category) -> getSubcategories(category).then (subcategories) ->
Promise.all(subcategories.map(crawlSubcategory))
.then (result) ->
category.categories = result
return category
crawl: ->
categories = TOP_LEVEL_CATEGORIES.map (x) ->
name: x
url: "http://www.barneys.com/on/demandware.store/Sites-BNY-Site/default/Search-Show?cgid=#{x}"
Promise.all categories.map(crawlCategory)
crawler = new Crawler
maxConnections: 100
autoWindowClose: true
discoverResources: false
crawler = new BarneysCrawler crawler
crawler.crawl().then (result) ->
console.log JSON.stringify(result, null, 2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment