Created
November 28, 2012 03:18
-
-
Save dearfrankg/4158831 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
scraper = require 'scraper' | |
fs = require 'fs' | |
######################################################## | |
## Hate typing console.log | |
log = (msg) -> console.log(msg) | |
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
class Grabber | |
constructor: (@url, @path, @cb) -> | |
that = this | |
scraper(@url, (err, jQuery) -> | |
throw err if err | |
that.cb.call(that, jQuery) | |
) | |
scrapeArray: (g,f, jQuery) -> | |
## GATHER | |
## | |
data = [] | |
count = 0 | |
jQuery(@path).each () -> | |
data[count] = {} | |
for doc in g | |
data[count][doc.property] = | |
if doc.attr is undefined | |
then jQuery(this).find(doc.path).html() | |
else jQuery(this).find(doc.path).attr(doc.attr) | |
count++ | |
## FILTER | |
## | |
filterOut = (item) -> | |
result = false | |
for filter in f | |
regex = new RegExp(filter.text, 'i') | |
result = true if item[filter.property].match(regex) | |
return result | |
fresh = [] | |
for item in data | |
fresh.push item unless filterOut item | |
return fresh | |
writeFile: (filename, str) -> | |
fs.writeFile(filename, str, (err) -> | |
if (err) | |
console.log(err); | |
else | |
console.log("The file was saved!"); | |
) | |
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
new Grabber( 'http://www.thefancy.com', 'li .figure-product', (jQuery) -> | |
log 'Pulling data from http://thefancy.com' | |
## | |
## Get the captions and detailUrl | |
## | |
data1 = this.scrapeArray( | |
[{ property: 'caption', path: 'figcaption'}, | |
{ property: 'detailUrl', path: 'a:eq(1)', attr: 'href'} ] | |
[{ property: 'caption', text: 'fancy'}] | |
jQuery | |
) | |
## | |
## Get the bigImageUrl | |
## | |
data2 = [] | |
count = 0 | |
for doc in data1 | |
url = @url + doc.detailUrl | |
new Grabber( url, '#container-wrapper', (jQuery) -> | |
item = this.scrapeArray( | |
[{ property: 'bigImageUrl', path: '.first .fig-image img', attr: 'src'}, | |
{ property: 'price', path: '#sidebar .price'}], | |
[], | |
jQuery | |
) | |
data2[count] = item[0] | |
count++ | |
if count is data1.length | |
log 'DATA2' | |
log 'SIZE: ' + data2.length | |
log data2 | |
this.writeFile( 'fancy.json', JSON.stringify data2 ) | |
) | |
) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment