Skip to content

Instantly share code, notes, and snippets.

@Dakuan
Created May 28, 2012 16:09
Show Gist options
  • Save Dakuan/2819896 to your computer and use it in GitHub Desktop.
Save Dakuan/2819896 to your computer and use it in GitHub Desktop.
Scraper
// the scraper module
// you need to set it's gubbins with manipulate and get methods
var Scraper = function(){
// public
setGubbins = function(gubb){
gubbins = gubb;
},
getContent = function(url, callback){
console.log('loading page...')
var subPage = buildPage();
subPage.open(url, function(status){
if (status !== 'success') {
var fs = require('fs');
fs.write(root + 'errors/' + createId() + '.err', url, 'w');
console.log(status + ': Unable to load ' + url);
phantom.exit();
return;
}
// pump in jquery
subPage.injectJs('jquery', function() {});
gubbins.manipulate(subPage);
// delay the fetch to ensure the AJAX calls are all done after DOM manipulation
setTimeout(function(){
gubbins.get(subPage, onComplete);
}, 200);
});
},
onComplete = function(dress){
//var json = dress.title;//toJson(dress)
var json = toJson(dress);
console.log(json);
var fs = require('fs');
fs.write(root + '/content/' + dress.id + '.json', json, 'w');
quit();
},
// private
root = 'asos/dresses/',
count = 0,
quit = function(){
console.log('exiting phantomjs');
phantom.exit();
},
buildPage = function(){
var page = require('webpage').create();
page.onConsoleMessage = this.onConsoleMessage;
page.onError = this.onJsError;
return page;
},
onJsError = function(msg, line, source){
console.log('error> ' + msg + ' on line ' + line);
},
gubbins = {},
toJson = function serialize(obj){
var returnVal;
if(obj != undefined){
switch(obj.constructor)
{
case Array:
var vArr="[";
for(var i=0;i<obj.length;i++)
{
if(i>0) vArr += ",";
vArr += serialize(obj[i]);
}
vArr += "]"
return vArr;
case String:
returnVal = escape("'" + obj + "'");
return returnVal;
case Number:
returnVal = isFinite(obj) ? obj.toString() : null;
return returnVal;
case Date:
returnVal = "#" + obj + "#";
return returnVal;
default:
if(typeof obj == "object"){
var vobj=[];
for(attr in obj)
{
if(typeof obj[attr] != "function")
{
vobj.push('"' + attr + '":' + serialize(obj[attr]));
}
}
if(vobj.length >0)
return "{" + vobj.join(",") + "}";
else
return "{}";
}
else
{
return obj.toString();
}
}
}
return null;
},
createId = function (){
var id = 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8);
return v.toString(16);
});
return id;
},
onConsoleMessage = function (msg, line, source) {
console.log('console> ' + msg);
};
return{
quit: quit,
setGubbins: setGubbins,
getContent: getContent
};
}();
// Gubbins
//////////////////////
var asosDress = {
manipulate: function(subPage){
// do all ajaxy things here
// trigger the drop down
console.log('manipulating page...');
subPage.evaluate(function(){
$('#ssMoreLink').click();
var colourDrop = $('#ctl00_ContentMainPage_ctlSeparateProduct_drpdwnColour');
if (colourDrop.attr('disabled') != 'disabled'){
var colour = $('#ctl00_ContentMainPage_ctlSeparateProduct_drpdwnColour').children()[1].text;
$('#ctl00_ContentMainPage_ctlSeparateProduct_drpdwnColour').val(colour);
$('#ctl00_ContentMainPage_ctlSeparateProduct_drpdwnColour').change();
}
});
},
get: function(subPage, onComplete){
var self = this;
// collect the data
console.log('scraping page...');
var dress = subPage.evaluate(function(){
var colours = $('#ctl00_ContentMainPage_ctlSeparateProduct_drpdwnColour').children();
console.log(colours.length);
colours.splice(0, 1);
var colourText = new Array();
for(var i = 0; i < colours.length; i++){
colourText.push(colours[i].text);
}
var sizes = $('#ctl00_ContentMainPage_ctlSeparateProduct_drpdwnSize').children();
sizes.splice(0, 1);
var sizeText = new Array();
for(var j = 0; j < sizes.length; j++){
sizeText.push(sizes[j].text);
}
var images = $('img', '.productImagesItems');
var imageUrls = new Array();
for(var i = 0; i < images.length; i++){
var src = $(images[i]).attr('src');
imageUrls.push(src);
}
return {
id: $('#ctl00_ContentMainPage_ctlSeparateProduct_hdnSku').val().trim(),
title: $('#ctl00_ContentMainPage_ctlSeparateProduct_lblProductTitle').text(),
price: $('#ctl00_ContentMainPage_ctlSeparateProduct_lblProductPrice').text(),
description: $('.single-entry').text(),
sizes: sizeText,
colours: colourText,
images: imageUrls,
url: window.location.href
};
});
dress.retailerId = self.retailerId;
if(onComplete){
onComplete(dress);
}
},
retailerId: 12345
};
// Execution
//////////////////////
try{
Scraper.setGubbins(asosDress);
var system = require('system');
Scraper.getContent(system.args[1]);
}
catch (ex){
console.log('phantomjs ex: ' + ex);
Scraper.quit();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment