Created
November 3, 2014 18:49
-
-
Save Eibwen/562a8ec3f4fd2c95f236 to your computer and use it in GitHub Desktop.
Apple laptops research
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var _ = require("underscore"); | |
function AppleStoreProduct(baseData) { | |
_.extend(this, baseData); | |
} | |
var specsLookup = [ | |
//Very specific ones... only single matches | |
{ match: /^(Refurbished |)(?:([0-9\.]+-inch) )?(MacBook (?:Air|Pro)) ([0-9\.]+ ?GHz) ([dD]ual-core|Quad-core) (Intel (?:Core )?(i5|i7))( with Retina Display|)$/, | |
result: function (obj, match) { | |
if (match[1] == "Refurbished ") | |
obj.Referb = true; | |
obj.Screen = match[2]; | |
obj.Model = match[3]; | |
obj.CpuSpeed = match[4].replace(" ", ""); | |
obj.CpuGHz = match[4].replace(" ", "").replace("GHz", ""); | |
obj.Cores = match[5]; | |
obj.CPU = match[6]; | |
obj.CpuModel = match[7]; | |
if (match[8] == " with Retina Display") | |
obj.Resolution = "Retina"; | |
} | |
}, | |
{ match: /^(\d+[GT]B) Flash Storage$/i, result: function (obj, match) { obj.SSD = match[1]; } }, | |
{ match: /^(\d+[GT]B) Solid State Drive$/i, result: function (obj, match) { obj.SSD = match[1]; } }, | |
{ match: /^(\d+[GT]B) PCIe-based flash storage$/i, result: function (obj, match) { obj.SSD = match[1]; } }, | |
{ match: /^Originally released (.+)$/i, result: function (obj, match) { obj.Released = match[1]; } }, | |
{ match: /^.*(Graphics|GeForce).*$/i, result: function (obj, match) { obj.Graphics = match[0]; } }, | |
{ match: /^720p FaceTime HD Camera$/i, result: function (obj, match) { obj.FaceTime720pCam = true; } }, | |
{ match: /^8x double-layer SuperDrive \(DVD±R DL\/DVD±RW\/CD-RW\)$/i, result: function (obj, match) { obj.DVDDrive = true; } }, | |
{ match: /^(\d+GB) memory/i, result: function (obj, match) { obj.RAM = match[1]; } }, | |
//8GB of 1600MHz LPDDR3 onboard memory | |
{ match: /^(\d+GB(?: \(.+\)|)) of (\d+MHz) ((?:LP)?DDR3L?) (?:SDRAM|onboard memory)$/i, | |
result: function (obj, match) { | |
obj.RAM = match[1]; | |
obj.RamSpeed = match[2]; | |
obj.RamType = match[3]; | |
} | |
}, | |
{ match: /^(\d+[GT]B) Serial ATA( @ 5400) rpm$/i, result: function (obj, match) { obj.HDD = match[1] + match[2]; } }, | |
{ match: /^([0-9\.]+-inch) \(diagonal\) Retina display; \d+-by-\d+ resolution at \d+ pixels per inch$/i, | |
result: function (obj, match) { | |
obj.Screen = match[1]; | |
//obj.ScreenTexture = match[2]; | |
obj.Resolution = "Retina"; | |
} | |
}, | |
{ match: /^([0-9\.]+-inch) \(diagonal\) LED-backlit (glossy|Hi-Res antiglare) widescreen display, (\d+-by-\d+) resolution$/i, | |
result: function (obj, match) { | |
obj.Screen = match[1]; | |
obj.ScreenTexture = match[2]; | |
obj.Resolution = match[3]; | |
} | |
}, | |
{ match: /^([0-9\.]+-inch) \(diagonal\) LED-backlit (glossy) widescreen display with support for millions of colors$/i, | |
result: function (obj, match) { | |
obj.Screen = match[1]; | |
obj.ScreenTexture = match[2]; | |
obj.Resolution = "other"; | |
} | |
}, | |
//{ match: /^$/i, result: function (obj, match) { obj. = true; } }, | |
// { match: /a/i, result: function (obj, match) { obj.SSD = match[1]; } }, | |
// { match: /a/i, result: function (obj, match) { obj.SSD = match[1]; } }, | |
]; | |
AppleStoreProduct.prototype.ProcessSpecLines = function(){ | |
if (this.specs) return this.specs; | |
this.specs = {}; | |
//console.log(this.specsRaw); | |
var specLines = this.specsRaw.split('\n'); | |
for (var i = specLines.length - 1; i >= 0; i--) { | |
var line = specLines[i]; | |
//console.log(line); | |
var lineMatched = false; | |
for (var j = specsLookup.length - 1; j >= 0; j--) { | |
var result = specsLookup[j].match.exec(line); | |
if (result) { | |
lineMatched = true; | |
specsLookup[j].result(this.specs, result); | |
} | |
}; | |
if (!lineMatched) { | |
console.log("FAILED TO MATCH: " + line); | |
} | |
}; | |
} | |
module.exports = AppleStoreProduct |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var request = require('request'); | |
var cheerio = require('cheerio'); | |
//Temp?? code is in cheerio's repo... | |
var entities = require('entities'); | |
var nStore = require('nstore'); | |
var AppleStoreProduct = require('./AppleStoreProduct'); | |
////////////// Global stuff | |
var productsRepo = nStore.new('products.db'); | |
var pages = [ | |
"http://store.apple.com/us/browse/home/specialdeals/mac/macbook_pro/13", | |
"http://store.apple.com/us/browse/home/specialdeals/mac/macbook_pro/15", | |
"http://store.apple.com/us/browse/home/specialdeals/mac/macbook_air/13", | |
"http://store.apple.com/us/browse/home/specialdeals/mac/macbook_air/11" | |
]; | |
////////////// Helper functions | |
// var trimEachLine = function (text){ | |
// var lines = text.split("\n"); | |
// for (var i = lines.length - 1; i >= 0; i--) { | |
// lines[i] = lines[i].trim(); | |
// }; | |
// return lines.join("\n"); | |
// } | |
var trimHtml = function (element){ | |
var asString = element.html(); | |
if (!asString) return ""; | |
//Kill <sup> | |
asString = asString.replace(/<sup>.+?<\/sup>/g, ""); | |
//Kill unicode new lines | |
asString = asString.replace(/
/g, "\n"); | |
asString = asString.replace(/\s*(<[^>]+>)\s*/g, "\n") | |
.replace(/\n+/g, "\n") | |
.trim(); | |
return entities.decodeHTML(asString); | |
//return trimEachLine(asString); | |
} | |
function scrapeApple(){ | |
////////////// Doing work | |
for (pageIndex in pages) { | |
var url = pages[pageIndex]; | |
console.log(url); | |
request(url, function(err, resp, body) { | |
if (err) | |
throw err; | |
$ = cheerio.load(body); | |
// TODO: scraping goes here! | |
var productObjects = []; | |
$(".product").each(function (prod){ | |
var prodObj = { | |
dateScraped: new Date().toISOString() | |
}; | |
var specs = $(this).find(".specs"); | |
prodObj.specsRaw = trimHtml(specs); | |
prodObj.link = specs.find("a").attr("href"); | |
var price = $(this).find(".price"); | |
prodObj.price = price.text().trim(); | |
var savings = $(this).find(".savings"); | |
var savingsText = trimHtml(savings); | |
if (savingsText) | |
prodObj.savings = savingsText; | |
productsRepo.save(null, prodObj, function (err, key) { | |
if (err) { throw err; } | |
// You now have the generated key | |
}); | |
productObjects.push(prodObj); | |
new AppleStoreProduct(prodObj).ProcessSpecLines(); | |
}); | |
//console.log(productObjects); | |
}); | |
/////DEBUGGING: | |
//return; | |
} | |
} | |
var scrapeWait; | |
function runScrape(){ | |
scrapeApple(); | |
scrapeWait = setTimeout(runScrape, 2 * 60 * 60 * 1000); | |
} | |
runScrape(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "AppleStoreScrape", | |
"version": "0.1.0", | |
"dependencies": { | |
"cheerio": "^0.17.0", | |
"co": "^3.1.0", | |
"entities": "^1.1.1", | |
"nstore": "^0.5.2", | |
"underscore": "^1.6.0" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment