Skip to content

Instantly share code, notes, and snippets.

@Eibwen
Created November 3, 2014 18:49
Show Gist options
  • Save Eibwen/562a8ec3f4fd2c95f236 to your computer and use it in GitHub Desktop.
Save Eibwen/562a8ec3f4fd2c95f236 to your computer and use it in GitHub Desktop.
Apple laptops research
var _ = require("underscore");
function AppleStoreProduct(baseData) {
_.extend(this, baseData);
}
var specsLookup = [
//Very specific ones... only single matches
{ match: /^(Refurbished |)(?:([0-9\.]+-inch) )?(MacBook (?:Air|Pro)) ([0-9\.]+ ?GHz) ([dD]ual-core|Quad-core) (Intel (?:Core )?(i5|i7))( with Retina Display|)$/,
result: function (obj, match) {
if (match[1] == "Refurbished ")
obj.Referb = true;
obj.Screen = match[2];
obj.Model = match[3];
obj.CpuSpeed = match[4].replace(" ", "");
obj.CpuGHz = match[4].replace(" ", "").replace("GHz", "");
obj.Cores = match[5];
obj.CPU = match[6];
obj.CpuModel = match[7];
if (match[8] == " with Retina Display")
obj.Resolution = "Retina";
}
},
{ match: /^(\d+[GT]B) Flash Storage$/i, result: function (obj, match) { obj.SSD = match[1]; } },
{ match: /^(\d+[GT]B) Solid State Drive$/i, result: function (obj, match) { obj.SSD = match[1]; } },
{ match: /^(\d+[GT]B) PCIe-based flash storage$/i, result: function (obj, match) { obj.SSD = match[1]; } },
{ match: /^Originally released (.+)$/i, result: function (obj, match) { obj.Released = match[1]; } },
{ match: /^.*(Graphics|GeForce).*$/i, result: function (obj, match) { obj.Graphics = match[0]; } },
{ match: /^720p FaceTime HD Camera$/i, result: function (obj, match) { obj.FaceTime720pCam = true; } },
{ match: /^8x double-layer SuperDrive \(DVD±R DL\/DVD±RW\/CD-RW\)$/i, result: function (obj, match) { obj.DVDDrive = true; } },
{ match: /^(\d+GB) memory/i, result: function (obj, match) { obj.RAM = match[1]; } },
//8GB of 1600MHz LPDDR3 onboard memory
{ match: /^(\d+GB(?: \(.+\)|)) of (\d+MHz) ((?:LP)?DDR3L?) (?:SDRAM|onboard memory)$/i,
result: function (obj, match) {
obj.RAM = match[1];
obj.RamSpeed = match[2];
obj.RamType = match[3];
}
},
{ match: /^(\d+[GT]B) Serial ATA( @ 5400) rpm$/i, result: function (obj, match) { obj.HDD = match[1] + match[2]; } },
{ match: /^([0-9\.]+-inch) \(diagonal\) Retina display; \d+-by-\d+ resolution at \d+ pixels per inch$/i,
result: function (obj, match) {
obj.Screen = match[1];
//obj.ScreenTexture = match[2];
obj.Resolution = "Retina";
}
},
{ match: /^([0-9\.]+-inch) \(diagonal\) LED-backlit (glossy|Hi-Res antiglare) widescreen display, (\d+-by-\d+) resolution$/i,
result: function (obj, match) {
obj.Screen = match[1];
obj.ScreenTexture = match[2];
obj.Resolution = match[3];
}
},
{ match: /^([0-9\.]+-inch) \(diagonal\) LED-backlit (glossy) widescreen display with support for millions of colors$/i,
result: function (obj, match) {
obj.Screen = match[1];
obj.ScreenTexture = match[2];
obj.Resolution = "other";
}
},
//{ match: /^$/i, result: function (obj, match) { obj. = true; } },
// { match: /a/i, result: function (obj, match) { obj.SSD = match[1]; } },
// { match: /a/i, result: function (obj, match) { obj.SSD = match[1]; } },
];
AppleStoreProduct.prototype.ProcessSpecLines = function(){
if (this.specs) return this.specs;
this.specs = {};
//console.log(this.specsRaw);
var specLines = this.specsRaw.split('\n');
for (var i = specLines.length - 1; i >= 0; i--) {
var line = specLines[i];
//console.log(line);
var lineMatched = false;
for (var j = specsLookup.length - 1; j >= 0; j--) {
var result = specsLookup[j].match.exec(line);
if (result) {
lineMatched = true;
specsLookup[j].result(this.specs, result);
}
};
if (!lineMatched) {
console.log("FAILED TO MATCH: " + line);
}
};
}
module.exports = AppleStoreProduct
var request = require('request');
var cheerio = require('cheerio');
//Temp?? code is in cheerio's repo...
var entities = require('entities');
var nStore = require('nstore');
var AppleStoreProduct = require('./AppleStoreProduct');
////////////// Global stuff
var productsRepo = nStore.new('products.db');
var pages = [
"http://store.apple.com/us/browse/home/specialdeals/mac/macbook_pro/13",
"http://store.apple.com/us/browse/home/specialdeals/mac/macbook_pro/15",
"http://store.apple.com/us/browse/home/specialdeals/mac/macbook_air/13",
"http://store.apple.com/us/browse/home/specialdeals/mac/macbook_air/11"
];
////////////// Helper functions
// var trimEachLine = function (text){
// var lines = text.split("\n");
// for (var i = lines.length - 1; i >= 0; i--) {
// lines[i] = lines[i].trim();
// };
// return lines.join("\n");
// }
var trimHtml = function (element){
var asString = element.html();
if (!asString) return "";
//Kill <sup>
asString = asString.replace(/<sup>.+?<\/sup>/g, "");
//Kill unicode new lines
asString = asString.replace(/&#x2028;/g, "\n");
asString = asString.replace(/\s*(<[^>]+>)\s*/g, "\n")
.replace(/\n+/g, "\n")
.trim();
return entities.decodeHTML(asString);
//return trimEachLine(asString);
}
function scrapeApple(){
////////////// Doing work
for (pageIndex in pages) {
var url = pages[pageIndex];
console.log(url);
request(url, function(err, resp, body) {
if (err)
throw err;
$ = cheerio.load(body);
// TODO: scraping goes here!
var productObjects = [];
$(".product").each(function (prod){
var prodObj = {
dateScraped: new Date().toISOString()
};
var specs = $(this).find(".specs");
prodObj.specsRaw = trimHtml(specs);
prodObj.link = specs.find("a").attr("href");
var price = $(this).find(".price");
prodObj.price = price.text().trim();
var savings = $(this).find(".savings");
var savingsText = trimHtml(savings);
if (savingsText)
prodObj.savings = savingsText;
productsRepo.save(null, prodObj, function (err, key) {
if (err) { throw err; }
// You now have the generated key
});
productObjects.push(prodObj);
new AppleStoreProduct(prodObj).ProcessSpecLines();
});
//console.log(productObjects);
});
/////DEBUGGING:
//return;
}
}
var scrapeWait;
function runScrape(){
scrapeApple();
scrapeWait = setTimeout(runScrape, 2 * 60 * 60 * 1000);
}
runScrape();
{
"name": "AppleStoreScrape",
"version": "0.1.0",
"dependencies": {
"cheerio": "^0.17.0",
"co": "^3.1.0",
"entities": "^1.1.1",
"nstore": "^0.5.2",
"underscore": "^1.6.0"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment