Created
April 27, 2016 14:36
-
-
Save ThomasPe/a6df2e4135ba1815b30c49a9495755f0 to your computer and use it in GitHub Desktop.
A simple Node.js script to crawl the Red Stripe Deals from the Windows Store and saving the app details into a JSON file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var request = require('request'); | |
var cheerio = require('cheerio'); | |
var fs = require('fs'); | |
var async = require('async'); | |
var zlib = require('zlib'); | |
var dealsUri = "https://www.microsoft.com/en-us/store/collections/redstripedeals/pc"; | |
var baseUri = "http://microsoft.com"; | |
var output = []; | |
var outputFilename = "redstripedeals.json"; | |
//var outputFilename = "d:\\home\\site\\wwwroot\\redstripedeals2.json"; | |
var redstripedealsParser = function (body, lang) | |
{ | |
$ = cheerio.load(body); | |
$('figure h4 a').each(function () | |
{ | |
var appUri = baseUri + $(this).attr("href"); | |
qDetails.push(appUri); | |
}); | |
} | |
var loadDeals = function () { | |
var options = { | |
url: dealsUri, | |
port: 443, | |
headers: { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko', | |
'Accept-Language': 'en-us', | |
'Content-Language': 'en-us', | |
'Accept-Encoding': 'gzip' | |
}, | |
timeout: 0, | |
encoding: null | |
}; | |
request(options, function (err, resp, body) { | |
if (err) { | |
console.log("error loading page"); | |
} | |
if (!err) { | |
if (resp.headers['content-encoding'] == 'gzip') { | |
zlib.gunzip(body, function (err, dezipped) { | |
redstripedealsParser(dezipped.toString()); | |
}); | |
} else { | |
redstripedealsParser(body); | |
} | |
} | |
}); | |
} | |
var qDetails = async.queue(function (task, callback) { | |
console.log("get details: " + task); | |
var options = { | |
url: task, | |
headers: { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko', | |
'Accept-Language': 'en-us', | |
'Content-Language': 'en-us', | |
'Accept-Encoding': 'gzip' | |
}, | |
timeout: 0, | |
encoding: null | |
}; | |
request(options, function (err, resp, body) { | |
if (err) | |
{ | |
console.log("error: " + task); | |
throw err; | |
} | |
if(resp.headers['content-encoding'] == 'gzip'){ | |
zlib.gunzip(body, function(err, dezipped) { | |
GetDetails(dezipped.toString(), task, callback); | |
}); | |
} else { | |
GetDetails(body, task, callback); | |
} | |
}); | |
}, 1); | |
var GetDetails = function (body, task, callback) { | |
console.log("parse details: " + task); | |
$ = cheerio.load(body); | |
var app = {}; | |
app["title"] = $('#page-title').text(); | |
app["image"] = "http:" + $('.ph-logo > img').first().attr('src').toString(); | |
app["description"] = $('.showmore > p').first().text(); | |
app["price"] = $('.srv_price > span').first().text(); | |
app["ratingValue"] = $('.srv_ratingsScore.win-rating-average').first().text(); | |
app["ratingCount"] = $('.win-rating-total').first().text().replace(/\D/g, ''); | |
app["packageSize"] = $('.metadata-list-content > div').eq(2).text().replace("\r\n", "").trim(); | |
app["publisher"] = $('.metadata-list-content > div').eq(0).text().replace("\r\n", "").trim(); | |
output.push(app); | |
callback(); | |
} | |
qDetails.drain = function () | |
{ | |
fs.writeFile(outputFilename, JSON.stringify(output, null), function (err) | |
{ | |
if (err) | |
{ | |
console.log(err); | |
} else | |
{ | |
console.log("JSON saved to " + outputFilename); | |
} | |
}); | |
} | |
loadDeals(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment