Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Recreate the Atlantic's Netflix category data using NodeJS. Requires a Netflix login.
{
"name": "netflix-category-scraper",
"version": "0.0.0",
"description": "Collect all categories from Netflix",
"dependencies": {
"cheerio": "*",
"sqlite3": "*",
"request": "*"
},
"author": {
"name" : "Chris Wilson",
"email" : "wilson@mechanicalscribe.com",
"url" : "http://mechanicalscribe.com"
}
}
/*
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
Note for users: Using this script to scrape Netflix is probably a violation of their terms of use.
Note for Netflix: Please reopen your API!
*/
// dependencies
var fs = require("fs"),
cheerio = require("cheerio"),
request = require('request').defaults({
maxRedirects:20,
jar: true
}),
sqlite = require("sqlite3");
// create the database and table for storing categories
var db = new sqlite.Database("netflix.sqlite");
db.run("CREATE TABLE IF NOT EXISTS categories (urlID INTEGER PRIMARY KEY, title TEXT, movies INTEGER)");
// you need to provide your login credentials when you run the script, like so:
// node scrape.js my.email@example.com fluffy1
var LOGIN = process.argv[2],
PASSWORD = process.argv[3];
if (!LOGIN || !PASSWORD) {
console.log("Please include your email and password");
return;
}
var BASE = "http://movies.netflix.com/WiAltGenre?agid=$id";
function login(email, password, callback) {
// POST data we're going to send to Netflix to log in
var data = {
email: email,
password: password
}
console.log("Logging in to Netflix with your info...");
// We need an authorization code from the source of the login page before proceding
request("https://signup.netflix.com/Login", function(err, resp, body) {
// cheerio parses the raw HTML of the response into a jQuery-like object for easy parsing
var $ = cheerio.load(body);
// we're specifically looking for an ID on the page we need to log in, which looks like this:
// <input type="hidden" name="authURL" value="1388775312720.+vRSN6us+IhZ1qOSlo8CyAS/ZJ4=">
data.authURL = $("input[name='authURL']").attr("value");
request.post("https://signup.netflix.com/Login", { form: data }, function(err, resp, body) {
if (err) { throw err; }
console.log("Successfully logged in.");
// when you first log in, NF prompts you to use the default profile or create a new one
// this function gets us through that interstitial page
getProfileCookie(function() {
for (var c = 0; c < 100000; c += 1) {
getGenre(c);
}
});
});
});
}
// merely asking for the same URL twice seems to set this cookie
function getProfileCookie(callback) {
request(BASE.replace("$id", 0), function(err, response, body) {
if (err) throw err;
request(BASE.replace("$id", 0), function(err, response, body) {
if (err) throw err;
callback();
});
});
}
function getGenre(id) {
request(BASE.replace("$id", id), function(err, response, body) {
if (err) throw err;
$ = cheerio.load(body);
//uncomment if you want to cache the HTML responses. Make sure you make and set the cache directory
//fs.writeFileSync("cache/" + id + ".html", body);
var title = $(".crumb a").text(),
// may fail on pagination, not certain
movies = $(".agMovie").length;
db.run("INSERT OR IGNORE INTO categories (urlID, title, movies) VALUES (?, ?, ?)", [id, title, movies]);
console.log(id, title, movies);
});
}
login(LOGIN, PASSWORD);

it doesn't work anymore :/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment