Created
August 10, 2017 06:11
-
-
Save lucasjellema/061993cfaee25ffbf8e311c88dbc84b0 to your computer and use it in GitHub Desktop.
Scraping for Oracle OpenWorld 2017 Session Catalog (well, actually just calling the REST API)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var request = require("request"); | |
var fs = require("fs"); | |
// this options object is constructed based on the network calls the web application at https://events.rainfocus.com/catalog/oracle/oow17/catalogoow17 is making to its backend API | |
var options = { | |
method: 'POST', | |
url: 'https://events.rainfocus.com/api/search', | |
headers: | |
{ | |
'cache-control': 'no-cache', | |
'content-type': 'application/x-www-form-urlencoded' | |
}, | |
form: | |
{ | |
showMyInterest: 'false', | |
size: '500', | |
search: '', | |
showEnrolled: 'false', | |
type: 'session', | |
rfWidgetId: 'NVQmYf90S4Dn2gZBDCIVYksWMc6ORlQ9', | |
rfApiProfileId: 'lwEkJf6GCYTu72vvGPhIOtMGDYl3xTeT' | |
} | |
}; | |
// delay between requests | |
var requestdelay = 500; | |
var oow2017Filename = 'oow2017-sessions-catalog.json'; | |
// global array to hold all session data returned to us | |
var sessions = []; | |
function getSessionData(search) { | |
var callOptions = options; | |
callOptions.form.search = search; | |
request(callOptions, function (error, response, body) { | |
if (error) throw new Error(error); | |
var results = JSON.parse(body); | |
sessions = sessions.concat(results.sectionList[0].items); | |
//return (results.sectionList && results.sectionList[0].numItems > 0) ? results.sectionList[0].items : []; | |
}); | |
} | |
// convenience function to delay execution in a Promise style way (see https://medium.com/oracledevs/sequential-asynchronous-calls-in-node-js-using-callbacks-async-and-es6-promises-e92cc849de46) | |
function delay(t) { | |
return new Promise(function (resolve) { | |
setTimeout(resolve, t) | |
}); | |
} | |
// the function to call to have the data fetched after some suitable delay for the appropriate call context | |
var delayedGetSessionData = function (ctr, type, firstDigit) { | |
getSessionData(type + firstDigit) | |
} | |
// loop over SUN, CON, TUT, GEN, BOF, HOL, SIG for session type | |
// loop over 1..9 for session id | |
var sessionTypes = ['SUN', 'CON', 'TUT', 'GEN', 'BOF', 'HOL', 'SIG']; | |
var ctr = 0; | |
for (sessionType of sessionTypes) { | |
for (var i = 1; i < 9; i++) | |
// delay each request with requestdelay milisecs compared to its predecessor, in order to not overflow the backend server | |
delay(requestdelay * ctr++).then(delayedGetSessionData(ctr, sessionType, i)); | |
} | |
//when all requests have been made and all responses have been received | |
//the sessions variable is loaded with all details for all sessions | |
//and we can serialize it to file | |
//allow an arbitrary 2.5 seconds for the final request to complete | |
delay(2500 + requestdelay * ctr++).then(function () { | |
fs.writeFile(oow2017Filename, JSON.stringify(sessions, null, '\t')); | |
console.log("Written file "+oow2017Filename); | |
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// initial attempt to web scrape server side with Node.js and cheerio; this failed because the HTML is not static but instead largely generated in the browser some JavaScript | |
const rp = require('request-promise'); | |
const cheerio = require('cheerio'); | |
const baseURL = "https://events.rainfocus.com/catalog/oracle/oow17/catalogoow17?showEnrolled=false"; | |
const options = { | |
uri: baseURL, | |
transform: function (body) { | |
return cheerio.load(body); | |
} | |
}; | |
rp(options) | |
.then(($) => { | |
var n = $('li[class=rf-list-item]'); | |
var t = $('ul[class=rf-list]').html(); | |
console.log("text"+t); | |
//.find('.rf-list-item')).each(function(i, elem) { | |
// console.log( $(this).text()); | |
//}); | |
}) | |
.catch((err) => { | |
console.log(err); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment