Skip to content

Instantly share code, notes, and snippets.

@JasonGross
Last active November 3, 2020 17:50
Show Gist options
  • Save JasonGross/dfd89b18f934137ffcd8 to your computer and use it in GitHub Desktop.
Save JasonGross/dfd89b18f934137ffcd8 to your computer and use it in GitHub Desktop.
Scrapers for Splash classes for 2015, in a combination of python and javascript
<html>
<!--// run `python -m SimpleHTTPServer 8000`, and visit http://localhost:8000/html-scrape-classes.html-->
<!--// run `python3 -m http.server 8000`, and visit http://localhost:8000/html-scrape-classes.html-->
<head>
<script type="text/javascript">
// follow the instructions at https://developers.google.com/google-apps/calendar/quickstart/js, using this in place of the quickstart file
// the output of the following javascript, run in the chrome console on https://esp.mit.edu/teach/Splash/2015/teacherreg, below
/*
var forPython = false;
var splashYear;// = 2016;
var splashMonth;// = 3;
var splashFirstDay;// = 12;
// http://stackoverflow.com/a/7244288/377022
function ISODateString(d) {
function pad(n) {
return n < 10 ? '0' + n : n
}
return d.getUTCFullYear() + '-' + pad(d.getUTCMonth() + 1) + '-' + pad(d.getUTCDate()) + 'T' + pad(d.getUTCHours()) + ':' + pad(d.getUTCMinutes()) + ':' + pad(d.getUTCSeconds()) + 'Z'
}
// http://stackoverflow.com/a/13566675/377022
function getMonthFromString(mon){
return new Date(Date.parse(mon +" 1, 2012")).getMonth()+1
}
function getHTMLAndTextOfURL(url, cb) {
var ifr = document.createElement("iframe");
ifr.src = url;
ifr.style = "display:none;";
ifr.onload = function () {
var html = ifr.contentWindow.document.body.innerHTML;
var text = ifr.contentWindow.document.body.innerText;
var doc = ifr.contentWindow.document;
document.body.removeChild(ifr);
cb(html, text, doc);
}
document.body.appendChild(ifr);
}
function makeEventObject(className, room, time, description) {
var timeSplitReg = /([A-Za-z]*)\s*([0-9]*):([0-9]*)([ap]m)[-\s]*([0-9]*):([0-9]*)([ap]m)/;
if (time.match(timeSplitReg) !== null) {
var timeParts = time.match(timeSplitReg);
var day = splashFirstDay;
if (timeParts[1] == 'Sun') {
day += 1;
} else if (timeParts[1] != 'Sat') {
console.log('Warning: Unrecognized Day: ' + timeParts[1] + ' for ' + className);
}
var startHours = parseInt(timeParts[2]);
var startMinutes = parseInt(timeParts[3]);
var endHours = parseInt(timeParts[5]);
var endMinutes = parseInt(timeParts[6]);
if (timeParts[4] == 'pm') {
if (startHours < 12) startHours += 12;
} else if (timeParts[4] != 'am') {
console.log('Warning: Unrecognized Time Modifier: ' + timeParts[4] + ' for ' + className);
}
if (timeParts[7] == 'pm') {
if (endHours < 12) endHours += 12;
} else if (timeParts[7] != 'am') {
console.log('Warning: Unrecognized Time Modifier: ' + timeParts[7] + ' for ' + className);
}
var start = new Date(splashYear, splashMonth - 1, day, startHours, startMinutes, 0, 0);
var end = new Date(splashYear, splashMonth - 1, day, endHours, endMinutes, 0, 0);
return {
'kind': 'calendar#event',
'location': room,
'summary': className,
'description': description,
'end': {
'dateTime': ISODateString(end)
},
'start': {
'dateTime': ISODateString(start)
}
};
} else {
console.log('Warning: Unrecognized Time: ' + time + ' for ' + className);
return [className, room, time];
}
}
function makeLink(event) {
return 'https://www.google.com/calendar/event?action=TEMPLATE&dates=' +
encodeURIComponent(event['start']['dateTime'].replace(/[:-]/g, '') + '/' + event['end']['dateTime'].replace(/[:-]/g, '')) +
'&text=' + encodeURIComponent(event['summary']) +
'&location=' + encodeURIComponent(event['location']) +
'&details=' + encodeURIComponent(event['description']);
}
function makeButton(event) {
var url = makeLink(event);
var btn = document.createElement("a");
btn.className = "abutton";
btn.href = url;
btn.innerText = "Add to Google Calendar";
return btn;
}
function addCalendarLink(event, row) {
var btn = makeButton(event);
var cell = row.getElementsByClassName('clsright')[0];
var existing = cell.getElementsByTagName('a');
for (var i = 0; i < existing.length; i++) {
if (existing[i].innerText == btn.innerText) {
cell.removeChild(existing[i]);
}
}
cell.appendChild(btn);
}
function getInnerText(elem, defaultv) {
if (elem !== undefined) {
return elem.innerText;
}
return defaultv;
}
function doMain() {
var roomReg = /Room:\s*((?:Zoom Room ?)[0-9-]+)/;
var timeReg = /Time:\s*([A-Za-z\s]*[0-9:\s]*[ap]m[\s-]*[0-9:\s]*[ap]m)/;
var rows = document.getElementsByTagName('tr');
var classes = [];
var className = null;
var longClassName = null;
var classDescription = null;
function doNext(i) {
if (i < rows.length) {
var names = rows[i].getElementsByClassName('classname');
if (names.length == 1) {
className = names[0].innerText.replace('»', '');
var fragment = document.createElement('div');
fragment.innerHTML = names[0].innerHTML;
console.log(className);
getHTMLAndTextOfURL(fragment.getElementsByTagName('a')[0].href, function (html, text, doc) {
classDescription = getInnerText(doc.getElementsByClassName('class_content')[0], "").replace(/^\s+/g,'').replace(/\s+$/g,'').replace(/\n +/g, '');
longClassName = getInnerText(doc.getElementsByClassName('class_title')[0], className).replace(/^\s+/g,'').replace(/\s+$/g,'');
console.log(longClassName);
doNext(i + 1);
});
} else if (rows[i].innerText.match(roomReg) !== null && className !== null) {
var theRoom = rows[i].innerText.match(roomReg)[1];
var theTime = rows[i].innerText.match(timeReg)[1];
// console.log(theRoom);
// console.log(theTime);
var event = makeEventObject(longClassName, theRoom, theTime, classDescription);
classes[classes.length] = event;
addCalendarLink(event, rows[i]);
doNext(i + 1);
} else {
doNext(i + 1);
}
} else {
var result = JSON.stringify(classes);
if (forPython) {
result = result.replace(/":"/g, '":u"');
}
console.log('CLASSES = ' + result);
}
};
doNext(0);
}
function main0() {
var datesReg = /Dates: [A-Za-z ]*([0-9]+) will be on Saturday, ([A-Za-z]+) ([0-9]+)/;
var datesReg2 = /[A-Za-z]+ ([0-9]+) will run (Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(?:Nov|Dec)(?:ember)?) ([0-9]+)/;
getHTMLAndTextOfURL(window.location.href.replace(/[0-9]+\/teacherreg/, "moreinfo.html"), function handleInfo(html, text) {
var datesMatch = text.match(datesReg);
if (datesMatch === null) datesMatch = text.match(datesReg2);
console.log(datesMatch);
splashYear = parseInt(datesMatch[1]);
splashMonth = getMonthFromString(datesMatch[2]);
splashFirstDay = parseInt(datesMatch[3]);
doMain();
});
}
main0();
*/
// paste output here:
var CLASSES = [{"kind":"calendar#event","location":"Zoom Room 14","summary":"14173: Proving Löb's Theorem","description":"","end":{"dateTime":"2020-11-15T16:55:00Z"},"start":{"dateTime":"2020-11-15T16:05:00Z"}},{"kind":"calendar#event","location":"Zoom Room 18","summary":"14172: Linear Logic","description":"","end":{"dateTime":"2020-11-15T18:55:00Z"},"start":{"dateTime":"2020-11-15T18:05:00Z"}},{"kind":"calendar#event","location":"Zoom Room 18","summary":"14170: Circling - Intersubjective Mindfulness Meditation","description":"","end":{"dateTime":"2020-11-14T20:55:00Z"},"start":{"dateTime":"2020-11-14T19:05:00Z"}},{"kind":"calendar#event","location":"Zoom Room 18","summary":"14169: Making deep friendships - Circling","description":"","end":{"dateTime":"2020-11-14T22:55:00Z"},"start":{"dateTime":"2020-11-14T21:05:00Z"}}]
// Your Client ID can be retrieved from your project in the Google
// Developer Console, https://console.developers.google.com
var CLIENT_ID = '683712212268-1il3ta3ubsnutnet6ig8m6ngj0j9mb1f.apps.googleusercontent.com';
var SCOPES = ["https://www.googleapis.com/auth/calendar"];
/**
* Check if current user has authorized this application.
*/
function checkAuth() {
gapi.auth.authorize({
'client_id': CLIENT_ID,
'scope': SCOPES.join(' '),
'immediate': true
}, handleAuthResult);
}
/**
* Handle response from authorization server.
*
* @param {Object} authResult Authorization result.
*/
function handleAuthResult(authResult) {
var authorizeDiv = document.getElementById('authorize-div');
if (authResult && !authResult.error) {
// Hide auth UI, then load client library.
authorizeDiv.style.display = 'none';
loadCalendarApi();
} else {
// Show auth UI, allowing the user to initiate authorization by
// clicking authorize button.
authorizeDiv.style.display = 'inline';
}
}
/**
* Initiate auth flow in response to user clicking authorize button.
*
* @param {Event} event Button click event.
*/
function handleAuthClick(event) {
gapi.auth.authorize({
client_id: CLIENT_ID,
scope: SCOPES,
immediate: false
},
handleAuthResult);
return false;
}
/**
* Load Google Calendar client library. List upcoming events
* once client library is loaded.
*/
function loadCalendarApi() {
gapi.client.load('calendar', 'v3', addClasses);
}
/**
* Append a pre element to the body containing the given message
* as its text node.
*
* @param {string} message Text to be placed in pre element.
*/
function appendPre(message) {
var pre = document.getElementById('output');
var textContent = document.createTextNode(message + '\n');
pre.appendChild(textContent);
}
function addClass(i) {
var classEvent = CLASSES[i];
var eventRequest = gapi.client.calendar.events.list({
'calendarId': 'primary',
'timeMin': classEvent['start']['dateTime'],
'timeMax': classEvent['end']['dateTime'],
'maxResults': 10,
'singleEvents': true,
'orderBy': 'startTime'
});
eventRequest.execute(function(resp) {
var events = resp.items;
var found = false;
// console.log(events);
for (var j = 0; j < events.length && !found; j++) {
var existingEvent = events[j];
// console.log([existingEvent.summary.substring(0, 6), classEvent['summary'].substring(0, 6), existingEvent.summary.substring(0, 6) == classEvent['summary'].substring(0, 6)]);
if (existingEvent.summary.substring(0, 6) == classEvent['summary'].substring(0, 6)) {
appendPre('Found existing event ' + existingEvent.summary + ' which seems to match event ' + classEvent['summary'] + '; not adding event');
found = true;
}
}
if (!found) {
gapi.client.calendar.events.insert({
'calendarId': 'primary',
'resource': classEvent
}).execute(function(event) {
// console.log(classEvent);
// console.log(event);
appendPre('Event created for ' + event.summary + ': ' + event.htmlLink);
});
}
});
}
function addClasses() {
for (var i = 0; i < CLASSES.length; i++) {
addClass(i);
}
}
</script>
<script src="https://apis.google.com/js/client.js?onload=checkAuth">
</script>
</head>
<body>
<div id="authorize-div" style="display: none">
<span>Authorize access to Google Calendar API</span>
<!--Button for the user to click to initiate auth sequence -->
<button id="authorize-button" onclick="handleAuthClick(event)">
Authorize
</button>
</div>
<pre id="output"></pre>
</body>
</html>
var forPython = false;
var splashYear;// = 2016;
var splashMonth;// = 3;
var splashFirstDay;// = 12;
// http://stackoverflow.com/a/7244288/377022
function ISODateString(d) {
function pad(n) {
return n < 10 ? '0' + n : n
}
return d.getUTCFullYear() + '-' + pad(d.getUTCMonth() + 1) + '-' + pad(d.getUTCDate()) + 'T' + pad(d.getUTCHours()) + ':' + pad(d.getUTCMinutes()) + ':' + pad(d.getUTCSeconds()) + 'Z'
}
// http://stackoverflow.com/a/13566675/377022
function getMonthFromString(mon){
return new Date(Date.parse(mon +" 1, 2012")).getMonth()+1
}
function getHTMLAndTextOfURL(url, cb) {
var ifr = document.createElement("iframe");
ifr.src = url;
ifr.style = "display:none;";
ifr.onload = function () {
var html = ifr.contentWindow.document.body.innerHTML;
var text = ifr.contentWindow.document.body.innerText;
var doc = ifr.contentWindow.document;
document.body.removeChild(ifr);
cb(html, text, doc);
}
document.body.appendChild(ifr);
}
function makeEventObject(className, room, time, description) {
var timeSplitReg = /([A-Za-z]*)\s*([0-9]*):([0-9]*)([ap]m)[-\s]*([0-9]*):([0-9]*)([ap]m)/;
if (time.match(timeSplitReg) !== null) {
var timeParts = time.match(timeSplitReg);
var day = splashFirstDay;
if (timeParts[1] == 'Sun') {
day += 1;
} else if (timeParts[1] != 'Sat') {
console.log('Warning: Unrecognized Day: ' + timeParts[1] + ' for ' + className);
}
var startHours = parseInt(timeParts[2]);
var startMinutes = parseInt(timeParts[3]);
var endHours = parseInt(timeParts[5]);
var endMinutes = parseInt(timeParts[6]);
if (timeParts[4] == 'pm') {
if (startHours < 12) startHours += 12;
} else if (timeParts[4] != 'am') {
console.log('Warning: Unrecognized Time Modifier: ' + timeParts[4] + ' for ' + className);
}
if (timeParts[7] == 'pm') {
if (endHours < 12) endHours += 12;
} else if (timeParts[7] != 'am') {
console.log('Warning: Unrecognized Time Modifier: ' + timeParts[7] + ' for ' + className);
}
var start = new Date(splashYear, splashMonth - 1, day, startHours, startMinutes, 0, 0);
var end = new Date(splashYear, splashMonth - 1, day, endHours, endMinutes, 0, 0);
return {
'kind': 'calendar#event',
'location': room,
'summary': className,
'description': description,
'end': {
'dateTime': ISODateString(end)
},
'start': {
'dateTime': ISODateString(start)
}
};
} else {
console.log('Warning: Unrecognized Time: ' + time + ' for ' + className);
return [className, room, time];
}
}
function makeLink(event) {
return 'https://www.google.com/calendar/event?action=TEMPLATE&dates=' +
encodeURIComponent(event['start']['dateTime'].replace(/[:-]/g, '') + '/' + event['end']['dateTime'].replace(/[:-]/g, '')) +
'&text=' + encodeURIComponent(event['summary']) +
'&location=' + encodeURIComponent(event['location']) +
'&details=' + encodeURIComponent(event['description']);
}
function makeButton(event) {
var url = makeLink(event);
var btn = document.createElement("a");
btn.className = "abutton";
btn.href = url;
btn.innerText = "Add to Google Calendar";
return btn;
}
function addCalendarLink(event, row) {
var btn = makeButton(event);
var cell = row.getElementsByClassName('clsright')[0];
var existing = cell.getElementsByTagName('a');
for (var i = 0; i < existing.length; i++) {
if (existing[i].innerText == btn.innerText) {
cell.removeChild(existing[i]);
}
}
cell.appendChild(btn);
}
function getInnerText(elem, defaultv) {
if (elem !== undefined) {
return elem.innerText;
}
return defaultv;
}
function doMain() {
var roomReg = /Room:\s*((?:Zoom Room ?)[0-9-]+)/;
var timeReg = /Time:\s*([A-Za-z\s]*[0-9:\s]*[ap]m[\s-]*[0-9:\s]*[ap]m)/;
var rows = document.getElementsByTagName('tr');
var classes = [];
var className = null;
var longClassName = null;
var classDescription = null;
function doNext(i) {
if (i < rows.length) {
var names = rows[i].getElementsByClassName('classname');
if (names.length == 1) {
className = names[0].innerText.replace('»', '');
var fragment = document.createElement('div');
fragment.innerHTML = names[0].innerHTML;
console.log(className);
getHTMLAndTextOfURL(fragment.getElementsByTagName('a')[0].href, function (html, text, doc) {
classDescription = getInnerText(doc.getElementsByClassName('class_content')[0], "").replace(/^\s+/g,'').replace(/\s+$/g,'').replace(/\n +/g, '');
longClassName = getInnerText(doc.getElementsByClassName('class_title')[0], className).replace(/^\s+/g,'').replace(/\s+$/g,'');
console.log(longClassName);
doNext(i + 1);
});
} else if (rows[i].innerText.match(roomReg) !== null && className !== null) {
var theRoom = rows[i].innerText.match(roomReg)[1];
var theTime = rows[i].innerText.match(timeReg)[1];
// console.log(theRoom);
// console.log(theTime);
var event = makeEventObject(longClassName, theRoom, theTime, classDescription);
classes[classes.length] = event;
addCalendarLink(event, rows[i]);
doNext(i + 1);
} else {
doNext(i + 1);
}
} else {
var result = JSON.stringify(classes);
if (forPython) {
result = result.replace(/":"/g, '":u"');
}
console.log('CLASSES = ' + result);
}
};
doNext(0);
}
function main0() {
var datesReg = /Dates: [A-Za-z ]*([0-9]+) will be on Saturday, ([A-Za-z]+) ([0-9]+)/;
var datesReg2 = /[A-Za-z]+ ([0-9]+) will run (Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|(?:Nov|Dec)(?:ember)?) ([0-9]+)/;
getHTMLAndTextOfURL(window.location.href.replace(/[0-9]+\/teacherreg/, "moreinfo.html"), function handleInfo(html, text) {
var datesMatch = text.match(datesReg);
if (datesMatch === null) datesMatch = text.match(datesReg2);
console.log(datesMatch);
splashYear = parseInt(datesMatch[1]);
splashMonth = getMonthFromString(datesMatch[2]);
splashFirstDay = parseInt(datesMatch[3]);
doMain();
});
}
main0();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment