Skip to content

Instantly share code, notes, and snippets.

@verespej
Last active December 29, 2015 05:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save verespej/7619547 to your computer and use it in GitHub Desktop.
Save verespej/7619547 to your computer and use it in GitHub Desktop.
Scrape course description from UW CSE course page and store as JSON. Implemented using node.js.
var cheerio = require('cheerio');
var http = require('http');
var fs = require('fs');
var url = require('url');
var pageUrl = 'http://www.washington.edu/students/crscat/cse.html';
http.get(pageUrl, function(res) {
res.setEncoding('utf8');
var data = '';
res.on('data', function(chunk) {
data += chunk;
});
res.on('end', function() {
parseCoursePage(cheerio.load(data), pageUrl);
});
});
function parseCoursePage($, rootUrl) {
var entries = [];
$('a[name]').each(function(index, elm) {
var courseId = $(elm).text();
var courseName = $(elm).parent().text().substring(courseId.length);
var courseDescription = $(elm).parent().parent().text().substring(
courseId.length + courseName.length
);
var credits = 'unspecified';
var areasOfKnowledge = [];
var creditsStart = courseName.indexOf('(');
var creditsEnd = courseName.indexOf(')');
if (creditsStart >= 0 && creditsEnd >= 0 && creditsStart < creditsEnd) {
// Get the credits
credits = courseName.substring(creditsStart + 1, creditsEnd);
// Get the areas of knowledge
if (courseName.length >= creditsEnd + 1) {
var aokText = courseName.substring(creditsEnd + 1);
var aokTokens = aokText.split(',');
for (var i = 0; i < aokTokens.length; i++) {
var code = aokTokens[i].trim();
if (code.length > 0) {
areasOfKnowledge.push(code);
}
}
}
// Isolate course name
courseName = courseName.substring(0, creditsStart - 1);
}
// Get instructor links
var courseLinks = [];
$(elm).parent().parent().children('a').each(function(linkIndex, linkObj) {
var href = $(linkObj).attr('href');
if (href != null && href.length > 0) {
courseLinks.push(url.resolve(rootUrl, href.trim()));
}
});
// Get prerequisites
var prereqs = 'None';
var parsedPr = /Prerequisite:(.*?)\.(?!\d)/.exec(courseDescription);
if (parsedPr != null) {
prereqs = parsedPr[1].trim();
}
console.log(prereqs);
entries.push({
id: courseId.trim(),
name: courseName.trim(),
credits: credits.trim(),
aok: areasOfKnowledge,
description: courseDescription.trim(),
links: courseLinks,
prereqs: prereqs
});
});
saveResults(entries, './courses.json');
}
function saveResults(results, dest) {
console.log('Saving results to ' + dest);
var stream = fs.createWriteStream(dest, { encoding: 'utf8' });
stream.write(JSON.stringify(results, null, 2));
}
{
"name": "uw-cse-course-scraper",
"description": "Scrape UW CSE courses and put them in a structured format",
"author": "Hakon Verespej <hakon@madrona.com>",
"dependencies": {
"cheerio": "0.12.4"
},
"devDependencies": {
}
}
@verespej
Copy link
Author

UW CSE Course Scraper

This code contains node.js javascript to scrape and parse the UW CSE course content provided at http://www.washington.edu/students/crscat/cse.html. The result is a json document containing the parsed content in a structured format.

To use,

npm install
node app.js

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment