Skip to content

Instantly share code, notes, and snippets.

@soulcyon
Created November 22, 2012 22:48
Show Gist options
  • Save soulcyon/4133209 to your computer and use it in GitHub Desktop.
Save soulcyon/4133209 to your computer and use it in GitHub Desktop.
NJIT Course Schedule Scraper
<?php
/**
* NJIT Course Schedule Scraper (11/21/2012)
*
* Using http://courseschedules.njit.edu/, this PHP scraper utilizes CURL to build a large
* associative array for all courses. The user may specify options through onlySubject and
* onlyCourse methods to scrape the respective subjects and/or courses. The resulting array only
* has the section data. Course data is not stored, but may easily be captured with small changes
* to the scraper.
*
* Example:
* $scraper = new NJITScrape("2013s", true);
* $scraper->onlySubject("CS");
* $scraper->onlyCourse("490");
* $courseData = $scraper->start();
*
* JSON Representation of $courseData will look like this:
* {
* "CS": {
* "name": "Computer Science",
* "490": {
* "name": "DESIGN IN SOFTWARE ENGR",
* "description": "Prerequisite: senior standing or departmental approval. This course
* focuses on the methodology for developing software systems. Methods and techniques
* for functional requirements analysis and specifications, design, coding, testing and
* proving integration and maintenance are discussed.",
* "102": {
* "call": 21239,
* "comments": "",
* "days": "F:600PM - 905PM",
* "rooms": "CKB207",
* "status": "Open",
* "maxStudents": 36,
* "currentStudents": 28,
* "instructor": "Nicholson Theod",
* "instructorId": "THEO",
* "credits": 3
* }
* }
* }
* }
*
* @author Sashank Tadepalli <dijjit@gmail.com>
* @license Creative Commons Attribution 3.0 Unported License.
* @version 1.0
* @link http://dijjit.com/php/njit-course-schedule-scraper/
*/
// SimpleHTMLDOM - Web Scraping Utility
// Please download from http://simplehtmldom.sourceforge.net/
require_once "simplehtmldom.php";
class NJITScrape {
private $page = "http://courseschedules.njit.edu/index.aspx";
private $semester;
private $subjectId;
private $onlySubjectId = "";
private $subjectName;
private $courseId;
private $onlyCourseId = "";
private $courseName;
private $data = array();
private $FLUSH_DEBUG = true;
public function __construct($startSemester, $flush){
set_time_limit(0);
$this->FLUSH_DEBUG = $flush;
$this->semester = $startSemester;
}
public function start(){
if( $this->FLUSH_DEBUG ){
echo "<h1>" . $this->semester . "</h1>";
@ob_flush();
flush();
}
$this->scrapeSemester();
return $this->data;
}
public function onlySubject($subjectId){
$this->onlySubjectId = $subjectId;
}
public function onlyCourse($courseId){
$this->onlyCourseId = $courseId;
}
private function scrapeSemester(){
$url = $this->page . "?semester=$this->semester";
$html = file_get_html($url);
foreach($html->find("span a") as $ele){
$subject = explode("-", $ele->plaintext);
$this->subjectId = trim($subject[0]);
if( !empty($this->onlySubjectId) &&
strpos($this->subjectId, $this->onlySubjectId) === false ){
continue;
}
$this->subjectName = trim($subject[1]);
$this->data[$this->subjectId] = array("name" => $this->subjectName);
if( $this->FLUSH_DEBUG ){
echo $this->subjectName . "<br />";
@ob_flush();
flush();
}
$this->scrapeSubject();
}
}
private function scrapeSubject(){
$courseIds = array();
$courseNames = array();
$url = $this->page . "?semester=$this->semester&subjectID=$this->subjectId";
$html = file_get_html($url);
// Populate courseIds and courseNames
foreach($html->find(".courseName strong") as $ele){
$courseIds[] = trim(str_replace($this->subjectId, "", $ele->plaintext));
}
foreach ($html->find(".courseName") as $ele) {
$t = explode("-", $ele->plaintext);
$t = explode("(", $t[1]);
$courseNames[] = trim($t[0]);
}
// Build pagination query and fancy post request parameters
$pagecount = count($html->find(".pagination_wrapper a")) / 2 + 1;
$t = $html->find("#__VIEWSTATE");
$vs = $t[0]->value;
$t = $html->find("#__EVENTVALIDATION");
$evt = $t[0]->value;
// Loop through all pages to get all courses
for ($i = 1; $i < $pagecount; $i++) {
// Fancy post request to fake new page request
if( !($pageHTML = $this->postRequest($url, array(
"__EVENTARGUMENT" => "Page\$" . ($i + 1),
"__EVENTTARGET" => "ctl10\$GridView1",
"__VIEWSTATE" => trim($vs),
"__VIEWSTATEENCRYPTED" => "",
"__EVENTVALIDATION" => trim($evt),
"__LASTFOCUS" => "",
"ctl10\$ddlSemester" => $this->semester
))) ){
continue;
}
$html = str_get_html($pageHTML);
// Populate courseIds and courseNames
foreach ($html->find(".courseName strong") as $ele1) {
$courseIds[] = trim(str_replace($this->subjectId, "", $ele1->plaintext));
}
foreach ($html->find(".courseName") as $ele1) {
$t = explode("-", $ele1->plaintext);
$t = explode("(", $t[1]);
$courseNames[] = trim($t[0]);
}
}
for($i = 0; $i < count($courseIds); $i++){
$this->courseId = $courseIds[$i];
if( !empty($this->onlyCourseId) &&
strpos($this->courseId, $this->onlyCourseId) === false ){
continue;
}
$this->courseName = $courseNames[$i];
$this->data[$this->subjectId][$this->courseId] = array("name" => $this->courseName);
$url = $this->page .
"?semester=$this->semester&subjectID=$this->subjectId&course=$this->courseId";
if( !($sectionHTML = $this->postRequest($url, array())) ){
continue;
}
if( $this->FLUSH_DEBUG ){
echo $this->courseName . "<br />";
@ob_flush();
flush();
}
$this->scrapeSections($sectionHTML);
}
}
private function scrapeSections($sectionHTML){
$html = str_get_html($sectionHTML);
$description = $html->find("#ctl10_lblCourseDesc");
if (count($description) == 0) {
$description = "";
} else {
$description = $description[0]->plaintext;
}
$this->data[$this->subjectId][$this->courseId]["description"] = trim($description);
foreach ($html->find("#ctl10_gv_sectionTable .sectionRow") as $ele) {
$fixed = $ele->find("td");
$days = explode("Section Comments:", $fixed[2]->plaintext);
$tempData = array();
$tempData["call"] = intval(
trim(
str_replace(
"View Book Info", "", $fixed[1]->plaintext
)
)
);
$tempData["comments"] = (count($days) > 1) ? trim($days[1]) : "";
$tempData["days"] = trim($days[0]);
$tempData["rooms"] = trim($fixed[3]->plaintext);
$tempData["status"] = trim($fixed[4]->plaintext);
$tempData["maxStudents"] = intval(trim($fixed[5]->plaintext));
$tempData["currentStudents"] = intval(trim($fixed[6]->plaintext));
$insId = $fixed[7]->find("a");
$tempData["instructor"] = $instructor = trim($fixed[7]->plaintext);
if (count($insId) == 0) {
$insId = $instructor;
} else {
$insId = trim(str_replace(
"https://directory.njit.edu/PersDetails.aspx?persid=", "", $insId[0]->href)
);
}
$tempData["instructorId"] = $insId;
$tempData["credits"] = intval(trim($fixed[8]->plaintext));
$this->data[$this->subjectId][$this->courseId][trim($fixed[0]->plaintext)] = $tempData;
}
}
private function postRequest($url, $post){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
if (!empty($post)) {
curl_setopt($ch, CURLOPT_POST, count($post));
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($post));
}
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$result = curl_exec($ch);
$check = curl_getinfo($ch);
curl_close($ch);
if ($check["http_code"] != "200") {
return false;
}
return $result;
}
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment