Skip to content

Instantly share code, notes, and snippets.

@nmanumr
Last active March 24, 2019 10:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nmanumr/5f82e80dfa6a01a4707a5381c5e67d6e to your computer and use it in GitHub Desktop.
Save nmanumr/5f82e80dfa6a01a4707a5381c5e67d6e to your computer and use it in GitHub Desktop.
COMSATS CourseWare Course Inbrowser Scrapper
/**
* Comsats CourseWare Inbrowser Scrapper
*
* @license
* Copyright (c) 2018 Nauman Umer
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* @author Nauman Umer (https://github.com/nmanumr)
*/
class CcwScrapper {
/**
* Comsats CourseWare Inbrowser Scrapper.
*
* @param options:
* - proxyServer: proxyServer to avoid Cross Origin Issues
* Can be a string or functio that takes URL and return proxy URL
* - courseId: Course ID of course to be scrapper. Default 'Mg' for Islamic Studies
* - baseUrl: baseUrl of ccw. Default 'http://ccw.vcomsats.edu.pk'
* - courseUrl: URL for course page: Default baseUrl+'/Course.aspx?CID'
* - lessonUrl: URL for lesson page: Default baseUrl+'/Lesson.aspx?LID'
* - logger: logger class (only log function required). Default console
*/
constructor(options) {
this.options = options || {};
this.options['proxyServer'] =
this.options['proxyServer'] || 'https://cors-anywhere.herokuapp.com/';
this.options['baseUrl'] =
this.options['baseUrl'] || 'http://ccw.vcomsats.edu.pk';
this.options['courseUrl'] =
this.options['courseUrl'] || `${this.options['baseUrl']}/Course.aspx?CID`;
this.options['lessonUrl'] =
this.options['lessonUrl'] || `${this.options['baseUrl']}/Lesson.aspx?LID`;
this.options['logger'] = this.options['logger'] || console;
}
/**
* Fetch document from url
* @param {string} url
*/
async fetchUrl(url) {
// proxyServer can be a function that takes url and return proxy url
if (typeof this.options.proxyServer == 'function')
url = this.options.proxyServer(url);
else
url = this.options.proxyServer + url;
return new Promise((resolve) => {
var oReq = new XMLHttpRequest();
var reqListener = () => {
resolve(oReq.responseText);
}
oReq.addEventListener("load", reqListener);
oReq.open("GET", url);
oReq.setRequestHeader('x-requested-with', this.options['baseUrl'])
oReq.send();
})
}
/**
* Scrap course with all its resources
* @param {string} courseId base64 encoded courseid
*/
async getLessons(courseId) {
// fetch course page
var html = await this.fetchUrl(`${this.options.courseUrl}=${courseId}==`);
// html Parser
var parser = new DOMParser();
var dom = parser.parseFromString(html, 'text/html');
var links = dom.querySelectorAll('.course-lession-list>li>a');
this.options.logger.log(`Found ${links.length} Lectures. Fetch Resources ...`);
var lectures = [];
for (var link of links) {
var lecture = {
id: link.getAttribute('href').slice(16, -2),
title: link.querySelector('div>div:first-child').innerText.trim(' '),
}
this.options.logger.log(`Fetching "${lecture.title}" Resources.`);
lecture['resources'] = await this.getLessonResources(lecture['id']);
this.options.logger.log(`Found ${lecture['resources'].length} Resources.`);
lectures.push(lecture)
}
return lectures;
}
/**
* Fetch reources of given lesson
* @param {string} id base64 encoded Lesson ID
*/
async getLessonResources(id) {
var html = await this.fetchUrl(`${this.options.lessonUrl}=${id}==`);
var parser = new DOMParser();
var dom = parser.parseFromString(html, 'text/html');
var links = dom.querySelectorAll('.course-lession-item tr>td:nth-child(2)>a');
var resources = [];
for (var link of links) {
resources.push({
title: link.innerText,
url: link.getAttribute('href')
})
}
return resources
}
}
// Example Usage
//
// var scrapper = new CcwScrapper();
// scrapper.getLessons('Mg').then(data=>{
// console.log(data);
// });
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment