Skip to content

Instantly share code, notes, and snippets.

@tmpvar
Forked from itissid/gist:709246
Created June 19, 2011 03:03
Show Gist options
  • Save tmpvar/1033715 to your computer and use it in GitHub Desktop.
Save tmpvar/1033715 to your computer and use it in GitHub Desktop.
jsdom port
//TO USE JUST save it and FIRE node <filename.js>
//AT THE BOTTOM THERE ARE TWO URL'S. TEST CASE IS FOR URL POINTING TO SUN'S JAVADOC ,A PAGE THAT MAKEE HEAVY //USE OF IFRAMES
// Should process HTML text and dump it on terminal
// Error on terminal with a backtrace
/*
/home/sid/opt/lib/node/.npm/jsdom/0.1.20/package/lib/jsdom/level2/html.js:1400
this._contentDocument = new HTMLDocument();
^
ReferenceError: HTMLDocument is not defined
at Object.contentDocument (/home/sid/opt/lib/node/.npm/jsdom/0.1.20/package/lib/jsdom/level2/html.js:1400:9)
at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:27:38)
at Function.each (evalmachine.<anonymous>:30:151)
at Object.each (evalmachine.<anonymous>:24:147)
at DocumentSaver.processRecursive (/media/dev/workspace/nodejs/scrobblepage.js:21:22)
at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:33:12)
at Function.each (evalmachine.<anonymous>:30:151)
at Object.each (evalmachine.<anonymous>:24:147)
at DocumentSaver.processRecursive (/media/dev/workspace/nodejs/scrobblepage.js:21:22)
at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:33:12)
*/
function DocumentSaver() {
this.textContent = ''; // Root of the cloned document
this.ignoredTypes = [ 'SCRIPT', 'TITLE', 'META', 'STYLE', 'LINK','CANVAS', 'VIDEO','AUDIO', 'AREA', 'IMG', 'MAP', 'EMBED', 'OBJECT'
, 'PARAM', 'SOURCE', 'DEVICE', 'NOSCRIPT' ,'HEAD']
this.$ = null;
this.indexOf = function(arr, item, from) {
if (arr.indexOf) return arr.indexOf(item, from);
var len = arr.length;
for (var i = (from < 0) ? Math.max(0, len + from) : from || 0; i < len; i++) {
if (arr[i] === item) return i;
}
return -1;
};
this.processRecursive = function(rootNode) {
var children_t = this.$(rootNode).children();
var self = this;
this.$(children_t).each(function(){
if(this.tagName && self.indexOf(self.ignoredTypes, this.tagName.toUpperCase()) == -1){
self.processRecursive(this);
var textContent_t = self.$(this)
.contents()
.filter(function() {
return this.nodeType == 3;
}).text();
if(self.$.trim(textContent_t)!=''){
self.textContent = ' '+self.textContent+' '+self.$.trim(textContent_t);
//console.log(textContent)
}
}
})
}
this.processDocument = function(doc, callback, jQuery) {
if(doc == null) {
callback('')
}
// TODO check content type
this.doc = doc;
this.$ = jQuery;
this.callback = callback;
var rootNode = doc.getElementsByTagName('html')[0];
if (!rootNode) {
console.error("No html node in document");
return;
}
this.textContent = "";
this.processRecursive( rootNode );
this.callback(this.textContent );
}
}
var request = require('request'),
jsdom = require('jsdom'),
sys = require('sys');
var testURL = 'http://winnipeg.ctv.ca/servlet/an/local/CTVNews/20101121/taliban-afghanistan-101121/20101121/?hub=WinnipegHome';
var testURL2 = 'http://download.oracle.com/javase/1.5.0/docs/api/index.html'
request({uri:testURL2}, function (error, response, body) {
if (!error && response.statusCode == 200) {
var window = jsdom.jsdom(body).createWindow();
jsdom.jQueryify(window, 'jquery.min.js', function (window, jQuery) {
var saver = new DocumentSaver();
saver.processDocument(window.document, function(text){
console.log('Got some text...')
console.log(text)
}, jQuery);
});
}
});
/*UNIT TESTED CODE FOR CRAWLING A PAGE AND GATHERING THE TEXT USING CHROME DEBUGGER...
* OPEN THE URL:'http://download.oracle.com/javase/1.5.0/docs/api/index.html'
* fire the script
d = new DocumentSaver();
d.processDocument(document, function(text_t){
console.log('********HTML TEXT******** ');
console.log(text_t)
})
RESULT: you will see the HTML text gathered from the page...
*/
function DocumentSaver() {
this.textContent = ''; // Root of the cloned document
this.ignoredTypes = [ 'SCRIPT', 'TITLE', 'META', 'STYLE', 'LINK','CANVAS', 'VIDEO','AUDIO', 'AREA', 'IMG', 'MAP', 'EMBED', 'OBJECT'
, 'PARAM', 'SOURCE', 'DEVICE', 'NOSCRIPT' ,'HEAD']
this.indexOf = function(arr, item, from) {
if (arr.indexOf) return arr.indexOf(item, from);
var len = arr.length;
for (var i = (from < 0) ? Math.max(0, len + from) : from || 0; i < len; i++) {
if (arr[i] === item) return i;
}
return -1;
};
this.processRecursive = function(rootNode) {
for (var child = rootNode.firstChild; child != null; child = child.nextSibling){
if(child.tagName && child.tagName.toUpperCase()=='FRAME'){
var self = this;
var iframeSaver = new DocumentSaver();
iframeSaver.processDocument(child.contentDocument, function(Iframetext){
self.textContent = self.textContent +" "+Iframetext;
})
}
if(child.tagName && this.indexOf(this.ignoredTypes, child.tagName.toUpperCase()) != -1){
continue;
}
//console.log(child);
this.processRecursive(child);
switch (child.nodeType) {
case child.TEXT_NODE:
if(child.textContent.trim()!=''){
// console.log(child);
this.textContent = " "+this.textContent+" "+child.textContent;
}
break;
default:
// console.log('Unhandled node: ' + child.nodeName);
break; /* TODO */
}
}
}
this.processDocument = function(doc, callback) {
// TODO check content type
this.doc = doc;
this.callback = callback;
var rootNode = doc.getElementsByTagName('html')[0];
if (!rootNode) {
console.error("No html node in document");
return;
}
this.textContent = "";
// TODO process html root too
this.processRecursive( rootNode );
this.callback(this.textContent );
}
}
/*Here is the Function that is supposed to be a work with jsdom to work with iframe just replacing the processRecursive function in the first file does not work...
*/
this.processRecursive = function(rootNode) {
var children_t = this.$(rootNode).children();
var self = this;
this.$(children_t).each(function(){
if(this.tagName && self.indexOf(self.ignoredTypes, this.tagName.toUpperCase()) == -1){
//console.log(child.tagName);
if(this.tagName && this.tagName.toUpperCase() == 'FRAME'){
var iframeSaver = new DocumentSaver();
console.log('*****In IFRAME PROCESSOR****')
//console.log(this._ownerDocument._children.length)
iframeSaver.processDocument(this.contentDocument, function(Iframetext){
console.log('********PROCESSED IFRAME TEXT*****')
console.log(Iframetext)
console.log('*************')
self.textContent = " "+self.textContent +" "+self.$.trim(Iframetext);;
},self.$)
}else{
//console.log(this.tagName);
self.processRecursive(this);
var textContent_t = self.$(this)
.contents()
.filter(function() {
return this.nodeType == 3;
}).text();
if(self.$.trim(textContent_t)!=''){
self.textContent = ' '+self.textContent+' '+self.$.trim(textContent_t);
//console.log(textContent)
}
}
}
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment