-
-
Save tmpvar/1033715 to your computer and use it in GitHub Desktop.
jsdom port
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//TO USE JUST save it and FIRE node <filename.js> | |
//AT THE BOTTOM THERE ARE TWO URL'S. TEST CASE IS FOR URL POINTING TO SUN'S JAVADOC ,A PAGE THAT MAKEE HEAVY //USE OF IFRAMES | |
// Should process HTML text and dump it on terminal | |
// Error on terminal with a backtrace | |
/* | |
/home/sid/opt/lib/node/.npm/jsdom/0.1.20/package/lib/jsdom/level2/html.js:1400 | |
this._contentDocument = new HTMLDocument(); | |
^ | |
ReferenceError: HTMLDocument is not defined | |
at Object.contentDocument (/home/sid/opt/lib/node/.npm/jsdom/0.1.20/package/lib/jsdom/level2/html.js:1400:9) | |
at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:27:38) | |
at Function.each (evalmachine.<anonymous>:30:151) | |
at Object.each (evalmachine.<anonymous>:24:147) | |
at DocumentSaver.processRecursive (/media/dev/workspace/nodejs/scrobblepage.js:21:22) | |
at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:33:12) | |
at Function.each (evalmachine.<anonymous>:30:151) | |
at Object.each (evalmachine.<anonymous>:24:147) | |
at DocumentSaver.processRecursive (/media/dev/workspace/nodejs/scrobblepage.js:21:22) | |
at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:33:12) | |
*/ | |
function DocumentSaver() { | |
this.textContent = ''; // Root of the cloned document | |
this.ignoredTypes = [ 'SCRIPT', 'TITLE', 'META', 'STYLE', 'LINK','CANVAS', 'VIDEO','AUDIO', 'AREA', 'IMG', 'MAP', 'EMBED', 'OBJECT' | |
, 'PARAM', 'SOURCE', 'DEVICE', 'NOSCRIPT' ,'HEAD'] | |
this.$ = null; | |
this.indexOf = function(arr, item, from) { | |
if (arr.indexOf) return arr.indexOf(item, from); | |
var len = arr.length; | |
for (var i = (from < 0) ? Math.max(0, len + from) : from || 0; i < len; i++) { | |
if (arr[i] === item) return i; | |
} | |
return -1; | |
}; | |
this.processRecursive = function(rootNode) { | |
var children_t = this.$(rootNode).children(); | |
var self = this; | |
this.$(children_t).each(function(){ | |
if(this.tagName && self.indexOf(self.ignoredTypes, this.tagName.toUpperCase()) == -1){ | |
self.processRecursive(this); | |
var textContent_t = self.$(this) | |
.contents() | |
.filter(function() { | |
return this.nodeType == 3; | |
}).text(); | |
if(self.$.trim(textContent_t)!=''){ | |
self.textContent = ' '+self.textContent+' '+self.$.trim(textContent_t); | |
//console.log(textContent) | |
} | |
} | |
}) | |
} | |
this.processDocument = function(doc, callback, jQuery) { | |
if(doc == null) { | |
callback('') | |
} | |
// TODO check content type | |
this.doc = doc; | |
this.$ = jQuery; | |
this.callback = callback; | |
var rootNode = doc.getElementsByTagName('html')[0]; | |
if (!rootNode) { | |
console.error("No html node in document"); | |
return; | |
} | |
this.textContent = ""; | |
this.processRecursive( rootNode ); | |
this.callback(this.textContent ); | |
} | |
} | |
var request = require('request'), | |
jsdom = require('jsdom'), | |
sys = require('sys'); | |
var testURL = 'http://winnipeg.ctv.ca/servlet/an/local/CTVNews/20101121/taliban-afghanistan-101121/20101121/?hub=WinnipegHome'; | |
var testURL2 = 'http://download.oracle.com/javase/1.5.0/docs/api/index.html' | |
request({uri:testURL2}, function (error, response, body) { | |
if (!error && response.statusCode == 200) { | |
var window = jsdom.jsdom(body).createWindow(); | |
jsdom.jQueryify(window, 'jquery.min.js', function (window, jQuery) { | |
var saver = new DocumentSaver(); | |
saver.processDocument(window.document, function(text){ | |
console.log('Got some text...') | |
console.log(text) | |
}, jQuery); | |
}); | |
} | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*UNIT TESTED CODE FOR CRAWLING A PAGE AND GATHERING THE TEXT USING CHROME DEBUGGER... | |
* OPEN THE URL:'http://download.oracle.com/javase/1.5.0/docs/api/index.html' | |
* fire the script | |
d = new DocumentSaver(); | |
d.processDocument(document, function(text_t){ | |
console.log('********HTML TEXT******** '); | |
console.log(text_t) | |
}) | |
RESULT: you will see the HTML text gathered from the page... | |
*/ | |
function DocumentSaver() { | |
this.textContent = ''; // Root of the cloned document | |
this.ignoredTypes = [ 'SCRIPT', 'TITLE', 'META', 'STYLE', 'LINK','CANVAS', 'VIDEO','AUDIO', 'AREA', 'IMG', 'MAP', 'EMBED', 'OBJECT' | |
, 'PARAM', 'SOURCE', 'DEVICE', 'NOSCRIPT' ,'HEAD'] | |
this.indexOf = function(arr, item, from) { | |
if (arr.indexOf) return arr.indexOf(item, from); | |
var len = arr.length; | |
for (var i = (from < 0) ? Math.max(0, len + from) : from || 0; i < len; i++) { | |
if (arr[i] === item) return i; | |
} | |
return -1; | |
}; | |
this.processRecursive = function(rootNode) { | |
for (var child = rootNode.firstChild; child != null; child = child.nextSibling){ | |
if(child.tagName && child.tagName.toUpperCase()=='FRAME'){ | |
var self = this; | |
var iframeSaver = new DocumentSaver(); | |
iframeSaver.processDocument(child.contentDocument, function(Iframetext){ | |
self.textContent = self.textContent +" "+Iframetext; | |
}) | |
} | |
if(child.tagName && this.indexOf(this.ignoredTypes, child.tagName.toUpperCase()) != -1){ | |
continue; | |
} | |
//console.log(child); | |
this.processRecursive(child); | |
switch (child.nodeType) { | |
case child.TEXT_NODE: | |
if(child.textContent.trim()!=''){ | |
// console.log(child); | |
this.textContent = " "+this.textContent+" "+child.textContent; | |
} | |
break; | |
default: | |
// console.log('Unhandled node: ' + child.nodeName); | |
break; /* TODO */ | |
} | |
} | |
} | |
this.processDocument = function(doc, callback) { | |
// TODO check content type | |
this.doc = doc; | |
this.callback = callback; | |
var rootNode = doc.getElementsByTagName('html')[0]; | |
if (!rootNode) { | |
console.error("No html node in document"); | |
return; | |
} | |
this.textContent = ""; | |
// TODO process html root too | |
this.processRecursive( rootNode ); | |
this.callback(this.textContent ); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*Here is the Function that is supposed to be a work with jsdom to work with iframe just replacing the processRecursive function in the first file does not work... | |
*/ | |
this.processRecursive = function(rootNode) { | |
var children_t = this.$(rootNode).children(); | |
var self = this; | |
this.$(children_t).each(function(){ | |
if(this.tagName && self.indexOf(self.ignoredTypes, this.tagName.toUpperCase()) == -1){ | |
//console.log(child.tagName); | |
if(this.tagName && this.tagName.toUpperCase() == 'FRAME'){ | |
var iframeSaver = new DocumentSaver(); | |
console.log('*****In IFRAME PROCESSOR****') | |
//console.log(this._ownerDocument._children.length) | |
iframeSaver.processDocument(this.contentDocument, function(Iframetext){ | |
console.log('********PROCESSED IFRAME TEXT*****') | |
console.log(Iframetext) | |
console.log('*************') | |
self.textContent = " "+self.textContent +" "+self.$.trim(Iframetext);; | |
},self.$) | |
}else{ | |
//console.log(this.tagName); | |
self.processRecursive(this); | |
var textContent_t = self.$(this) | |
.contents() | |
.filter(function() { | |
return this.nodeType == 3; | |
}).text(); | |
if(self.$.trim(textContent_t)!=''){ | |
self.textContent = ' '+self.textContent+' '+self.$.trim(textContent_t); | |
//console.log(textContent) | |
} | |
} | |
} | |
}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment