tmpvar/gist:1033715

## gistfile1.js
//TO USE JUST save it and  FIRE node <filename.js>
//AT THE BOTTOM THERE ARE TWO URL'S. TEST CASE IS FOR URL POINTING TO SUN'S JAVADOC ,A PAGE THAT MAKEE HEAVY //USE OF IFRAMES
// Should process HTML text and dump it on terminal
// Error on terminal with a backtrace
/*
/home/sid/opt/lib/node/.npm/jsdom/0.1.20/package/lib/jsdom/level2/html.js:1400
        this._contentDocument = new HTMLDocument();
        ^
ReferenceError: HTMLDocument is not defined
    at Object.contentDocument (/home/sid/opt/lib/node/.npm/jsdom/0.1.20/package/lib/jsdom/level2/html.js:1400:9)
    at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:27:38)
    at Function.each (evalmachine.<anonymous>:30:151)
    at Object.each (evalmachine.<anonymous>:24:147)
    at DocumentSaver.processRecursive (/media/dev/workspace/nodejs/scrobblepage.js:21:22)
    at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:33:12)
    at Function.each (evalmachine.<anonymous>:30:151)
    at Object.each (evalmachine.<anonymous>:24:147)
    at DocumentSaver.processRecursive (/media/dev/workspace/nodejs/scrobblepage.js:21:22)
    at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:33:12)
*/

function DocumentSaver() {
	this.textContent = ''; // Root of the cloned document
	this.ignoredTypes = [   'SCRIPT', 'TITLE', 'META', 'STYLE', 'LINK','CANVAS', 'VIDEO','AUDIO', 'AREA', 'IMG', 'MAP', 'EMBED', 'OBJECT'
						, 'PARAM', 'SOURCE', 'DEVICE', 'NOSCRIPT' ,'HEAD']
	this.$ = null;
	this.indexOf = function(arr, item, from) {
		if (arr.indexOf) return arr.indexOf(item, from);
		var len = arr.length;
		for (var i = (from < 0) ? Math.max(0, len + from) : from || 0; i < len; i++) {
			if (arr[i] === item) return i;
		}
		return -1;
	};
	this.processRecursive = function(rootNode) {
		var children_t = this.$(rootNode).children();
		var self = this;

		this.$(children_t).each(function(){
			if(this.tagName && self.indexOf(self.ignoredTypes, this.tagName.toUpperCase()) == -1){

				self.processRecursive(this);
				var textContent_t = self.$(this)
								  .contents()
								  .filter(function() {
										return this.nodeType == 3;
								}).text();
				if(self.$.trim(textContent_t)!=''){
					self.textContent = ' '+self.textContent+' '+self.$.trim(textContent_t);
					//console.log(textContent)
				}

			}

		})


	}

  this.processDocument = function(doc, callback, jQuery) {
	if(doc == null) {
		callback('')
	}
    // TODO check content type
    this.doc = doc;

    this.$ = jQuery;
    this.callback = callback;
    var rootNode = doc.getElementsByTagName('html')[0];

    if (!rootNode) {
      console.error("No html node in document");
      return;
    }
    this.textContent = "";

    this.processRecursive( rootNode );

    this.callback(this.textContent );
  }
}

var request = require('request'),
    jsdom = require('jsdom'),
    sys = require('sys');

var testURL = 'http://winnipeg.ctv.ca/servlet/an/local/CTVNews/20101121/taliban-afghanistan-101121/20101121/?hub=WinnipegHome';
var testURL2 = 'http://download.oracle.com/javase/1.5.0/docs/api/index.html'

request({uri:testURL2}, function (error, response, body) {
  if (!error && response.statusCode == 200) {
    var window = jsdom.jsdom(body).createWindow();
    jsdom.jQueryify(window, 'jquery.min.js', function (window, jQuery) {
				var saver = new DocumentSaver();
				saver.processDocument(window.document, function(text){
						console.log('Got some text...')
						console.log(text)
				}, jQuery);
    });
  }
});

## gistfile2.js
/*UNIT TESTED CODE FOR CRAWLING A PAGE AND GATHERING THE TEXT USING CHROME DEBUGGER...
* OPEN THE URL:'http://download.oracle.com/javase/1.5.0/docs/api/index.html'
* fire the script
d = new DocumentSaver();
d.processDocument(document, function(text_t){
		console.log('********HTML TEXT******** ');
		console.log(text_t)
})
RESULT: you will see the HTML text gathered from the page...
*/

function DocumentSaver() {
	this.textContent = ''; // Root of the cloned document
	this.ignoredTypes = [ 'SCRIPT', 'TITLE', 'META', 'STYLE', 'LINK','CANVAS', 'VIDEO','AUDIO', 'AREA', 'IMG', 'MAP', 'EMBED', 'OBJECT'
						, 'PARAM', 'SOURCE', 'DEVICE', 'NOSCRIPT' ,'HEAD']

	this.indexOf = function(arr, item, from) {
		if (arr.indexOf) return arr.indexOf(item, from);
		var len = arr.length;
		for (var i = (from < 0) ? Math.max(0, len + from) : from || 0; i < len; i++) {
			if (arr[i] === item) return i;
		}
		return -1;
	};
	this.processRecursive = function(rootNode) {

		for (var child = rootNode.firstChild; child != null; child = child.nextSibling){
			if(child.tagName  && child.tagName.toUpperCase()=='FRAME'){
				var self = this;
				var iframeSaver = new DocumentSaver();

				iframeSaver.processDocument(child.contentDocument, function(Iframetext){
					self.textContent = self.textContent +" "+Iframetext;
				})
			}
			if(child.tagName && this.indexOf(this.ignoredTypes, child.tagName.toUpperCase()) != -1){
				continue;
			}
			//console.log(child);
			this.processRecursive(child);

			switch (child.nodeType) {
			case child.TEXT_NODE:
				if(child.textContent.trim()!=''){
				//	console.log(child);
					this.textContent = " "+this.textContent+" "+child.textContent;
				}
				break;
			default:
			//  console.log('Unhandled node: ' + child.nodeName);
			  break; /* TODO */
			}
		}

	}

  this.processDocument = function(doc, callback) {
    // TODO check content type
    this.doc = doc;
    this.callback = callback;

    var rootNode = doc.getElementsByTagName('html')[0];
    if (!rootNode) {
      console.error("No html node in document");
      return;
    }
    this.textContent = "";
    // TODO process html root too
    this.processRecursive( rootNode );
    this.callback(this.textContent );
  }
}

## gistfile3.js
/*Here is the  Function that is supposed to be a work with jsdom to work with iframe just replacing the processRecursive function in the first file does not work...
*/
this.processRecursive = function(rootNode) {
		var children_t = this.$(rootNode).children();
		var self = this;

		this.$(children_t).each(function(){
			if(this.tagName && self.indexOf(self.ignoredTypes, this.tagName.toUpperCase()) == -1){
				//console.log(child.tagName);
				if(this.tagName  &&  this.tagName.toUpperCase() == 'FRAME'){
				var iframeSaver = new DocumentSaver();
					console.log('*****In IFRAME PROCESSOR****')
					//console.log(this._ownerDocument._children.length)
					iframeSaver.processDocument(this.contentDocument, function(Iframetext){
						console.log('********PROCESSED IFRAME TEXT*****')
						console.log(Iframetext)
						console.log('*************')
						self.textContent = " "+self.textContent +" "+self.$.trim(Iframetext);;
					},self.$)
				}else{
						//console.log(this.tagName);
						self.processRecursive(this);
						var textContent_t = self.$(this)
										  .contents()
										  .filter(function() {
												return this.nodeType == 3;
										}).text();
						if(self.$.trim(textContent_t)!=''){
							self.textContent = ' '+self.textContent+' '+self.$.trim(textContent_t);
							//console.log(textContent)
						}
				}
			}

		})


	}
	//TO USE JUST save it and FIRE node <filename.js>
	//AT THE BOTTOM THERE ARE TWO URL'S. TEST CASE IS FOR URL POINTING TO SUN'S JAVADOC ,A PAGE THAT MAKEE HEAVY //USE OF IFRAMES
	// Should process HTML text and dump it on terminal
	// Error on terminal with a backtrace
	/*
	/home/sid/opt/lib/node/.npm/jsdom/0.1.20/package/lib/jsdom/level2/html.js:1400
	this._contentDocument = new HTMLDocument();
	^
	ReferenceError: HTMLDocument is not defined
	at Object.contentDocument (/home/sid/opt/lib/node/.npm/jsdom/0.1.20/package/lib/jsdom/level2/html.js:1400:9)
	at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:27:38)
	at Function.each (evalmachine.<anonymous>:30:151)
	at Object.each (evalmachine.<anonymous>:24:147)
	at DocumentSaver.processRecursive (/media/dev/workspace/nodejs/scrobblepage.js:21:22)
	at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:33:12)
	at Function.each (evalmachine.<anonymous>:30:151)
	at Object.each (evalmachine.<anonymous>:24:147)
	at DocumentSaver.processRecursive (/media/dev/workspace/nodejs/scrobblepage.js:21:22)
	at Object.<anonymous> (/media/dev/workspace/nodejs/scrobblepage.js:33:12)
	*/

	function DocumentSaver() {
	this.textContent = ''; // Root of the cloned document
	this.ignoredTypes = [ 'SCRIPT', 'TITLE', 'META', 'STYLE', 'LINK','CANVAS', 'VIDEO','AUDIO', 'AREA', 'IMG', 'MAP', 'EMBED', 'OBJECT'
	, 'PARAM', 'SOURCE', 'DEVICE', 'NOSCRIPT' ,'HEAD']
	this.$ = null;
	this.indexOf = function(arr, item, from) {
	if (arr.indexOf) return arr.indexOf(item, from);
	var len = arr.length;
	for (var i = (from < 0) ? Math.max(0, len + from) : from \|\| 0; i < len; i++) {
	if (arr[i] === item) return i;
	}
	return -1;
	};
	this.processRecursive = function(rootNode) {
	var children_t = this.$(rootNode).children();
	var self = this;

	this.$(children_t).each(function(){
	if(this.tagName && self.indexOf(self.ignoredTypes, this.tagName.toUpperCase()) == -1){

	self.processRecursive(this);
	var textContent_t = self.$(this)
	.contents()
	.filter(function() {
	return this.nodeType == 3;
	}).text();
	if(self.$.trim(textContent_t)!=''){
	self.textContent = ' '+self.textContent+' '+self.$.trim(textContent_t);
	//console.log(textContent)
	}

	}

	})


	}

	this.processDocument = function(doc, callback, jQuery) {
	if(doc == null) {
	callback('')
	}
	// TODO check content type
	this.doc = doc;

	this.$ = jQuery;
	this.callback = callback;
	var rootNode = doc.getElementsByTagName('html')[0];

	if (!rootNode) {
	console.error("No html node in document");
	return;
	}
	this.textContent = "";

	this.processRecursive( rootNode );

	this.callback(this.textContent );
	}
	}

	var request = require('request'),
	jsdom = require('jsdom'),
	sys = require('sys');

	var testURL = 'http://winnipeg.ctv.ca/servlet/an/local/CTVNews/20101121/taliban-afghanistan-101121/20101121/?hub=WinnipegHome';
	var testURL2 = 'http://download.oracle.com/javase/1.5.0/docs/api/index.html'

	request({uri:testURL2}, function (error, response, body) {
	if (!error && response.statusCode == 200) {
	var window = jsdom.jsdom(body).createWindow();
	jsdom.jQueryify(window, 'jquery.min.js', function (window, jQuery) {
	var saver = new DocumentSaver();
	saver.processDocument(window.document, function(text){
	console.log('Got some text...')
	console.log(text)
	}, jQuery);
	});
	}
	});
	/*UNIT TESTED CODE FOR CRAWLING A PAGE AND GATHERING THE TEXT USING CHROME DEBUGGER...
	* OPEN THE URL:'http://download.oracle.com/javase/1.5.0/docs/api/index.html'
	* fire the script
	d = new DocumentSaver();
	d.processDocument(document, function(text_t){
	console.log('******HTML TEXT****** ');
	console.log(text_t)
	})
	RESULT: you will see the HTML text gathered from the page...
	*/

	function DocumentSaver() {
	this.textContent = ''; // Root of the cloned document
	this.ignoredTypes = [ 'SCRIPT', 'TITLE', 'META', 'STYLE', 'LINK','CANVAS', 'VIDEO','AUDIO', 'AREA', 'IMG', 'MAP', 'EMBED', 'OBJECT'
	, 'PARAM', 'SOURCE', 'DEVICE', 'NOSCRIPT' ,'HEAD']

	this.indexOf = function(arr, item, from) {
	if (arr.indexOf) return arr.indexOf(item, from);
	var len = arr.length;
	for (var i = (from < 0) ? Math.max(0, len + from) : from \|\| 0; i < len; i++) {
	if (arr[i] === item) return i;
	}
	return -1;
	};
	this.processRecursive = function(rootNode) {

	for (var child = rootNode.firstChild; child != null; child = child.nextSibling){
	if(child.tagName && child.tagName.toUpperCase()=='FRAME'){
	var self = this;
	var iframeSaver = new DocumentSaver();

	iframeSaver.processDocument(child.contentDocument, function(Iframetext){
	self.textContent = self.textContent +" "+Iframetext;
	})
	}
	if(child.tagName && this.indexOf(this.ignoredTypes, child.tagName.toUpperCase()) != -1){
	continue;
	}
	//console.log(child);
	this.processRecursive(child);

	switch (child.nodeType) {
	case child.TEXT_NODE:
	if(child.textContent.trim()!=''){
	// console.log(child);
	this.textContent = " "+this.textContent+" "+child.textContent;
	}
	break;
	default:
	// console.log('Unhandled node: ' + child.nodeName);
	break; /* TODO */
	}
	}

	}

	this.processDocument = function(doc, callback) {
	// TODO check content type
	this.doc = doc;
	this.callback = callback;

	var rootNode = doc.getElementsByTagName('html')[0];
	if (!rootNode) {
	console.error("No html node in document");
	return;
	}
	this.textContent = "";
	// TODO process html root too
	this.processRecursive( rootNode );
	this.callback(this.textContent );
	}
	}
	/*Here is the Function that is supposed to be a work with jsdom to work with iframe just replacing the processRecursive function in the first file does not work...
	*/
	this.processRecursive = function(rootNode) {
	var children_t = this.$(rootNode).children();
	var self = this;

	this.$(children_t).each(function(){
	if(this.tagName && self.indexOf(self.ignoredTypes, this.tagName.toUpperCase()) == -1){
	//console.log(child.tagName);
	if(this.tagName && this.tagName.toUpperCase() == 'FRAME'){
	var iframeSaver = new DocumentSaver();
	console.log('***In IFRAME PROCESSOR**')
	//console.log(this._ownerDocument._children.length)
	iframeSaver.processDocument(this.contentDocument, function(Iframetext){
	console.log('******PROCESSED IFRAME TEXT***')
	console.log(Iframetext)
	console.log('*************')
	self.textContent = " "+self.textContent +" "+self.$.trim(Iframetext);;
	},self.$)
	}else{
	//console.log(this.tagName);
	self.processRecursive(this);
	var textContent_t = self.$(this)
	.contents()
	.filter(function() {
	return this.nodeType == 3;
	}).text();
	if(self.$.trim(textContent_t)!=''){
	self.textContent = ' '+self.textContent+' '+self.$.trim(textContent_t);
	//console.log(textContent)
	}
	}
	}

	})


	}