Created
August 6, 2015 19:20
-
-
Save tchittick/6780f15ca520d3610432 to your computer and use it in GitHub Desktop.
Recursion through a Cheerio.js object and writing to .CSV
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*An object created to parse through a large number of HTML | |
blocks quickly. Used with cheerio.js. Begin via: | |
parse.run($('some-div')[0]) | |
*/ | |
var fs = require('fs'), | |
cheerio = require('cheerio'); | |
var Parse = function(block) { | |
this.block = {}; //top level node | |
this.depth = 0; //number of 1st gen | |
this.stats = []; //array to push successful parses | |
}; | |
Parse.prototype = { | |
run: function(block) { | |
this.block = block; //top level node | |
this.depth = block.children.length; //number of 1st gen | |
this.stats = []; //array to push successful parses | |
//begin | |
this.parseRecursion(this.block); | |
}, | |
parseRecursion: function(node) { | |
var nodes; | |
nodes = node.children; | |
this.parseLoopAllChildren(nodes); | |
if (this.depth === 0) { | |
this.outputParse(); | |
} | |
}, | |
parseLoopAllChildren: function(nodes) { | |
if(nodes) { | |
var node; | |
for (var obj in nodes) { | |
node = nodes[obj]; | |
//If || Switch blocks here depending on what you | |
//are parsing for. | |
if (node.type == 'text') { | |
this.cleanLine(node.data); | |
} | |
//Continue recursion on node if more children nodes | |
if (node.children) { | |
this.parseRecursion(node); | |
} | |
//If the nodes parent is top level, node complete | |
if(node.parent == this.block) { | |
this.depth--; | |
} | |
} | |
} | |
}, | |
//Output and cleaning functions | |
outputParse: function() { | |
//Output function will vary depending on parsing goals | |
var result = []; | |
for (var i = 0; i < this.stats.length; i++) { | |
switch(this.stats[i]) { | |
case 'First Name': | |
result[0] = (this.cleanComma(this.stats[i + 2])); | |
break; | |
case 'Last Name:': | |
result[1] = (this.cleanComma(this.stats[i + 1])); | |
break; | |
case 'Email:': | |
result[6] = (this.cleanComma(this.stats[i + 1])); | |
break; | |
case 'Website:': | |
result[7] = (this.cleanComma(this.stats[i + 1])); | |
break; | |
} | |
} | |
fs.appendFile('test.csv', this.num + ', ' + result.join(', ') + '\n', function(err) { | |
if (err) throw err; | |
}); | |
}, | |
cleanComma: function(str) { | |
return (str) ? str.replace(',', ' ') : str; | |
}, | |
cleanLine: function(line) { | |
if (line[0] != '\r' && line[0] != ' ') { | |
this.stats.push(line); | |
} else { | |
var newLine = ''; | |
for (var i = 0; i < line.length; i++) { | |
if (line[i] != '\r' && line[i] != '\n' && line[i] != '\t' && line[i] != ' ' && line[i] != String.fromCharCode(0xC2)) { | |
newLine += line[i]; | |
} | |
} | |
if (newLine !== '') { | |
this.stats.push(newLine); | |
} | |
} | |
} | |
}; | |
module.exports = Parse; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment