Skip to content

Instantly share code, notes, and snippets.

@designeng
Forked from tchittick/parse_gist.js
Created September 20, 2021 21:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save designeng/f34970ef16f6d2ebcb58e0da90e1c3b9 to your computer and use it in GitHub Desktop.
Save designeng/f34970ef16f6d2ebcb58e0da90e1c3b9 to your computer and use it in GitHub Desktop.
Recursion through a Cheerio.js object and writing to .CSV
/*An object created to parse through a large number of HTML
blocks quickly. Used with cheerio.js. Begin via:
parse.run($('some-div')[0])
*/
var fs = require('fs'),
cheerio = require('cheerio');
var Parse = function(block) {
this.block = {}; //top level node
this.depth = 0; //number of 1st gen
this.stats = []; //array to push successful parses
};
Parse.prototype = {
run: function(block) {
this.block = block; //top level node
this.depth = block.children.length; //number of 1st gen
this.stats = []; //array to push successful parses
//begin
this.parseRecursion(this.block);
},
parseRecursion: function(node) {
var nodes;
nodes = node.children;
this.parseLoopAllChildren(nodes);
if (this.depth === 0) {
this.outputParse();
}
},
parseLoopAllChildren: function(nodes) {
if(nodes) {
var node;
for (var obj in nodes) {
node = nodes[obj];
//If || Switch blocks here depending on what you
//are parsing for.
if (node.type == 'text') {
this.cleanLine(node.data);
}
//Continue recursion on node if more children nodes
if (node.children) {
this.parseRecursion(node);
}
//If the nodes parent is top level, node complete
if(node.parent == this.block) {
this.depth--;
}
}
}
},
//Output and cleaning functions
outputParse: function() {
//Output function will vary depending on parsing goals
var result = [];
for (var i = 0; i < this.stats.length; i++) {
switch(this.stats[i]) {
case 'First Name':
result[0] = (this.cleanComma(this.stats[i + 2]));
break;
case 'Last Name:':
result[1] = (this.cleanComma(this.stats[i + 1]));
break;
case 'Email:':
result[6] = (this.cleanComma(this.stats[i + 1]));
break;
case 'Website:':
result[7] = (this.cleanComma(this.stats[i + 1]));
break;
}
}
fs.appendFile('test.csv', this.num + ', ' + result.join(', ') + '\n', function(err) {
if (err) throw err;
});
},
cleanComma: function(str) {
return (str) ? str.replace(',', ' ') : str;
},
cleanLine: function(line) {
if (line[0] != '\r' && line[0] != ' ') {
this.stats.push(line);
} else {
var newLine = '';
for (var i = 0; i < line.length; i++) {
if (line[i] != '\r' && line[i] != '\n' && line[i] != '\t' && line[i] != ' ' && line[i] != String.fromCharCode(0xC2)) {
newLine += line[i];
}
}
if (newLine !== '') {
this.stats.push(newLine);
}
}
}
};
module.exports = Parse;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment