Skip to content

Instantly share code, notes, and snippets.

@shaosh
Created January 21, 2015 17:05
Show Gist options
  • Save shaosh/697b796889c679bfda37 to your computer and use it in GitHub Desktop.
Save shaosh/697b796889c679bfda37 to your computer and use it in GitHub Desktop.
Simple HTML Parser
function HtmlParser(){}
function Stack(){
this.stac = [];
this.pop = function(){
return this.stac.pop();
};
this.push = function(item){
this.stac.push(item);
};
this.peek = function(){
if(!this.empty()){
return this.stac[this.stac.length - 1];
}
else{
return null;
}
};
this.empty = function(){
return this.stac.length === 0;
};
}
var result = [];
//Reference:
//http://pickerel.iteye.com/blog/264252
//http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
HtmlParser.prototype = {
startTagRe: /^<([^>\s\/]+)((\s+[^=>\s]+(\s*=\s*((\"[^\"]*\")|(\'[^\']*\')|[^>\s]+))?)*)\s*\/?\s*>/m,
endTagRe: /^<\/([^>\s]+)[^>]*>/m,
attrRe: /([^=\s]+)(\s*=\s*((\"([^\"]*)\")|(\'([^\']*)\')|[^>\s]+))?/gm,
typeRe: /^<([^>\s\/]+)[>\s]/m,
parse: function(s){
var lc,
lm,
rc,
index,
o,
content = '',
i = 0,
treatAsChars = false
stack = new Stack();
while(s.length > 0){
//Comment
if(s.substring(0, 4) === '<!--'){
index = s.indexOf('-->');
if(index < 0){
treatAsChars = true;
}
else{
s = s.substring(index + 3);
treatAsChars = false;
}
}
//End tag
else if(s.substring(0, 2) === '</'){
if(this.endTagRe.test(s)){
var eTag;
lc = RegExp.leftContext;
lm = RegExp.lastMatch;
rc = RegExp.rightContext;
s = rc;
eTag = this.parseEndTag(lm);
if(stack.peek() !== null && eTag === result[stack.peek()].type){
stack.pop();
}
console.log(stack);
if(content !== ''){
o.content = content;
}
treatAsChars = false;
}
else{
treatAsChars = true;
}
}
//Start tag
else if(s.substring(0, 1) === '<'){
if(this.startTagRe.test(s)){
lc = RegExp.leftContext;
lm = RegExp.lastMatch;
rc = RegExp.rightContext;
s = rc;
treatAsChars = false;
o = this.parseStartTag(lm);
if(content !== '' && stack.peek() !== null){
result[stack.peek()].content = content;
}
result.push(o);
stack.push(i);
i++;
}
else{
treatAsChars = true;
}
}
content = '';
//If the first char is not '<'
if(treatAsChars){
index = s.indexOf('<');
if(index < 0){
s = '';
}
else{
content = s.substring(0, index);
s = s.substring(index);
}
}
treatAsChars = true;
}
},
parseStartTag: function(sTag){
var attrs,
type,
obj,
lm,
rc;
this.typeRe.test(sTag);
lm = RegExp.lastMatch;
rc = RegExp.rightContext;
type = lm.substring(1, lm.length - 1);
obj = {};
obj.type = type;
if(rc.length !== 0){
this.parseAttrs(rc.substring(0, rc.length - 1).trim(), obj);
}
return obj;
},
parseEndTag: function(eTag){
var lm,
type;
this.endTagRe.test(eTag);
lm = RegExp.lastMatch;
type = lm.substring(2, lm.length - 1).trim();
return type;
},
parseAttrs: function(attrs, obj){
var lm,
flag,
attr;
while(this.attrRe.test(attrs)){
lm = RegExp.lastMatch;
attrs = RegExp.rightContext;
this.parseAttr(lm, obj);
}
},
parseAttr: function(attr, obj){
var name,
equalIndex,
property;
equalIndex = attr.indexOf('=');
name = attr.substring(0, equalIndex).trim();
property = attr.substring(equalIndex + 1).trim();
obj[name] = property.substring(1, property.length - 1);
}
};
//Testing
var parser = new HtmlParser(),
sample = "<div><ul id='myList'><li class='hello'>Hello</li><li customTag='Earth'>World</li></ul></div><a href='http://famo.us'>Click</a>",
sample2 = "<div>content of div<ul id='myList'><li class='hello'>Hello</li>content of ul<li customTag='Earth'>World</li></ul></div><a href='http://famo.us'>Click</a>",
samples = [sample, sample2];
for(var i = 0; i < samples.length; i++){
console.log('Example Input:\n', samples[i]);
result = [];
parser.parse(samples[i]);
for(var j = 0; j < result.length; j++){
if(j === 0){
console.log('result:\n[');
}
console.log('', result[j]);
if(j === result.length - 1){
console.log(']');
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment