Created
October 2, 2013 03:37
-
-
Save Kreijstal/6788840 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function TokenizeHTML(string){ | |
var state=0,firststatechars=['<!','</','<'],endingTag=true,returnObject=[],ret='',c='',d,ii,ll,tagname=''; | |
//sorry for uncommented shit, I don't even know what Im'd doing | |
//state 0 is outside tag, 1 is inside a normal tag | |
//ret,c,d,ii,ll have various uses dependent on the state and loop, so I can't really know what each one does, they're like counters and state keepers | |
//ret usually works as adding the text outside the tags | |
function ifret(){ | |
if(ret){ | |
returnObject.push({Type:"text",Content:ret}); | |
ret=''; | |
} | |
} | |
//this is a lot faster than creating a character array; | |
function isUpperCharacterSet(c){return c.charCodeAt()>=0x41&&c.charCodeAt()<=0x5a;} | |
function isLowerCharacterSet(c){return c.charCodeAt()>=0x61&&c.charCodeAt()<=0x7a;} | |
var whitespaceCodePoints=[0,0x20,0xA0,0x2f/*FOR SCIENCE*/];//the / character as a whitespace maay be helpful in occasions, whatever | |
function isWhitespace(c){ | |
var carh=c.charCodeAt(); | |
for(var i=0,l=whitespaceCodePoints.length;i<l;i++){ | |
if(whitespaceCodePoints[i]==carh){return true;} | |
} | |
return carh>=0x9&&carh<=0xd; | |
} | |
function submittag(content,state){ | |
if(state==1){ | |
returnObject.push({Type:"WhiteSpace",Content:content}); | |
}else if(state==2){returnObject.push({Type:"attributeName",Content:content});}} | |
MAINLOOP:for(var i=0,l=string.length;i<l;i++){ | |
if(state==0){ | |
c=string.substr(i,2); | |
for(ii=0,ll=firststatechars.length;ii<ll;ii++){ | |
if(c.indexOf(firststatechars[ii])==0){ | |
if(ii==0){ | |
if(string.substr(i+2,2)=="--"){ | |
d=string.substr(i+2).indexOf('-->'); | |
c=d!=-1?string.substr(i,d+5):string.substr(i); | |
ifret(); | |
returnObject.push({Type:"comment",Content:c}); | |
i=i+c.length; | |
c=''; | |
}else{ | |
state=3; | |
i++; | |
c=''; | |
ifret(); | |
returnObject.push({Type:"startTag",Content:"<!"}) | |
} | |
} else if(ii==2||ii==1){ | |
if(ii==1){i++;endingTag=true;}; | |
if(isUpperCharacterSet(string.charAt(i+1))||isLowerCharacterSet(string.charAt(i+1))){ | |
ifret(); | |
state=1; | |
c=''; | |
if(ii==1){returnObject.push({Type:"startTag",Content:"</"});continue MAINLOOP;} | |
returnObject.push({Type:"startTag",Content:"<"}) | |
continue MAINLOOP; | |
}else{ | |
ret+=string.charAt(i); | |
//console.log("RET1",ret) | |
continue MAINLOOP; | |
} | |
}; | |
break; | |
} | |
} | |
ret+=string.charAt(i); | |
//console.log("RET2",ret) | |
}else if(state==1){ | |
//console.log("RET",ret); | |
d=string.charAt(i); | |
//console.log(d); | |
if(!tagname){ | |
//console.log(c.charCodeAt()) | |
if(isWhitespace(d)||d=='>'){ | |
returnObject.push({Type:"tagName",Content:c}); | |
tagname=c;i--; | |
c='';ii=0;ll=0; | |
}else{c+=d;} | |
}else{ | |
if(isWhitespace(d)){ | |
if(ii==2){ | |
submittag(c,ii);c=''; | |
} | |
c+=d; | |
ii=1; | |
continue; | |
} | |
if(ii==1){ | |
submittag(c,ii);c=''; | |
} | |
if(d=='>'){ | |
if(ii==2){ | |
submittag(c,ii);c=''; | |
} | |
state=0; | |
c=''; | |
returnObject.push({Type:"endTag",Content:">"}); | |
//console.log("TAGNAME",tagname) | |
if((tagname=='script'||tagname=='style')&&!endingTag){ | |
i++; | |
returnObject.push({Type:"verbatimText",Content:string.substr(i,state=string.substr(i).indexOf("</"+tagname))});//saving mem | |
i+=state-1; | |
state=0; | |
} | |
endingTag=false; | |
tagname=''; | |
continue; | |
} | |
if(d=="="){ | |
if(ii==2){ | |
submittag(c,ii);c=''; | |
} | |
state=2; | |
ii=0; | |
returnObject.push({Type:"setter",Content:"="}); | |
continue; | |
} | |
ii=2; | |
c+=d; | |
} | |
}else if(state==2){ | |
d=string.charAt(i); | |
if(isWhitespace(d)||d=='>'){ | |
if(ii=2){ | |
ii=0; | |
i--;state=1; | |
returnObject.push({Type:"attributeValue",Content:c}); | |
c='';continue; | |
} | |
c+=d; | |
ii=1; | |
continue; | |
} | |
if(ii==1){ | |
submittag(c,ii);c=''; | |
} | |
if(d=="'"||d=='"'){if(ii==2){ | |
submittag(c,ii);c=''; | |
}; | |
ii=string.substring(i+1).indexOf(d)+1; | |
//console.log(string.substr(i,ii)); | |
returnObject.push({Type:"attributeValue",Content:string.substr(i,ii+1)}); | |
i+=ii; | |
ii=0;state=1;continue;} | |
ii=2; | |
c+=d; | |
} | |
} | |
ifret(); | |
return returnObject; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment