Skip to content

Instantly share code, notes, and snippets.

@Kreijstal
Created October 2, 2013 03:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Kreijstal/6788840 to your computer and use it in GitHub Desktop.
Save Kreijstal/6788840 to your computer and use it in GitHub Desktop.
function TokenizeHTML(string){
var state=0,firststatechars=['<!','</','<'],endingTag=true,returnObject=[],ret='',c='',d,ii,ll,tagname='';
//sorry for uncommented shit, I don't even know what Im'd doing
//state 0 is outside tag, 1 is inside a normal tag
//ret,c,d,ii,ll have various uses dependent on the state and loop, so I can't really know what each one does, they're like counters and state keepers
//ret usually works as adding the text outside the tags
function ifret(){
if(ret){
returnObject.push({Type:"text",Content:ret});
ret='';
}
}
//this is a lot faster than creating a character array;
function isUpperCharacterSet(c){return c.charCodeAt()>=0x41&&c.charCodeAt()<=0x5a;}
function isLowerCharacterSet(c){return c.charCodeAt()>=0x61&&c.charCodeAt()<=0x7a;}
var whitespaceCodePoints=[0,0x20,0xA0,0x2f/*FOR SCIENCE*/];//the / character as a whitespace maay be helpful in occasions, whatever
function isWhitespace(c){
var carh=c.charCodeAt();
for(var i=0,l=whitespaceCodePoints.length;i<l;i++){
if(whitespaceCodePoints[i]==carh){return true;}
}
return carh>=0x9&&carh<=0xd;
}
function submittag(content,state){
if(state==1){
returnObject.push({Type:"WhiteSpace",Content:content});
}else if(state==2){returnObject.push({Type:"attributeName",Content:content});}}
MAINLOOP:for(var i=0,l=string.length;i<l;i++){
if(state==0){
c=string.substr(i,2);
for(ii=0,ll=firststatechars.length;ii<ll;ii++){
if(c.indexOf(firststatechars[ii])==0){
if(ii==0){
if(string.substr(i+2,2)=="--"){
d=string.substr(i+2).indexOf('-->');
c=d!=-1?string.substr(i,d+5):string.substr(i);
ifret();
returnObject.push({Type:"comment",Content:c});
i=i+c.length;
c='';
}else{
state=3;
i++;
c='';
ifret();
returnObject.push({Type:"startTag",Content:"<!"})
}
} else if(ii==2||ii==1){
if(ii==1){i++;endingTag=true;};
if(isUpperCharacterSet(string.charAt(i+1))||isLowerCharacterSet(string.charAt(i+1))){
ifret();
state=1;
c='';
if(ii==1){returnObject.push({Type:"startTag",Content:"</"});continue MAINLOOP;}
returnObject.push({Type:"startTag",Content:"<"})
continue MAINLOOP;
}else{
ret+=string.charAt(i);
//console.log("RET1",ret)
continue MAINLOOP;
}
};
break;
}
}
ret+=string.charAt(i);
//console.log("RET2",ret)
}else if(state==1){
//console.log("RET",ret);
d=string.charAt(i);
//console.log(d);
if(!tagname){
//console.log(c.charCodeAt())
if(isWhitespace(d)||d=='>'){
returnObject.push({Type:"tagName",Content:c});
tagname=c;i--;
c='';ii=0;ll=0;
}else{c+=d;}
}else{
if(isWhitespace(d)){
if(ii==2){
submittag(c,ii);c='';
}
c+=d;
ii=1;
continue;
}
if(ii==1){
submittag(c,ii);c='';
}
if(d=='>'){
if(ii==2){
submittag(c,ii);c='';
}
state=0;
c='';
returnObject.push({Type:"endTag",Content:">"});
//console.log("TAGNAME",tagname)
if((tagname=='script'||tagname=='style')&&!endingTag){
i++;
returnObject.push({Type:"verbatimText",Content:string.substr(i,state=string.substr(i).indexOf("</"+tagname))});//saving mem
i+=state-1;
state=0;
}
endingTag=false;
tagname='';
continue;
}
if(d=="="){
if(ii==2){
submittag(c,ii);c='';
}
state=2;
ii=0;
returnObject.push({Type:"setter",Content:"="});
continue;
}
ii=2;
c+=d;
}
}else if(state==2){
d=string.charAt(i);
if(isWhitespace(d)||d=='>'){
if(ii=2){
ii=0;
i--;state=1;
returnObject.push({Type:"attributeValue",Content:c});
c='';continue;
}
c+=d;
ii=1;
continue;
}
if(ii==1){
submittag(c,ii);c='';
}
if(d=="'"||d=='"'){if(ii==2){
submittag(c,ii);c='';
};
ii=string.substring(i+1).indexOf(d)+1;
//console.log(string.substr(i,ii));
returnObject.push({Type:"attributeValue",Content:string.substr(i,ii+1)});
i+=ii;
ii=0;state=1;continue;}
ii=2;
c+=d;
}
}
ifret();
return returnObject;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment