Skip to content

Instantly share code, notes, and snippets.

@Mellen
Last active August 30, 2019 10:05
Show Gist options
  • Save Mellen/ce5d247587dcb7b49e5145a6f49328be to your computer and use it in GitHub Desktop.
Save Mellen/ce5d247587dcb7b49e5145a6f49328be to your computer and use it in GitHub Desktop.
Replacing number based html entities with characters, either hex or decimal.
const NORMAL = 0;
const FOUNDAMP = 1;
const FOUNDHASH = 2;
const FOUNDX = 3;
const FOUNDHEX = 4;
const GOTENTITY = 5;
const FOUNDDEC = 6;
function replaceNumericEntities(input)
{
const hexdigits = '0123456789abcdefABCDEF';
const digits = '0123456789';
let output = '';
let entity = '';
let isHex = false;
let state = NORMAL;
for(let c of input+' ')
{
switch(state)
{
case NORMAL:
if(c === '&')
{
state = FOUNDAMP;
entity += c;
}
else
{
output += c;
}
break;
case FOUNDAMP:
if(c === '#')
{
state = FOUNDHASH;
entity += c
}
else
{
({output, entity, state} = reset(c, entity, output));
}
break;
case FOUNDHASH:
{
if(c === 'x')
{
state = FOUNDX;
isHex = true;
entity += c;
}
else if(digits.indexOf(c) === -1)
{
({output, entity, state} = reset(c, entity, output));
}
else
{
state = FOUNDDEC;
isHex = false;
entity += c;
}
}
break;
case FOUNDX:
{
if(hexdigits.indexOf(c) === -1)
{
({output, entity, state} = reset(c, entity, output));
}
else
{
state = FOUNDHEX;
entity += c;
}
}
break;
case FOUNDHEX:
{
if(c === ';')
{
state = GOTENTITY;
}
else if(hexdigits.indexOf(c) === -1)
{
({output, entity, state} = reset(c, entity, output));
}
else
{
state = FOUNDHEX;
entity += c;
}
}
break;
case GOTENTITY:
{
if((isHex && withinHexRange(entity)) || (!isHex && withinDecRange(entity)))
{
output += getCharacter(entity, isHex);
}
else
{
output += entity;
}
({output, entity, state} = reset(c, '', output));
}
break;
case FOUNDDEC:
{
if (c === ';')
{
state = GOTENTITY;
}
else if(digits.indexOf(c) === -1)
{
({output, entity, state} = reset(c, entity, output));
}
else
{
state = FOUNDDEC;
entity += c;
}
}
break;
}
}
return output.slice(0, -1);
}
function reset(c, entity, output)
{
let state = 0;
output += entity;
entity = '';
if(c === '&')
{
state = FOUNDAMP;
entity = c
}
else
{
state = NORMAL;
output += c;
}
return {output: output, entity: entity, state: state};
}
function withinHexRange(entity)
{
let value = Number.parseInt(entity.slice(3), 16);
return value <= 0x10ffff;
}
function withinDecRange(entity)
{
let value = Number.parseInt(entity.slice(2), 10);
return value <= 0x10ffff;
}
function getCharacter(entity, isHex)
{
let value = 0;
if(isHex)
{
value = Number.parseInt(entity.slice(3), 16);
}
else
{
value = Number.parseInt(entity.slice(2), 10);
}
let cha = String.fromCodePoint(value);
return cha;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment