Skip to content

Instantly share code, notes, and snippets.

@mdciotti
Last active August 1, 2022 03:26
Show Gist options
  • Save mdciotti/121c3e16540b1b273b4bf96156b19baf to your computer and use it in GitHub Desktop.
Save mdciotti/121c3e16540b1b273b4bf96156b19baf to your computer and use it in GitHub Desktop.
Completely encode all characters of a string into a URL escape sequence.
// NOTE: This is overkill. I wrote this and then quickly realized there is a better way to do it. See below.
/**
* https://tc39.es/ecma262/#leading-surrogate
* @param {number} codeUnit
*/
function isLeadingSurrogate(codeUnit) {
return 0xD800 <= codeUnit && codeUnit <=0xDBFF;
}
/**
* https://tc39.es/ecma262/#trailing-surrogate
* @param {number} codeUnit
*/
function isTrailingSurrogate(codeUnit) {
return 0xDC00 <= codeUnit && codeUnit <=0xDFFF;
}
/**
* Two code units, lead and trail, that form a UTF-16 surrogate pair are converted to a code point.
* @param {number} lead
* @param {number} trail
* https://tc39.es/ecma262/#sec-utf16decode
*/
function UTF16Decode(lead, trail) {
if (!(isLeadingSurrogate(lead) && isTrailingSurrogate(trail))) {
throw new Error('unpaired surrogate');
}
return (lead - 0xD800) * 0x400 + (trail - 0xDC00) + 0x10000;
}
/**
* Transforms a UTF-16 code unit into UTF-8 octets.
* @param {string} string
* @param {number} position
*/
function UTF8OctetsAt(string, position) {
const size = string.length;
const lead = string.charCodeAt(position);
if (lead <= 0x007F) {
return [lead];
}
if (lead <= 0x07FF) {
const first = 0b11000000 | ((0b0000011111000000 & lead) >> 6)
const second = 0b10000000 | (0b0000000000111111 & lead);
return [first, second];
}
if (lead <= 0xD7FF || lead >= 0xE000) {
const first = 0b11100000 | ((lead & 0b1111000000000000) >> 12);
const second = 0b10000000 | ((lead & 0b0000111111000000) >> 6);
const third = 0b10000000 | (lead & 0b0000000000111111);
return [first, second, third];
}
if (isTrailingSurrogate(lead) || position + 1 === size) {
throw new Error('unpaired surrogate');
}
const trail = string.charCodeAt(position + 1);
if (!isTrailingSurrogate(trail)) {
throw new Error('unpaired surrogate');
}
// Each letter represents one bit from the lead or trail.
// lead: 110110vv vvwwwwxx
// trail: 110111yy yyzzzzzz
const vvvv = (0b0000001111000000 & lead) >> 6;
const uuuuu = vvvv + 1;
const wwww = (0b0000000000111100 & lead) >> 2;
const xx = (0b0000000000000011 & lead);
const yyyy = (0b0000001111000000 & trail) >> 6;
const zzzzzz = (0b0000000000111111 & trail);
// return 11110uuu 10uuwwww 10xxyyyy 10zzzzzz
const first = 0b11110000 | (uuuuu >> 2);
const second = 0b10000000 | ((uuuuu & 0b11) << 4) | wwww;
const third = 0b10000000 | (xx << 4) | yyyy;
const fourth = 0b10000000 | zzzzzz;
return [first, second, third, fourth];
}
/**
* Interprets a String string as a sequence of UTF-16 encoded code points, as described in 6.1.4, and reads from it a single code point starting with the code unit at index position.
* @param {string} string
* @param {number} position
* https://tc39.es/ecma262/#sec-codepointat
*/
function CodePointAt(string, position) {
let size = string.length;
if (!(position >= 0 && position < size)) {
throw new RangeError(`invalid position ${position}`);
}
let first = string.charCodeAt(0);
let cp = first;
if (!isLeadingSurrogate(first) && !isTrailingSurrogate(first)) {
return { CodePoint: cp, CodeUnitCount: 1, IsUnpairedSurrogate: false };
}
if (isTrailingSurrogate(first) || position + 1 === size) {
return { CodePoint: cp, CodeUnitCount: 1, IsUnpairedSurrogate: true };
}
let second = string.charCodeAt(position + 1);
if (!isTrailingSurrogate(second)) {
return { CodePoint: cp, CodeUnitCount: 1, IsUnpairedSurrogate: true };
}
cp = UTF16Decode(first, second);
return { CodePoint: cp, CodeUnitCount: 2, IsUnpairedSurrogate: false };
}
/**
* Encodes an entire string into URI escape codes.
* @param {string} value the string to escape
* https://tc39.es/ecma262/#sec-encode
*/
function encodeURIComplete(value) {
let strLen = value.length;
let R = '';
let k = 0;
while (true) {
if (k === strLen) return R;
let cp = CodePointAt(value, k);
if (cp.IsUnpairedSurrogate) throw new URIError('unpaired surrogate');
let Octets = UTF8OctetsAt(value, k);
k = k + cp.CodeUnitCount;
for (const octet of Octets) {
R = R + '%' + octet.toString(16).toUpperCase().padStart(2, '0')
}
}
}
/**
* Encodes an entire string into URI escape codes.
* @param {string} value the string to escape
* Relies on the browser to encode characters not present in the unescaped URIComponent set.
* Manually encodes the small set of characters that the browser does not.
*/
function encodeURIComplete(value) {
return Array.from(value).map((s) => {
const defaultEncoded = encodeURIComponent(s);
if (defaultEncoded !== s) return defaultEncoded;
return '%' + s.charCodeAt(0)
.toString(16)
.toUpperCase()
.padStart(2, '0');
}).join('');
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment