mdciotti/encodeURIComplete.js

## encodeURIComplete.js
// NOTE: This is overkill. I wrote this and then quickly realized there is a better way to do it. See below.

/**
 * https://tc39.es/ecma262/#leading-surrogate
 * @param {number} codeUnit
 */
function isLeadingSurrogate(codeUnit) {
  return 0xD800 <= codeUnit && codeUnit <=0xDBFF;
}

/**
 * https://tc39.es/ecma262/#trailing-surrogate
 * @param {number} codeUnit
 */
function isTrailingSurrogate(codeUnit) {
  return 0xDC00 <= codeUnit && codeUnit <=0xDFFF;
}

/**
 * Two code units, lead and trail, that form a UTF-16 surrogate pair are converted to a code point.
 * @param {number} lead
 * @param {number} trail
 * https://tc39.es/ecma262/#sec-utf16decode
 */
function UTF16Decode(lead, trail) {
  if (!(isLeadingSurrogate(lead) && isTrailingSurrogate(trail))) {
    throw new Error('unpaired surrogate');
  }
  return (lead - 0xD800) * 0x400 + (trail - 0xDC00) + 0x10000;
}

/**
 * Transforms a UTF-16 code unit into UTF-8 octets.
 * @param {string} string
 * @param {number} position
 */
function UTF8OctetsAt(string, position) {
  const size = string.length;
  const lead = string.charCodeAt(position);
  if (lead <= 0x007F) {
    return [lead];
  }
  if (lead <= 0x07FF) {
    const first =  0b11000000 | ((0b0000011111000000 & lead) >> 6)
    const second = 0b10000000 |  (0b0000000000111111 & lead);
    return [first, second];
  }
  if (lead <= 0xD7FF || lead >= 0xE000) {
    const first =  0b11100000 | ((lead & 0b1111000000000000) >> 12);
    const second = 0b10000000 | ((lead & 0b0000111111000000) >> 6);
    const third =  0b10000000 |  (lead & 0b0000000000111111);
    return [first, second, third];
  }
  if (isTrailingSurrogate(lead) || position + 1 === size) {
    throw new Error('unpaired surrogate');
  }
  const trail = string.charCodeAt(position + 1);
  if (!isTrailingSurrogate(trail)) {
    throw new Error('unpaired surrogate');
  }

  // Each letter represents one bit from the lead or trail.
  // lead:  110110vv vvwwwwxx
  // trail: 110111yy yyzzzzzz
  const vvvv =   (0b0000001111000000 & lead) >> 6;
  const uuuuu = vvvv + 1;
  const wwww =   (0b0000000000111100 & lead) >> 2;
  const xx =     (0b0000000000000011 & lead);
  const yyyy =   (0b0000001111000000 & trail) >> 6;
  const zzzzzz = (0b0000000000111111 & trail);
  // return 11110uuu 10uuwwww 10xxyyyy 10zzzzzz
  const first =  0b11110000 | (uuuuu >> 2);
  const second = 0b10000000 | ((uuuuu & 0b11) << 4) | wwww;
  const third =  0b10000000 | (xx << 4) | yyyy;
  const fourth = 0b10000000 | zzzzzz;
  return [first, second, third, fourth];
}

/**
 * Interprets a String string as a sequence of UTF-16 encoded code points, as described in 6.1.4, and reads from it a single code point starting with the code unit at index position.
 * @param {string} string
 * @param {number} position
 * https://tc39.es/ecma262/#sec-codepointat
 */
function CodePointAt(string, position) {
  let size = string.length;
  if (!(position >= 0 && position < size)) {
    throw new RangeError(`invalid position ${position}`);
  }
  let first = string.charCodeAt(0);
  let cp = first;
  if (!isLeadingSurrogate(first) && !isTrailingSurrogate(first)) {
    return { CodePoint: cp, CodeUnitCount: 1, IsUnpairedSurrogate: false };
  }
  if (isTrailingSurrogate(first) || position + 1 === size) {
    return { CodePoint: cp, CodeUnitCount: 1, IsUnpairedSurrogate: true };
  }
  let second = string.charCodeAt(position + 1);
  if (!isTrailingSurrogate(second)) {
    return { CodePoint: cp, CodeUnitCount: 1, IsUnpairedSurrogate: true };
  }
  cp = UTF16Decode(first, second);
  return { CodePoint: cp, CodeUnitCount: 2, IsUnpairedSurrogate: false };
}

/**
 * Encodes an entire string into URI escape codes.
 * @param {string} value the string to escape
 * https://tc39.es/ecma262/#sec-encode
 */
function encodeURIComplete(value) {
  let strLen = value.length;
  let R = '';
  let k = 0;
  while (true) {
    if (k === strLen) return R;
    let cp = CodePointAt(value, k);
    if (cp.IsUnpairedSurrogate) throw new URIError('unpaired surrogate');
    let Octets = UTF8OctetsAt(value, k);
    k = k + cp.CodeUnitCount;
    for (const octet of Octets) {
      R = R + '%' + octet.toString(16).toUpperCase().padStart(2, '0')
    }
  }
}

## encodeURIComplete2.js
/**
 * Encodes an entire string into URI escape codes.
 * @param {string} value the string to escape
 * Relies on the browser to encode characters not present in the unescaped URIComponent set.
 * Manually encodes the small set of characters that the browser does not.
 */
function encodeURIComplete(value) {
  return Array.from(value).map((s) => {
    const defaultEncoded = encodeURIComponent(s);
    if (defaultEncoded !== s) return defaultEncoded;
    return '%' + s.charCodeAt(0)
      .toString(16)
      .toUpperCase()
      .padStart(2, '0');
  }).join('');
}
	// NOTE: This is overkill. I wrote this and then quickly realized there is a better way to do it. See below.

	/**
	* https://tc39.es/ecma262/#leading-surrogate
	* @param {number} codeUnit
	*/
	function isLeadingSurrogate(codeUnit) {
	return 0xD800 <= codeUnit && codeUnit <=0xDBFF;
	}

	/**
	* https://tc39.es/ecma262/#trailing-surrogate
	* @param {number} codeUnit
	*/
	function isTrailingSurrogate(codeUnit) {
	return 0xDC00 <= codeUnit && codeUnit <=0xDFFF;
	}

	/**
	* Two code units, lead and trail, that form a UTF-16 surrogate pair are converted to a code point.
	* @param {number} lead
	* @param {number} trail
	* https://tc39.es/ecma262/#sec-utf16decode
	*/
	function UTF16Decode(lead, trail) {
	if (!(isLeadingSurrogate(lead) && isTrailingSurrogate(trail))) {
	throw new Error('unpaired surrogate');
	}
	return (lead - 0xD800) * 0x400 + (trail - 0xDC00) + 0x10000;
	}

	/**
	* Transforms a UTF-16 code unit into UTF-8 octets.
	* @param {string} string
	* @param {number} position
	*/
	function UTF8OctetsAt(string, position) {
	const size = string.length;
	const lead = string.charCodeAt(position);
	if (lead <= 0x007F) {
	return [lead];
	}
	if (lead <= 0x07FF) {
	const first = 0b11000000 \| ((0b0000011111000000 & lead) >> 6)
	const second = 0b10000000 \| (0b0000000000111111 & lead);
	return [first, second];
	}
	if (lead <= 0xD7FF \|\| lead >= 0xE000) {
	const first = 0b11100000 \| ((lead & 0b1111000000000000) >> 12);
	const second = 0b10000000 \| ((lead & 0b0000111111000000) >> 6);
	const third = 0b10000000 \| (lead & 0b0000000000111111);
	return [first, second, third];
	}
	if (isTrailingSurrogate(lead) \|\| position + 1 === size) {
	throw new Error('unpaired surrogate');
	}
	const trail = string.charCodeAt(position + 1);
	if (!isTrailingSurrogate(trail)) {
	throw new Error('unpaired surrogate');
	}

	// Each letter represents one bit from the lead or trail.
	// lead: 110110vv vvwwwwxx
	// trail: 110111yy yyzzzzzz
	const vvvv = (0b0000001111000000 & lead) >> 6;
	const uuuuu = vvvv + 1;
	const wwww = (0b0000000000111100 & lead) >> 2;
	const xx = (0b0000000000000011 & lead);
	const yyyy = (0b0000001111000000 & trail) >> 6;
	const zzzzzz = (0b0000000000111111 & trail);
	// return 11110uuu 10uuwwww 10xxyyyy 10zzzzzz
	const first = 0b11110000 \| (uuuuu >> 2);
	const second = 0b10000000 \| ((uuuuu & 0b11) << 4) \| wwww;
	const third = 0b10000000 \| (xx << 4) \| yyyy;
	const fourth = 0b10000000 \| zzzzzz;
	return [first, second, third, fourth];
	}

	/**
	* Interprets a String string as a sequence of UTF-16 encoded code points, as described in 6.1.4, and reads from it a single code point starting with the code unit at index position.
	* @param {string} string
	* @param {number} position
	* https://tc39.es/ecma262/#sec-codepointat
	*/
	function CodePointAt(string, position) {
	let size = string.length;
	if (!(position >= 0 && position < size)) {
	throw new RangeError(`invalid position ${position}`);
	}
	let first = string.charCodeAt(0);
	let cp = first;
	if (!isLeadingSurrogate(first) && !isTrailingSurrogate(first)) {
	return { CodePoint: cp, CodeUnitCount: 1, IsUnpairedSurrogate: false };
	}
	if (isTrailingSurrogate(first) \|\| position + 1 === size) {
	return { CodePoint: cp, CodeUnitCount: 1, IsUnpairedSurrogate: true };
	}
	let second = string.charCodeAt(position + 1);
	if (!isTrailingSurrogate(second)) {
	return { CodePoint: cp, CodeUnitCount: 1, IsUnpairedSurrogate: true };
	}
	cp = UTF16Decode(first, second);
	return { CodePoint: cp, CodeUnitCount: 2, IsUnpairedSurrogate: false };
	}

	/**
	* Encodes an entire string into URI escape codes.
	* @param {string} value the string to escape
	* https://tc39.es/ecma262/#sec-encode
	*/
	function encodeURIComplete(value) {
	let strLen = value.length;
	let R = '';
	let k = 0;
	while (true) {
	if (k === strLen) return R;
	let cp = CodePointAt(value, k);
	if (cp.IsUnpairedSurrogate) throw new URIError('unpaired surrogate');
	let Octets = UTF8OctetsAt(value, k);
	k = k + cp.CodeUnitCount;
	for (const octet of Octets) {
	R = R + '%' + octet.toString(16).toUpperCase().padStart(2, '0')
	}
	}
	}
	/**
	* Encodes an entire string into URI escape codes.
	* @param {string} value the string to escape
	* Relies on the browser to encode characters not present in the unescaped URIComponent set.
	* Manually encodes the small set of characters that the browser does not.
	*/
	function encodeURIComplete(value) {
	return Array.from(value).map((s) => {
	const defaultEncoded = encodeURIComponent(s);
	if (defaultEncoded !== s) return defaultEncoded;
	return '%' + s.charCodeAt(0)
	.toString(16)
	.toUpperCase()
	.padStart(2, '0');
	}).join('');
	}