Skip to content

Instantly share code, notes, and snippets.

@Munawwar
Last active April 13, 2021 06:25
Show Gist options
  • Save Munawwar/af9432d15118935e1e7cca163f0e4648 to your computer and use it in GitHub Desktop.
Save Munawwar/af9432d15118935e1e7cca163f0e4648 to your computer and use it in GitHub Desktop.
Testing the speed of regex replace approach
const whitespace = '[\\x20\\t\\r\\n\\f]';
const unescapeRegExp = new RegExp('\\\\([\\da-f]{1,6}' + whitespace + '?|(' + whitespace + ')|.)', 'ig');
function unescOrig(str) {
return str.replace(unescapeRegExp, (_, escaped, escapedWhitespace) => {
const high = '0x' + escaped - 0x10000;
// NaN means non-codepoint
// Workaround erroneous numeric interpretation of +"0x"
// eslint-disable-next-line no-self-compare
return high !== high || escapedWhitespace
? escaped
: high < 0
? // BMP codepoint
String.fromCharCode(high + 0x10000)
: // Supplemental Plane codepoint (surrogate pair)
String.fromCharCode((high >> 10) | 0xd800, (high & 0x3ff) | 0xdc00);
});
}
function fasterRegexReplaceAll(str, regexWithGFlag, func) {
const stashedIndex = regexWithGFlag.lastIndex;
regexWithGFlag.lastIndex = -1;
let output = '';
let lastProcessedIndex = 0;
let match;
while ((match = regexWithGFlag.exec(str))) {
output += str.slice(lastProcessedIndex, match.index) + func(...match, match.index, str);
// prepare for next iteration
lastProcessedIndex = regexWithGFlag.lastIndex;
}
output += str.slice(lastProcessedIndex);
regexWithGFlag.lastIndex = stashedIndex;
return output;
}
/**
* @param {string} str
*/
function unescNew(str) {
return fasterRegexReplaceAll(str, unescapeRegExp, (_, escaped, escapedWhitespace) => {
const high = '0x' + escaped - 0x10000;
// NaN means non-codepoint
// Workaround erroneous numeric interpretation of +"0x"
// eslint-disable-next-line no-self-compare
return high !== high || escapedWhitespace
? escaped
: high < 0
? // BMP codepoint
String.fromCharCode(high + 0x10000)
: // Supplemental Plane codepoint (surrogate pair)
String.fromCharCode((high >> 10) | 0xd800, (high & 0x3ff) | 0xdc00);
});
}
/**
*
* @param {string} str
* @returns {[string, number]|undefined}
*/
function gobbleHex(str) {
const lower = str.toLowerCase();
let hex = '';
let spaceTerminated = false;
for (let i = 0; i < 6 && lower[i] !== undefined; i++) {
const code = lower.charCodeAt(i);
// check to see if we are dealing with a valid hex char [a-f|0-9]
const valid = (code >= 97 && code <= 102) || (code >= 48 && code <= 57);
// https://drafts.csswg.org/css-syntax/#consume-escaped-code-point
spaceTerminated = code === 32;
if (!valid) {
break;
}
hex += lower[i];
}
if (hex.length === 0) {
return undefined;
}
const codePoint = parseInt(hex, 16);
const isSurrogate = codePoint >= 0xD800 && codePoint <= 0xDFFF;
// Add special case for
// "If this number is zero, or is for a surrogate, or is greater than the maximum allowed code point"
// https://drafts.csswg.org/css-syntax/#maximum-allowed-code-point
if (isSurrogate || codePoint === 0x0000 || codePoint > 0x10FFFF) {
return ['\uFFFD', hex.length + (spaceTerminated ? 1 : 0)];
}
return [
String.fromCodePoint(codePoint),
hex.length + (spaceTerminated ? 1 : 0),
];
}
const CONTAINS_ESCAPE = /\\/;
function unescFast(str) {
let needToProcess = CONTAINS_ESCAPE.test(str);
if (!needToProcess) {
return str;
}
let ret = "";
for (let i = 0; i < str.length; i++) {
if ((str[i] === "\\")) {
const gobbled = gobbleHex(str.slice(i+1, i+7));
if (gobbled !== undefined) {
ret += gobbled[0];
i += gobbled[1];
continue;
}
// Retain a pair of \\ if double escaped `\\\\`
// https://github.com/postcss/postcss-selector-parser/commit/268c9a7656fb53f543dc620aa5b73a30ec3ff20e
if (str[i +1] === "\\") {
ret += "\\";
i++;
continue;
}
// if \\ is at the end of the string retain it
// https://github.com/postcss/postcss-selector-parser/commit/01a6b346e3612ce1ab20219acc26abdc259ccefb
if (str.length === i + 1) {
ret += str[i];
}
continue;
}
ret += str[i];
}
return ret;
}
// run test first
const inputStrings = [
'#foo',
'#w\\+',
'#foo\\',
'#wow\\\\k',
'.\\31 23',
'.\\🐐',
'.\\E9motion',
'.\\E9 dition',
'.\\0000E9dition',
'.\\1D306',
'.\\1D306k',
];
const assertEqual = (a, b) => {
if (a !== b) throw new Error(`${a} !== ${b}`);
}
const test = () => inputStrings.forEach(str => assertEqual(
unescOrig(str),
unescNew(str),
));
test();
function time(taskName, task, runs = 10 ** 6) {
const start = Date.now();
for (let i = 0; i < runs; i += 1) {
task();
}
const total = Date.now() - start;
console.log(taskName, 'Time taken =', total, 'ms');
return total;
}
const origTime = time('unescOrig()', () => inputStrings.forEach(str => unescOrig(str)));
const newTime = time('unescNew()', () => inputStrings.forEach(str => unescNew(str)));
const fastTime = time('unescFast()', () => inputStrings.forEach(str => unescFast(str)));
console.log('unescNew() / unescOrig() ratio =', newTime / origTime);
console.log('unescFast() / unescOrig() ratio =', fastTime / origTime);
// output on node 14.16.1 (osx 10.14.6, macbook pro mid-2015)
// unescOrig() Time taken = 8285 ms
// unescNew() Time taken = 4314 ms
// unescFast() Time taken = 2349 ms
// unescNew() / unescOrig() ratio = 0.5207000603500301
// unescFast() / unescOrig() ratio = 0.28352444176222086
// tested on node v14 and v12 on osx and ubuntu, and results are similar.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment