Skip to content

Instantly share code, notes, and snippets.

@kevinswiber
Last active October 24, 2023 21:04
Show Gist options
  • Save kevinswiber/41af627ddd7e9cf37bd1ef53eeef2e38 to your computer and use it in GitHub Desktop.
Save kevinswiber/41af627ddd7e9cf37bd1ef53eeef2e38 to your computer and use it in GitHub Desktop.
Implementation of URL Pattern spec
import assert from "node:assert";
function escapeRegexString(str) {
return str.replace(/[\.\+\*\?\^\$\{\}\(\)\[\]\|\/\\]/g, "\\$&");
}
function isValidNameCodePoint(codePoint, isFirstCodePoint) {
if (isFirstCodePoint) {
return (codePoint >= 65 && codePoint <= 90) ||
(codePoint >= 97 && codePoint <= 122) ||
codePoint === 95;
}
return (codePoint >= 65 && codePoint <= 90) ||
(codePoint >= 97 && codePoint <= 122) ||
(codePoint >= 48 && codePoint <= 57) ||
codePoint === 95;
}
function isASCII(codePoint) {
return codePoint <= 0x7f;
}
function tokenize(input, policy = "strict") {
const tokenizer = {
input,
policy,
index: 0,
nextIndex: 0,
codePoint: null,
inputCodePointLength: [...input].length,
tokens: []
};
while (tokenizer.index < tokenizer.inputCodePointLength) {
tokenizer.nextIndex = tokenizer.index;
tokenizer.codePoint = tokenizer.input.codePointAt(tokenizer.nextIndex);
tokenizer.nextIndex++;
switch (tokenizer.codePoint) {
case 0x2a /* "*" */:
tokenizer.tokens.push({
type: "asterisk",
index: tokenizer.index,
value: tokenizer.input.substr(tokenizer.index,
tokenizer.nextIndex - tokenizer.index)
});
tokenizer.index++;
continue;
case 0x2b /* "+" */:
case 0x3f /* "?" */:
tokenizer.tokens.push({
type: "other-modifier",
index: tokenizer.index,
value: tokenizer.input.substr(tokenizer.index,
tokenizer.nextIndex - tokenizer.index)
});
tokenizer.index++;
continue;
case 0x5c /* "\" */:
if (tokenizer.index === tokenizer.inputCodePointLength - 1) {
if (tokenizer.policy === "strict") {
throw new TypeError("Invalid escape sequence.");
}
assert(tokenizer.policy === "lenient");
tokenizer.tokens.push({
type: "invalid-char",
index: tokenizer.index,
value: tokenizer.input.substr(tokenizer.index,
tokenizer.nextIndex - tokenizer.index)
});
tokenizer.index++;
continue;
}
const escapedIndex = tokenizer.index;
tokenizer.codePoint = tokenizer.input.codePointAt(tokenizer.nextIndex);
tokenizer.nextIndex++;
tokenizer.tokens.push({
type: "escaped-char",
index: escapedIndex,
value: tokenizer.input.substr(escapedIndex,
tokenizer.nextIndex - escapedIndex)
});
tokenizer.index++;
continue;
case 0x7b /* "{" */:
tokenizer.tokens.push({
type: "open",
index: tokenizer.index,
value: tokenizer.input.substr(tokenizer.index,
tokenizer.nextIndex - tokenizer.index)
});
tokenizer.index++;
continue;
case 0x7d /* "}" */:
tokenizer.tokens.push({
type: "close",
index: tokenizer.index,
value: tokenizer.input.substr(tokenizer.index,
tokenizer.nextIndex - tokenizer.index)
});
tokenizer.index++;
continue;
case 0x3a /* ":" */:
let namePosition = tokenizer.nextIndex;
const nameStart = namePosition;
while (namePosition < tokenizer.inputCodePointLength) {
tokenizer.nextIndex = namePosition;
tokenizer.codePoint = tokenizer.input.codePointAt(namePosition);
tokenizer.nextIndex++;
const isFirstCodePoint = namePosition === nameStart;
const isValidCodePoint = isValidNameCodePoint(tokenizer.codePoint, isFirstCodePoint);
if (!isValidCodePoint) {
break;
}
namePosition = tokenizer.nextIndex;
}
if (namePosition <= nameStart) {
if (tokenizer.policy === "strict") {
throw new TypeError("Invalid name.");
}
assert(tokenizer.policy === "lenient");
tokenizer.tokens.push({
type: "invalid-char",
index: nameStart,
value: tokenizer.input.substr(tokenizer.nameStart,
tokenizer.index - nameStart)
});
tokenizer.index = nameStart;
continue;
}
tokenizer.tokens.push({
type: "name",
index: nameStart,
value: tokenizer.input.substr(nameStart,
namePosition - nameStart)
});
tokenizer.index = namePosition;
continue;
case 0x28 /* "(" */:
let depth = 1;
let regexpPosition = tokenizer.nextIndex;
let error = false;
const regexpStart = regexpPosition;
while (regexpPosition < tokenizer.inputCodePointLength) {
tokenizer.nextIndex = regexpPosition;
tokenizer.codePoint = tokenizer.input.codePointAt(regexpPosition);
tokenizer.nextIndex++;
if (!isASCII(tokenizer.codePoint)) {
error = true;
if (tokenizer.policy === "strict") {
throw new TypeError("Invalid character.");
}
assert(tokenizer.policy === "lenient");
tokenizer.tokens.push({
type: "invalid-char",
index: regexpStart,
value: tokenizer.input.substr(tokenizer.nameStart,
tokenizer.index - regexpStart)
});
tokenizer.index = regexpStart;
break;
}
if (regexpPosition === regexpStart && tokenizer.codePoint === 0x3f /* "?" */) {
error = true;
if (tokenizer.policy === "strict") {
throw new TypeError("Invalid character.");
}
assert(tokenizer.policy === "lenient");
tokenizer.tokens.push({
type: "invalid-char",
index: regexpStart,
value: tokenizer.input.substr(tokenizer.nameStart,
tokenizer.index - regexpStart)
});
tokenizer.index = regexpStart;
break;
}
if (tokenizer.codePoint === 0x5c /* "\" */) {
if (regexpPosition === tokenizer.inputCodePointLength - 1) {
error = true;
if (tokenizer.policy === "strict") {
throw new TypeError("Invalid escape sequence.");
}
assert(tokenizer.policy === "lenient");
tokenizer.tokens.push({
type: "invalid-char",
index: tokenizer.index,
value: tokenizer.input.substr(tokenizer.index,
tokenizer.nextIndex - tokenizer.index)
});
tokenizer.index++;
break;
}
tokenizer.codePoint = tokenizer.input.codePointAt(tokenizer.nextIndex);
tokenizer.nextIndex++;
if (!isASCII(tokenizer.codePoint)) {
error = true;
if (tokenizer.policy === "strict") {
throw new TypeError("Invalid character.");
}
assert(tokenizer.policy === "lenient");
tokenizer.tokens.push({
type: "invalid-char",
index: regexpStart,
value: tokenizer.input.substr(tokenizer.nameStart,
tokenizer.index - regexpStart)
});
tokenizer.index = regexpStart;
break;
}
regexpPosition = tokenizer.nextIndex;
continue;
}
if (tokenizer.codePoint === 0x29 /* ")" */) {
depth--;
if (depth === 0) {
regexpPosition = tokenizer.nextIndex;
break;
}
} else if (tokenizer.codePoint === 0x28 /* "(" */) {
depth++;
if (regexpPosition === tokenizer.inputCodePointLength - 1) {
error = true;
if (tokenizer.policy === "strict") {
throw new TypeError("Invalid char.");
}
assert(tokenizer.policy === "lenient");
tokenizer.tokens.push({
type: "invalid-char",
index: regexpStart,
value: tokenizer.input.substr(tokenizer.index,
regexpStart - tokenizer.index)
});
tokenizer.index = regexpStart;
break;
}
const tempPosition = tokenizer.nextIndex;
tokenizer.codePoint = tokenizer.input.codePointAt(tokenizer.nextIndex);
tokenizer.nextIndex++;
if (tokenizer.codePoint !== 0x3f /* "?" */) {
error = true;
if (tokenizer.policy === "strict") {
throw new TypeError("Invalid char.");
}
assert(tokenizer.policy === "lenient");
tokenizer.tokens.push({
type: "invalid-char",
index: regexpStart,
value: tokenizer.input.substr(tokenizer.index,
regexpStart - tokenizer.index)
});
tokenizer.index = regexpStart;
break;
}
tokenizer.nextIndex = tempPosition;
}
regexpPosition = tokenizer.nextIndex;
}
if (error) {
continue;
}
if (depth !== 0) {
if (tokenizer.policy === "strict") {
throw new TypeError("Invalid char.");
}
assert(tokenizer.policy === "lenient");
tokenizer.tokens.push({
type: "invalid-char",
index: regexpStart,
value: tokenizer.input.substr(tokenizer.index,
regexpStart - tokenizer.index)
});
tokenizer.index = regexpStart;
continue;
}
const regexpLen = regexpPosition - regexpStart - 1;
if (regexpLen === 0) {
if (tokenizer.policy === "strict") {
throw new TypeError("Invalid char.");
}
assert(tokenizer.policy === "lenient");
tokenizer.tokens.push({
type: "invalid-char",
index: regexpStart,
value: tokenizer.input.substr(tokenizer.index,
regexpStart - tokenizer.index)
});
tokenizer.index = regexpStart;
continue;
}
tokenizer.tokens.push({
type: "regexp",
index: tokenizer.index,
value: tokenizer.input.substr(regexpStart, regexpLen)
});
tokenizer.index = regexpPosition;
continue;
default:
tokenizer.tokens.push({
type: "char",
index: tokenizer.index,
value: tokenizer.input.substr(tokenizer.index,
tokenizer.nextIndex - tokenizer.index)
});
tokenizer.index++;
continue;
}
}
tokenizer.tokens.push({
type: "end",
index: tokenizer.index,
value: ""
});
return tokenizer.tokens;
}
function parse(input,
options = {
delimiterCodePoint: "/",
prefixCodePoint: "/",
}, encodingCallback = (str) => str) {
const { delimiterCodePoint, prefixCodePoint } = options;
const segmentWildcardRegexp = "[^" + escapeRegexString(delimiterCodePoint) + "]+?"
const parser = {
tokens: tokenize(input),
segmentWildcardRegexp,
parts: [],
pendingFixedValue: "",
index: 0,
nextNumericName: 0,
};
function tryToConsume(tokenType) {
const current = parser.tokens[parser.index];
if (current.type === tokenType) {
parser.index++;
return current;
}
return null;
}
function tryToConsumeRegexpOrWildcard(nameToken) {
const token = tryToConsume("regexp");
return (nameToken === null && token === null)
? tryToConsume("asterisk") : token;
}
function tryToConsumeModifierToken() {
const token = tryToConsume("other-modifier");
return token === null ? tryToConsume("asterisk") : token;
}
function maybeAddAPartFromPendingFixedValue() {
if (parser.pendingFixedValue === "") {
return;
}
const encodedValue = encodingCallback(parser.pendingFixedValue);
parser.pendingFixedValue = "";
const part = {
type: "fixed-text",
value: encodedValue,
modifier: "none"
};
parser.parts.push(part);
}
function addAPart(prefix, nameToken, regexpOrWildcardToken, suffix, modifierToken) {
let modifier = "none";
if (modifierToken !== null) {
switch (modifierToken.value) {
case "?":
modifier = "optional";
break;
case "*":
modifier = "zero-or-more";
break;
case "+":
modifier = "one-or-more";
break;
}
}
if (nameToken === null && regexpOrWildcardToken === null && modifier === "none") {
parser.pendingFixedValue += prefix;
return;
}
maybeAddAPartFromPendingFixedValue();
if (nameToken === null && regexpOrWildcardToken === null) {
assert(suffix === "");
if (prefix === "") {
return;
}
const encodedValue = encodingCallback(prefix);
const part = {
type: "fixed-text",
value: encodedValue,
modifier
};
parser.parts.push(part);
return;
}
let regexpValue = "";
if (regexpOrWildcardToken === null) {
regexpValue = parser.segmentWildcardRegexp;
} else if (regexpOrWildcardToken.type === "asterisk") {
regexpValue = ".*";
} else {
regexpValue = regexpOrWildcardToken.value;
}
let type = "regexp";
if (regexpValue === parser.segmentWildcardRegexp) {
type = "segment-wildcard";
regexpValue = "";
} else if (regexpValue === ".*") {
type = "full-wildcard";
regexpValue = "";
}
let name = "";
if (nameToken !== null) {
name = nameToken.value;
} else if (regexpOrWildcardToken !== null) {
name = parser.nextNumericName.toString();
parser.nextNumericName++;
}
if (parser.parts.find((part) => part.name === name)) {
throw new TypeError("duplicate name");
}
const encodedPrefix = encodingCallback(prefix);
const encodedSuffix = encodingCallback(suffix);
const part = {
type,
value: regexpValue,
modifier,
name,
prefix: encodedPrefix,
suffix: encodedSuffix
};
parser.parts.push(part);
}
function consumeText() {
let result = "";
while (parser.index < parser.tokens.length) {
let token = tryToConsume("char");
if (token === null) {
token = tryToConsume("escaped-char");
}
if (token === null) {
break;
}
result += token.value;
}
return result;
}
while (parser.index < parser.tokens.length) {
const charToken = tryToConsume("char");
const nameToken = tryToConsume("name");
const regexpOrWildcardToken = tryToConsumeRegexpOrWildcard(nameToken);
if (nameToken !== null || regexpOrWildcardToken !== null) {
let prefix = charToken ? charToken.value : "";
if (!["", prefixCodePoint].includes("prefix")) {
parser.pendingFixedValue += prefix;
prefix = "";
}
maybeAddAPartFromPendingFixedValue();
let modifierToken = tryToConsumeModifierToken();
addAPart(prefix, nameToken, regexpOrWildcardToken, "", modifierToken);
continue;
}
let fixedToken = charToken;
if (fixedToken === null) {
fixedToken = tryToConsume("escaped-char");
}
if (fixedToken !== null) {
parser.pendingFixedValue += fixedToken.value;
continue;
}
let openToken = tryToConsume("open");
if (openToken !== null) {
const prefix = consumeText();
const nameToken = tryToConsume("name");
const regexpOrWildcardToken = tryToConsumeRegexpOrWildcard(nameToken);
const suffix = consumeText();
const closeToken = tryToConsume("close");
if (closeToken === null) {
throw new TypeError("missing close token");
}
const modifierToken = tryToConsumeModifierToken();
addAPart(prefix, nameToken, regexpOrWildcardToken, suffix, modifierToken);
continue;
}
maybeAddAPartFromPendingFixedValue();
const endToken = tryToConsume("end");
if (endToken === null) {
throw new TypeError("expected end token");
}
}
return parser.parts;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment