Skip to content

Instantly share code, notes, and snippets.

@clshortfuse
Created March 6, 2023 21:05
Show Gist options
  • Save clshortfuse/697925e85a5c3d86e7691c11cd6ac823 to your computer and use it in GitHub Desktop.
Save clshortfuse/697925e85a5c3d86e7691c11cd6ac823 to your computer and use it in GitHub Desktop.
JS XML Parser and Builder
/**
* @template T
* @typedef {import('./index.js').TupleTree<T>} TupleTree<T>
*/
/**
* @template T
* @typedef {import('./index.js').TupleTreeEntry<T>} TupleTreeEntry<T>
*/
import {
ATTRIBUTE_NODE_KEY, CDATA_NODE_KEY, CHARCODE_QUESTION, COMMENT_NODE_KEY, CONTENT_NODE_KEY,
} from './constants.js';
/**
* @param {string|boolean|number} value
* @return {string}
*/
function escapeContentValue(value) {
return value.toString()
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/\r/g, '&#xD;')
.replace(/]]>/g, ']]&gt;');
}
/**
* @param {string|boolean|number} value
* @return {string}
*/
function escapeAttributeValue(value) {
return value.toString()
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/\r/g, '&#xD;')
.replace(/'/g, '&apos;');
}
/**
* @param {string|boolean|number} value
* @return {string}
*/
function escapeCommentValue(value) {
return value.toString()
.replace(/--/g, '-‐');
}
/**
* @param {string} key
* @param {any} value
* @return {string}
*/
export function buildXMLFromObject(key, value) {
/** @type [string, string][] */
const attributes = [];
/** @type [string, any][] */
const childNodes = [];
let textValue = null;
switch (typeof value) {
case 'symbol':
case 'function':
return '';
case 'undefined':
textValue = '';
break;
case 'object':
if (value === null) {
textValue = '';
break;
}
if (Array.isArray(value)) {
return value.map((arrayValue) => buildXMLFromObject(key, arrayValue)).join('');
}
if (value instanceof Date) {
textValue = value.toISOString();
break;
}
for (const [entryKey, entryValue] of Object.entries(value)) {
switch (entryKey) {
case CONTENT_NODE_KEY:
textValue = entryValue;
break;
case ATTRIBUTE_NODE_KEY:
if (entryValue instanceof Map) {
attributes.push(...entryValue.entries());
} else if (Array.isArray(entryValue)) {
attributes.push(...entryValue);
} else {
attributes.push(...Object.entries(entryValue));
}
break;
default:
childNodes.push([entryKey, entryValue]);
}
}
break;
case 'string':
textValue = value;
break;
case 'boolean':
case 'number':
case 'bigint':
default:
textValue = value.toString();
break;
}
if (key) {
if (key === COMMENT_NODE_KEY) {
return `<!--${escapeCommentValue(value)}-->`;
}
if (key === CDATA_NODE_KEY) {
return `<![CDATA[${value}]]>`;
}
const output = [
'<', key,
attributes.length ? ' ' : '',
attributes.map(([attrName, attrValue]) => `${attrName}="${escapeAttributeValue(attrValue)}"`).join(' '),
];
if (!childNodes.length && textValue == null) {
// eslint-disable-next-line unicorn/prefer-code-point
if (key.charCodeAt(0) === CHARCODE_QUESTION) {
output.push('?');
} else {
output.push('/');
}
output.push('>');
} else {
output.push(
'>',
childNodes.map(([childKey, childValue]) => buildXMLFromObject(childKey, childValue)).join(''),
textValue ? escapeContentValue(textValue) : '',
'</',
key,
'>',
);
}
return output.join('');
}
// Root
return [
childNodes.map(([childKey, childValue]) => buildXMLFromObject(childKey, childValue)).join(''),
].join('');
}
/**
* @param {TupleTreeEntry<string>} entry
* @return {string}
*/
function buildXMLFromEntry([key, value]) {
/** @type {string[]} */
switch (key) {
case ATTRIBUTE_NODE_KEY:
throw new Error('Invalid entry');
case COMMENT_NODE_KEY:
if (typeof value !== 'string') throw new Error('Content nodes must be strings.');
return `<!--${escapeCommentValue(value)}-->`;
case CDATA_NODE_KEY:
if (typeof value !== 'string') throw new Error('Content nodes must be strings.');
return `<![CDATA[${value}]]>`;
case CONTENT_NODE_KEY:
if (typeof value !== 'string') throw new Error('Content nodes must be strings.');
return escapeContentValue(value);
default:
}
if (typeof value === 'string') throw new Error('Child nodes must be tuples.');
const output = [];
output.push('<', key);
let closed = false;
let selfClosed = false;
/**
* @param {boolean} selfClose
* @return {void}
*/
function checkClose(selfClose = false) {
if (closed) return;
if (selfClose) {
selfClosed = true;
// eslint-disable-next-line unicorn/prefer-code-point
if (key.charCodeAt(0) === CHARCODE_QUESTION) {
output.push('?');
} else {
output.push('/');
}
}
output.push('>');
closed = true;
}
// eslint-disable-next-line github/array-foreach
value.forEach(([childKey, childValue], index, array) => {
switch (childKey) {
case ATTRIBUTE_NODE_KEY:
if (typeof childValue === 'string') throw new Error('Attributes must be tuples.');
for (const [attrName, attrValue] of childValue) {
output.push(' ', attrName, '="', escapeAttributeValue(/** @type {string} */ (attrValue)), '"');
}
checkClose(index === array.length - 1);
break;
default:
checkClose();
output.push(buildXMLFromEntry([childKey, childValue]));
}
});
checkClose();
if (!selfClosed) {
output.push('</', key, '>');
}
return output.join('');
}
/**
* @param {TupleTree<string>} entries
* @return {string}
*/
export function buildXMLFromEntries(entries) {
return entries.map((entry) => buildXMLFromEntry(entry)).join('');
}
/**
* @param {Object|TupleTree<string>} input
* @param {Object} [options]
* @param {boolean} [options.header]
* @return {string}
*/
export function buildXML(input, options = {}) {
const result = (Array.isArray(input) ? buildXMLFromEntries(input) : buildXMLFromObject(null, input));
if (options.header && !result.startsWith('<?xml')) {
return `<?xml version="1.0" encoding="utf-8"?>${result}`;
}
return result;
}
/* eslint-disable unicorn/prefer-code-point */
export const CONTENT_NODE_KEY = '$';
export const ATTRIBUTE_NODE_KEY = '$$';
export const AS_ARRAY_KEY = '$A';
export const AS_OBJECT_KEY = '$O';
export const AS_STRING_KEY = '$S';
export const COMMENT_NODE_KEY = '!--';
export const CDATA_NODE_KEY = '![CDATA[';
export const CHARCODE_QUESTION = '?'.charCodeAt(0);
export const NODE_TYPE_NONE = Symbol('NONE');
export const NODE_TYPE_XML_DECL = Symbol('XML_DECL');
export const NODE_TYPE_NOTATION_DECL = Symbol('NOTATION_DECL');
export const NODE_TYPE_ROOT = Symbol('ROOT');
export const NODE_TYPE_CHILD = Symbol('CHILD');
/* eslint-disable no-use-before-define */
export type TupleTreeEntry<T> = [T, T|TupleTreeEntry<T>[]];
export type TupleTree<T> = [T, T|TupleTree<T>][];
export type XMLObjectBase<T> = {
$A: {
[P in Extract<keyof T, string>]?: (
T[P] extends T[P][0][] ? XMLObject<T[P][0]>[] :
(XMLObjectBase<T[P]> & XMLObject<T[P]>)[]
)
},
$O: {
[P in Extract<keyof T, string>]?: {
$$?: Record<string, string>
} & (
T[P] extends T[P][0][] ? (XMLObjectBase<T[P][0]> & XMLObject<T[P][0]>) :
T[P] extends string ? { $:T[P] } :
T[P] extends number ? { $:string } :
T[P] extends Date ? { $:string } :
XMLObjectBase<T[P]> & XMLObject<T[P]>)
},
$S: {
[P in Extract<keyof T, string>]?: (
T[P] extends string ? T[P] : string
)
},
$$?: Record<string, string>,
};
export type XMLObject<T> = XMLObjectBase<T> & {
[P in Extract<keyof T, string>]?: (
T[P] extends T[P][0][] ? (XMLObjectBase<T[P][0]> & XMLObject<T[P][0]>) :
T[P] extends string ? T[P] | XMLObject<{ $:T[P] }> :
T[P] extends number ? string | XMLObject<{ $:string }> :
T[P] extends Date ? string | XMLObject<{ $:string }> :
XMLObjectBase<T[P]> & XMLObject<T[P]>
)
};
export type XMLObjectFlatBase<T> = {
$A: {
[P in Extract<keyof T, string>]?: (
T[P] extends T[P][0][] ? XMLObjectFlat<T[P][0]>[] :
(XMLObjectFlatBase<T[P]> & XMLObjectFlat<T[P]>)[]
)
},
$O: {
[P in Extract<keyof T, string>]?: (
T[P] extends T[P][0][] ? (XMLObjectFlatBase<T[P][0]> & XMLObjectFlat<T[P][0]>) :
T[P] extends string ? { $:T[P] } :
T[P] extends number ? { $:string } :
T[P] extends Date ? { $:string } :
XMLObjectFlatBase<T[P]> & XMLObjectFlat<T[P]>)
},
$S: {
[P in Extract<keyof T, string>]?: (
T[P] extends string ? T[P] : string
)
},
};
export type XMLObjectFlat<T> = XMLObjectFlatBase<T> & {
[P in Extract<keyof T, string>]?: (
T[P] extends T[P][0][] ? (XMLObjectFlatBase<T[P][0]> & XMLObjectFlat<T[P][0]>) :
T[P] extends string ? T[P] :
T[P] extends number ? string :
T[P] extends Date ? string :
XMLObjectFlatBase<T[P]> & XMLObjectFlat<T[P]>
)
};
/* eslint-disable unicorn/prefer-code-point */
/** @see https://www.w3.org/TR/xml/ */
import {
AS_ARRAY_KEY,
AS_OBJECT_KEY,
AS_STRING_KEY,
ATTRIBUTE_NODE_KEY,
CDATA_NODE_KEY,
CHARCODE_QUESTION,
COMMENT_NODE_KEY,
CONTENT_NODE_KEY,
} from './constants.js';
/**
* @template T
* @typedef {import('./index.js').TupleTree<T>} TupleTree<T>
*/
/**
* @template T
* @typedef {import('./index.js').TupleTreeEntry<T>} TupleTreeEntry<T>
*/
/**
* @template T
* @typedef {import('./index.js').XMLObject<T>} XMLObject<T>
*/
const END_POSITION_SYMBOL = Symbol('EndPositionSymbol');
const STATE_BEGIN = Symbol('BEGIN');
const STATE_PROLOG_OR_ROOT_OPEN = Symbol('PROLOG_OR_ROOT_OPEN');
const STATE_MISC_WHITESPACE = Symbol('MISC_WHITESPACE');
const STATE_DOCTYPE_OR_MISC_OR_ROOT_OPEN = Symbol('DOCTYPE_OR_MISC_OR_ROOT_OPEN');
const STATE_XML_DECL_OPEN = Symbol('XML_DECL_OPEN');
const STATE_XML_DECL_CLOSER = Symbol('XML_DECL_CLOSER');
const STATE_NOTATION_OPEN = Symbol('NOTATION_OPEN');
const STATE_COMMENT_OPEN = Symbol('COMMENT_OPEN');
const STATE_COMMENT = Symbol('COMMENT');
const STATE_COMMENT_CARRIAGE_RETURN = Symbol('COMMENT_CARRIAGE_RETURN');
const STATE_COMMENT_HYPHEN = Symbol('COMMENT_HYPHEN');
const STATE_COMMENT_CLOSE = Symbol('COMMENT_CLOSE');
const STATE_START_TAG_OPEN = Symbol('START_TAG_OPEN');
const STATE_START_TAG_NAME = Symbol('START_TAG_NAME');
const STATE_START_TAG_WHITESPACE = Symbol('START_TAG_WHITESPACE');
const STATE_ATTRIBUTE_NAME = Symbol('ATTRIBUTE_NAME');
const STATE_ATTRIBUTE_EQUAL = Symbol('ATTRIBUTE_EQUAL');
const STATE_ATTRIBUTE_VALUE = Symbol('ATTRIBUTE_VALUE');
const STATE_ATTRIBUTE_VALUE_CLOSE = Symbol('ATTRIBUTE_VALUE_CLOSE');
const STATE_ATTRIBUTE_REFERENCE = Symbol('ATTRIBUTE_REFERENCE');
const STATE_ATTRIBUTE_CHAR_REFERENCE = Symbol('ATTRIBUTE_CHAR_REFERENCE');
const STATE_ATTRIBUTE_CHAR_REFERENCE_HEX = Symbol('ATTRIBUTE_CHAR_REFERENCE_HEX');
const STATE_ATTRIBUTE_CHAR_REFERENCE_DEC = Symbol('ATTRIBUTE_CHAR_REFERENCE_DEC');
const STATE_ATTRIBUTE_ENTITY_REFERENCE = Symbol('ATTRIBUTE_ENTITY_REFERENCE');
const STATE_SELF_CLOSING_TAG_CLOSER = Symbol('SELF_CLOSING_TAG_CLOSER');
const STATE_CONTENT = Symbol('CONTENT');
const STATE_CONTENT_CARRIAGE_RETURN = Symbol('CONTENT_CARRIAGE_RETURN');
const STATE_CONTENT_CDATA_SELECTION_CLOSE_1 = Symbol('CONTENT_CDATA_SELECTION_CLOSE_1');
const STATE_CONTENT_CDATA_SELECTION_CLOSE_2 = Symbol('CONTENT_CDATA_SELECTION_CLOSE_2');
const STATE_CONTENT_REFERENCE = Symbol('CONTENT_REFERENCE');
const STATE_CONTENT_CHAR_REFERENCE = Symbol('CONTENT_CHAR_REFERENCE');
const STATE_CONTENT_CHAR_REFERENCE_HEX = Symbol('CONTENT_CHAR_REFERENCE_HEX');
const STATE_CONTENT_CHAR_REFERENCE_DEC = Symbol('CONTENT_CHAR_REFERENCE_DEC');
const STATE_CONTENT_ENTITY_REFERENCE = Symbol('CONTENT_ENTITY_REFERENCE');
const STATE_CDATA_OPEN = Symbol('STATE_CDATA_OPEN');
const STATE_CDATA_C = Symbol('STATE_CDATA_C');
const STATE_CDATA_CD = Symbol('STATE_CDATA_CD');
const STATE_CDATA_CDA = Symbol('STATE_CDATA_CDA');
const STATE_CDATA_CDAT = Symbol('STATE_CDATA_CDAT');
const STATE_CDATA_CARRIAGE_RETURN = Symbol('STATE_CDATA_CARRIAGE_RETURN');
const STATE_CDATA_DATA_START = Symbol('STATE_CDATA_DATA_START');
const STATE_CDATA_DATA = Symbol('STATE_CDATA_DATA');
const STATE_CDATA_DATA_END = Symbol('STATE_CDATA_DATA_END');
const STATE_CDATA_CLOSE = Symbol('STATE_CDATA_CLOSE');
const STATE_UNKNOWN_TAG_OPEN = Symbol('UNKNOWN_TAG_OPEN');
const STATE_CHILD_NODE = Symbol('CHILD_NODE');
const STATE_END_TAG_OPEN = Symbol('END_TAG_OPEN');
const STATE_END_TAG_NAME = Symbol('END_TAG_NAME');
const STATE_END_TAG_WHITESPACE = Symbol('END_TAG_WHITESPACE');
const STATE_END_TAG_CLOSE = Symbol('END_TAG_CLOSE');
const NODE_TYPE_NONE = Symbol('NONE');
const NODE_TYPE_XML_DECL = Symbol('XML_DECL');
const NODE_TYPE_NOTATION_DECL = Symbol('NOTATION_DECL');
const NODE_TYPE_ROOT = Symbol('ROOT');
const NODE_TYPE_CHILD = Symbol('CHILD');
const CHARCODE_SPACE = ' '.charCodeAt(0);
const CHARCODE_HTAB = '\t'.charCodeAt(0);
const CHARCODE_CR = '\r'.charCodeAt(0);
const CHARCODE_LF = '\n'.charCodeAt(0);
const CHARCODE_COLON = ':'.charCodeAt(0);
const CHARCODE_UNDERSCORE = '_'.charCodeAt(0);
const CHARCODE_HYPHEN = '-'.charCodeAt(0);
const CHARCODE_PERIOD = '.'.charCodeAt(0);
const CHARCODE_MIDDLE_DOT = '·'.charCodeAt(0);
const CHARCODE_LESS_THAN = '<'.charCodeAt(0);
const CHARCODE_SLASH = '/'.charCodeAt(0);
const CHARCODE_BANG = '!'.charCodeAt(0);
const CHARCODE_GREATER_THAN = '>'.charCodeAt(0);
const CHARCODE_EQUALS = '='.charCodeAt(0);
const CHARCODE_DOUBLE_QUOTE = '"'.charCodeAt(0);
const CHARCODE_SINGLE_QUOTE = "'".charCodeAt(0);
const CHARCODE_AMP = '&'.charCodeAt(0);
const CHARCODE_HASH = '#'.charCodeAt(0);
const CHARCODE_SEMICOLON = ';'.charCodeAt(0);
const CHARCODE_LOWERCASE_X = 'x'.charCodeAt(0);
const CHARCODE_OPEN_BRACKET = '['.charCodeAt(0);
const CHARCODE_C = 'C'.charCodeAt(0);
const CHARCODE_D = 'D'.charCodeAt(0);
const CHARCODE_A = 'A'.charCodeAt(0);
const CHARCODE_T = 'T'.charCodeAt(0);
// const CHARCODE_A = 'T'.charCodeAt(0);
const CHARCODE_CLOSE_BRACKET = ']'.charCodeAt(0);
const NAME_START_RANGES = [
['A'.charCodeAt(0), 'Z'.charCodeAt(0)],
['a'.charCodeAt(0), 'z'.charCodeAt(0)],
[0xC0, 0xD6],
// 0xD7 ×
[0xD8, 0xF6],
// 0xF7 ÷
[0xF8, 0x02_FF],
[0x03_70, 0x03_7D],
// 0x37E ;
[0x03_7F, 0x1F_FF],
[0x20_0C, 0x20_0D],
[0x20_70, 0x21_8F],
[0x2C_00, 0x2F_EF],
[0x30_01, 0xD7_FF],
[0xF9_00, 0xFD_CF],
[0xFD_F0, 0xFF_FD],
[0x01_00_00, 0x0E_FF_FF],
];
const NAME_RANGES = [
...NAME_START_RANGES,
['0'.charCodeAt(0), '9'.charCodeAt(0)],
[0x03_00, 0x03_6F],
[0x20_3F, 0x20_40],
];
const CHARACTER_RANGES = [
[0x20, 0xD7_FF],
[0xE0_00, 0xFF_FD],
[0x01_00_00, 0x10_FF_FF],
];
const CHAR_REFERENCE_DEC_RANGES = [
['0'.charCodeAt(0), '9'.charCodeAt(0)],
];
const CHAR_REFERENCE_HEX_RANGES = [
['0'.charCodeAt(0), '9'.charCodeAt(0)],
['a'.charCodeAt(0), 'f'.charCodeAt(0)],
['A'.charCodeAt(0), 'F'.charCodeAt(0)],
];
const PREDEFINED_ENTITIES = new Map([
['amp', '&'],
['lt', '<'],
['gt', '>'],
['apos', "'"],
['quot', '"'],
]);
/**
* @param {number} reference
* @return {string}
*/
function parseCharReference(reference) {
if (reference > 0xFF_FF) {
return String.fromCharCode(
Math.floor((reference - 0x01_00_00) / 0x04_00) + 0xD8_00,
((reference - 0x01_00_00) % 0x04_00) + 0xDC_00,
);
}
if (reference >= 0) {
return String.fromCharCode(reference);
}
throw new Error(`Invalid CharRef (${reference})`);
}
/**
* @param {string} [entity]
* @param {Map<string,string>} [declaredEntities]
* @return {string}
*/
function parseEntityReference(entity, declaredEntities) {
if (declaredEntities?.has(entity)) {
return declaredEntities.get(entity);
}
if (PREDEFINED_ENTITIES.has(entity)) {
return PREDEFINED_ENTITIES.get(entity);
}
throw new Error(`Unknown entity: ${entity}`);
}
/**
* @param {string} input
* @param {Object} options
* @param {number} [options.index=0]
* @param {number} [options.charCode]
* @param {Symbol} [options.nodeType]
* @param {boolean} [options.enforceUniqueAttributes=true]
* @param {boolean} [options.enforceEntityDeclared=true]
* @return {TupleTreeEntry<string>}
*/
function parseXMLNode(input, options = {}) {
/** @type {Symbol} */
let state;
switch (options.nodeType) {
default:
case NODE_TYPE_NONE:
state = STATE_BEGIN;
break;
case NODE_TYPE_XML_DECL:
state = STATE_XML_DECL_OPEN;
break;
case NODE_TYPE_NOTATION_DECL:
state = STATE_NOTATION_OPEN;
break;
case NODE_TYPE_ROOT:
case NODE_TYPE_CHILD:
state = STATE_START_TAG_OPEN;
}
/** @type {TupleTree<string>} */
const children = [];
/** @type {string} */
let stringReturnValue;
/** @type {string} */
let tagName;
/** @type {string} */
let attrName;
/** @type {string} */
let attrValue;
/** @type {number} */
let attrValueDelimiter;
/** @type {number} */
let reference;
/** @type {string} */
let entity;
/** @type {string} */
let content;
/** @type {string} */
let comment;
/** @type {string} */
let cdata;
/** @type {TupleTreeEntry<string>} */
let childNode;
// TODO: Add declared entities support
const declaredEntities = new Map();
let index = options.index ?? 0;
let charCode = options.charCode ?? input.charCodeAt(index);
let stringStartIndex = index;
/** @type {string} */
let xmlSpace;
/** @type {Set<string>} */
const attributeNames = new Set();
/** @type {[string,string][]} */
const attributes = [];
let hasContent = false;
let selfClosing = false;
// let previousState = state;
/**
* @param {string} key
* @param {string|TupleTree<string>} value
* @return {void}
*/
function addChild(key, value) {
children.push([key, value]);
}
const resetContent = () => {
hasContent = false;
content = '';
};
const onContentEnd = () => {
content += input.slice(stringStartIndex, index);
if (hasContent || xmlSpace === 'preserve') {
addChild(CONTENT_NODE_KEY, content);
}
};
/**
* @param {number[][]} ranges
* @return {void}
*/
const assertCharCodeRange = (ranges) => {
if (!ranges.some(([min, max]) => charCode >= min && charCode <= max)) {
throw new Error(`Invalid character ${(input[index])} at ${index}.`);
}
};
const onCommentCharCode = () => {
switch (charCode) {
case CHARCODE_HYPHEN:
state = STATE_COMMENT_HYPHEN;
break;
case CHARCODE_CR:
content += input.slice(stringStartIndex, index);
state = STATE_COMMENT_CARRIAGE_RETURN;
break;
default:
assertCharCodeRange(CHARACTER_RANGES);
// Fallthrough
case CHARCODE_HTAB:
case CHARCODE_LF:
state = STATE_COMMENT;
}
};
const onCDataCharCode = () => {
switch (charCode) {
case CHARCODE_CLOSE_BRACKET:
state = STATE_CDATA_CLOSE;
break;
case CHARCODE_CR:
cdata += input.slice(stringStartIndex, index);
state = STATE_CDATA_CARRIAGE_RETURN;
break;
default:
assertCharCodeRange(CHARACTER_RANGES);
// Fallthrough
case CHARCODE_HTAB:
case CHARCODE_LF:
state = STATE_CDATA_DATA;
}
};
const onContentCharCode = () => {
switch (charCode) {
case CHARCODE_LESS_THAN:
onContentEnd();
state = STATE_UNKNOWN_TAG_OPEN;
break;
case CHARCODE_CR:
content += input.slice(stringStartIndex, index);
state = STATE_CONTENT_CARRIAGE_RETURN;
break;
case CHARCODE_SPACE: case CHARCODE_HTAB: case CHARCODE_LF:
break;
case CHARCODE_CLOSE_BRACKET:
state = STATE_CONTENT_CDATA_SELECTION_CLOSE_1;
hasContent = true;
break;
case CHARCODE_AMP:
content += input.slice(stringStartIndex, index);
state = STATE_CONTENT_REFERENCE;
// Fallthrough
default:
hasContent = true;
}
};
/**
* @param {Symbol} nodeType
* @return {void}
*/
const onTagOpen = (nodeType) => {
// console.log('onTagOpen', nodeType.description);
switch (charCode) {
case CHARCODE_SLASH:
state = STATE_END_TAG_OPEN;
break;
default:
state = STATE_CHILD_NODE;
childNode = parseXMLNode(input, { index, charCode, nodeType });
if (nodeType === NODE_TYPE_XML_DECL && childNode[0] !== '?xml') {
throw new Error('Unknown declaration type');
}
// @ts-ignore Hidden Symbol
index = childNode[END_POSITION_SYMBOL];
stringStartIndex = index + 1;
resetContent();
// @ts-ignore Hidden Symbol
delete childNode[END_POSITION_SYMBOL];
children.push(childNode);
state = options.nodeType === NODE_TYPE_CHILD ? STATE_CONTENT : STATE_MISC_WHITESPACE;
}
};
// const logState = () => {
// console.log(
// index,
// String.fromCharCode(charCode),
// previousState.description,
// '=>',
// state.description,
// String.fromCharCode(charCode),
// );
// };
const getUnexpectedCharacterError = () => new Error(`Invalid character ${(input[index])} at ${index}.`);
const buildReturnValue = () => {
/** @type {TupleTreeEntry<string>} */
let tuple;
if (stringReturnValue != null) {
tuple = [tagName, stringReturnValue];
} else {
/** @type {TupleTree<string>} */
const entries = [];
if (attributes.length) {
entries.push([ATTRIBUTE_NODE_KEY, attributes]);
}
if (children.length) {
entries.push(...children);
} else if (!selfClosing) {
entries.push([CONTENT_NODE_KEY, '']);
}
tuple = [tagName, entries];
}
Object.defineProperty(tuple, END_POSITION_SYMBOL, {
enumerable: false, configurable: true, value: index, writable: false,
});
return tuple;
};
while (Number.isNaN(charCode) === false) {
switch (state) {
case STATE_BEGIN:
switch (charCode) {
case CHARCODE_LESS_THAN:
state = STATE_PROLOG_OR_ROOT_OPEN;
break;
case CHARCODE_SPACE: case CHARCODE_CR: case CHARCODE_HTAB: case CHARCODE_LF:
state = STATE_MISC_WHITESPACE;
break;
default:
throw getUnexpectedCharacterError();
}
break;
case STATE_PROLOG_OR_ROOT_OPEN:
switch (charCode) {
case CHARCODE_QUESTION:
onTagOpen(NODE_TYPE_XML_DECL);
break;
case CHARCODE_BANG:
onTagOpen(NODE_TYPE_NOTATION_DECL);
break;
default:
onTagOpen(NODE_TYPE_CHILD);
}
break;
case STATE_MISC_WHITESPACE:
switch (charCode) {
case CHARCODE_SPACE: case CHARCODE_CR: case CHARCODE_HTAB: case CHARCODE_LF:
state = STATE_MISC_WHITESPACE;
break;
case CHARCODE_LESS_THAN:
state = STATE_DOCTYPE_OR_MISC_OR_ROOT_OPEN;
break;
default:
throw getUnexpectedCharacterError();
}
break;
case STATE_XML_DECL_OPEN:
case STATE_NOTATION_OPEN:
state = STATE_START_TAG_OPEN;
break;
case STATE_XML_DECL_CLOSER:
if (charCode !== CHARCODE_GREATER_THAN) {
throw getUnexpectedCharacterError();
}
selfClosing = true;
state = STATE_END_TAG_CLOSE;
break;
case STATE_COMMENT_OPEN:
if (charCode !== CHARCODE_HYPHEN) {
throw getUnexpectedCharacterError();
}
comment = '';
stringStartIndex = index + 1;
state = STATE_COMMENT;
break;
case STATE_COMMENT:
onCommentCharCode();
break;
case STATE_COMMENT_CARRIAGE_RETURN:
switch (charCode) {
default:
content += '\n';
// Fallthrough
case CHARCODE_LF:
stringStartIndex = index;
onCommentCharCode();
break;
}
break;
case STATE_COMMENT_HYPHEN:
if (charCode === CHARCODE_HYPHEN) {
comment += input.slice(stringStartIndex, index - 1);
state = STATE_COMMENT_CLOSE;
break;
}
onCommentCharCode();
break;
case STATE_COMMENT_CLOSE:
if (charCode !== CHARCODE_GREATER_THAN) {
throw getUnexpectedCharacterError();
}
tagName = COMMENT_NODE_KEY;
stringReturnValue = comment;
state = STATE_END_TAG_CLOSE;
break;
case STATE_START_TAG_OPEN:
switch (charCode) {
case CHARCODE_OPEN_BRACKET:
if (options.nodeType !== NODE_TYPE_NOTATION_DECL) {
throw new Error(`Invalid character ${(input[index])} at ${index}.`);
}
state = STATE_CDATA_OPEN;
break;
case CHARCODE_HYPHEN:
if (options.nodeType !== NODE_TYPE_NOTATION_DECL) {
throw new Error(`Invalid character ${(input[index])} at ${index}.`);
}
state = STATE_COMMENT_OPEN;
break;
default:
assertCharCodeRange(NAME_START_RANGES);
// Fallthrough
case CHARCODE_COLON: case CHARCODE_UNDERSCORE:
state = STATE_START_TAG_NAME;
}
break;
case STATE_START_TAG_NAME:
switch (charCode) {
default:
assertCharCodeRange(NAME_RANGES);
// Fallthrough
case CHARCODE_COLON: case CHARCODE_UNDERSCORE:
case CHARCODE_HYPHEN: case CHARCODE_PERIOD: case CHARCODE_MIDDLE_DOT:
break;
case CHARCODE_SPACE: case CHARCODE_CR: case CHARCODE_HTAB: case CHARCODE_LF:
tagName = input.slice(stringStartIndex, index);
state = STATE_START_TAG_WHITESPACE;
break;
case CHARCODE_SLASH:
if (options.nodeType === NODE_TYPE_CHILD) {
tagName = input.slice(stringStartIndex, index);
state = STATE_SELF_CLOSING_TAG_CLOSER;
} else {
throw getUnexpectedCharacterError();
}
break;
case CHARCODE_GREATER_THAN:
tagName = input.slice(stringStartIndex, index);
switch (options.nodeType) {
case NODE_TYPE_CHILD:
stringStartIndex = index + 1;
resetContent();
state = STATE_CONTENT;
break;
case NODE_TYPE_NOTATION_DECL:
state = STATE_END_TAG_CLOSE;
break;
default:
throw getUnexpectedCharacterError();
}
break;
}
break;
case STATE_ATTRIBUTE_VALUE_CLOSE:
switch (charCode) {
case CHARCODE_QUESTION:
if (options.nodeType === NODE_TYPE_XML_DECL) {
state = STATE_XML_DECL_CLOSER;
break;
}
// Fallthrough
default:
throw getUnexpectedCharacterError();
case CHARCODE_SPACE: case CHARCODE_CR: case CHARCODE_HTAB: case CHARCODE_LF:
state = STATE_START_TAG_WHITESPACE;
break;
case CHARCODE_SLASH:
if (options.nodeType === NODE_TYPE_CHILD) {
state = STATE_SELF_CLOSING_TAG_CLOSER;
} else {
throw getUnexpectedCharacterError();
}
break;
case CHARCODE_GREATER_THAN:
switch (options.nodeType) {
case NODE_TYPE_CHILD:
stringStartIndex = index + 1;
resetContent();
state = STATE_CONTENT;
break;
case NODE_TYPE_NOTATION_DECL:
state = STATE_END_TAG_CLOSE;
break;
default:
throw getUnexpectedCharacterError();
}
break;
}
break;
case STATE_START_TAG_WHITESPACE:
switch (charCode) {
case CHARCODE_SPACE: case CHARCODE_CR: case CHARCODE_HTAB: case CHARCODE_LF:
state = STATE_START_TAG_WHITESPACE;
break;
case CHARCODE_QUESTION:
if (options.nodeType === NODE_TYPE_XML_DECL) {
state = STATE_XML_DECL_CLOSER;
} else {
throw getUnexpectedCharacterError();
}
break;
case CHARCODE_SLASH:
if (options.nodeType === NODE_TYPE_CHILD) {
state = STATE_SELF_CLOSING_TAG_CLOSER;
} else {
throw getUnexpectedCharacterError();
}
break;
case CHARCODE_GREATER_THAN:
switch (options.nodeType) {
case NODE_TYPE_CHILD:
stringStartIndex = index + 1;
resetContent();
state = STATE_CONTENT;
break;
case NODE_TYPE_NOTATION_DECL:
state = STATE_END_TAG_CLOSE;
break;
default:
throw getUnexpectedCharacterError();
}
break;
default:
assertCharCodeRange(NAME_START_RANGES);
// Fallthrough
case CHARCODE_COLON: case CHARCODE_UNDERSCORE:
stringStartIndex = index;
state = STATE_ATTRIBUTE_NAME;
}
break;
case STATE_ATTRIBUTE_NAME:
switch (charCode) {
default:
assertCharCodeRange(NAME_RANGES);
// Fallthrough
case CHARCODE_COLON: case CHARCODE_UNDERSCORE:
case CHARCODE_HYPHEN: case CHARCODE_PERIOD: case CHARCODE_MIDDLE_DOT:
break;
case CHARCODE_EQUALS:
attrName = input.slice(stringStartIndex, index);
if (options.enforceUniqueAttributes !== false && attributeNames.has(attrName)) {
throw new Error(`Attribute name (${attrName}) must be unique at ${index}.`);
}
attributeNames.add(attrName);
state = STATE_ATTRIBUTE_EQUAL;
break;
}
break;
case STATE_ATTRIBUTE_EQUAL:
switch (charCode) {
case CHARCODE_SINGLE_QUOTE:
case CHARCODE_DOUBLE_QUOTE:
attrValueDelimiter = charCode;
state = STATE_ATTRIBUTE_VALUE;
stringStartIndex = index + 1;
attrValue = '';
break;
default:
throw getUnexpectedCharacterError();
}
break;
case STATE_ATTRIBUTE_VALUE:
switch (charCode) {
case attrValueDelimiter:
attrValue += input.slice(stringStartIndex, index);
attributes.push([attrName, attrValue]);
if (attrName === 'xml:space') xmlSpace = attrValue;
state = STATE_ATTRIBUTE_VALUE_CLOSE;
break;
case CHARCODE_LESS_THAN:
throw getUnexpectedCharacterError();
case CHARCODE_AMP:
attrValue += input.slice(stringStartIndex, index);
state = STATE_ATTRIBUTE_REFERENCE;
break;
default:
}
break;
case STATE_ATTRIBUTE_REFERENCE:
switch (charCode) {
case CHARCODE_HASH:
state = STATE_ATTRIBUTE_CHAR_REFERENCE;
break;
default:
assertCharCodeRange(NAME_START_RANGES);
// Fallthrough
case CHARCODE_COLON: case CHARCODE_UNDERSCORE:
stringStartIndex = index;
state = STATE_ATTRIBUTE_ENTITY_REFERENCE;
}
break;
case STATE_ATTRIBUTE_ENTITY_REFERENCE:
switch (charCode) {
case CHARCODE_SEMICOLON:
entity = parseEntityReference(input.slice(stringStartIndex, index), declaredEntities);
attrValue += entity;
state = STATE_ATTRIBUTE_VALUE;
stringStartIndex = index + 1;
break;
default:
assertCharCodeRange(NAME_RANGES);
// Fallthrough
case CHARCODE_COLON: case CHARCODE_UNDERSCORE:
case CHARCODE_HYPHEN: case CHARCODE_PERIOD: case CHARCODE_MIDDLE_DOT:
}
break;
case STATE_ATTRIBUTE_CHAR_REFERENCE:
switch (charCode) {
case CHARCODE_LOWERCASE_X:
stringStartIndex = index + 1;
state = STATE_ATTRIBUTE_CHAR_REFERENCE_HEX;
break;
default:
assertCharCodeRange(CHAR_REFERENCE_DEC_RANGES);
stringStartIndex = index;
state = STATE_ATTRIBUTE_CHAR_REFERENCE_DEC;
}
break;
case STATE_ATTRIBUTE_CHAR_REFERENCE_DEC:
switch (charCode) {
case CHARCODE_SEMICOLON:
reference = Number.parseInt(input.slice(stringStartIndex, index), 10);
attrValue += parseCharReference(reference);
stringStartIndex = index + 1;
state = STATE_ATTRIBUTE_VALUE;
break;
default:
assertCharCodeRange(CHAR_REFERENCE_DEC_RANGES);
}
break;
case STATE_ATTRIBUTE_CHAR_REFERENCE_HEX:
switch (charCode) {
case CHARCODE_SEMICOLON:
reference = Number.parseInt(input.slice(stringStartIndex, index), 16);
attrValue += parseCharReference(reference);
stringStartIndex = index + 1;
state = STATE_ATTRIBUTE_VALUE;
break;
default:
assertCharCodeRange(CHAR_REFERENCE_HEX_RANGES);
}
break;
case STATE_SELF_CLOSING_TAG_CLOSER:
if (charCode !== CHARCODE_GREATER_THAN) {
throw getUnexpectedCharacterError();
}
selfClosing = true;
state = STATE_END_TAG_CLOSE;
break;
case STATE_CONTENT_CARRIAGE_RETURN:
switch (charCode) {
default:
content += '\n';
// Fallthrough
case CHARCODE_LF:
stringStartIndex = index;
onContentCharCode();
break;
}
break;
case STATE_CONTENT_CDATA_SELECTION_CLOSE_2:
if (charCode === CHARCODE_GREATER_THAN) throw getUnexpectedCharacterError();
// Fallthrough
case STATE_CONTENT_CDATA_SELECTION_CLOSE_1:
if (charCode === CHARCODE_CLOSE_BRACKET) {
state = STATE_CONTENT_CDATA_SELECTION_CLOSE_2;
hasContent = true;
break;
}
// Fallthrough
case STATE_CONTENT:
onContentCharCode();
break;
case STATE_CONTENT_REFERENCE:
switch (charCode) {
case CHARCODE_HASH:
state = STATE_CONTENT_CHAR_REFERENCE;
break;
default:
assertCharCodeRange(NAME_START_RANGES);
// Fallthrough
case CHARCODE_COLON: case CHARCODE_UNDERSCORE:
stringStartIndex = index;
state = STATE_CONTENT_ENTITY_REFERENCE;
}
break;
case STATE_CONTENT_ENTITY_REFERENCE:
switch (charCode) {
case CHARCODE_SEMICOLON:
entity = parseEntityReference(input.slice(stringStartIndex, index), declaredEntities);
content += entity;
hasContent = true;
stringStartIndex = index + 1;
state = STATE_CONTENT;
break;
default:
assertCharCodeRange(NAME_RANGES);
// Fallthrough
case CHARCODE_COLON: case CHARCODE_UNDERSCORE:
case CHARCODE_HYPHEN: case CHARCODE_PERIOD: case CHARCODE_MIDDLE_DOT:
}
break;
case STATE_CONTENT_CHAR_REFERENCE:
switch (charCode) {
case CHARCODE_LOWERCASE_X:
stringStartIndex = index + 1;
state = STATE_CONTENT_CHAR_REFERENCE_HEX;
break;
default:
assertCharCodeRange(CHAR_REFERENCE_DEC_RANGES);
stringStartIndex = index;
state = STATE_CONTENT_CHAR_REFERENCE_DEC;
}
break;
case STATE_CONTENT_CHAR_REFERENCE_DEC:
switch (charCode) {
case CHARCODE_SEMICOLON:
reference = Number.parseInt(input.slice(stringStartIndex, index), 10);
content += parseCharReference(reference);
hasContent = true;
stringStartIndex = index + 1;
state = STATE_CONTENT;
break;
default:
assertCharCodeRange(CHAR_REFERENCE_DEC_RANGES);
}
break;
case STATE_CONTENT_CHAR_REFERENCE_HEX:
switch (charCode) {
case CHARCODE_SEMICOLON:
reference = Number.parseInt(input.slice(stringStartIndex, index), 16);
content += parseCharReference(reference);
hasContent = true;
stringStartIndex = index + 1;
state = STATE_CONTENT;
break;
default:
assertCharCodeRange(CHAR_REFERENCE_HEX_RANGES);
}
break;
case STATE_DOCTYPE_OR_MISC_OR_ROOT_OPEN:
case STATE_UNKNOWN_TAG_OPEN:
switch (charCode) {
case CHARCODE_SLASH:
state = STATE_END_TAG_OPEN;
break;
case CHARCODE_BANG:
onTagOpen(NODE_TYPE_NOTATION_DECL);
break;
default:
onTagOpen(NODE_TYPE_CHILD);
}
break;
case STATE_END_TAG_OPEN:
switch (charCode) {
default:
assertCharCodeRange(NAME_START_RANGES);
// Fallthrough
case CHARCODE_COLON: case CHARCODE_UNDERSCORE:
stringStartIndex = index;
state = STATE_END_TAG_NAME;
}
break;
case STATE_END_TAG_NAME:
switch (charCode) {
default:
assertCharCodeRange(NAME_RANGES);
// Fallthrough
case CHARCODE_COLON: case CHARCODE_UNDERSCORE:
case CHARCODE_HYPHEN: case CHARCODE_PERIOD: case CHARCODE_MIDDLE_DOT:
break;
case CHARCODE_SPACE: case CHARCODE_CR: case CHARCODE_HTAB: case CHARCODE_LF:
if (input.slice(stringStartIndex, index) !== tagName) {
throw new Error(`Element was not closed (${tagName}) at ${index}.`);
}
state = STATE_END_TAG_WHITESPACE;
break;
case CHARCODE_GREATER_THAN:
if (input.slice(stringStartIndex, index) !== tagName) {
throw new Error(`Element was not closed (${tagName}) at ${index}.`);
}
state = STATE_END_TAG_CLOSE;
}
break;
case STATE_END_TAG_WHITESPACE:
switch (charCode) {
case CHARCODE_SPACE: case CHARCODE_CR: case CHARCODE_HTAB: case CHARCODE_LF:
break;
case CHARCODE_GREATER_THAN:
state = STATE_END_TAG_CLOSE;
break;
default:
throw getUnexpectedCharacterError();
}
break;
case STATE_CDATA_OPEN:
if (charCode !== CHARCODE_C) throw getUnexpectedCharacterError();
state = STATE_CDATA_C;
break;
case STATE_CDATA_C:
if (charCode !== CHARCODE_D) throw getUnexpectedCharacterError();
state = STATE_CDATA_CD;
break;
case STATE_CDATA_CD:
if (charCode !== CHARCODE_A) throw getUnexpectedCharacterError();
state = STATE_CDATA_CDA;
break;
case STATE_CDATA_CDA:
if (charCode !== CHARCODE_T) throw getUnexpectedCharacterError();
state = STATE_CDATA_CDAT;
break;
case STATE_CDATA_CDAT:
if (charCode !== CHARCODE_A) throw getUnexpectedCharacterError();
state = STATE_CDATA_DATA_START;
break;
case STATE_CDATA_DATA_START:
if (charCode !== CHARCODE_OPEN_BRACKET) throw getUnexpectedCharacterError();
cdata = '';
stringStartIndex = index + 1;
state = STATE_CDATA_DATA;
break;
case STATE_CDATA_DATA:
onCDataCharCode();
break;
case STATE_CDATA_CARRIAGE_RETURN:
switch (charCode) {
default:
cdata += '\n';
// Fallthrough
case CHARCODE_LF:
stringStartIndex = index;
onCDataCharCode();
break;
}
break;
case STATE_CDATA_DATA_END:
if (charCode === CHARCODE_CLOSE_BRACKET) {
state = STATE_CDATA_CLOSE;
} else {
onCDataCharCode();
}
break;
case STATE_CDATA_CLOSE:
if (charCode === CHARCODE_GREATER_THAN) {
tagName = CDATA_NODE_KEY;
cdata += input.slice(stringStartIndex, index - 2);
stringReturnValue = cdata;
state = STATE_END_TAG_CLOSE;
} else {
onCDataCharCode();
}
break;
default:
}
// logState();
// previousState = state;
if (state === STATE_END_TAG_CLOSE) {
// console.log('close', input.slice(options.index, index).replace(/\n/g, '\\n').slice(0, 60));
return buildReturnValue();
}
index += 1;
charCode = input.charCodeAt(index);
}
switch (state) {
case STATE_CONTENT:
if (options.nodeType === NODE_TYPE_CHILD) break;
// Fallthrough
case STATE_END_TAG_CLOSE:
case STATE_MISC_WHITESPACE:
return buildReturnValue();
default:
}
throw new Error('EOF');
}
/**
* @typedef {Object} ParseXMLFlattenOptions
* @prop {boolean} [flattenContent=true]
* @prop {boolean} [flattenArrays=true]
* @prop {boolean} [mergeContentNodes=true]
* @prop {boolean} [skipAttributes=false]
* @prop {boolean} [removeNamespaces=false]
*/
/**
* @param {TupleTree<string>} parsedXML
* @param {ParseXMLFlattenOptions} [options]
* @return {XMLObject<unknown>}
*/
export function flattenParsedXML(parsedXML, options = {}) {
if (!parsedXML.length) return;
/** @type {any} */
const result = {};
Object.defineProperty(result, AS_ARRAY_KEY, {
enumerable: false, configurable: true, value: {}, writable: false,
});
Object.defineProperty(result, AS_OBJECT_KEY, {
enumerable: false, configurable: true, value: {}, writable: false,
});
Object.defineProperty(result, AS_STRING_KEY, {
enumerable: false, configurable: true, value: {}, writable: false,
});
for (const [key, value] of parsedXML) {
if (key === ATTRIBUTE_NODE_KEY) {
if (!options.skipAttributes) {
if (options.removeNamespaces) {
result[key] = Object.fromEntries(/** @type {[string,string][]} */ (value)
.map(([attrKey, attrValue]) => [attrKey.replace(/^[^:]*:/, ''), attrValue]));
} else {
result[key] = Object.fromEntries(/** @type {[string,string][]} */ (value));
}
}
continue;
}
let outKey;
if (key === CDATA_NODE_KEY) {
outKey = CONTENT_NODE_KEY;
} else if (options.removeNamespaces) {
outKey = key.replace(/^[^:]*:/, '');
} else {
outKey = key;
}
let flattenedValue;
let typeofFlattenedValue = 'string';
if (typeof value !== 'string') {
flattenedValue = flattenParsedXML(value, options);
typeofFlattenedValue = typeof flattenedValue;
if (typeofFlattenedValue === 'undefined') {
if (outKey in result === false) {
if (options.flattenArrays) {
result[outKey] = null;
} else {
result[outKey] = [];
}
}
continue;
}
} else {
flattenedValue = value;
}
if (outKey in result === false) {
result[AS_ARRAY_KEY][outKey] = [flattenedValue];
if (typeofFlattenedValue === 'string') {
result[AS_OBJECT_KEY][outKey] = { $: flattenedValue };
result[AS_STRING_KEY][outKey] = flattenedValue;
} else {
result[AS_OBJECT_KEY][outKey] = flattenedValue;
}
if (options.flattenArrays !== false) {
result[outKey] = flattenedValue;
} else {
result[outKey] = [flattenedValue];
}
} else {
result[AS_ARRAY_KEY][outKey].push(flattenedValue);
if (options.flattenArrays !== false && !Array.isArray(result[outKey])) {
result[outKey] = [result[outKey], flattenedValue];
} else {
result[outKey].push(flattenedValue);
}
}
}
if (options.mergeContentNodes !== false && Array.isArray(result[CONTENT_NODE_KEY])) {
result[CONTENT_NODE_KEY] = result[CONTENT_NODE_KEY].join('');
}
if (options.flattenContent !== false && CONTENT_NODE_KEY in result && Object.keys(result).length === 1) {
return result[CONTENT_NODE_KEY];
}
return result;
}
/**
* @param {string} input
* @return {TupleTree<string>}
*/
export function parseXMLAsEntries(input) {
const [, value] = parseXMLNode(input);
return /** @type {TupleTree<string>} */ (value);
}
/**
* @param {string|TupleTree<string>} input
* @param {ParseXMLFlattenOptions} [flattenOptions]
* @return {XMLObject<unknown>}
*/
export function parseXMLAsObject(input, flattenOptions) {
const entries = (typeof input === 'string') ? parseXMLAsEntries(input) : input;
return flattenParsedXML(entries, flattenOptions);
}
/**
* @param {string} input
* @param {ParseXMLFlattenOptions} [flattenOptions]
* @return {TupleTree<string>|XMLObject<unknown>}
*/
export function parseXML(input, flattenOptions) {
const entries = parseXMLAsEntries(input);
if (flattenOptions) return flattenParsedXML(entries, flattenOptions);
return entries;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment