Skip to content

Instantly share code, notes, and snippets.

@ChecksumFailed
Last active May 3, 2024 16:38
Show Gist options
  • Save ChecksumFailed/876e47d653b3850bc99811b180f6ed06 to your computer and use it in GitHub Desktop.
Save ChecksumFailed/876e47d653b3850bc99811b180f6ed06 to your computer and use it in GitHub Desktop.

HTMLTableParser:

The cf_HTMLTableParser ServiceNow Script Include is a JavaScript utility designed to parse HTML tables into an array of objects, enabling programmatic manipulation and processing of table data. A possible use case is parsing html table from inbound emails.

parseHTMLTables

Parses HTML tables into an array of objects.

Parameters:

Name Type Required Description
strHtml String Yes The HTML string containing the tables to be parsed.
strIdentifier String No An optional string to identify specific tables to parse.
logLevel String No The log level for the logger (error, warn, info, debug).

Returns:

Type Description
Array An array of objects representing the parsed tables.
graph TD
    TablesArray[Tables Array] --> Table1Array[Table 1 Array]
    TablesArray --> Table2Array[Table 2 Array]
    Table1Array --> Object1[Object 1]
    Table1Array --> Object2[Object 2]
    Table1Array --> Object3[Object 3]
    Table2Array --> Object4[Object 1]
    Table2Array --> Object5[Object 2]
Loading

Example:

// Parse all tables in the HTML string
var html = '<table>...</table><table>...</table>';
var parsedTables = cf_HTMLTableParser.parseHTMLTables(html);

// Parse tables containing the string "Example" and set the log level to debug
var parsedTables = cf_HTMLTableParser.parseHTMLTables(html, 'Example',"debug");

setLogLevel

Sets the log level for the logger.

Parameters:

Name Type Required Description
strLevel String Yes The log level (error, warn, info, debug).

Returns:

Type Description
cf_HTMLTableParser The cf_HTMLTableParser object for chaining.

Example:

// Set the log level to "debug"
var html = '<table>...</table><table>...</table>';
var parsedTables = cf_HTMLTableParser.setLogLevel('debug').parseHTMLTables(html);

_processTable (Private)

Processes a single table and returns an array of objects representing the table rows.

Parameters:

Name Type Required Description
table String Yes The HTML string representing the table.
identRegExp RegExp Yes A regular expression to identify specific tables.

Returns:

Type Description
Array An array of objects representing the table rows.

Example:

// Process a table and get the parsed rows
var table = '<table><tr><th>Header 1</th><th>Header 2</th></tr><tr><td>Value 1</td><td>Value 2</td></tr></table>';
var identRegExp = /.*/;
var parsedRows = _processTable(table, identRegExp);

_getHeaders (Private)

Extracts the headers from the header row of a table.

Parameters:

Name Type Required Description
headerRow String Yes The HTML string representing the header row.

Returns:

Type Description
Array An array of header strings.

Example:

// Extract headers from a header row
var headerRow = '<tr><th>Header 1</th><th>Header 2</th></tr>';
var headers = _getHeaders(headerRow);

_processRow (Private)

Processes a single row of a table and returns an object representing the row data.

Parameters:

Name Type Required Description
arrRowData Array Yes An array of HTML strings representing the table cells.
arrHeaders Array Yes An array of header strings.

Returns:

Type Description
Object An object representing the row data, with keys corresponding to the headers.

Example:

// Process a row and get the row data
var arrRowData = ['<td>Value 1</td>', '<td>Value 2</td>'];
var arrHeaders = ['Header 1', 'Header 2'];
var rowData = _processRow(arrRowData, arrHeaders);

_htmlToStr (Private)

Converts an HTML string to plain text by removing HTML tags and decoding HTML entities.

Parameters:

Name Type Required Description
html String Yes The HTML string to convert.

Returns:

Type Description
String The plain text string.

Example:

// Convert an HTML string to plain text
var htmlString = '<p>This is <b>bold</b> text.</p>';
var plainText = _htmlToStr(htmlString);
/**
* @namespace cf_HTMLTableParser
* @description A module for parsing HTML tables into an array of objects.
*/
var cf_HTMLTableParser = (function () {
var logger = new GSLog('', "cf_HTMLTableParser");
/**
* Parses HTML tables into an array of objects.
Example Structure of an HTML document with two tables
Tables Array
|__Table1 Array
|__Object1
|__Object2
|__Object3
|__Table2 Array
|__Object1
|__Object2
* @param {string} strHtml - The HTML string containing the tables.
* @param {string} [strIdentifier] - An optional string to identify specific tables to parse. Regex is also supported. Searches the header row
* @param {string} [logLevel] - The log level for the logger (error, warn, info, debug).
* @returns {Array} An array of objects representing the parsed tables.
* @example
* // Parse all tables in the HTML string
* var html = '<table>...</table><table>...</table>';
* var parsedTables = cf_HTMLTableParser.parseHTMLTables(html);
*
* // Parse tables containing the string "Example" and debug level logging
* var parsedTables = cf_HTMLTableParser.parseHTMLTables(html, 'Example',"debug");
*/
function parseHTMLTables(strHtml, strIdentifier, logLevel) {
var results = [];
identRegExp = strIdentifier ? new RegExp(strIdentifier, 'g') : /.*/;
var tableMatches = strHtml.match(/<table[\s\S]*?<\/table>/g); // Match all tables
tableMatches.forEach(function (table) {
var tableObj = _processTable(table, identRegExp);
if (tableObj) {
results.push(tableObj);
}
});
return results;
}
/**
* Sets the log level for the logger.
* @param {string} strLevel - The log level (error, warn, info, debug).
* @returns {this} This object for chaining.
* @example
* // Set the log level to "debug" and parseHTML
* var html = '<table>...</table><table>...</table>';
* var parsedTables = cf_HTMLTableParser.setLogLevel('debug').parseHTMLTables(html);
*/
function setLogLevel(strLevel) {
var validLevels = {
"error": "error",
"warn": "warn",
"info": "info",
"debug": "debug"
};
strLevel = (strLevel || "").toLowerCase();
if (gs.nil(strLevel) || (typeof strLevel !== "string") || !validLevels.hasOwnProperty(strLevel)) {
return;
}
logger.setLevel(validLevels[strLevel]);
return this;
}
/**
* Processes a single table and returns an array of objects representing the table rows.
* @param {string} table - The HTML string representing the table.
* @param {RegExp} identRegExp - A regular expression to identify specific tables.
* @returns {Array} An array of objects representing the table rows.
* @private
* @example
* // Process a table and get the parsed rows
* var table = '<table><tr><th>Header 1</th><th>Header 2</th></tr><tr><td>Value 1</td><td>Value 2</td></tr></table>';
* var identRegExp = /;
* var parsedRows = _processTable(table, identRegExp);
* */
function _processTable(table, identRegExp) {
identRegExp = identRegExp || /.*/g;
logger.debug("Table: " + table);
var arrRows = table.match(/<tr[\s\S]*?<\/tr>/g); // Match all rows within table(tr elements)
if (!arrRows) { //Fail early if no rows found
logger.logDebug("No Rows Found");
return;
}
var arrHeaders = _getHeaders(arrRows[0]); //Build array of headers. Assume row 0 is header row.
if (!arrHeaders || arrHeaders.length === 0) { //return if no headers found
gs.error("No Headers Found");
return;
}
logger.logDebug("Headers:" + arrHeaders);
var identifierFound = identRegExp.test(arrRows[0]);
if (!identifierFound) { //Do not process table if identifier not found
gs.error("Ident not found");
return;
}
//process all rows and add to table array
var arrTable = [];
for (var i = 1; i < arrRows.length; i++) { // Start from 1 to skip header row
var arrRowData = arrRows[i].match(/<td[\s\S]*?<\/td>/g); //match all TD Elements
var objRow = _processRow(arrRowData, arrHeaders);
if (objRow) {
arrTable.push(objRow);
}
}
return arrTable;
}
/**
* Extracts the headers from the header row of a table.
* @param {string} headerRow - The HTML string representing the header row.
* @returns {Array} An array of header strings.
* @private
* @example
* // Extract headers from a header row
* var headerRow = '<tr><th>Header 1</th><th>Header 2</th></tr>';
* var headers = _getHeaders(headerRow);
* */
function _getHeaders(headerRow) {
return headerRow.match(/<t[hr][\s\S]*?<\/t[hr]>/g).map(function (header) {
return header.replace(/<\/?[^>]+>/g, '').trim(); // Strip HTML to get clean header text
});
}
/**
* Processes a single row of a table and returns an object representing the row data.
* @param {Array} arrRowData - An array of HTML strings representing the table cells.
* @param {Array} arrHeaders - An array of header strings.
* @returns {Object} An object representing the row data, with keys corresponding to the headers.
* @private
* @example
* // Process a row and get the row data
* var arrRowData = ['<td>Value 1</td>', '<td>Value 2</td>'];
* var arrHeaders = ['Header 1', 'Header 2'];
* var rowData = _processRow(arrRowData, arrHeaders);
* */
function _processRow(arrRowData, arrHeaders) {
var objRow = {};
arrRowData.forEach(function (strRowItem, idx) {
strRowItem = _htmlToStr(strRowItem); // Strip HTML and get text
logger.logDebug("Row Item: " + strRowItem);
if (idx < arrHeaders.length) {
objRow[arrHeaders[idx]] = strRowItem; // Map text to corresponding header as key
}
});
logger.logDebug(JSON.stringify(objRow));
return JSON.stringify(objRow) === "{}" ? null : objRow;
}
/**
* Converts an HTML string to plain text by removing HTML tags and decoding HTML entities.
* @param {string} html - The HTML string to convert.
* @returns {string} The plain text string.
* @private
* @example
* // Convert an HTML string to plain text
* var htmlString = '<p>This is <b>bold</b> text.</p>';
* var plainText = _htmlToStr(htmlString);
* */
function _htmlToStr(html) {
function decodeUTF(match, group1) {
return decodeURIComponent(group1);
}
function translateChar(match, group1) {
var charMap = {
'nbsp': String.fromCharCode(160),
'amp': '&',
'quot': '"',
'lt': '<',
'gt': '>'
};
return charMap[group1];
}
var utfRegEx = /(%\w{2}(%\w{2}%\w{1,2})?)/gm;
var specialCharRegex = /&(nbsp|amp|quot|lt|gt)/g;
var noHTML = html.replace(/<br(?:\s*)?\/?>/gi, "\n").replace(/(<\/(?:tr|div|p)>)\s*([^\n])/, "$1\n$2").replace(/(<\/(?:td)>)\s*\n/gi, "$1").replace(/(<([^>]+)>)/ig, '').replace(utfRegEx, decodeUTF).replace(specialCharRegex, translateChar);
return decodeURI(noHTML);
}
return {
parseHTMLTables: parseHTMLTables,
setLogLevel: setLogLevel
};
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment