ChecksumFailed/HTMLTableParser.js

## readme.md

      
    Raw
  

              readme.md
            
          
    HTMLTableParser:

The cf_HTMLTableParser ServiceNow Script Include is a JavaScript utility designed to parse HTML tables into an array of objects, enabling programmatic manipulation and processing of table data.   A possible use case is parsing html table from inbound emails.
parseHTMLTables

Parses HTML tables into an array of objects.
Parameters:


Name
Type
Required
Description


strHtml
String
Yes
The HTML string containing the tables to be parsed.


strIdentifier
String
No
An optional string to identify specific tables to parse.


logLevel
String
No
The log level for the logger (error, warn, info, debug).


Returns:


Type
Description


Array
An array of objects representing the parsed tables.


      graph TD
    TablesArray[Tables Array] --> Table1Array[Table 1 Array]
    TablesArray --> Table2Array[Table 2 Array]
    Table1Array --> Object1[Object 1]
    Table1Array --> Object2[Object 2]
    Table1Array --> Object3[Object 3]
    Table2Array --> Object4[Object 1]
    Table2Array --> Object5[Object 2]

    
      Loading

  
Example:

// Parse all tables in the HTML string
var html = '<table>...</table><table>...</table>';
var parsedTables = cf_HTMLTableParser.parseHTMLTables(html);

// Parse tables containing the string "Example" and set the log level to debug
var parsedTables = cf_HTMLTableParser.parseHTMLTables(html, 'Example',"debug");
setLogLevel

Sets the log level for the logger.
Parameters:


Name
Type
Required
Description


strLevel
String
Yes
The log level (error, warn, info, debug).


Returns:


Type
Description


cf_HTMLTableParser
The cf_HTMLTableParser object for chaining.


Example:

// Set the log level to "debug"
var html = '<table>...</table><table>...</table>';
var parsedTables = cf_HTMLTableParser.setLogLevel('debug').parseHTMLTables(html);
_processTable (Private)

Processes a single table and returns an array of objects representing the table rows.
Parameters:


Name
Type
Required
Description


table
String
Yes
The HTML string representing the table.


identRegExp
RegExp
Yes
A regular expression to identify specific tables.


Returns:


Type
Description


Array
An array of objects representing the table rows.


Example:

// Process a table and get the parsed rows
var table = '<table><tr><th>Header 1</th><th>Header 2</th></tr><tr><td>Value 1</td><td>Value 2</td></tr></table>';
var identRegExp = /.*/;
var parsedRows = _processTable(table, identRegExp);
_getHeaders (Private)

Extracts the headers from the header row of a table.
Parameters:


Name
Type
Required
Description


headerRow
String
Yes
The HTML string representing the header row.


Returns:


Type
Description


Array
An array of header strings.


Example:

// Extract headers from a header row
var headerRow = '<tr><th>Header 1</th><th>Header 2</th></tr>';
var headers = _getHeaders(headerRow);
_processRow (Private)

Processes a single row of a table and returns an object representing the row data.
Parameters:


Name
Type
Required
Description


arrRowData
Array
Yes
An array of HTML strings representing the table cells.


arrHeaders
Array
Yes
An array of header strings.


Returns:


Type
Description


Object
An object representing the row data, with keys corresponding to the headers.


Example:

// Process a row and get the row data
var arrRowData = ['<td>Value 1</td>', '<td>Value 2</td>'];
var arrHeaders = ['Header 1', 'Header 2'];
var rowData = _processRow(arrRowData, arrHeaders);
_htmlToStr (Private)

Converts an HTML string to plain text by removing HTML tags and decoding HTML entities.
Parameters:


Name
Type
Required
Description


html
String
Yes
The HTML string to convert.


Returns:


Type
Description


String
The plain text string.


Example:

// Convert an HTML string to plain text
var htmlString = '<p>This is <b>bold</b> text.</p>';
var plainText = _htmlToStr(htmlString);

  
## HTMLTableParser.js
/**
 * @namespace cf_HTMLTableParser
 * @description A module for parsing HTML tables into an array of objects.
 */
var cf_HTMLTableParser = (function () {
    var logger = new GSLog('', "cf_HTMLTableParser");

    /**
     * Parses HTML tables into an array of objects.
       Example Structure of an HTML document with two tables
            Tables Array
            |__Table1 Array
                |__Object1
                |__Object2
                |__Object3
            |__Table2 Array
                |__Object1
                |__Object2
     * @param {string} strHtml - The HTML string containing the tables.
     * @param {string} [strIdentifier] - An optional string to identify specific tables to parse. Regex is also supported. Searches the header row
     * @param {string} [logLevel] - The log level for the logger (error, warn, info, debug).
     * @returns {Array} An array of objects representing the parsed tables.
     * @example
     * // Parse all tables in the HTML string
     * var html = '<table>...</table><table>...</table>';
     * var parsedTables = cf_HTMLTableParser.parseHTMLTables(html);
     *
     * // Parse tables containing the string "Example" and debug level logging
     * var parsedTables = cf_HTMLTableParser.parseHTMLTables(html, 'Example',"debug");

     */
    function parseHTMLTables(strHtml, strIdentifier, logLevel) {
        var results = [];
        identRegExp = strIdentifier ? new RegExp(strIdentifier, 'g') : /.*/;

        var tableMatches = strHtml.match(/<table[\s\S]*?<\/table>/g); // Match all tables
        tableMatches.forEach(function (table) {
            var tableObj = _processTable(table, identRegExp);
            if (tableObj) {
                results.push(tableObj);
            }
        });

        return results;
    }

    /**
     * Sets the log level for the logger.
     * @param {string} strLevel - The log level (error, warn, info, debug).
     * @returns {this} This object for chaining.
     * @example
     * // Set the log level to "debug" and parseHTML
     * var html = '<table>...</table><table>...</table>';
     * var parsedTables = cf_HTMLTableParser.setLogLevel('debug').parseHTMLTables(html);
     */
    function setLogLevel(strLevel) {
        var validLevels = {
            "error": "error",
            "warn": "warn",
            "info": "info",
            "debug": "debug"
        };
        strLevel = (strLevel || "").toLowerCase();
        if (gs.nil(strLevel) || (typeof strLevel !== "string") || !validLevels.hasOwnProperty(strLevel)) {
            return;
        }

        logger.setLevel(validLevels[strLevel]);
        return this;
    }

    /**
     * Processes a single table and returns an array of objects representing the table rows.
     * @param {string} table - The HTML string representing the table.
     * @param {RegExp} identRegExp - A regular expression to identify specific tables.
     * @returns {Array} An array of objects representing the table rows.
     * @private
     * @example
     * // Process a table and get the parsed rows
     * var table = '<table><tr><th>Header 1</th><th>Header 2</th></tr><tr><td>Value 1</td><td>Value 2</td></tr></table>';
     * var identRegExp = /;
     * var parsedRows = _processTable(table, identRegExp);
     *      */
    function _processTable(table, identRegExp) {
        identRegExp = identRegExp || /.*/g;
        logger.debug("Table: " + table);
        var arrRows = table.match(/<tr[\s\S]*?<\/tr>/g); // Match all rows within table(tr elements)
        if (!arrRows) { //Fail early if no rows found
            logger.logDebug("No Rows Found");
            return;
        }

        var arrHeaders = _getHeaders(arrRows[0]); //Build array of headers.  Assume row 0 is header row.
        if (!arrHeaders || arrHeaders.length === 0) { //return if no headers found
            gs.error("No Headers Found");
            return;
        }
        logger.logDebug("Headers:" + arrHeaders);
        var identifierFound = identRegExp.test(arrRows[0]);

        if (!identifierFound) { //Do not process table if identifier not found
            gs.error("Ident not found");
            return;
        }

        //process all rows and add to table array
        var arrTable = [];
        for (var i = 1; i < arrRows.length; i++) { // Start from 1 to skip header row
            var arrRowData = arrRows[i].match(/<td[\s\S]*?<\/td>/g); //match all TD Elements
            var objRow = _processRow(arrRowData, arrHeaders);
            if (objRow) {
                arrTable.push(objRow);
            }
        }

        return arrTable;
    }

    /**
     * Extracts the headers from the header row of a table.
     * @param {string} headerRow - The HTML string representing the header row.
     * @returns {Array} An array of header strings.
     * @private
     * @example
     * // Extract headers from a header row
     * var headerRow = '<tr><th>Header 1</th><th>Header 2</th></tr>';
     * var headers = _getHeaders(headerRow);
     *      */
    function _getHeaders(headerRow) {
        return headerRow.match(/<t[hr][\s\S]*?<\/t[hr]>/g).map(function (header) {
            return header.replace(/<\/?[^>]+>/g, '').trim(); // Strip HTML to get clean header text
        });
    }

    /**
     * Processes a single row of a table and returns an object representing the row data.
     * @param {Array} arrRowData - An array of HTML strings representing the table cells.
     * @param {Array} arrHeaders - An array of header strings.
     * @returns {Object} An object representing the row data, with keys corresponding to the headers.
     * @private
     * @example
     * // Process a row and get the row data
     * var arrRowData = ['<td>Value 1</td>', '<td>Value 2</td>'];
     * var arrHeaders = ['Header 1', 'Header 2'];
     * var rowData = _processRow(arrRowData, arrHeaders);
     *      */
    function _processRow(arrRowData, arrHeaders) {
        var objRow = {};
        arrRowData.forEach(function (strRowItem, idx) {
            strRowItem = _htmlToStr(strRowItem); // Strip HTML and get text
            logger.logDebug("Row Item: " + strRowItem);
            if (idx < arrHeaders.length) {
                objRow[arrHeaders[idx]] = strRowItem; // Map text to corresponding header as key
            }
        });
        logger.logDebug(JSON.stringify(objRow));
        return JSON.stringify(objRow) === "{}" ? null : objRow;
    }

    /**
     * Converts an HTML string to plain text by removing HTML tags and decoding HTML entities.
     * @param {string} html - The HTML string to convert.
     * @returns {string} The plain text string.
     * @private
     * @example
     * // Convert an HTML string to plain text
     * var htmlString = '<p>This is <b>bold</b> text.</p>';
     * var plainText = _htmlToStr(htmlString);
     *      */
    function _htmlToStr(html) {
        function decodeUTF(match, group1) {
            return decodeURIComponent(group1);
        }

        function translateChar(match, group1) {
            var charMap = {
                'nbsp': String.fromCharCode(160),
                'amp': '&',
                'quot': '"',
                'lt': '<',
                'gt': '>'
            };
            return charMap[group1];
        }

        var utfRegEx = /(%\w{2}(%\w{2}%\w{1,2})?)/gm;
        var specialCharRegex = /&(nbsp|amp|quot|lt|gt)/g;
        var noHTML = html.replace(/<br(?:\s*)?\/?>/gi, "\n").replace(/(<\/(?:tr|div|p)>)\s*([^\n])/, "$1\n$2").replace(/(<\/(?:td)>)\s*\n/gi, "$1").replace(/(<([^>]+)>)/ig, '').replace(utfRegEx, decodeUTF).replace(specialCharRegex, translateChar);
        return decodeURI(noHTML);
    }

    return {
        parseHTMLTables: parseHTMLTables,
        setLogLevel: setLogLevel
    };
})();
Name	Type	Required	Description
strHtml	String	Yes	The HTML string containing the tables to be parsed.
strIdentifier	String	No	An optional string to identify specific tables to parse.
logLevel	String	No	The log level for the logger (error, warn, info, debug).
Name	Type	Required	Description
table	String	Yes	The HTML string representing the table.
identRegExp	RegExp	Yes	A regular expression to identify specific tables.
Name	Type	Required	Description
arrRowData	Array	Yes	An array of HTML strings representing the table cells.
arrHeaders	Array	Yes	An array of header strings.
	/**
	* @namespace cf_HTMLTableParser
	* @description A module for parsing HTML tables into an array of objects.
	*/
	var cf_HTMLTableParser = (function () {
	var logger = new GSLog('', "cf_HTMLTableParser");

	/**
	* Parses HTML tables into an array of objects.
	Example Structure of an HTML document with two tables
	Tables Array
	\|__Table1 Array
	\|__Object1
	\|__Object2
	\|__Object3
	\|__Table2 Array
	\|__Object1
	\|__Object2
	* @param {string} strHtml - The HTML string containing the tables.
	* @param {string} [strIdentifier] - An optional string to identify specific tables to parse. Regex is also supported. Searches the header row
	* @param {string} [logLevel] - The log level for the logger (error, warn, info, debug).
	* @returns {Array} An array of objects representing the parsed tables.
	* @example
	* // Parse all tables in the HTML string
	* var html = '<table>...</table><table>...</table>';
	* var parsedTables = cf_HTMLTableParser.parseHTMLTables(html);
	*
	* // Parse tables containing the string "Example" and debug level logging
	* var parsedTables = cf_HTMLTableParser.parseHTMLTables(html, 'Example',"debug");

	*/
	function parseHTMLTables(strHtml, strIdentifier, logLevel) {
	var results = [];
	identRegExp = strIdentifier ? new RegExp(strIdentifier, 'g') : /.*/;

	var tableMatches = strHtml.match(/<table[\s\S]*?<\/table>/g); // Match all tables
	tableMatches.forEach(function (table) {
	var tableObj = _processTable(table, identRegExp);
	if (tableObj) {
	results.push(tableObj);
	}
	});

	return results;
	}

	/**
	* Sets the log level for the logger.
	* @param {string} strLevel - The log level (error, warn, info, debug).
	* @returns {this} This object for chaining.
	* @example
	* // Set the log level to "debug" and parseHTML
	* var html = '<table>...</table><table>...</table>';
	* var parsedTables = cf_HTMLTableParser.setLogLevel('debug').parseHTMLTables(html);
	*/
	function setLogLevel(strLevel) {
	var validLevels = {
	"error": "error",
	"warn": "warn",
	"info": "info",
	"debug": "debug"
	};
	strLevel = (strLevel \|\| "").toLowerCase();
	if (gs.nil(strLevel) \|\| (typeof strLevel !== "string") \|\| !validLevels.hasOwnProperty(strLevel)) {
	return;
	}

	logger.setLevel(validLevels[strLevel]);
	return this;
	}

	/**
	* Processes a single table and returns an array of objects representing the table rows.
	* @param {string} table - The HTML string representing the table.
	* @param {RegExp} identRegExp - A regular expression to identify specific tables.
	* @returns {Array} An array of objects representing the table rows.
	* @private
	* @example
	* // Process a table and get the parsed rows
	* var table = '<table><tr><th>Header 1</th><th>Header 2</th></tr><tr><td>Value 1</td><td>Value 2</td></tr></table>';
	* var identRegExp = /;
	* var parsedRows = _processTable(table, identRegExp);
	* */
	function _processTable(table, identRegExp) {
	identRegExp = identRegExp \|\| /.*/g;
	logger.debug("Table: " + table);
	var arrRows = table.match(/<tr[\s\S]*?<\/tr>/g); // Match all rows within table(tr elements)
	if (!arrRows) { //Fail early if no rows found
	logger.logDebug("No Rows Found");
	return;
	}

	var arrHeaders = _getHeaders(arrRows[0]); //Build array of headers. Assume row 0 is header row.
	if (!arrHeaders \|\| arrHeaders.length === 0) { //return if no headers found
	gs.error("No Headers Found");
	return;
	}
	logger.logDebug("Headers:" + arrHeaders);
	var identifierFound = identRegExp.test(arrRows[0]);

	if (!identifierFound) { //Do not process table if identifier not found
	gs.error("Ident not found");
	return;
	}

	//process all rows and add to table array
	var arrTable = [];
	for (var i = 1; i < arrRows.length; i++) { // Start from 1 to skip header row
	var arrRowData = arrRows[i].match(/<td[\s\S]*?<\/td>/g); //match all TD Elements
	var objRow = _processRow(arrRowData, arrHeaders);
	if (objRow) {
	arrTable.push(objRow);
	}
	}

	return arrTable;
	}

	/**
	* Extracts the headers from the header row of a table.
	* @param {string} headerRow - The HTML string representing the header row.
	* @returns {Array} An array of header strings.
	* @private
	* @example
	* // Extract headers from a header row
	* var headerRow = '<tr><th>Header 1</th><th>Header 2</th></tr>';
	* var headers = _getHeaders(headerRow);
	* */
	function _getHeaders(headerRow) {
	return headerRow.match(/<t[hr][\s\S]*?<\/t[hr]>/g).map(function (header) {
	return header.replace(/<\/?[^>]+>/g, '').trim(); // Strip HTML to get clean header text
	});
	}

	/**
	* Processes a single row of a table and returns an object representing the row data.
	* @param {Array} arrRowData - An array of HTML strings representing the table cells.
	* @param {Array} arrHeaders - An array of header strings.
	* @returns {Object} An object representing the row data, with keys corresponding to the headers.
	* @private
	* @example
	* // Process a row and get the row data
	* var arrRowData = ['<td>Value 1</td>', '<td>Value 2</td>'];
	* var arrHeaders = ['Header 1', 'Header 2'];
	* var rowData = _processRow(arrRowData, arrHeaders);
	* */
	function _processRow(arrRowData, arrHeaders) {
	var objRow = {};
	arrRowData.forEach(function (strRowItem, idx) {
	strRowItem = _htmlToStr(strRowItem); // Strip HTML and get text
	logger.logDebug("Row Item: " + strRowItem);
	if (idx < arrHeaders.length) {
	objRow[arrHeaders[idx]] = strRowItem; // Map text to corresponding header as key
	}
	});
	logger.logDebug(JSON.stringify(objRow));
	return JSON.stringify(objRow) === "{}" ? null : objRow;
	}

	/**
	* Converts an HTML string to plain text by removing HTML tags and decoding HTML entities.
	* @param {string} html - The HTML string to convert.
	* @returns {string} The plain text string.
	* @private
	* @example
	* // Convert an HTML string to plain text
	* var htmlString = '<p>This is <b>bold</b> text.</p>';
	* var plainText = _htmlToStr(htmlString);
	* */
	function _htmlToStr(html) {
	function decodeUTF(match, group1) {
	return decodeURIComponent(group1);
	}

	function translateChar(match, group1) {
	var charMap = {
	'nbsp': String.fromCharCode(160),
	'amp': '&',
	'quot': '"',
	'lt': '<',
	'gt': '>'
	};
	return charMap[group1];
	}

	var utfRegEx = /(%\w{2}(%\w{2}%\w{1,2})?)/gm;
	var specialCharRegex = /&(nbsp\|amp\|quot\|lt\|gt)/g;
	var noHTML = html.replace(/<br(?:\s)?\/?>/gi, "\n").replace(/(<\/(?:tr\|div\|p)>)\s([^\n])/, "$1\n$2").replace(/(<\/(?:td)>)\s*\n/gi, "$1").replace(/(<([^>]+)>)/ig, '').replace(utfRegEx, decodeUTF).replace(specialCharRegex, translateChar);
	return decodeURI(noHTML);
	}

	return {
	parseHTMLTables: parseHTMLTables,
	setLogLevel: setLogLevel
	};
	})();