lancejpollard/a.readme.md

## a.readme.md

      
    Raw
  

              a.readme.md
            
          
    Wiktionary HTML Table Parser

This should work against these Wiktionary declension and conjugation tables. On each Wiktionary link/page, scroll down to the conjugation or declention table and click to expand it, to test this on it.


success
language
form
operation
title


✅
Danish
noun
declension
Danish Noun Declension


✅
French
verb
conjugation
French Verb Conjugation


✅
Latin
verb
conjugation
Latin Verb Conjugation


✅
Ancient Greek
verb
conjugation
Ancient Greek Verb Conjugation


✅
Arabic
verb
conjugation
Arabic Verb Conjugation


✅
Arabic
noun
declension
Arabic Noun Declension


✅
Russian
noun
declension
Russian Noun Declension


❌
Russian
verb
conjugation
Russian Verb Conjugation


❌
Ukranian
verb
conjugation
Ukrainian Verb Conjugation


✅
Tamil
noun
declension
Tamil Noun Declension


✅
Tamil
verb
conjugation
Tamil Verb Conjugation


Others that don't fit the same pattern:


success
language
form
operation
title


❌
Irish
noun
declension
Irish Noun Declension


## a.usage.js
import { toMatrix, toObject } from './parser'

console.log(toObject(toMatrix(document.querySelector(".inflection-table"))))

## parser.js
// Function to get an element's text, marking italics with underscores,
//    removing footnotes and extra white-space and optionally removing non-Latin text.
export function clean(elem, latinOnly) {
    let txt = elem.textContent;
    const italic = elem.querySelector("i")?.textContent.trim();
    // Mark italics (assuming at most one word). latinOnly flag must be off:
    if (italic && !latinOnly) txt = txt.replaceAll(italic, "_" + italic + "_");
    // The parameter determines whether to remove non-Latin characters
    if (latinOnly) txt = txt.replace(/[^\x10-\xFF]+/gu, ""); // Remove non-Latin characters
    // Assume that a footnote appears as isolated, single digit
    return txt.replace(/(\s)\s*(\d\s+)?|—/gu, " ").trim();   // Remove footnotes, multiple whitespace
}

// Function to convert an HTML table to a matrix, duplicating data where col/rowspans occur
export function toMatrix(table) {
    // Get a 2-letter lang attribute from anywhere in the whole table. To be used when
    //   a data cell does not have an element with a lang attribute.
    const defaultLang = Array.from(table.querySelectorAll("[lang]"), elem => elem.getAttribute("lang"))
                             .find(lang => lang.length == 2) ?? "text";
    // Function to extract the content from a table cell, taking into account
    //    tag (td/th), IPA class, lang, row/colspan attributes, and italics.
    // Returns an array of duplicated objects (to reflect colspan)
    function get(cell) {
        const length = +(cell.getAttribute("colspan") ?? 1);
        // Assume that the presence of italics means it is an alias:
        const hasAlias = !!cell.querySelector("i > span");
        // Assume that data is distinguished from qualifiers by use
        // of TD tag instead of TH tag
        const hasData = cell.tagName == "TD";
        const span = +(cell.getAttribute("rowspan") ?? 1);
        let content = {}; // Data will be represented in an object
        if (hasAlias) {
            content.alias = clean(cell);
        } else if (hasData) {
            const ipa = cell.querySelector(".IPA");
            if (ipa) content.ipa = clean(ipa);
            for (const version of cell.querySelectorAll("[lang]")) {
                const lang = version.getAttribute("lang");
                let cleanVersion = clean(version);
                // Check if this version is between parentheses, indicating it is an optional suffix
                //    This occurs in the Ancient Greek table, like with "ἀμφεκᾰ́λυψε(ν)"
                if (version.previousSibling?.textContent === "(") {
                    cleanVersion = (Array.isArray(content[lang]) ? content[lang].at(-1) : content[lang]) + cleanVersion;
                }
                // If there are multiple data for the same language: make an array;
                // If then also there is a parenthesis, assume the second version is an optional suffix for the first version
                content[lang] = lang in content
                    ? [].concat(content[lang], cleanVersion)
                    : cleanVersion;
            }
            // If there is no IPA class, nor lang attribute,
            // then assume that the whole text of this cell
            // is text in the target language
            if (!Object.keys(content).length) {
                const version = clean(cell);
                content = version ? { [defaultLang]: version } : null;
            }
        } else {
            // It's a qualifier: assign a string (not object).
            // Only keep Latin character content.
            content = clean(cell, true);
        }
        // Return with the content also the rowspan
        return Array.from({length}, () => ({ content, span }));
    }

    let cells;
    const rows = [...table.rows];
    // Ignore first row if it is the clickable row to expand/collapse the rest of the table
    if (rows[0].cells[0].classList.contains("vsToggleElement")) rows.shift();
    // Ignore last row if it has notes
    if (rows.at(-1).cells[0].className.includes("-notes-")) rows.pop();
    return Array.from(rows, row => {
        // Get the current row, with colspans resolved into cells
        const buff = Array.from(row.cells, get).flat();
        // Integrate this row by taking into account the rowspans
        // of cells in previous row(s)
        cells = cells?.map(cell => --cell.span ? cell : buff.shift() ?? ({span:1})).concat(buff)
              ?? buff;
        // Now that rowspans have served their purpose, just retain
        // the content:
        return cells.map(cell => cell.content);
    });
}

// Function to convert matrix to the desired flat object.
export function toObject(matrix) {
    const result = [];
    const tabQualifier = [];
    const colQualifiers = [];
    let data = true;
    for (const [...row] of matrix) {
        // Find the column index where first data is found --
        // knowing that a data element is represented by an object
        // while a qualifier is represented by a string data type
        const dataAt = row.findIndex(content => typeof content === "object");
        // Some heading rows have a TD element in first column (an inconsistency in the table for Arabic),
        // so only consider it data when in a non-first column
        if (dataAt < 1) { // No data; so this is a heading with only qualifier(s)
            // If the qualifier is present in the first column, it's a table qualifier (highest-level qualifier)
            if (row[0]) tabQualifier[0] = row[0];
            // If the previous row had data, we enter a new section... so reset the column-qualifiers
            if (data) colQualifiers.length = 0;
            row.forEach((col, i) => (colQualifiers[i] ??= []).push(col));
        } else { // Row with data
            // For sections that have no column headers, create one dummy column qualifier
            if (!colQualifiers.length) colQualifiers[dataAt] = [];
            const qualifier = [...tabQualifier, ...row.splice(0, dataAt)];
            // For each column with data, join it with the qualifiers
            colQualifiers.slice(dataAt).forEach((colQualifier, i) => {
                if (row[i]) {
                    result.push({
                        // Create keys with true as value for each of the relevant qualifiers
                        ...Object.fromEntries(Array.from([...qualifier, ...colQualifier], qual => [qual, true])),
                        ...row[i]
                    });
                }
            });
        }
        data = dataAt >= 1;
    }
    return result;
}
success	language	form	operation	title
✅	Danish	noun	declension	Danish Noun Declension
✅	French	verb	conjugation	French Verb Conjugation
✅	Latin	verb	conjugation	Latin Verb Conjugation
✅	Ancient Greek	verb	conjugation	Ancient Greek Verb Conjugation
✅	Arabic	verb	conjugation	Arabic Verb Conjugation
✅	Arabic	noun	declension	Arabic Noun Declension
✅	Russian	noun	declension	Russian Noun Declension
❌	Russian	verb	conjugation	Russian Verb Conjugation
❌	Ukranian	verb	conjugation	Ukrainian Verb Conjugation
✅	Tamil	noun	declension	Tamil Noun Declension
✅	Tamil	verb	conjugation	Tamil Verb Conjugation
	import { toMatrix, toObject } from './parser'

	console.log(toObject(toMatrix(document.querySelector(".inflection-table"))))
	// Function to get an element's text, marking italics with underscores,
	// removing footnotes and extra white-space and optionally removing non-Latin text.
	export function clean(elem, latinOnly) {
	let txt = elem.textContent;
	const italic = elem.querySelector("i")?.textContent.trim();
	// Mark italics (assuming at most one word). latinOnly flag must be off:
	if (italic && !latinOnly) txt = txt.replaceAll(italic, "_" + italic + "_");
	// The parameter determines whether to remove non-Latin characters
	if (latinOnly) txt = txt.replace(/[^\x10-\xFF]+/gu, ""); // Remove non-Latin characters
	// Assume that a footnote appears as isolated, single digit
	return txt.replace(/(\s)\s*(\d\s+)?\|—/gu, " ").trim(); // Remove footnotes, multiple whitespace
	}

	// Function to convert an HTML table to a matrix, duplicating data where col/rowspans occur
	export function toMatrix(table) {
	// Get a 2-letter lang attribute from anywhere in the whole table. To be used when
	// a data cell does not have an element with a lang attribute.
	const defaultLang = Array.from(table.querySelectorAll("[lang]"), elem => elem.getAttribute("lang"))
	.find(lang => lang.length == 2) ?? "text";
	// Function to extract the content from a table cell, taking into account
	// tag (td/th), IPA class, lang, row/colspan attributes, and italics.
	// Returns an array of duplicated objects (to reflect colspan)
	function get(cell) {
	const length = +(cell.getAttribute("colspan") ?? 1);
	// Assume that the presence of italics means it is an alias:
	const hasAlias = !!cell.querySelector("i > span");
	// Assume that data is distinguished from qualifiers by use
	// of TD tag instead of TH tag
	const hasData = cell.tagName == "TD";
	const span = +(cell.getAttribute("rowspan") ?? 1);
	let content = {}; // Data will be represented in an object
	if (hasAlias) {
	content.alias = clean(cell);
	} else if (hasData) {
	const ipa = cell.querySelector(".IPA");
	if (ipa) content.ipa = clean(ipa);
	for (const version of cell.querySelectorAll("[lang]")) {
	const lang = version.getAttribute("lang");
	let cleanVersion = clean(version);
	// Check if this version is between parentheses, indicating it is an optional suffix
	// This occurs in the Ancient Greek table, like with "ἀμφεκᾰ́λυψε(ν)"
	if (version.previousSibling?.textContent === "(") {
	cleanVersion = (Array.isArray(content[lang]) ? content[lang].at(-1) : content[lang]) + cleanVersion;
	}
	// If there are multiple data for the same language: make an array;
	// If then also there is a parenthesis, assume the second version is an optional suffix for the first version
	content[lang] = lang in content
	? [].concat(content[lang], cleanVersion)
	: cleanVersion;
	}
	// If there is no IPA class, nor lang attribute,
	// then assume that the whole text of this cell
	// is text in the target language
	if (!Object.keys(content).length) {
	const version = clean(cell);
	content = version ? { [defaultLang]: version } : null;
	}
	} else {
	// It's a qualifier: assign a string (not object).
	// Only keep Latin character content.
	content = clean(cell, true);
	}
	// Return with the content also the rowspan
	return Array.from({length}, () => ({ content, span }));
	}

	let cells;
	const rows = [...table.rows];
	// Ignore first row if it is the clickable row to expand/collapse the rest of the table
	if (rows[0].cells[0].classList.contains("vsToggleElement")) rows.shift();
	// Ignore last row if it has notes
	if (rows.at(-1).cells[0].className.includes("-notes-")) rows.pop();
	return Array.from(rows, row => {
	// Get the current row, with colspans resolved into cells
	const buff = Array.from(row.cells, get).flat();
	// Integrate this row by taking into account the rowspans
	// of cells in previous row(s)
	cells = cells?.map(cell => --cell.span ? cell : buff.shift() ?? ({span:1})).concat(buff)
	?? buff;
	// Now that rowspans have served their purpose, just retain
	// the content:
	return cells.map(cell => cell.content);
	});
	}

	// Function to convert matrix to the desired flat object.
	export function toObject(matrix) {
	const result = [];
	const tabQualifier = [];
	const colQualifiers = [];
	let data = true;
	for (const [...row] of matrix) {
	// Find the column index where first data is found --
	// knowing that a data element is represented by an object
	// while a qualifier is represented by a string data type
	const dataAt = row.findIndex(content => typeof content === "object");
	// Some heading rows have a TD element in first column (an inconsistency in the table for Arabic),
	// so only consider it data when in a non-first column
	if (dataAt < 1) { // No data; so this is a heading with only qualifier(s)
	// If the qualifier is present in the first column, it's a table qualifier (highest-level qualifier)
	if (row[0]) tabQualifier[0] = row[0];
	// If the previous row had data, we enter a new section... so reset the column-qualifiers
	if (data) colQualifiers.length = 0;
	row.forEach((col, i) => (colQualifiers[i] ??= []).push(col));
	} else { // Row with data
	// For sections that have no column headers, create one dummy column qualifier
	if (!colQualifiers.length) colQualifiers[dataAt] = [];
	const qualifier = [...tabQualifier, ...row.splice(0, dataAt)];
	// For each column with data, join it with the qualifiers
	colQualifiers.slice(dataAt).forEach((colQualifier, i) => {
	if (row[i]) {
	result.push({
	// Create keys with true as value for each of the relevant qualifiers
	...Object.fromEntries(Array.from([...qualifier, ...colQualifier], qual => [qual, true])),
	...row[i]
	});
	}
	});
	}
	data = dataAt >= 1;
	}
	return result;
	}