|
// Function to get an element's text, marking italics with underscores, |
|
// removing footnotes and extra white-space and optionally removing non-Latin text. |
|
export function clean(elem, latinOnly) { |
|
let txt = elem.textContent; |
|
const italic = elem.querySelector("i")?.textContent.trim(); |
|
// Mark italics (assuming at most one word). latinOnly flag must be off: |
|
if (italic && !latinOnly) txt = txt.replaceAll(italic, "_" + italic + "_"); |
|
// The parameter determines whether to remove non-Latin characters |
|
if (latinOnly) txt = txt.replace(/[^\x10-\xFF]+/gu, ""); // Remove non-Latin characters |
|
// Assume that a footnote appears as isolated, single digit |
|
return txt.replace(/(\s)\s*(\d\s+)?|—/gu, " ").trim(); // Remove footnotes, multiple whitespace |
|
} |
|
|
|
// Function to convert an HTML table to a matrix, duplicating data where col/rowspans occur |
|
export function toMatrix(table) { |
|
// Get a 2-letter lang attribute from anywhere in the whole table. To be used when |
|
// a data cell does not have an element with a lang attribute. |
|
const defaultLang = Array.from(table.querySelectorAll("[lang]"), elem => elem.getAttribute("lang")) |
|
.find(lang => lang.length == 2) ?? "text"; |
|
// Function to extract the content from a table cell, taking into account |
|
// tag (td/th), IPA class, lang, row/colspan attributes, and italics. |
|
// Returns an array of duplicated objects (to reflect colspan) |
|
function get(cell) { |
|
const length = +(cell.getAttribute("colspan") ?? 1); |
|
// Assume that the presence of italics means it is an alias: |
|
const hasAlias = !!cell.querySelector("i > span"); |
|
// Assume that data is distinguished from qualifiers by use |
|
// of TD tag instead of TH tag |
|
const hasData = cell.tagName == "TD"; |
|
const span = +(cell.getAttribute("rowspan") ?? 1); |
|
let content = {}; // Data will be represented in an object |
|
if (hasAlias) { |
|
content.alias = clean(cell); |
|
} else if (hasData) { |
|
const ipa = cell.querySelector(".IPA"); |
|
if (ipa) content.ipa = clean(ipa); |
|
for (const version of cell.querySelectorAll("[lang]")) { |
|
const lang = version.getAttribute("lang"); |
|
let cleanVersion = clean(version); |
|
// Check if this version is between parentheses, indicating it is an optional suffix |
|
// This occurs in the Ancient Greek table, like with "ἀμφεκᾰ́λυψε(ν)" |
|
if (version.previousSibling?.textContent === "(") { |
|
cleanVersion = (Array.isArray(content[lang]) ? content[lang].at(-1) : content[lang]) + cleanVersion; |
|
} |
|
// If there are multiple data for the same language: make an array; |
|
// If then also there is a parenthesis, assume the second version is an optional suffix for the first version |
|
content[lang] = lang in content |
|
? [].concat(content[lang], cleanVersion) |
|
: cleanVersion; |
|
} |
|
// If there is no IPA class, nor lang attribute, |
|
// then assume that the whole text of this cell |
|
// is text in the target language |
|
if (!Object.keys(content).length) { |
|
const version = clean(cell); |
|
content = version ? { [defaultLang]: version } : null; |
|
} |
|
} else { |
|
// It's a qualifier: assign a string (not object). |
|
// Only keep Latin character content. |
|
content = clean(cell, true); |
|
} |
|
// Return with the content also the rowspan |
|
return Array.from({length}, () => ({ content, span })); |
|
} |
|
|
|
let cells; |
|
const rows = [...table.rows]; |
|
// Ignore first row if it is the clickable row to expand/collapse the rest of the table |
|
if (rows[0].cells[0].classList.contains("vsToggleElement")) rows.shift(); |
|
// Ignore last row if it has notes |
|
if (rows.at(-1).cells[0].className.includes("-notes-")) rows.pop(); |
|
return Array.from(rows, row => { |
|
// Get the current row, with colspans resolved into cells |
|
const buff = Array.from(row.cells, get).flat(); |
|
// Integrate this row by taking into account the rowspans |
|
// of cells in previous row(s) |
|
cells = cells?.map(cell => --cell.span ? cell : buff.shift() ?? ({span:1})).concat(buff) |
|
?? buff; |
|
// Now that rowspans have served their purpose, just retain |
|
// the content: |
|
return cells.map(cell => cell.content); |
|
}); |
|
} |
|
|
|
// Function to convert matrix to the desired flat object. |
|
export function toObject(matrix) { |
|
const result = []; |
|
const tabQualifier = []; |
|
const colQualifiers = []; |
|
let data = true; |
|
for (const [...row] of matrix) { |
|
// Find the column index where first data is found -- |
|
// knowing that a data element is represented by an object |
|
// while a qualifier is represented by a string data type |
|
const dataAt = row.findIndex(content => typeof content === "object"); |
|
// Some heading rows have a TD element in first column (an inconsistency in the table for Arabic), |
|
// so only consider it data when in a non-first column |
|
if (dataAt < 1) { // No data; so this is a heading with only qualifier(s) |
|
// If the qualifier is present in the first column, it's a table qualifier (highest-level qualifier) |
|
if (row[0]) tabQualifier[0] = row[0]; |
|
// If the previous row had data, we enter a new section... so reset the column-qualifiers |
|
if (data) colQualifiers.length = 0; |
|
row.forEach((col, i) => (colQualifiers[i] ??= []).push(col)); |
|
} else { // Row with data |
|
// For sections that have no column headers, create one dummy column qualifier |
|
if (!colQualifiers.length) colQualifiers[dataAt] = []; |
|
const qualifier = [...tabQualifier, ...row.splice(0, dataAt)]; |
|
// For each column with data, join it with the qualifiers |
|
colQualifiers.slice(dataAt).forEach((colQualifier, i) => { |
|
if (row[i]) { |
|
result.push({ |
|
// Create keys with true as value for each of the relevant qualifiers |
|
...Object.fromEntries(Array.from([...qualifier, ...colQualifier], qual => [qual, true])), |
|
...row[i] |
|
}); |
|
} |
|
}); |
|
} |
|
data = dataAt >= 1; |
|
} |
|
return result; |
|
} |