Skip to content

Instantly share code, notes, and snippets.

@lancejpollard
Last active February 21, 2024 11:43
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lancejpollard/ce8e1bd04de1eec150d974447d46cc43 to your computer and use it in GitHub Desktop.
Save lancejpollard/ce8e1bd04de1eec150d974447d46cc43 to your computer and use it in GitHub Desktop.
Wiktionary Table Parser

Wiktionary HTML Table Parser

This should work against these Wiktionary declension and conjugation tables. On each Wiktionary link/page, scroll down to the conjugation or declention table and click to expand it, to test this on it.

success language form operation title
Danish noun declension Danish Noun Declension
French verb conjugation French Verb Conjugation
Latin verb conjugation Latin Verb Conjugation
Ancient Greek verb conjugation Ancient Greek Verb Conjugation
Arabic verb conjugation Arabic Verb Conjugation
Arabic noun declension Arabic Noun Declension
Russian noun declension Russian Noun Declension
Russian verb conjugation Russian Verb Conjugation
Ukranian verb conjugation Ukrainian Verb Conjugation
Tamil noun declension Tamil Noun Declension
Tamil verb conjugation Tamil Verb Conjugation

Others that don't fit the same pattern:

success language form operation title
Irish noun declension Irish Noun Declension
import { toMatrix, toObject } from './parser'
console.log(toObject(toMatrix(document.querySelector(".inflection-table"))))
// Function to get an element's text, marking italics with underscores,
// removing footnotes and extra white-space and optionally removing non-Latin text.
export function clean(elem, latinOnly) {
let txt = elem.textContent;
const italic = elem.querySelector("i")?.textContent.trim();
// Mark italics (assuming at most one word). latinOnly flag must be off:
if (italic && !latinOnly) txt = txt.replaceAll(italic, "_" + italic + "_");
// The parameter determines whether to remove non-Latin characters
if (latinOnly) txt = txt.replace(/[^\x10-\xFF]+/gu, ""); // Remove non-Latin characters
// Assume that a footnote appears as isolated, single digit
return txt.replace(/(\s)\s*(\d\s+)?|—/gu, " ").trim(); // Remove footnotes, multiple whitespace
}
// Function to convert an HTML table to a matrix, duplicating data where col/rowspans occur
export function toMatrix(table) {
// Get a 2-letter lang attribute from anywhere in the whole table. To be used when
// a data cell does not have an element with a lang attribute.
const defaultLang = Array.from(table.querySelectorAll("[lang]"), elem => elem.getAttribute("lang"))
.find(lang => lang.length == 2) ?? "text";
// Function to extract the content from a table cell, taking into account
// tag (td/th), IPA class, lang, row/colspan attributes, and italics.
// Returns an array of duplicated objects (to reflect colspan)
function get(cell) {
const length = +(cell.getAttribute("colspan") ?? 1);
// Assume that the presence of italics means it is an alias:
const hasAlias = !!cell.querySelector("i > span");
// Assume that data is distinguished from qualifiers by use
// of TD tag instead of TH tag
const hasData = cell.tagName == "TD";
const span = +(cell.getAttribute("rowspan") ?? 1);
let content = {}; // Data will be represented in an object
if (hasAlias) {
content.alias = clean(cell);
} else if (hasData) {
const ipa = cell.querySelector(".IPA");
if (ipa) content.ipa = clean(ipa);
for (const version of cell.querySelectorAll("[lang]")) {
const lang = version.getAttribute("lang");
let cleanVersion = clean(version);
// Check if this version is between parentheses, indicating it is an optional suffix
// This occurs in the Ancient Greek table, like with "ἀμφεκᾰ́λυψε(ν)"
if (version.previousSibling?.textContent === "(") {
cleanVersion = (Array.isArray(content[lang]) ? content[lang].at(-1) : content[lang]) + cleanVersion;
}
// If there are multiple data for the same language: make an array;
// If then also there is a parenthesis, assume the second version is an optional suffix for the first version
content[lang] = lang in content
? [].concat(content[lang], cleanVersion)
: cleanVersion;
}
// If there is no IPA class, nor lang attribute,
// then assume that the whole text of this cell
// is text in the target language
if (!Object.keys(content).length) {
const version = clean(cell);
content = version ? { [defaultLang]: version } : null;
}
} else {
// It's a qualifier: assign a string (not object).
// Only keep Latin character content.
content = clean(cell, true);
}
// Return with the content also the rowspan
return Array.from({length}, () => ({ content, span }));
}
let cells;
const rows = [...table.rows];
// Ignore first row if it is the clickable row to expand/collapse the rest of the table
if (rows[0].cells[0].classList.contains("vsToggleElement")) rows.shift();
// Ignore last row if it has notes
if (rows.at(-1).cells[0].className.includes("-notes-")) rows.pop();
return Array.from(rows, row => {
// Get the current row, with colspans resolved into cells
const buff = Array.from(row.cells, get).flat();
// Integrate this row by taking into account the rowspans
// of cells in previous row(s)
cells = cells?.map(cell => --cell.span ? cell : buff.shift() ?? ({span:1})).concat(buff)
?? buff;
// Now that rowspans have served their purpose, just retain
// the content:
return cells.map(cell => cell.content);
});
}
// Function to convert matrix to the desired flat object.
export function toObject(matrix) {
const result = [];
const tabQualifier = [];
const colQualifiers = [];
let data = true;
for (const [...row] of matrix) {
// Find the column index where first data is found --
// knowing that a data element is represented by an object
// while a qualifier is represented by a string data type
const dataAt = row.findIndex(content => typeof content === "object");
// Some heading rows have a TD element in first column (an inconsistency in the table for Arabic),
// so only consider it data when in a non-first column
if (dataAt < 1) { // No data; so this is a heading with only qualifier(s)
// If the qualifier is present in the first column, it's a table qualifier (highest-level qualifier)
if (row[0]) tabQualifier[0] = row[0];
// If the previous row had data, we enter a new section... so reset the column-qualifiers
if (data) colQualifiers.length = 0;
row.forEach((col, i) => (colQualifiers[i] ??= []).push(col));
} else { // Row with data
// For sections that have no column headers, create one dummy column qualifier
if (!colQualifiers.length) colQualifiers[dataAt] = [];
const qualifier = [...tabQualifier, ...row.splice(0, dataAt)];
// For each column with data, join it with the qualifiers
colQualifiers.slice(dataAt).forEach((colQualifier, i) => {
if (row[i]) {
result.push({
// Create keys with true as value for each of the relevant qualifiers
...Object.fromEntries(Array.from([...qualifier, ...colQualifier], qual => [qual, true])),
...row[i]
});
}
});
}
data = dataAt >= 1;
}
return result;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment