Skip to content

Instantly share code, notes, and snippets.

@signalwerk
Last active July 12, 2024 14:52
Show Gist options
  • Save signalwerk/8262a30a3ac36efabd5d6f4d075790ab to your computer and use it in GitHub Desktop.
Save signalwerk/8262a30a3ac36efabd5d6f4d075790ab to your computer and use it in GitHub Desktop.
A tool to extract content from HTML, convert it to YAML, and update HTML content from YAML files.
// HTML to YAML Extractor and Updater
/**
* A tool to extract content from HTML, convert it to YAML, and update HTML content from YAML files.
*/
import cheerio from "cheerio";
import { promises as fs } from "fs";
import path from "path";
import yaml from "js-yaml";
import TurndownService from "turndown";
import { remark } from "remark";
import remarkHtml from "remark-html";
import prettier from "prettier";
const turndownService = new TurndownService();
const yamlDir = "./yamls";
let textDivsSelect = ".colMain > .wrap > .contentItem";
// This function removes old IE conditional comments from the HTML.
function cleanHTML(html) {
let cleanStr = html;
cleanStr = cleanStr.replace(/<!--\[if[\s\S]*?-->/g, "");
cleanStr = cleanStr.replace(/<!--([^>]+?)endif\]-->/g, "");
return cleanStr;
}
async function traverseDir(dir, callback, root = dir) {
const files = await fs.readdir(dir);
for (let file of files) {
let fullPath = path.join(dir, file);
let stat = await fs.lstat(fullPath);
if (stat.isDirectory()) {
await traverseDir(fullPath, callback, root);
} else if (path.extname(file) === ".html") {
await callback(fullPath, dir, root);
}
}
}
// This function extracts YAML from the HTML.
async function extractYamlFromHtml(filePath, doc, root) {
let html = await fs.readFile(filePath, "utf-8");
html = cleanHTML(html);
let $ = cheerio.load(html);
let textDivs = $(textDivsSelect).filter((index, element) => {
const classes = $(element).attr("class");
if (classes) {
const classArray = classes.split(" ");
for (let i = 0; i < classArray.length; i++) {
if (classArray[i].startsWith("wrapsfcustom-")) {
return false; // Element has a class starting with "wrapsfcustom-", so exclude it
}
}
}
return true; // Element doesn't have a class starting with "wrapsfcustom-"
});
textDivs.each(async function (i, el) {
let textContent = $(el).html();
textContent = textContent.replaceAll("&nbsp;", "zzzzz--nbsp--zzzzz");
let markdown = turndownService.turndown(textContent);
markdown = markdown.replaceAll("zzzzz--nbsp--zzzzz", "&nbsp;");
let prettyMarkdown = prettier.format(markdown, { parser: "markdown" });
let yamlContent = yaml.dump({ content: prettyMarkdown });
let relativePath = path.relative(root, filePath);
// remove file extension
relativePath = relativePath.slice(0, -5);
let fullIdentifier = `${relativePath.replaceAll(path.sep, "-")}-${i}`;
let yamlPath = path.join(yamlDir, `${fullIdentifier}.yaml`);
yamlContent = prettier.format(yamlContent, { parser: "yaml" });
$(el).attr("data-yaml", fullIdentifier);
await fs.writeFile(yamlPath, yamlContent, "utf-8");
console.log(`Wrote "${yamlPath}"`);
});
let formattedHtml = prettier.format($.html(), { parser: "html" });
await fs.writeFile(filePath, formattedHtml, "utf-8");
}
// This function updates the HTML from the YAML files.
async function updateHtmlFromYaml(filePath, doc, root) {
let html = await fs.readFile(filePath, "utf-8");
let $ = cheerio.load(html);
let textDivs = $("[data-yaml]");
let updatePromises = textDivs
.map(async (i, el) => {
let yamlPath = $(el).attr("data-yaml");
yamlPath = path.join(yamlDir, `${yamlPath.trim()}.yaml`);
let yamlContent = await fs.readFile(yamlPath, "utf-8");
let data = yaml.load(yamlContent);
let convertedHtml = await remark().use(remarkHtml).process(data.content);
convertedHtml = convertedHtml.toString();
console.log(`set from "${yamlPath}"`);
$(el).html(convertedHtml);
})
.get();
await Promise.all(updatePromises);
let formattedHtml = prettier.format($.html(), { parser: "html" });
await fs.writeFile(filePath, formattedHtml, "utf-8");
}
if (process.argv.includes("--extract")) {
let dirIndex = process.argv.indexOf("--extract") + 1;
let dir = process.argv[dirIndex];
traverseDir(dir, extractYamlFromHtml);
} else if (process.argv.includes("--update")) {
let dirIndex = process.argv.indexOf("--update") + 1;
let dir = process.argv[dirIndex];
traverseDir(dir, updateHtmlFromYaml);
} else {
console.error(
"Please specify a command: --extract <directory> or --update <directory>",
);
process.exit(1);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment