Skip to content

Instantly share code, notes, and snippets.

@jasenmichael
Last active January 11, 2022 02:59
Show Gist options
  • Save jasenmichael/ebfa5f1c411a9f3fd735d2bb342fb056 to your computer and use it in GitHub Desktop.
Save jasenmichael/ebfa5f1c411a9f3fd735d2bb342fb056 to your computer and use it in GitHub Desktop.
recursivly convert html to markdown using nodejs
const fs = require("fs");
const path = require("path");
const glob = require("glob");
const TurndownService = require("turndown");
const turndownService = new TurndownService({
// options
headingStyle: "atx",
bulletListMarker: "-",
linkStyle: "referenced",
linkReferenceStyle: "full",
});
glob("site/**/*.html", (err, files) => {
if (err) {
console.log(err);
}
if (files) {
files.forEach((file) => {
if (!fs.existsSync('out/' + path.dirname(file))) {
fs.mkdirSync('out/' + path.dirname(file), { recursive: true })
}
console.log(file);
const html = fs.readFileSync(file).toString();
const textArray = turndownService.turndown(html).split("\n");
const text = `${file.replace('site/', 'http://')}\r\n` + textArray
.map((line) => {
line = line.endsWith('.html') ? line.replace('.html', '') : line
return line.includes('turn\\_client\\_track\\_id = "";', "") ? line.split('turn\\_client\\_track\\_id = "";')[1] : line
})
.filter((line) => {
// console.log(line);
return (
line !== "\r\n" &&
line !== " " &&
line.trim() !== "- Search" &&
line.trim() !== "Search" &&
line.trim() !== "- Search" &&
line.trim() !== "- Previous" &&
line.trim() !== "- |" &&
line.trim() !== "- Next" &&
!/\[\]\[(.*?)\]/g.test(line) &&
!line.startsWith("window.dataLayer") &&
!line.startsWith("!function") &&
!line.startsWith("![](") &&
!line.startsWith("function ") &&
!line.trim().startsWith("jQuery(function()") &&
!line.startsWith("<img ") &&
!line.startsWith("### _!") &&
!line.startsWith("// ")
);
})
.join("\r\n")
.replace(/ \r\n /g, " ")
.replace(/]: /g, "]: https://www.goodwillcentraltexas.org/")
.replace(/- - /g, "- ")
.replace(/ - /g, " - ")
.replace(/- /g, "- ")
// .replace(/ /g, "")
// .replace(/ /g, "")
// .replace(/ /g, "")
.replace(/https:\/\/www.goodwillcentraltexas.org\/http/g, "http")
.replace(
"\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n",
"\r\n"
)
.replace(/\r\n\r\n\r\n/g, "\r\n\r\n")
.replace(/\r\n\r\n/g, "\r\n\r\n");
// fs.writeFileSync("out.json", JSON.stringify(text, null, 2));
fs.writeFileSync(`./out/${file.replace(".html", ".md")}`, text);
});
}
console.log(`complete! processed: ${files.length} files`);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment