Skip to content

Instantly share code, notes, and snippets.

@a-churchill
Created April 8, 2024 22:09
Show Gist options
  • Save a-churchill/a562e1857364649f88aff03102982fba to your computer and use it in GitHub Desktop.
Save a-churchill/a562e1857364649f88aff03102982fba to your computer and use it in GitHub Desktop.
Importing Markdown to Sanity

Importing Markdown to Sanity

This Gist contains code we used at Causal to import our old Markdown documentation content into Sanity. See more on our blog!

// This script is used to convert Markdown files (e.g. as generated by GitBook) to a format that we
// can import into Sanity.
import { htmlToBlocks, randomKey } from "@sanity/block-tools";
import { createClient } from "@sanity/client";
import { readFile, writeFile } from "fs/promises";
import { JSDOM } from "jsdom";
import { toString } from "mdast-util-to-string";
import { basename, extname } from "path";
import { resolve } from "path";
import rehypeRaw from "rehype-raw";
import rehypeStringify from "rehype-stringify";
import remarkExtractFrontmatter from "remark-extract-frontmatter";
import remarkFrontmatter from "remark-frontmatter";
import remarkGfm from "remark-gfm";
import remarkParse from "remark-parse";
import remarkRehype from "remark-rehype";
import { unified } from "unified";
import { EXIT, visitParents } from "unist-util-visit-parents";
import { parse } from "yaml";
import { schema } from "./schema.mjs";
const client = createClient({
apiVersion: "2021-08-31",
projectId: "xxxxxxxx",
dataset: "production",
token: process.env.SANITY_TOKEN,
});
/** Map from image's original filename to its ID in sanity. */
const imagesByOriginalFilename = client
.fetch('*[_type == "sanity.imageAsset"]{_id, originalFilename}')
.then(images => new Map(images.map(image => [image.originalFilename, image._id])));
// The compiled schema type for the content type that holds the block array
const blockContentType = schema.get("docPage").fields.find(field => field.name === "content").type;
function capitalize(string) {
if (string === "") return string;
return string.charAt(0).toUpperCase() + string.slice(1);
}
async function convertMarkdownToHtml(markdown) {
/** Puts the title of the document on the data property, and removes it from the document. */
function extractTitle() {
return (node, file) => {
let title = "";
visitParents(node, "heading", (node, ancestors) => {
if (node.depth === 1) {
title = toString(node);
const parent = ancestors[ancestors.length - 1];
parent.children = parent.children.filter(n => n !== node);
return EXIT;
}
});
file.data.title = title;
};
}
const result = await unified()
.use(extractTitle)
.use(remarkParse)
.use(remarkGfm) // we need this to support tables
.use(remarkFrontmatter, { type: "yaml", marker: "-" })
.use(remarkExtractFrontmatter, { yaml: parse })
.use(remarkRehype, { allowDangerousHtml: true })
.use(rehypeRaw)
.use(rehypeStringify)
.process(markdown);
return { html: result.value, data: result.data };
}
/** Reads a file at the given path and returns an object `{ html, data }`. */
async function convertMarkdownFileToHtml(filePath) {
console.error("Parsing file: ", filePath);
const rawText = await readFile(filePath, "utf-8");
const processedText = rawText
// Hints with markdown inside them won't get parsed correctly unless we first wrap them in a div.
.replaceAll("{% hint style=", "<aside>{% hint style=")
.replaceAll('" %}', '" %}\n') // without this, markdown right after the hint won't be parsed
.replaceAll("{% endhint %}", "{% endhint %}</aside>")
.replaceAll("{% endembed %}", ""); // remove the endembed tag which we ignore anyway
return convertMarkdownToHtml(processedText);
}
async function convertHtmlToSanity({ filePath, html, data, sectionSlugs, imagesPath, files }) {
const images = await imagesByOriginalFilename;
const { title, description } = data;
const path = filePath.split("causal-docs/")[1];
const pathComponents = path.split("/");
if (pathComponents.length < 2) {
console.error("Path doesn't have section: ", path);
throw new Error("Path doesn't have section");
} else if (pathComponents.length === 2 && pathComponents[1] === "README.md") {
console.error("Section cannot have README");
throw new Error("Section cannot have README");
} else if (pathComponents.length > 3) {
console.error("Path is too deep: ", path);
throw new Error("Path is too deep");
}
let [section, slug] = pathComponents;
let parentPage = undefined;
if (pathComponents.length === 3 && pathComponents[2] !== "README.md") {
[section, parentPage, slug] = pathComponents;
}
sectionSlugs.add(section);
slug = basename(slug, extname(slug));
console.error("Converting to Sanity document: ", JSON.stringify({ section, parentPage, slug }));
const htmlToBlocksRules = [
{
// Code blocks
deserialize: (el, next, block) => {
if (el.tagName?.toLowerCase() !== "pre") return undefined;
const codeNode = el.children[0];
const childNodes =
codeNode && codeNode.tagName.toLowerCase() === "code"
? codeNode.childNodes
: el.childNodes;
let code = "";
childNodes.forEach(node => {
code += node.textContent;
});
return block({ _type: "code", code });
},
},
{
// Loom/YouTube embed blocks
deserialize: (el, next, block) => {
if (el.tagName?.toLowerCase() !== "p") return undefined;
if (!el.textContent?.startsWith('{% embed url="')) return undefined;
const url = el.textContent.replace('{% embed url="', "").replace('" %}', "");
if (url.startsWith("https://www.youtube.com")) {
return block({ _type: "youtube", url });
} else if (url.startsWith("https://www.loom.com")) {
return block({ _type: "loom", url });
} else {
console.warn(`Unknown embed URL: ${url}`);
return undefined;
}
},
},
{
// Images and figures
deserialize: (el, next, block) => {
if (el.tagName?.toLowerCase() !== "img" && el.tagName?.toLowerCase() !== "figure")
return undefined;
let src, caption;
if (el.tagName.toLowerCase() === "img") {
src = el.getAttribute("src");
caption = "";
} else {
const img = el.querySelector("img");
src = img.getAttribute("src");
caption = el.querySelector("figcaption")?.textContent ?? "";
}
const filename = basename(decodeURIComponent(src));
if (src.startsWith("http")) {
// download the image to the images path
console.warn(`❗ Download image before import: ${src}`);
}
const imageId = images.get(filename);
if (imageId == null) {
const path = resolve(imagesPath, filename);
return block({ _type: "image", caption, _sanityAsset: `image@file://${path}` });
}
return block({ _type: "image", caption, asset: { _type: "reference", _ref: imageId } });
},
},
{
// Hint blocks
deserialize: (el, next, block) => {
if (el.tagName?.toLowerCase() !== "aside") return undefined;
if (!el.textContent?.startsWith('{% hint style="')) return undefined;
const regex = /\{% hint style="(\w+)" %} (.*) {% endhint %\}(.*)/;
try {
const [, style, ...text] = el.innerHTML.match(regex);
const content = htmlToBlocks(`<div>${text.join("")}</div>`, blockContentType, {
parseHtml: html => new JSDOM(html).window.document,
rules: htmlToBlocksRules,
});
return block({ _type: "callout", style, content });
} catch (e) {
console.warn(`Failed to parse hint: ${el.innerHTML}`);
throw e;
}
},
},
{
// Summary/Detail blocks
deserialize: (el, next, block) => {
if (el.tagName?.toLowerCase() !== "details") return undefined;
const summaryElement = el.querySelector("summary");
if (summaryElement == null) {
console.warn("Details element without summary");
return undefined;
}
const title = summaryElement.textContent;
el.removeChild(summaryElement);
const content = htmlToBlocks(el.innerHTML, blockContentType, {
rules: htmlToBlocksRules,
parseHtml: html => new JSDOM(html).window.document,
});
return block({ _type: "collapsible", title, content });
},
},
{
// "{% content-ref" blocks
deserialize: (el, next, block) => {
if (el.tagName?.toLowerCase() !== "p") return undefined;
if (!el.textContent?.startsWith("{% content-ref")) return undefined;
return htmlToBlocks("", blockContentType, {
parseHtml: html => new JSDOM(html).window.document,
});
},
},
{
// Links to other documentation pages
deserialize: (el, next) => {
if (el.tagName?.toLowerCase() !== "a") return undefined;
let href = el.getAttribute("href");
if (href == null) return undefined;
if (el.getAttribute("href")?.startsWith("http") === true) return undefined; // default handling is fine
// GitBook uses broken-reference to indicate a link to a non-existent page
if (href === "broken-reference") return next(el.childNodes);
href = href.split("#")[0];
if (href.endsWith("/")) {
href = href.slice(0, -1);
}
const slug = basename(href, extname(href));
if (files.every(file => basename(file, extname(file)) !== slug)) return undefined;
// had to look at the source code to figure this one out -___-
return {
_type: "__annotation",
markDef: {
_key: randomKey(12),
_type: "internalLink",
reference: { _type: "reference", _ref: slug },
},
children: next(el.childNodes),
};
},
},
{
// Tables
deserialize: (el, next, block) => {
if (el.tagName?.toLowerCase() !== "table") return undefined;
const headers = el.querySelectorAll("thead th");
const columns = [...headers].map(th => th.textContent);
const rows = el.querySelectorAll("tbody tr");
const data = [...rows].map(row => {
const cells = row.querySelectorAll("td");
return [...cells].slice(0, columns.length).map(cell => cell.textContent);
});
return block({
_type: "table",
rows: [columns, ...data].map(cells => ({ _type: "tableRow", cells })),
});
},
},
// Manual:
// - Tabs
];
const portableText = htmlToBlocks(
html,
blockContentType,
{ rules: htmlToBlocksRules, parseHtml: html => new JSDOM(html).window.document },
{ allowedDecorators: true },
);
const isResourcePage = section === "resources";
return {
_type: isResourcePage ? "resource" : "docPage",
_id: slug,
title,
description,
slug: { _type: "slug", current: slug },
content: portableText,
...(isResourcePage
? {}
: {
parentSection: { _type: "reference", _ref: section },
parentPage: parentPage != null ? { _type: "reference", _ref: parentPage } : undefined,
}),
};
}
async function main() {
if (process.argv.length < 4) {
console.error("Usage: importMarkdown.mjs <path-to-images> <path> [...<path>]");
process.exit(1);
}
const imagesPath = process.argv[2];
const files = process.argv.slice(3);
const sectionSlugs = new Set();
const docPages = await Promise.all(
files.map(async filePath => {
const { html, data } = await convertMarkdownFileToHtml(filePath);
await writeFile(filePath + ".html", html);
const doc = await convertHtmlToSanity({
filePath,
html,
data,
sectionSlugs,
imagesPath,
files,
});
await writeFile(filePath + ".json", JSON.stringify(doc, null, 2));
return doc;
}),
);
const sections = [...sectionSlugs].map(slug => ({
_type: "docSection",
_id: slug,
title: slug
.split("-")
.map(s => (s === "and" ? s : capitalize(s)))
.join(" "),
slug: { _type: "slug", current: slug },
}));
console.error(`Converted ${docPages.length} documents`);
console.error(`Found ${sections.length} sections`);
const sectionsResult = sections.reduce((acc, s) => `${acc}${JSON.stringify(s)}\n`, "");
const result = docPages.reduce((acc, doc) => `${acc}${JSON.stringify(doc)}\n`, sectionsResult);
await writeFile("sanity-import.ndjson", result);
console.error("Wrote sanity-import.ndjson");
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment