Created April 8, 2024 22:09
Importing Markdown to Sanity

This Gist contains code we used at Causal to import our old Markdown documentation content into Sanity. See more on our blog!

// This script is used to convert Markdown files (e.g. as generated by GitBook) to a format that we
// can import into Sanity.
import { htmlToBlocks, randomKey } from "@sanity/block-tools";
import { createClient } from "@sanity/client";
import { readFile, writeFile } from "fs/promises";
import { JSDOM } from "jsdom";
import { toString } from "mdast-util-to-string";
import { basename, extname } from "path";
import { resolve } from "path";
import rehypeRaw from "rehype-raw";
import rehypeStringify from "rehype-stringify";
import remarkExtractFrontmatter from "remark-extract-frontmatter";
import remarkFrontmatter from "remark-frontmatter";
import remarkGfm from "remark-gfm";
import remarkParse from "remark-parse";
import remarkRehype from "remark-rehype";
import { unified } from "unified";
import { EXIT, visitParents } from "unist-util-visit-parents";
import { parse } from "yaml";
import { schema } from "./schema.mjs";
const client = createClient({
apiVersion: "2021-08-31",
projectId: "xxxxxxxx",
dataset: "production",
token: process.env.SANITY_TOKEN,
/** Map from image's original filename to its ID in sanity. */
const imagesByOriginalFilename = client
.fetch('*[_type == "sanity.imageAsset"]{_id, originalFilename}')
.then(images => new Map( => [image.originalFilename, image._id])));
// The compiled schema type for the content type that holds the block array
const blockContentType = schema.get("docPage").fields.find(field => === "content").type;
function capitalize(string) {
if (string === "") return string;
return string.charAt(0).toUpperCase() + string.slice(1);
async function convertMarkdownToHtml(markdown) {
/** Puts the title of the document on the data property, and removes it from the document. */
function extractTitle() {
return (node, file) => {
let title = "";
visitParents(node, "heading", (node, ancestors) => {
if (node.depth === 1) {
title = toString(node);
const parent = ancestors[ancestors.length - 1];
parent.children = parent.children.filter(n => n !== node);
return EXIT;
}); = title;
const result = await unified()
.use(remarkGfm) // we need this to support tables
.use(remarkFrontmatter, { type: "yaml", marker: "-" })
.use(remarkExtractFrontmatter, { yaml: parse })
.use(remarkRehype, { allowDangerousHtml: true })
return { html: result.value, data: };
/** Reads a file at the given path and returns an object `{ html, data }`. */
async function convertMarkdownFileToHtml(filePath) {
console.error("Parsing file: ", filePath);
const rawText = await readFile(filePath, "utf-8");
const processedText = rawText
// Hints with markdown inside them won't get parsed correctly unless we first wrap them in a div.
.replaceAll("{% hint style=", "<aside>{% hint style=")
.replaceAll('" %}', '" %}\n') // without this, markdown right after the hint won't be parsed
.replaceAll("{% endhint %}", "{% endhint %}</aside>")
.replaceAll("{% endembed %}", ""); // remove the endembed tag which we ignore anyway
return convertMarkdownToHtml(processedText);
async function convertHtmlToSanity({ filePath, html, data, sectionSlugs, imagesPath, files }) {
const images = await imagesByOriginalFilename;
const { title, description } = data;
const path = filePath.split("causal-docs/")[1];
const pathComponents = path.split("/");
if (pathComponents.length < 2) {
console.error("Path doesn't have section: ", path);
throw new Error("Path doesn't have section");
} else if (pathComponents.length === 2 && pathComponents[1] === "") {
console.error("Section cannot have README");
throw new Error("Section cannot have README");
} else if (pathComponents.length > 3) {
console.error("Path is too deep: ", path);
throw new Error("Path is too deep");
let [section, slug] = pathComponents;
let parentPage = undefined;
if (pathComponents.length === 3 && pathComponents[2] !== "") {
[section, parentPage, slug] = pathComponents;
slug = basename(slug, extname(slug));
console.error("Converting to Sanity document: ", JSON.stringify({ section, parentPage, slug }));
const htmlToBlocksRules = [
// Code blocks
deserialize: (el, next, block) => {
if (el.tagName?.toLowerCase() !== "pre") return undefined;
const codeNode = el.children[0];
const childNodes =
codeNode && codeNode.tagName.toLowerCase() === "code"
? codeNode.childNodes
: el.childNodes;
let code = "";
childNodes.forEach(node => {
code += node.textContent;
return block({ _type: "code", code });
// Loom/YouTube embed blocks
deserialize: (el, next, block) => {
if (el.tagName?.toLowerCase() !== "p") return undefined;
if (!el.textContent?.startsWith('{% embed url="')) return undefined;
const url = el.textContent.replace('{% embed url="', "").replace('" %}', "");
if (url.startsWith("")) {
return block({ _type: "youtube", url });
} else if (url.startsWith("")) {
return block({ _type: "loom", url });
} else {
console.warn(`Unknown embed URL: ${url}`);
return undefined;
// Images and figures
deserialize: (el, next, block) => {
if (el.tagName?.toLowerCase() !== "img" && el.tagName?.toLowerCase() !== "figure")
return undefined;
let src, caption;
if (el.tagName.toLowerCase() === "img") {
src = el.getAttribute("src");
caption = "";
} else {
const img = el.querySelector("img");
src = img.getAttribute("src");
caption = el.querySelector("figcaption")?.textContent ?? "";
const filename = basename(decodeURIComponent(src));
if (src.startsWith("http")) {
// download the image to the images path
console.warn(`❗ Download image before import: ${src}`);
const imageId = images.get(filename);
if (imageId == null) {
const path = resolve(imagesPath, filename);
return block({ _type: "image", caption, _sanityAsset: `image@file://${path}` });
return block({ _type: "image", caption, asset: { _type: "reference", _ref: imageId } });
// Hint blocks
deserialize: (el, next, block) => {
if (el.tagName?.toLowerCase() !== "aside") return undefined;
if (!el.textContent?.startsWith('{% hint style="')) return undefined;
const regex = /\{% hint style="(\w+)" %} (.*) {% endhint %\}(.*)/;
try {
const [, style, ...text] = el.innerHTML.match(regex);
const content = htmlToBlocks(`<div>${text.join("")}</div>`, blockContentType, {
parseHtml: html => new JSDOM(html).window.document,
rules: htmlToBlocksRules,
return block({ _type: "callout", style, content });
} catch (e) {
console.warn(`Failed to parse hint: ${el.innerHTML}`);
throw e;
// Summary/Detail blocks
deserialize: (el, next, block) => {
if (el.tagName?.toLowerCase() !== "details") return undefined;
const summaryElement = el.querySelector("summary");
if (summaryElement == null) {
console.warn("Details element without summary");
return undefined;
const title = summaryElement.textContent;
const content = htmlToBlocks(el.innerHTML, blockContentType, {
rules: htmlToBlocksRules,
parseHtml: html => new JSDOM(html).window.document,
return block({ _type: "collapsible", title, content });
// "{% content-ref" blocks
deserialize: (el, next, block) => {
if (el.tagName?.toLowerCase() !== "p") return undefined;
if (!el.textContent?.startsWith("{% content-ref")) return undefined;
return htmlToBlocks("", blockContentType, {
parseHtml: html => new JSDOM(html).window.document,
// Links to other documentation pages
deserialize: (el, next) => {
if (el.tagName?.toLowerCase() !== "a") return undefined;
let href = el.getAttribute("href");
if (href == null) return undefined;
if (el.getAttribute("href")?.startsWith("http") === true) return undefined; // default handling is fine
// GitBook uses broken-reference to indicate a link to a non-existent page
if (href === "broken-reference") return next(el.childNodes);
href = href.split("#")[0];
if (href.endsWith("/")) {
href = href.slice(0, -1);
const slug = basename(href, extname(href));
if (files.every(file => basename(file, extname(file)) !== slug)) return undefined;
// had to look at the source code to figure this one out -___-
return {
_type: "__annotation",
markDef: {
_key: randomKey(12),
_type: "internalLink",
reference: { _type: "reference", _ref: slug },
children: next(el.childNodes),
// Tables
deserialize: (el, next, block) => {
if (el.tagName?.toLowerCase() !== "table") return undefined;
const headers = el.querySelectorAll("thead th");
const columns = [...headers].map(th => th.textContent);
const rows = el.querySelectorAll("tbody tr");
const data = [...rows].map(row => {
const cells = row.querySelectorAll("td");
return [...cells].slice(0, columns.length).map(cell => cell.textContent);
return block({
_type: "table",
rows: [columns,].map(cells => ({ _type: "tableRow", cells })),
// Manual:
// - Tabs
const portableText = htmlToBlocks(
{ rules: htmlToBlocksRules, parseHtml: html => new JSDOM(html).window.document },
{ allowedDecorators: true },
const isResourcePage = section === "resources";
return {
_type: isResourcePage ? "resource" : "docPage",
_id: slug,
slug: { _type: "slug", current: slug },
content: portableText,
? {}
: {
parentSection: { _type: "reference", _ref: section },
parentPage: parentPage != null ? { _type: "reference", _ref: parentPage } : undefined,
async function main() {
if (process.argv.length < 4) {
console.error("Usage: importMarkdown.mjs <path-to-images> <path> [...<path>]");
const imagesPath = process.argv[2];
const files = process.argv.slice(3);
const sectionSlugs = new Set();
const docPages = await Promise.all( filePath => {
const { html, data } = await convertMarkdownFileToHtml(filePath);
await writeFile(filePath + ".html", html);
const doc = await convertHtmlToSanity({
await writeFile(filePath + ".json", JSON.stringify(doc, null, 2));
return doc;
const sections = [...sectionSlugs].map(slug => ({
_type: "docSection",
_id: slug,
title: slug
.map(s => (s === "and" ? s : capitalize(s)))
.join(" "),
slug: { _type: "slug", current: slug },
console.error(`Converted ${docPages.length} documents`);
console.error(`Found ${sections.length} sections`);
const sectionsResult = sections.reduce((acc, s) => `${acc}${JSON.stringify(s)}\n`, "");
const result = docPages.reduce((acc, doc) => `${acc}${JSON.stringify(doc)}\n`, sectionsResult);
await writeFile("sanity-import.ndjson", result);
console.error("Wrote sanity-import.ndjson");
