a-churchill/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Importing Markdown to Sanity

This Gist contains code we used at Causal to import our old Markdown documentation content into Sanity. See more on our blog!

  
## importMarkdown.mjs
// This script is used to convert Markdown files (e.g. as generated by GitBook) to a format that we
// can import into Sanity.

import { htmlToBlocks, randomKey } from "@sanity/block-tools";
import { createClient } from "@sanity/client";
import { readFile, writeFile } from "fs/promises";
import { JSDOM } from "jsdom";
import { toString } from "mdast-util-to-string";
import { basename, extname } from "path";
import { resolve } from "path";
import rehypeRaw from "rehype-raw";
import rehypeStringify from "rehype-stringify";
import remarkExtractFrontmatter from "remark-extract-frontmatter";
import remarkFrontmatter from "remark-frontmatter";
import remarkGfm from "remark-gfm";
import remarkParse from "remark-parse";
import remarkRehype from "remark-rehype";
import { unified } from "unified";
import { EXIT, visitParents } from "unist-util-visit-parents";
import { parse } from "yaml";

import { schema } from "./schema.mjs";

const client = createClient({
  apiVersion: "2021-08-31",
  projectId: "xxxxxxxx",
  dataset: "production",
  token: process.env.SANITY_TOKEN,
});

/** Map from image's original filename to its ID in sanity. */
const imagesByOriginalFilename = client
  .fetch('*[_type == "sanity.imageAsset"]{_id, originalFilename}')
  .then(images => new Map(images.map(image => [image.originalFilename, image._id])));

// The compiled schema type for the content type that holds the block array
const blockContentType = schema.get("docPage").fields.find(field => field.name === "content").type;

function capitalize(string) {
  if (string === "") return string;
  return string.charAt(0).toUpperCase() + string.slice(1);
}

async function convertMarkdownToHtml(markdown) {
  /** Puts the title of the document on the data property, and removes it from the document. */
  function extractTitle() {
    return (node, file) => {
      let title = "";

      visitParents(node, "heading", (node, ancestors) => {
        if (node.depth === 1) {
          title = toString(node);
          const parent = ancestors[ancestors.length - 1];
          parent.children = parent.children.filter(n => n !== node);
          return EXIT;
        }
      });

      file.data.title = title;
    };
  }

  const result = await unified()
    .use(extractTitle)
    .use(remarkParse)
    .use(remarkGfm) // we need this to support tables
    .use(remarkFrontmatter, { type: "yaml", marker: "-" })
    .use(remarkExtractFrontmatter, { yaml: parse })
    .use(remarkRehype, { allowDangerousHtml: true })
    .use(rehypeRaw)
    .use(rehypeStringify)
    .process(markdown);

  return { html: result.value, data: result.data };
}

/** Reads a file at the given path and returns an object `{ html, data }`. */
async function convertMarkdownFileToHtml(filePath) {
  console.error("Parsing file:                   ", filePath);

  const rawText = await readFile(filePath, "utf-8");
  const processedText = rawText
    // Hints with markdown inside them won't get parsed correctly unless we first wrap them in a div.
    .replaceAll("{% hint style=", "<aside>{% hint style=")
    .replaceAll('" %}', '" %}\n') // without this, markdown right after the hint won't be parsed
    .replaceAll("{% endhint %}", "{% endhint %}</aside>")
    .replaceAll("{% endembed %}", ""); // remove the endembed tag which we ignore anyway

  return convertMarkdownToHtml(processedText);
}

async function convertHtmlToSanity({ filePath, html, data, sectionSlugs, imagesPath, files }) {
  const images = await imagesByOriginalFilename;
  const { title, description } = data;

  const path = filePath.split("causal-docs/")[1];
  const pathComponents = path.split("/");
  if (pathComponents.length < 2) {
    console.error("Path doesn't have section:      ", path);
    throw new Error("Path doesn't have section");
  } else if (pathComponents.length === 2 && pathComponents[1] === "README.md") {
    console.error("Section cannot have README");
    throw new Error("Section cannot have README");
  } else if (pathComponents.length > 3) {
    console.error("Path is too deep:               ", path);
    throw new Error("Path is too deep");
  }

  let [section, slug] = pathComponents;

  let parentPage = undefined;
  if (pathComponents.length === 3 && pathComponents[2] !== "README.md") {
    [section, parentPage, slug] = pathComponents;
  }

  sectionSlugs.add(section);
  slug = basename(slug, extname(slug));
  console.error("Converting to Sanity document:  ", JSON.stringify({ section, parentPage, slug }));

  const htmlToBlocksRules = [
    {
      // Code blocks
      deserialize: (el, next, block) => {
        if (el.tagName?.toLowerCase() !== "pre") return undefined;

        const codeNode = el.children[0];
        const childNodes =
          codeNode && codeNode.tagName.toLowerCase() === "code"
            ? codeNode.childNodes
            : el.childNodes;
        let code = "";
        childNodes.forEach(node => {
          code += node.textContent;
        });

        return block({ _type: "code", code });
      },
    },

    {
      // Loom/YouTube embed blocks
      deserialize: (el, next, block) => {
        if (el.tagName?.toLowerCase() !== "p") return undefined;
        if (!el.textContent?.startsWith('{% embed url="')) return undefined;

        const url = el.textContent.replace('{% embed url="', "").replace('" %}', "");
        if (url.startsWith("https://www.youtube.com")) {
          return block({ _type: "youtube", url });
        } else if (url.startsWith("https://www.loom.com")) {
          return block({ _type: "loom", url });
        } else {
          console.warn(`Unknown embed URL: ${url}`);
          return undefined;
        }
      },
    },

    {
      // Images and figures
      deserialize: (el, next, block) => {
        if (el.tagName?.toLowerCase() !== "img" && el.tagName?.toLowerCase() !== "figure")
          return undefined;

        let src, caption;
        if (el.tagName.toLowerCase() === "img") {
          src = el.getAttribute("src");
          caption = "";
        } else {
          const img = el.querySelector("img");
          src = img.getAttribute("src");
          caption = el.querySelector("figcaption")?.textContent ?? "";
        }

        const filename = basename(decodeURIComponent(src));

        if (src.startsWith("http")) {
          // download the image to the images path
          console.warn(`❗ Download image before import: ${src}`);
        }

        const imageId = images.get(filename);
        if (imageId == null) {
          const path = resolve(imagesPath, filename);
          return block({ _type: "image", caption, _sanityAsset: `image@file://${path}` });
        }

        return block({ _type: "image", caption, asset: { _type: "reference", _ref: imageId } });
      },
    },

    {
      // Hint blocks
      deserialize: (el, next, block) => {
        if (el.tagName?.toLowerCase() !== "aside") return undefined;
        if (!el.textContent?.startsWith('{% hint style="')) return undefined;

        const regex = /\{% hint style="(\w+)" %} (.*) {% endhint %\}(.*)/;
        try {
          const [, style, ...text] = el.innerHTML.match(regex);

          const content = htmlToBlocks(`<div>${text.join("")}</div>`, blockContentType, {
            parseHtml: html => new JSDOM(html).window.document,
            rules: htmlToBlocksRules,
          });

          return block({ _type: "callout", style, content });
        } catch (e) {
          console.warn(`Failed to parse hint: ${el.innerHTML}`);
          throw e;
        }
      },
    },

    {
      // Summary/Detail blocks
      deserialize: (el, next, block) => {
        if (el.tagName?.toLowerCase() !== "details") return undefined;

        const summaryElement = el.querySelector("summary");
        if (summaryElement == null) {
          console.warn("Details element without summary");
          return undefined;
        }

        const title = summaryElement.textContent;
        el.removeChild(summaryElement);
        const content = htmlToBlocks(el.innerHTML, blockContentType, {
          rules: htmlToBlocksRules,
          parseHtml: html => new JSDOM(html).window.document,
        });

        return block({ _type: "collapsible", title, content });
      },
    },

    {
      // "{% content-ref" blocks
      deserialize: (el, next, block) => {
        if (el.tagName?.toLowerCase() !== "p") return undefined;
        if (!el.textContent?.startsWith("{% content-ref")) return undefined;

        return htmlToBlocks("", blockContentType, {
          parseHtml: html => new JSDOM(html).window.document,
        });
      },
    },

    {
      // Links to other documentation pages
      deserialize: (el, next) => {
        if (el.tagName?.toLowerCase() !== "a") return undefined;
        let href = el.getAttribute("href");
        if (href == null) return undefined;
        if (el.getAttribute("href")?.startsWith("http") === true) return undefined; // default handling is fine

        // GitBook uses broken-reference to indicate a link to a non-existent page
        if (href === "broken-reference") return next(el.childNodes);

        href = href.split("#")[0];
        if (href.endsWith("/")) {
          href = href.slice(0, -1);
        }

        const slug = basename(href, extname(href));
        if (files.every(file => basename(file, extname(file)) !== slug)) return undefined;

        // had to look at the source code to figure this one out -___-
        return {
          _type: "__annotation",
          markDef: {
            _key: randomKey(12),
            _type: "internalLink",
            reference: { _type: "reference", _ref: slug },
          },
          children: next(el.childNodes),
        };
      },
    },

    {
      // Tables
      deserialize: (el, next, block) => {
        if (el.tagName?.toLowerCase() !== "table") return undefined;

        const headers = el.querySelectorAll("thead th");
        const columns = [...headers].map(th => th.textContent);
        const rows = el.querySelectorAll("tbody tr");
        const data = [...rows].map(row => {
          const cells = row.querySelectorAll("td");
          return [...cells].slice(0, columns.length).map(cell => cell.textContent);
        });

        return block({
          _type: "table",
          rows: [columns, ...data].map(cells => ({ _type: "tableRow", cells })),
        });
      },
    },

    // Manual:
    //  - Tabs
  ];

  const portableText = htmlToBlocks(
    html,
    blockContentType,
    { rules: htmlToBlocksRules, parseHtml: html => new JSDOM(html).window.document },
    { allowedDecorators: true },
  );

  const isResourcePage = section === "resources";

  return {
    _type: isResourcePage ? "resource" : "docPage",
    _id: slug,
    title,
    description,
    slug: { _type: "slug", current: slug },
    content: portableText,
    ...(isResourcePage
      ? {}
      : {
          parentSection: { _type: "reference", _ref: section },
          parentPage: parentPage != null ? { _type: "reference", _ref: parentPage } : undefined,
        }),
  };
}

async function main() {
  if (process.argv.length < 4) {
    console.error("Usage: importMarkdown.mjs <path-to-images> <path> [...<path>]");
    process.exit(1);
  }

  const imagesPath = process.argv[2];
  const files = process.argv.slice(3);

  const sectionSlugs = new Set();

  const docPages = await Promise.all(
    files.map(async filePath => {
      const { html, data } = await convertMarkdownFileToHtml(filePath);
      await writeFile(filePath + ".html", html);
      const doc = await convertHtmlToSanity({
        filePath,
        html,
        data,
        sectionSlugs,
        imagesPath,
        files,
      });
      await writeFile(filePath + ".json", JSON.stringify(doc, null, 2));
      return doc;
    }),
  );

  const sections = [...sectionSlugs].map(slug => ({
    _type: "docSection",
    _id: slug,
    title: slug
      .split("-")
      .map(s => (s === "and" ? s : capitalize(s)))
      .join(" "),
    slug: { _type: "slug", current: slug },
  }));

  console.error(`Converted ${docPages.length} documents`);
  console.error(`Found ${sections.length} sections`);
  const sectionsResult = sections.reduce((acc, s) => `${acc}${JSON.stringify(s)}\n`, "");
  const result = docPages.reduce((acc, doc) => `${acc}${JSON.stringify(doc)}\n`, sectionsResult);
  await writeFile("sanity-import.ndjson", result);
  console.error("Wrote sanity-import.ndjson");
}

main();
	// This script is used to convert Markdown files (e.g. as generated by GitBook) to a format that we
	// can import into Sanity.

	import { htmlToBlocks, randomKey } from "@sanity/block-tools";
	import { createClient } from "@sanity/client";
	import { readFile, writeFile } from "fs/promises";
	import { JSDOM } from "jsdom";
	import { toString } from "mdast-util-to-string";
	import { basename, extname } from "path";
	import { resolve } from "path";
	import rehypeRaw from "rehype-raw";
	import rehypeStringify from "rehype-stringify";
	import remarkExtractFrontmatter from "remark-extract-frontmatter";
	import remarkFrontmatter from "remark-frontmatter";
	import remarkGfm from "remark-gfm";
	import remarkParse from "remark-parse";
	import remarkRehype from "remark-rehype";
	import { unified } from "unified";
	import { EXIT, visitParents } from "unist-util-visit-parents";
	import { parse } from "yaml";

	import { schema } from "./schema.mjs";

	const client = createClient({
	apiVersion: "2021-08-31",
	projectId: "xxxxxxxx",
	dataset: "production",
	token: process.env.SANITY_TOKEN,
	});

	/** Map from image's original filename to its ID in sanity. */
	const imagesByOriginalFilename = client
	.fetch('*[_type == "sanity.imageAsset"]{_id, originalFilename}')
	.then(images => new Map(images.map(image => [image.originalFilename, image._id])));

	// The compiled schema type for the content type that holds the block array
	const blockContentType = schema.get("docPage").fields.find(field => field.name === "content").type;

	function capitalize(string) {
	if (string === "") return string;
	return string.charAt(0).toUpperCase() + string.slice(1);
	}

	async function convertMarkdownToHtml(markdown) {
	/** Puts the title of the document on the data property, and removes it from the document. */
	function extractTitle() {
	return (node, file) => {
	let title = "";

	visitParents(node, "heading", (node, ancestors) => {
	if (node.depth === 1) {
	title = toString(node);
	const parent = ancestors[ancestors.length - 1];
	parent.children = parent.children.filter(n => n !== node);
	return EXIT;
	}
	});

	file.data.title = title;
	};
	}

	const result = await unified()
	.use(extractTitle)
	.use(remarkParse)
	.use(remarkGfm) // we need this to support tables
	.use(remarkFrontmatter, { type: "yaml", marker: "-" })
	.use(remarkExtractFrontmatter, { yaml: parse })
	.use(remarkRehype, { allowDangerousHtml: true })
	.use(rehypeRaw)
	.use(rehypeStringify)
	.process(markdown);

	return { html: result.value, data: result.data };
	}

	/** Reads a file at the given path and returns an object `{ html, data }`. */
	async function convertMarkdownFileToHtml(filePath) {
	console.error("Parsing file: ", filePath);

	const rawText = await readFile(filePath, "utf-8");
	const processedText = rawText
	// Hints with markdown inside them won't get parsed correctly unless we first wrap them in a div.
	.replaceAll("{% hint style=", "<aside>{% hint style=")
	.replaceAll('" %}', '" %}\n') // without this, markdown right after the hint won't be parsed
	.replaceAll("{% endhint %}", "{% endhint %}</aside>")
	.replaceAll("{% endembed %}", ""); // remove the endembed tag which we ignore anyway

	return convertMarkdownToHtml(processedText);
	}

	async function convertHtmlToSanity({ filePath, html, data, sectionSlugs, imagesPath, files }) {
	const images = await imagesByOriginalFilename;
	const { title, description } = data;

	const path = filePath.split("causal-docs/")[1];
	const pathComponents = path.split("/");
	if (pathComponents.length < 2) {
	console.error("Path doesn't have section: ", path);
	throw new Error("Path doesn't have section");
	} else if (pathComponents.length === 2 && pathComponents[1] === "README.md") {
	console.error("Section cannot have README");
	throw new Error("Section cannot have README");
	} else if (pathComponents.length > 3) {
	console.error("Path is too deep: ", path);
	throw new Error("Path is too deep");
	}

	let [section, slug] = pathComponents;

	let parentPage = undefined;
	if (pathComponents.length === 3 && pathComponents[2] !== "README.md") {
	[section, parentPage, slug] = pathComponents;
	}

	sectionSlugs.add(section);
	slug = basename(slug, extname(slug));
	console.error("Converting to Sanity document: ", JSON.stringify({ section, parentPage, slug }));

	const htmlToBlocksRules = [
	{
	// Code blocks
	deserialize: (el, next, block) => {
	if (el.tagName?.toLowerCase() !== "pre") return undefined;

	const codeNode = el.children[0];
	const childNodes =
	codeNode && codeNode.tagName.toLowerCase() === "code"
	? codeNode.childNodes
	: el.childNodes;
	let code = "";
	childNodes.forEach(node => {
	code += node.textContent;
	});

	return block({ _type: "code", code });
	},
	},

	{
	// Loom/YouTube embed blocks
	deserialize: (el, next, block) => {
	if (el.tagName?.toLowerCase() !== "p") return undefined;
	if (!el.textContent?.startsWith('{% embed url="')) return undefined;

	const url = el.textContent.replace('{% embed url="', "").replace('" %}', "");
	if (url.startsWith("https://www.youtube.com")) {
	return block({ _type: "youtube", url });
	} else if (url.startsWith("https://www.loom.com")) {
	return block({ _type: "loom", url });
	} else {
	console.warn(`Unknown embed URL: ${url}`);
	return undefined;
	}
	},
	},

	{
	// Images and figures
	deserialize: (el, next, block) => {
	if (el.tagName?.toLowerCase() !== "img" && el.tagName?.toLowerCase() !== "figure")
	return undefined;

	let src, caption;
	if (el.tagName.toLowerCase() === "img") {
	src = el.getAttribute("src");
	caption = "";
	} else {
	const img = el.querySelector("img");
	src = img.getAttribute("src");
	caption = el.querySelector("figcaption")?.textContent ?? "";
	}

	const filename = basename(decodeURIComponent(src));

	if (src.startsWith("http")) {
	// download the image to the images path
	console.warn(`❗ Download image before import: ${src}`);
	}

	const imageId = images.get(filename);
	if (imageId == null) {
	const path = resolve(imagesPath, filename);
	return block({ _type: "image", caption, _sanityAsset: `image@file://${path}` });
	}

	return block({ _type: "image", caption, asset: { _type: "reference", _ref: imageId } });
	},
	},

	{
	// Hint blocks
	deserialize: (el, next, block) => {
	if (el.tagName?.toLowerCase() !== "aside") return undefined;
	if (!el.textContent?.startsWith('{% hint style="')) return undefined;

	const regex = /\{% hint style="(\w+)" %} (.) {% endhint %\}(.)/;
	try {
	const [, style, ...text] = el.innerHTML.match(regex);

	const content = htmlToBlocks(`<div>${text.join("")}</div>`, blockContentType, {
	parseHtml: html => new JSDOM(html).window.document,
	rules: htmlToBlocksRules,
	});

	return block({ _type: "callout", style, content });
	} catch (e) {
	console.warn(`Failed to parse hint: ${el.innerHTML}`);
	throw e;
	}
	},
	},

	{
	// Summary/Detail blocks
	deserialize: (el, next, block) => {
	if (el.tagName?.toLowerCase() !== "details") return undefined;

	const summaryElement = el.querySelector("summary");
	if (summaryElement == null) {
	console.warn("Details element without summary");
	return undefined;
	}

	const title = summaryElement.textContent;
	el.removeChild(summaryElement);
	const content = htmlToBlocks(el.innerHTML, blockContentType, {
	rules: htmlToBlocksRules,
	parseHtml: html => new JSDOM(html).window.document,
	});

	return block({ _type: "collapsible", title, content });
	},
	},

	{
	// "{% content-ref" blocks
	deserialize: (el, next, block) => {
	if (el.tagName?.toLowerCase() !== "p") return undefined;
	if (!el.textContent?.startsWith("{% content-ref")) return undefined;

	return htmlToBlocks("", blockContentType, {
	parseHtml: html => new JSDOM(html).window.document,
	});
	},
	},

	{
	// Links to other documentation pages
	deserialize: (el, next) => {
	if (el.tagName?.toLowerCase() !== "a") return undefined;
	let href = el.getAttribute("href");
	if (href == null) return undefined;
	if (el.getAttribute("href")?.startsWith("http") === true) return undefined; // default handling is fine

	// GitBook uses broken-reference to indicate a link to a non-existent page
	if (href === "broken-reference") return next(el.childNodes);

	href = href.split("#")[0];
	if (href.endsWith("/")) {
	href = href.slice(0, -1);
	}

	const slug = basename(href, extname(href));
	if (files.every(file => basename(file, extname(file)) !== slug)) return undefined;

	// had to look at the source code to figure this one out -___-
	return {
	_type: "__annotation",
	markDef: {
	_key: randomKey(12),
	_type: "internalLink",
	reference: { _type: "reference", _ref: slug },
	},
	children: next(el.childNodes),
	};
	},
	},

	{
	// Tables
	deserialize: (el, next, block) => {
	if (el.tagName?.toLowerCase() !== "table") return undefined;

	const headers = el.querySelectorAll("thead th");
	const columns = [...headers].map(th => th.textContent);
	const rows = el.querySelectorAll("tbody tr");
	const data = [...rows].map(row => {
	const cells = row.querySelectorAll("td");
	return [...cells].slice(0, columns.length).map(cell => cell.textContent);
	});

	return block({
	_type: "table",
	rows: [columns, ...data].map(cells => ({ _type: "tableRow", cells })),
	});
	},
	},

	// Manual:
	// - Tabs
	];

	const portableText = htmlToBlocks(
	html,
	blockContentType,
	{ rules: htmlToBlocksRules, parseHtml: html => new JSDOM(html).window.document },
	{ allowedDecorators: true },
	);

	const isResourcePage = section === "resources";

	return {
	_type: isResourcePage ? "resource" : "docPage",
	_id: slug,
	title,
	description,
	slug: { _type: "slug", current: slug },
	content: portableText,
	...(isResourcePage
	? {}
	: {
	parentSection: { _type: "reference", _ref: section },
	parentPage: parentPage != null ? { _type: "reference", _ref: parentPage } : undefined,
	}),
	};
	}

	async function main() {
	if (process.argv.length < 4) {
	console.error("Usage: importMarkdown.mjs <path-to-images> <path> [...<path>]");
	process.exit(1);
	}

	const imagesPath = process.argv[2];
	const files = process.argv.slice(3);

	const sectionSlugs = new Set();

	const docPages = await Promise.all(
	files.map(async filePath => {
	const { html, data } = await convertMarkdownFileToHtml(filePath);
	await writeFile(filePath + ".html", html);
	const doc = await convertHtmlToSanity({
	filePath,
	html,
	data,
	sectionSlugs,
	imagesPath,
	files,
	});
	await writeFile(filePath + ".json", JSON.stringify(doc, null, 2));
	return doc;
	}),
	);

	const sections = [...sectionSlugs].map(slug => ({
	_type: "docSection",
	_id: slug,
	title: slug
	.split("-")
	.map(s => (s === "and" ? s : capitalize(s)))
	.join(" "),
	slug: { _type: "slug", current: slug },
	}));

	console.error(`Converted ${docPages.length} documents`);
	console.error(`Found ${sections.length} sections`);
	const sectionsResult = sections.reduce((acc, s) => `${acc}${JSON.stringify(s)}\n`, "");
	const result = docPages.reduce((acc, doc) => `${acc}${JSON.stringify(doc)}\n`, sectionsResult);
	await writeFile("sanity-import.ndjson", result);
	console.error("Wrote sanity-import.ndjson");
	}

	main();