Last active
September 21, 2022 21:18
-
-
Save colinfwren/3e35388ec13d03e6811f3cb793ee31c0 to your computer and use it in GitHub Desktop.
A script to convert Medium export into Markdown usable by Gatsby
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fetch from 'node-fetch' | |
import {read} from 'to-vfile' | |
import {unified} from 'unified' | |
import rehypeParse from 'rehype-parse' | |
import rehypeRemark from 'rehype-remark' | |
import remarkStringify from 'remark-stringify' | |
import { writeFile, promises as fs } from 'fs' | |
import { selectAll } from "hast-util-select"; | |
import {toHtml} from "hast-util-to-html"; | |
import slugify from "slugify"; | |
import frontmatter from 'remark-frontmatter' | |
import {visit} from "unist-util-visit"; | |
import path from 'path' | |
async function downloadImage(url, pathName) { | |
try { | |
const imageResp = await fetch(url) | |
const imageData = await imageResp.arrayBuffer() | |
await writeFile(pathName, Buffer.from(imageData), (err) => { | |
if (err) | |
console.log(`Failed to write ${pathName}`) | |
else { | |
console.log(`Wrote ${pathName}`) | |
} | |
}) | |
return pathName | |
} catch (error) { | |
console.error('Failed to fetch', url, error) | |
return url | |
} | |
} | |
async function downloadGistCode(url) { | |
try { | |
const rawUrl = `${url.split('.js')[0]}/raw` | |
const codeResp = await fetch(rawUrl) | |
return await codeResp.text() | |
} catch (error) { | |
console.error('Failed to fetch', url, error) | |
return false | |
} | |
} | |
function getFilename(src) { | |
const filename = src.split('/').pop() | |
if (filename.split('.').length > 1) { | |
return filename | |
} else { | |
return `${filename}.jpg` | |
} | |
} | |
function rehypeDownloadImages() { | |
return async (tree, file) => { | |
const nodes = selectAll('img', tree) | |
await Promise.all(nodes.map(async (node) => { | |
const filename = getFilename(node.properties.src) | |
const outputPath = path.join(file.outputFolder, filename) | |
await downloadImage(node.properties.src, outputPath) | |
node.properties.src = filename | |
return node | |
})) | |
return tree | |
} | |
} | |
function rehypeInlineGistScript() { | |
return async (tree) => { | |
const nodes = selectAll('script', tree) | |
await Promise.all(nodes.map(async (node) => { | |
if (node.properties.src.indexOf('gist') > -1) { | |
const code = await downloadGistCode(node.properties.src) | |
node.properties = {} | |
node.type = 'text' | |
node.value = '\n```\n' + code + '\n```\n' | |
} | |
return node | |
})) | |
return tree | |
} | |
} | |
function gatherFrontMatterData() { | |
return async (tree, file) => { | |
const title = tree.children.find(x => x.tagName === 'title').children[0].value | |
const articleContent = tree.children.find(x => x.tagName === 'article').children | |
const subtitle = articleContent.find(x => x.properties && x.properties.dataField === 'subtitle') | |
const footerContent = articleContent.find(x => x.tagName === 'footer').children | |
const date = footerContent.reduce((acc, node) => { | |
const links = node.children ? node.children.filter(x => x.tagName === 'a') : [] | |
links.map(x => { | |
const time = x.children.find(x => x.tagName === 'time') | |
if (time) { | |
acc = time.properties.dateTime.split('T')[0] | |
} | |
}) | |
return acc | |
}, '') | |
file.frontmatter = { | |
slug: slugify(title, { lower: true }), | |
date, | |
title, | |
excerpt: subtitle ? subtitle.children[0].value : '' | |
} | |
return Promise.resolve(tree) | |
} | |
} | |
function setFrontMatter() { | |
return (tree, file) => { | |
tree.children.unshift({ | |
type: 'yaml', | |
value: ` | |
slug: "${file.frontmatter.slug}" | |
date: "${file.frontmatter.date}" | |
title: "${file.frontmatter.title}" | |
except: "${file.frontmatter.excerpt.trim()}" | |
` | |
}) | |
} | |
} | |
function removeMediumExtras() { | |
return (tree) => { | |
const article = tree.children.find(x => x.tagName === 'article') | |
article.children = article.children.filter((node) => node.properties && node.properties.dataField === 'body') | |
visit(tree, { tagName: 'hr' }, (node, index, parent) => { | |
if (node.properties.className.includes('section-divider')) { | |
parent.children.splice(index, 1) | |
} | |
}) | |
visit(tree, { tagName: 'h3' }, (node, index, parent) => { | |
if (node.properties.className.includes('graf--title')) { | |
parent.children.splice(index, 1) | |
} | |
}) | |
} | |
} | |
function createDirectoryForPost(options) { | |
return async (tree, file) => { | |
const fullPath = path.join(options.outputFolder, `${file.frontmatter.date}-${file.frontmatter.slug}`) | |
file.outputFolder = fullPath | |
try { | |
await fs.mkdir(fullPath, (err) => { | |
if (err && err.code != 'EEXIST') throw err | |
}) | |
} catch (error) { | |
console.error(`Failed to create output folder at ${fullPath}`) | |
} | |
} | |
} | |
const paddingNode = { | |
type: 'text', | |
value: '\n', | |
} | |
function getCaption(node) { | |
switch(node.children.length) { | |
case 0: | |
return '' | |
case 1: | |
return node.children[0].value | |
default: | |
return node.children.reduce((acc, child) => { | |
if (child.type === 'text') { | |
acc = `${acc} ${child.value}` | |
} | |
if (child.tagName === 'a') { | |
acc = `${acc} ${child.children[0].value}` | |
} | |
return acc | |
}, '') | |
} | |
} | |
async function convertHtmlToMarkdown(filePath, outputFolder ) { | |
const tree = await unified() | |
.use(rehypeParse, {fragment: true}) | |
.use(gatherFrontMatterData) | |
.use(createDirectoryForPost, { outputFolder }) | |
.use(rehypeDownloadImages) | |
.use(rehypeInlineGistScript) | |
.use(removeMediumExtras) | |
.use(rehypeRemark, { | |
handlers: { // defines how to handle specific HTML tags | |
figure(h, node) { | |
const captionNode = node.children.find(child => child.tagName === 'figcaption') | |
const caption = captionNode ? getCaption(captionNode) : '' | |
const cleansedChildren = node.children.map((child) => { | |
// Add the figcaption text to the img so when converted to Markdown it will use that | |
if (child.tagName === 'img') { | |
return { | |
...child, | |
properties: { | |
src: child.properties.src, | |
alt: child.properties.alt || caption | |
} | |
} | |
} | |
if (child.tagName === 'figcaption') { | |
return { | |
...child, | |
properties: {} | |
} | |
} | |
return child | |
}) | |
const cleansedNode = { | |
...node, | |
properties: {}, | |
children: cleansedChildren.reduce((acc, child) => { | |
acc.push(child) | |
acc.push(paddingNode) | |
return acc | |
}, [paddingNode]) | |
} | |
return h(cleansedNode, 'html', toHtml(cleansedNode, { closeSelfClosing: true })) | |
}, | |
} | |
}) | |
.use(frontmatter) | |
.use(setFrontMatter) | |
.use(remarkStringify) | |
.process(await read(filePath)) | |
const fullPath = path.join(outputFolder, `${tree.frontmatter.date}-${tree.frontmatter.slug}/index.mdx`) | |
const fileContent = String(tree) | |
const cleanedFileContent = fileContent.replace(/ | |/g, ' ') | |
await writeFile(fullPath, cleanedFileContent, (err) => { | |
if (err) { | |
console.error(`Failed to write file at ${fullPath}`, err) | |
} | |
}) | |
} | |
async function processFilesInDirectory(directory) { | |
try { | |
const cwd = path.resolve() | |
const fullPath = path.join(cwd, directory) | |
const outputPath = path.join(cwd, 'gatsby-posts') | |
const files = await fs.readdir(fullPath) | |
await fs.mkdir(outputPath) | |
const htmlFiles = files.filter(file => path.extname(file) === '.html') | |
await Promise.all(htmlFiles.map(async (file) => { | |
const filePath = path.join(fullPath, file) | |
await convertHtmlToMarkdown(filePath, outputPath) | |
})) | |
} catch (error) { | |
console.error('Failed to process files in directory', error) | |
} | |
} | |
processFilesInDirectory('posts') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment