Skip to content

Instantly share code, notes, and snippets.

@colinfwren
Last active September 21, 2022 21:18
Show Gist options
  • Save colinfwren/3e35388ec13d03e6811f3cb793ee31c0 to your computer and use it in GitHub Desktop.
Save colinfwren/3e35388ec13d03e6811f3cb793ee31c0 to your computer and use it in GitHub Desktop.
A script to convert Medium export into Markdown usable by Gatsby
import fetch from 'node-fetch'
import {read} from 'to-vfile'
import {unified} from 'unified'
import rehypeParse from 'rehype-parse'
import rehypeRemark from 'rehype-remark'
import remarkStringify from 'remark-stringify'
import { writeFile, promises as fs } from 'fs'
import { selectAll } from "hast-util-select";
import {toHtml} from "hast-util-to-html";
import slugify from "slugify";
import frontmatter from 'remark-frontmatter'
import {visit} from "unist-util-visit";
import path from 'path'
async function downloadImage(url, pathName) {
try {
const imageResp = await fetch(url)
const imageData = await imageResp.arrayBuffer()
await writeFile(pathName, Buffer.from(imageData), (err) => {
if (err)
console.log(`Failed to write ${pathName}`)
else {
console.log(`Wrote ${pathName}`)
}
})
return pathName
} catch (error) {
console.error('Failed to fetch', url, error)
return url
}
}
async function downloadGistCode(url) {
try {
const rawUrl = `${url.split('.js')[0]}/raw`
const codeResp = await fetch(rawUrl)
return await codeResp.text()
} catch (error) {
console.error('Failed to fetch', url, error)
return false
}
}
function getFilename(src) {
const filename = src.split('/').pop()
if (filename.split('.').length > 1) {
return filename
} else {
return `${filename}.jpg`
}
}
function rehypeDownloadImages() {
return async (tree, file) => {
const nodes = selectAll('img', tree)
await Promise.all(nodes.map(async (node) => {
const filename = getFilename(node.properties.src)
const outputPath = path.join(file.outputFolder, filename)
await downloadImage(node.properties.src, outputPath)
node.properties.src = filename
return node
}))
return tree
}
}
function rehypeInlineGistScript() {
return async (tree) => {
const nodes = selectAll('script', tree)
await Promise.all(nodes.map(async (node) => {
if (node.properties.src.indexOf('gist') > -1) {
const code = await downloadGistCode(node.properties.src)
node.properties = {}
node.type = 'text'
node.value = '\n```\n' + code + '\n```\n'
}
return node
}))
return tree
}
}
function gatherFrontMatterData() {
return async (tree, file) => {
const title = tree.children.find(x => x.tagName === 'title').children[0].value
const articleContent = tree.children.find(x => x.tagName === 'article').children
const subtitle = articleContent.find(x => x.properties && x.properties.dataField === 'subtitle')
const footerContent = articleContent.find(x => x.tagName === 'footer').children
const date = footerContent.reduce((acc, node) => {
const links = node.children ? node.children.filter(x => x.tagName === 'a') : []
links.map(x => {
const time = x.children.find(x => x.tagName === 'time')
if (time) {
acc = time.properties.dateTime.split('T')[0]
}
})
return acc
}, '')
file.frontmatter = {
slug: slugify(title, { lower: true }),
date,
title,
excerpt: subtitle ? subtitle.children[0].value : ''
}
return Promise.resolve(tree)
}
}
function setFrontMatter() {
return (tree, file) => {
tree.children.unshift({
type: 'yaml',
value: `
slug: "${file.frontmatter.slug}"
date: "${file.frontmatter.date}"
title: "${file.frontmatter.title}"
except: "${file.frontmatter.excerpt.trim()}"
`
})
}
}
function removeMediumExtras() {
return (tree) => {
const article = tree.children.find(x => x.tagName === 'article')
article.children = article.children.filter((node) => node.properties && node.properties.dataField === 'body')
visit(tree, { tagName: 'hr' }, (node, index, parent) => {
if (node.properties.className.includes('section-divider')) {
parent.children.splice(index, 1)
}
})
visit(tree, { tagName: 'h3' }, (node, index, parent) => {
if (node.properties.className.includes('graf--title')) {
parent.children.splice(index, 1)
}
})
}
}
function createDirectoryForPost(options) {
return async (tree, file) => {
const fullPath = path.join(options.outputFolder, `${file.frontmatter.date}-${file.frontmatter.slug}`)
file.outputFolder = fullPath
try {
await fs.mkdir(fullPath, (err) => {
if (err && err.code != 'EEXIST') throw err
})
} catch (error) {
console.error(`Failed to create output folder at ${fullPath}`)
}
}
}
const paddingNode = {
type: 'text',
value: '\n',
}
function getCaption(node) {
switch(node.children.length) {
case 0:
return ''
case 1:
return node.children[0].value
default:
return node.children.reduce((acc, child) => {
if (child.type === 'text') {
acc = `${acc} ${child.value}`
}
if (child.tagName === 'a') {
acc = `${acc} ${child.children[0].value}`
}
return acc
}, '')
}
}
async function convertHtmlToMarkdown(filePath, outputFolder ) {
const tree = await unified()
.use(rehypeParse, {fragment: true})
.use(gatherFrontMatterData)
.use(createDirectoryForPost, { outputFolder })
.use(rehypeDownloadImages)
.use(rehypeInlineGistScript)
.use(removeMediumExtras)
.use(rehypeRemark, {
handlers: { // defines how to handle specific HTML tags
figure(h, node) {
const captionNode = node.children.find(child => child.tagName === 'figcaption')
const caption = captionNode ? getCaption(captionNode) : ''
const cleansedChildren = node.children.map((child) => {
// Add the figcaption text to the img so when converted to Markdown it will use that
if (child.tagName === 'img') {
return {
...child,
properties: {
src: child.properties.src,
alt: child.properties.alt || caption
}
}
}
if (child.tagName === 'figcaption') {
return {
...child,
properties: {}
}
}
return child
})
const cleansedNode = {
...node,
properties: {},
children: cleansedChildren.reduce((acc, child) => {
acc.push(child)
acc.push(paddingNode)
return acc
}, [paddingNode])
}
return h(cleansedNode, 'html', toHtml(cleansedNode, { closeSelfClosing: true }))
},
}
})
.use(frontmatter)
.use(setFrontMatter)
.use(remarkStringify)
.process(await read(filePath))
const fullPath = path.join(outputFolder, `${tree.frontmatter.date}-${tree.frontmatter.slug}/index.mdx`)
const fileContent = String(tree)
const cleanedFileContent = fileContent.replace(/ | |/g, ' ')
await writeFile(fullPath, cleanedFileContent, (err) => {
if (err) {
console.error(`Failed to write file at ${fullPath}`, err)
}
})
}
async function processFilesInDirectory(directory) {
try {
const cwd = path.resolve()
const fullPath = path.join(cwd, directory)
const outputPath = path.join(cwd, 'gatsby-posts')
const files = await fs.readdir(fullPath)
await fs.mkdir(outputPath)
const htmlFiles = files.filter(file => path.extname(file) === '.html')
await Promise.all(htmlFiles.map(async (file) => {
const filePath = path.join(fullPath, file)
await convertHtmlToMarkdown(filePath, outputPath)
}))
} catch (error) {
console.error('Failed to process files in directory', error)
}
}
processFilesInDirectory('posts')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment