Skip to content

Instantly share code, notes, and snippets.

@clipperhouse
Last active April 29, 2019 18:23
Show Gist options
  • Save clipperhouse/010d4666892807afee16ba7711b41401 to your computer and use it in GitHub Desktop.
Save clipperhouse/010d4666892807afee16ba7711b41401 to your computer and use it in GitHub Desktop.
A Go script to convert Medium export (HTML) to Markdown, for use with Hugo. It’s a one-use type of thing for me, so it ain’t beautiful.
// A quick script for converting Medium HTML files to Markdown, suitable for use in a static file generator such as Hugo or Jekyll
package main
import (
"fmt"
"io/ioutil"
"log"
"os"
"path/filepath"
"regexp"
"strings"
"text/template"
"github.com/PuerkitoBio/goquery"
"github.com/lunny/html2md"
)
// Location of exported, unzipped Medium HTML files
var src = "/Users/mwsherman/medium-export"
// Destination for Markdown files, perhaps the content folder for Hugo or Jekyll
var dest = "/Users/mwsherman/tmp"
func main() {
filez, err := ioutil.ReadDir(src)
if err != nil {
panic(err)
}
for _, f := range filez {
if !strings.HasSuffix(f.Name(), ".html") {
continue
}
inpath := filepath.Join(src, f.Name())
doc, err := read(inpath)
if err != nil {
log.Fatal(err)
}
post := process(doc)
post.Draft = strings.HasPrefix(f.Name(), "draft_")
if len(post.Title) > 0 && len(post.Body) > 0 {
outpath := filepath.Join(dest, slug(post.Title)+".md")
write(post, outpath)
}
}
}
type post struct {
Title, Author, Date, Body string
Draft bool
}
func nbsp(r rune) rune {
if r == '\u00A0' {
return ' '
}
return r
}
func process(doc *goquery.Document) post {
title := doc.Find("title").Text()
date, _ := doc.Find("time").Attr("datetime")
author := doc.Find(".p-author.h-card").Text()
body := ""
doc.Find("div.section-inner").Each(func(i int, s *goquery.Selection) {
h, _ := s.Html()
body += html2md.Convert(h)
})
body = strings.Map(nbsp, body)
redundant := fmt.Sprintf("### %s", title) // post body shouldn't repeat the title
if strings.HasPrefix(body, redundant) {
body = body[len(redundant):]
}
body = strings.TrimSpace(body)
p := post{
Title: title,
Author: author,
Date: date,
Body: body,
}
return p
}
func read(path string) (*goquery.Document, error) {
f, err := os.Open(path)
if err != nil {
panic(err)
}
defer f.Close()
// Load the HTML document
return goquery.NewDocumentFromReader(f)
}
func write(post post, path string) {
f, err := os.Create(path)
if err != nil {
panic(err)
}
defer f.Close()
err = tmpl.Execute(f, post)
if err != nil {
panic(err)
}
}
var spaces = regexp.MustCompile(`[\s]+`)
var notallowed = regexp.MustCompile(`[^\p{L}\p{N}.\s]`)
var athe = regexp.MustCompile(`^(a\-|the\-)`)
func slug(s string) string {
result := s
result = strings.Replace(result, "%", " percent", -1)
result = strings.Replace(result, "#", " sharp", -1)
result = notallowed.ReplaceAllString(result, "")
result = spaces.ReplaceAllString(result, "-")
result = strings.ToLower(result)
result = athe.ReplaceAllString(result, "")
return result
}
var tmpl = template.Must(template.New("").Parse(`---
title: "{{ .Title }}"
date: {{ .Date }}
author: "{{ .Author }}"
{{ if eq .Draft true }}draft: {{ .Draft }}{{end}}
---
{{ .Body }}
`))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment