Skip to content

Instantly share code, notes, and snippets.

@glaforge
Created March 7, 2023 17:40
Show Gist options
  • Save glaforge/d8535e0f6b1e8d475fb0103184f9432d to your computer and use it in GitHub Desktop.
Save glaforge/d8535e0f6b1e8d475fb0103184f9432d to your computer and use it in GitHub Desktop.
Transform my old blog posts into Hugo friendly Markdown article documents
@Grab('org.jsoup:jsoup:1.15.4')
import org.jsoup.Jsoup
import org.jsoup.safety.Safelist
@Grab('io.github.furstenheim:copy_down:1.1')
import io.github.furstenheim.*
import java.nio.file.Paths
import java.nio.file.Files
import java.text.SimpleDateFormat
final sdfFrom = new SimpleDateFormat('dd MMM, yyyy', Locale.ENGLISH)
final sdfTo = new SimpleDateFormat('yyyy/MM/dd', Locale.ENGLISH)
final sdfIso = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSXXX")
final baseUrl = 'https://glaforge.appspot.com'
final baseArchivePageUrl = baseUrl + '/archives/p'
final pageRange = 1..49
final outputFolder = "/tmp/blog-output"
Paths.get(outputFolder).deleteDir()
def articleUrls = pageRange.collectMany { int pageId ->
def archiveUrl = baseArchivePageUrl + pageId
def archivePageDoc = Jsoup.connect(archiveUrl).get()
archivePageDoc.select('.archive-post-title > h3 > a').collect { aTag ->
aTag.attr('href')
}
}
articleUrls.each { relArticleUrl ->
def fullArticleUrl = baseUrl + relArticleUrl
def slug = fullArticleUrl.substring(fullArticleUrl.lastIndexOf('/') + 1)
def articlePageDoc = Jsoup.connect(fullArticleUrl).get()
// get date
def dateBlockText = articlePageDoc.select('.post-date').text()
dateBlockText = dateBlockText.substring(10, dateBlockText.indexOf('(')).trim()
def date = sdfFrom.parse(dateBlockText)
def formattedDate = sdfTo.format(date)
def isoDate = sdfIso.format(date)
// get title
def title = articlePageDoc.select('.post-title').text()
// get categories
def tags = articlePageDoc.select('.post-meta a').collect { it.text().toLowerCase().replaceAll(' ', '-') - '-platform' }
// get article content
def articleBody = articlePageDoc.select('.post-body').first().outerHtml()
def safelist = Safelist.basicWithImages()//.removeTags('span')
def sanitizedHtml = Jsoup.clean(articleBody, fullArticleUrl, safelist)
// turn into markdown
def options = OptionsBuilder.anOptions()
.withHeadingStyle(HeadingStyle.ATX)
.withCodeBlockStyle(CodeBlockStyle.FENCED)
.build()
def toMd = new CopyDown(options)
def md = toMd.convert(sanitizedHtml)
// output with front matter
def mdFilePath = Files.createFile(Files.createDirectories(Paths.get(outputFolder, formattedDate)).resolve(slug + '.md'))
println mdFilePath
def frontMatter = """\
---
title: "${title}"
date: "${isoDate}"
tags: [${tags.join(', ')}]
---
""".stripIndent()
mdFilePath << frontMatter
mdFilePath << md
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment