glaforge/fetchPostsToMD.groovy

## fetchPostsToMD.groovy
@Grab('org.jsoup:jsoup:1.15.4')
import org.jsoup.Jsoup
import org.jsoup.safety.Safelist

@Grab('io.github.furstenheim:copy_down:1.1')
import io.github.furstenheim.*

import java.nio.file.Paths
import java.nio.file.Files
import java.text.SimpleDateFormat

final sdfFrom = new SimpleDateFormat('dd MMM, yyyy', Locale.ENGLISH)
final sdfTo = new SimpleDateFormat('yyyy/MM/dd', Locale.ENGLISH)
final sdfIso = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSXXX")

final baseUrl = 'https://glaforge.appspot.com'
final baseArchivePageUrl = baseUrl + '/archives/p'
final pageRange = 1..49
final outputFolder = "/tmp/blog-output"
Paths.get(outputFolder).deleteDir()

def articleUrls = pageRange.collectMany { int pageId ->
    def archiveUrl = baseArchivePageUrl + pageId
    def archivePageDoc = Jsoup.connect(archiveUrl).get()
    archivePageDoc.select('.archive-post-title > h3 > a').collect { aTag ->
        aTag.attr('href')
    }
}

articleUrls.each { relArticleUrl ->
    def fullArticleUrl = baseUrl + relArticleUrl
    def slug = fullArticleUrl.substring(fullArticleUrl.lastIndexOf('/') + 1)
    def articlePageDoc = Jsoup.connect(fullArticleUrl).get()

    // get date
    def dateBlockText = articlePageDoc.select('.post-date').text()
    dateBlockText = dateBlockText.substring(10, dateBlockText.indexOf('(')).trim()
    def date = sdfFrom.parse(dateBlockText)
    def formattedDate = sdfTo.format(date)
    def isoDate = sdfIso.format(date)

    // get title
    def title = articlePageDoc.select('.post-title').text()

    // get categories
    def tags = articlePageDoc.select('.post-meta a').collect { it.text().toLowerCase().replaceAll(' ', '-') - '-platform' }

    // get article content
    def articleBody = articlePageDoc.select('.post-body').first().outerHtml()
    def safelist = Safelist.basicWithImages()//.removeTags('span')
    def sanitizedHtml = Jsoup.clean(articleBody, fullArticleUrl, safelist)

    // turn into markdown
    def options = OptionsBuilder.anOptions()
        .withHeadingStyle(HeadingStyle.ATX)
        .withCodeBlockStyle(CodeBlockStyle.FENCED)
        .build()
    def toMd = new CopyDown(options)
    def md = toMd.convert(sanitizedHtml)

    // output with front matter
    def mdFilePath = Files.createFile(Files.createDirectories(Paths.get(outputFolder, formattedDate)).resolve(slug + '.md'))
    println mdFilePath

    def frontMatter = """\
        ---
        title: "${title}"
        date: "${isoDate}"
        tags: [${tags.join(', ')}]
        ---

    """.stripIndent()

    mdFilePath << frontMatter
    mdFilePath << md
}
	@Grab('org.jsoup:jsoup:1.15.4')
	import org.jsoup.Jsoup
	import org.jsoup.safety.Safelist

	@Grab('io.github.furstenheim:copy_down:1.1')
	import io.github.furstenheim.*

	import java.nio.file.Paths
	import java.nio.file.Files
	import java.text.SimpleDateFormat

	final sdfFrom = new SimpleDateFormat('dd MMM, yyyy', Locale.ENGLISH)
	final sdfTo = new SimpleDateFormat('yyyy/MM/dd', Locale.ENGLISH)
	final sdfIso = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSXXX")

	final baseUrl = 'https://glaforge.appspot.com'
	final baseArchivePageUrl = baseUrl + '/archives/p'
	final pageRange = 1..49
	final outputFolder = "/tmp/blog-output"
	Paths.get(outputFolder).deleteDir()

	def articleUrls = pageRange.collectMany { int pageId ->
	def archiveUrl = baseArchivePageUrl + pageId
	def archivePageDoc = Jsoup.connect(archiveUrl).get()
	archivePageDoc.select('.archive-post-title > h3 > a').collect { aTag ->
	aTag.attr('href')
	}
	}

	articleUrls.each { relArticleUrl ->
	def fullArticleUrl = baseUrl + relArticleUrl
	def slug = fullArticleUrl.substring(fullArticleUrl.lastIndexOf('/') + 1)
	def articlePageDoc = Jsoup.connect(fullArticleUrl).get()

	// get date
	def dateBlockText = articlePageDoc.select('.post-date').text()
	dateBlockText = dateBlockText.substring(10, dateBlockText.indexOf('(')).trim()
	def date = sdfFrom.parse(dateBlockText)
	def formattedDate = sdfTo.format(date)
	def isoDate = sdfIso.format(date)

	// get title
	def title = articlePageDoc.select('.post-title').text()

	// get categories
	def tags = articlePageDoc.select('.post-meta a').collect { it.text().toLowerCase().replaceAll(' ', '-') - '-platform' }

	// get article content
	def articleBody = articlePageDoc.select('.post-body').first().outerHtml()
	def safelist = Safelist.basicWithImages()//.removeTags('span')
	def sanitizedHtml = Jsoup.clean(articleBody, fullArticleUrl, safelist)

	// turn into markdown
	def options = OptionsBuilder.anOptions()
	.withHeadingStyle(HeadingStyle.ATX)
	.withCodeBlockStyle(CodeBlockStyle.FENCED)
	.build()
	def toMd = new CopyDown(options)
	def md = toMd.convert(sanitizedHtml)

	// output with front matter
	def mdFilePath = Files.createFile(Files.createDirectories(Paths.get(outputFolder, formattedDate)).resolve(slug + '.md'))
	println mdFilePath

	def frontMatter = """\
	---
	title: "${title}"
	date: "${isoDate}"
	tags: [${tags.join(', ')}]
	---

	""".stripIndent()

	mdFilePath << frontMatter
	mdFilePath << md
	}