thesephist/analysis.ink

## analysis.ink
` Count sentence length, word size distribution over past posts `

std := load('std')
str := load('str')
quicksort := load('quicksort')

log := std.log
f := std.format
append := std.append
cat := std.cat
slice := std.slice
flatten := std.flatten
reduce := std.reduce
map := std.map
each := std.each
filter := std.filter
readFile := std.readFile
writeFile := std.writeFile

hasPrefix? := str.hasPrefix?
split := str.split
trim := str.trim
trimS := s => trim(s, ' ')

sortBy := quicksort.sortBy
sort := quicksort.sort

` Constants `

PostsDir := './content/posts'
Newline := char(10)

blank? := s => trimS(s) = ''

` find all blog posts on the site and callback with all file names `
withAllPosts := cb => (
	postFiles := dir(PostsDir, evt => evt.type :: {
		'error' -> log('error: could not read posts directory!')
		'data' -> cb(filter(
			map(evt.data, entry => entry.name)
			` filter out hidden files and the _index.md file `
			fName => ~(hasPrefix?(fName, '.') | hasPrefix?(fName, '_'))
		))
	})
)

` given a potentially double-quoted string, strip the quotes `
stripQuotes := s => s.0 :: {
	'"' -> slice(s, 1, len(s) - 1)
	_ -> s
}

` given a file name to a blog post, parse it completely
	and return a PostRecord structure with parsed metadata and body `
withPostRecord := (fileName, cb) => (
	readFile(PostsDir + '/' + fileName, bytes => (
		lines := filter(
			split(bytes, Newline)
			` remove raw HTML lines `
			line => ~(blank?(line) | hasPrefix?(line, '<'))
		)

		` sanitize lines `
		lines := map(lines, line => (sub := i => i :: {
			0 -> line
			_ -> (
				line.(i) :: {
					'_' -> line.(i) := ' '
					'*' -> line.(i) := ' '
					'[' -> line.(i) := ' '
					']' -> line.(i) := ' '
					'(' -> line.(i) := ' '
					')' -> line.(i) := ' '
				}
				sub(i - 1)
			)
		})(len(line) - 1))

		record := {
			` parse state:
				0 -> start
				1 -> inside front matter
				2 -> after front matter
				3 -> error, stop parsing `
			parseState: 0
			title: ()
			date: ()
			body: []
		}
		each(lines, line => record.parseState :: {
			0 -> line :: {
				'---' -> record.parseState := 1
				_ -> (
					log(f('error: unexpected line in post file, {{0}}', [line]))
					record.parseState := 3
				)
			}
			1 -> line :: {
				'---' -> record.parseState := 2
				_ -> split(line, ':').0 :: {
					'title' -> record.title := stripQuotes(trimS(split(line, 'title:').1))
					'date' -> record.date := trimS(split(line, 'date:').1)
				}
			}
			2 -> record.body.len(record.body) := line
			3 -> ()
		})

		cb(record)
	))
)

` mean of an array `
mean := xs => len(xs) :: {
	0 -> ~1
	_ -> reduce(xs, (a, b) => a + b, 0) / len(xs)
}

` median of an array `
median := xs => xs :: {
	[] -> ~1
	_ -> (
		sorted := sort(xs)
		mid := floor(len(sorted) / 2)
		(len(sorted) % 2) :: {
			0 -> (sorted.(mid) + sorted.(mid - 1)) / 2
			1 -> sorted.(mid)
		}
	)
}

` split up a blog post body into a flat list of words
	includes doing some sanitization `
getWords := record => filter(
	flatten(map(record.body, line => split(line, ' ')))
	` try to remove links and empty words`
	word => blank?(word) :: {
		true -> false
		_ -> ~(hasPrefix?(word, 'http') | hasPrefix?(word, '/'))
	}
)

` split up a blog post body into a flat list of sentences
	includes doing some sanitization `
getSentences := record => flatten(map(record.body, line => split(line, '. ')))

` main analysis function that works per-PostRecord, computing
	statistics over the post body and publishing a CSV `
analyze := records => (
	sorted := sortBy(records, r => r.date)

	log('Serializing word list...')
	wordLengths := map(sorted, r => map(getWords(r), len))
	log('Serializing sentence list...')
	sentenceLengths := map(sorted, r => map(
		getSentences(r)
		sent => len(filter(split(sent, ' '), w => ~blank?(w)))
	))

	log('Computing mean word lengths')
	meanWordLengths := map(wordLengths, mean)

	log('Computing median word lengths')
	medianWordLengths := map(wordLengths, median)

	log('Computing median sentence lengths')
	medianSentenceLengths := map(sentenceLengths, median)

	log('Computing median paragraph lengths')
	paragraphLengths := map(sorted, record => map(
		record.body
		para => len(filter(split(para, ' '), w => ~blank?(w)))
	))
	medianParagraphLengths := map(paragraphLengths, median)

	results := {
		dates: map(sorted, r => r.date)
		meanWordLengths: meanWordLengths
		medianWordLengths: medianWordLengths
		medianSentenceLengths: medianSentenceLengths
		medianParagraphLengths: medianParagraphLengths
	}

	csv := renderCSV(results)

	log(csv)

	writeFile('./analysis.csv', csv, done => done :: {
		true -> log('File saved to ./analysis.csv successfully!')
		() -> log('error: failed to save analysis results csv!')
	})
)

` render results into a CSV for importing into Google Sheets `
renderCSV := results => (
	csvLines := []

	each(keys(results), key => (
		rowData := append([key], map(results.(key), string))
		csvLines.len(csvLines) := cat(rowData, ',')
	))

	cat(csvLines, Newline)
)

` main analysis routine `

postRecords := []

withAllPosts(fileNames => each(
	fileNames
	fName => withPostRecord(fName, record => (
		log(f('read: [{{ date }}] {{ title }}', record))

		postRecords.len(postRecords) := record
		len(postRecords) :: {
			len(fileNames) -> analyze(postRecords)
		}
	))
))

## quicksort.ink
` minimal quicksort implementation
	using hoare partition `

std := load('std')

map := std.map
clone := std.clone

sortBy := (v, pred) => (
	vPred := map(v, pred)
	partition := (v, lo, hi) => (
		pivot := vPred.(lo)
		lsub := i => (vPred.(i) < pivot) :: {
			true -> lsub(i + 1)
			false -> i
		}
		rsub := j => (vPred.(j) > pivot) :: {
			true -> rsub(j - 1)
			false -> j
		}
		(sub := (i, j) => (
			i := lsub(i)
			j := rsub(j)
			(i < j) :: {
				false -> j
				true -> (
					` inlined swap! `
					tmp := v.(i)
					tmpPred := vPred.(i)
					v.(i) := v.(j)
					v.(j) := tmp
					vPred.(i) := vPred.(j)
					vPred.(j) := tmpPred

					sub(i + 1, j - 1)
				)
			}
		))(lo, hi)
	)
	(quicksort := (v, lo, hi) => len(v) :: {
		0 -> v
		_ -> (lo < hi) :: {
			false -> v
			true -> (
				p := partition(v, lo, hi)
				quicksort(v, lo, p)
				quicksort(v, p + 1, hi)
			)
		}
	})(v, 0, len(v) - 1)
)

sort! := v => sortBy(v, x => x)

sort := v => sort!(clone(v))
	` Count sentence length, word size distribution over past posts `

	std := load('std')
	str := load('str')
	quicksort := load('quicksort')

	log := std.log
	f := std.format
	append := std.append
	cat := std.cat
	slice := std.slice
	flatten := std.flatten
	reduce := std.reduce
	map := std.map
	each := std.each
	filter := std.filter
	readFile := std.readFile
	writeFile := std.writeFile

	hasPrefix? := str.hasPrefix?
	split := str.split
	trim := str.trim
	trimS := s => trim(s, ' ')

	sortBy := quicksort.sortBy
	sort := quicksort.sort

	` Constants `

	PostsDir := './content/posts'
	Newline := char(10)

	blank? := s => trimS(s) = ''

	` find all blog posts on the site and callback with all file names `
	withAllPosts := cb => (
	postFiles := dir(PostsDir, evt => evt.type :: {
	'error' -> log('error: could not read posts directory!')
	'data' -> cb(filter(
	map(evt.data, entry => entry.name)
	` filter out hidden files and the _index.md file `
	fName => ~(hasPrefix?(fName, '.') \| hasPrefix?(fName, '_'))
	))
	})
	)

	` given a potentially double-quoted string, strip the quotes `
	stripQuotes := s => s.0 :: {
	'"' -> slice(s, 1, len(s) - 1)
	_ -> s
	}

	` given a file name to a blog post, parse it completely
	and return a PostRecord structure with parsed metadata and body `
	withPostRecord := (fileName, cb) => (
	readFile(PostsDir + '/' + fileName, bytes => (
	lines := filter(
	split(bytes, Newline)
	` remove raw HTML lines `
	line => ~(blank?(line) \| hasPrefix?(line, '<'))
	)

	` sanitize lines `
	lines := map(lines, line => (sub := i => i :: {
	0 -> line
	_ -> (
	line.(i) :: {
	'_' -> line.(i) := ' '
	'*' -> line.(i) := ' '
	'[' -> line.(i) := ' '
	']' -> line.(i) := ' '
	'(' -> line.(i) := ' '
	')' -> line.(i) := ' '
	}
	sub(i - 1)
	)
	})(len(line) - 1))

	record := {
	` parse state:
	0 -> start
	1 -> inside front matter
	2 -> after front matter
	3 -> error, stop parsing `
	parseState: 0
	title: ()
	date: ()
	body: []
	}
	each(lines, line => record.parseState :: {
	0 -> line :: {
	'---' -> record.parseState := 1
	_ -> (
	log(f('error: unexpected line in post file, {{0}}', [line]))
	record.parseState := 3
	)
	}
	1 -> line :: {
	'---' -> record.parseState := 2
	_ -> split(line, ':').0 :: {
	'title' -> record.title := stripQuotes(trimS(split(line, 'title:').1))
	'date' -> record.date := trimS(split(line, 'date:').1)
	}
	}
	2 -> record.body.len(record.body) := line
	3 -> ()
	})

	cb(record)
	))
	)

	` mean of an array `
	mean := xs => len(xs) :: {
	0 -> ~1
	_ -> reduce(xs, (a, b) => a + b, 0) / len(xs)
	}

	` median of an array `
	median := xs => xs :: {
	[] -> ~1
	_ -> (
	sorted := sort(xs)
	mid := floor(len(sorted) / 2)
	(len(sorted) % 2) :: {
	0 -> (sorted.(mid) + sorted.(mid - 1)) / 2
	1 -> sorted.(mid)
	}
	)
	}

	` split up a blog post body into a flat list of words
	includes doing some sanitization `
	getWords := record => filter(
	flatten(map(record.body, line => split(line, ' ')))
	` try to remove links and empty words`
	word => blank?(word) :: {
	true -> false
	_ -> ~(hasPrefix?(word, 'http') \| hasPrefix?(word, '/'))
	}
	)

	` split up a blog post body into a flat list of sentences
	includes doing some sanitization `
	getSentences := record => flatten(map(record.body, line => split(line, '. ')))

	` main analysis function that works per-PostRecord, computing
	statistics over the post body and publishing a CSV `
	analyze := records => (
	sorted := sortBy(records, r => r.date)

	log('Serializing word list...')
	wordLengths := map(sorted, r => map(getWords(r), len))
	log('Serializing sentence list...')
	sentenceLengths := map(sorted, r => map(
	getSentences(r)
	sent => len(filter(split(sent, ' '), w => ~blank?(w)))
	))

	log('Computing mean word lengths')
	meanWordLengths := map(wordLengths, mean)

	log('Computing median word lengths')
	medianWordLengths := map(wordLengths, median)

	log('Computing median sentence lengths')
	medianSentenceLengths := map(sentenceLengths, median)

	log('Computing median paragraph lengths')
	paragraphLengths := map(sorted, record => map(
	record.body
	para => len(filter(split(para, ' '), w => ~blank?(w)))
	))
	medianParagraphLengths := map(paragraphLengths, median)

	results := {
	dates: map(sorted, r => r.date)
	meanWordLengths: meanWordLengths
	medianWordLengths: medianWordLengths
	medianSentenceLengths: medianSentenceLengths
	medianParagraphLengths: medianParagraphLengths
	}

	csv := renderCSV(results)

	log(csv)

	writeFile('./analysis.csv', csv, done => done :: {
	true -> log('File saved to ./analysis.csv successfully!')
	() -> log('error: failed to save analysis results csv!')
	})
	)

	` render results into a CSV for importing into Google Sheets `
	renderCSV := results => (
	csvLines := []

	each(keys(results), key => (
	rowData := append([key], map(results.(key), string))
	csvLines.len(csvLines) := cat(rowData, ',')
	))

	cat(csvLines, Newline)
	)

	` main analysis routine `

	postRecords := []

	withAllPosts(fileNames => each(
	fileNames
	fName => withPostRecord(fName, record => (
	log(f('read: [{{ date }}] {{ title }}', record))

	postRecords.len(postRecords) := record
	len(postRecords) :: {
	len(fileNames) -> analyze(postRecords)
	}
	))
	))
	` minimal quicksort implementation
	using hoare partition `

	std := load('std')

	map := std.map
	clone := std.clone

	sortBy := (v, pred) => (
	vPred := map(v, pred)
	partition := (v, lo, hi) => (
	pivot := vPred.(lo)
	lsub := i => (vPred.(i) < pivot) :: {
	true -> lsub(i + 1)
	false -> i
	}
	rsub := j => (vPred.(j) > pivot) :: {
	true -> rsub(j - 1)
	false -> j
	}
	(sub := (i, j) => (
	i := lsub(i)
	j := rsub(j)
	(i < j) :: {
	false -> j
	true -> (
	` inlined swap! `
	tmp := v.(i)
	tmpPred := vPred.(i)
	v.(i) := v.(j)
	v.(j) := tmp
	vPred.(i) := vPred.(j)
	vPred.(j) := tmpPred

	sub(i + 1, j - 1)
	)
	}
	))(lo, hi)
	)
	(quicksort := (v, lo, hi) => len(v) :: {
	0 -> v
	_ -> (lo < hi) :: {
	false -> v
	true -> (
	p := partition(v, lo, hi)
	quicksort(v, lo, p)
	quicksort(v, p + 1, hi)
	)
	}
	})(v, 0, len(v) - 1)
	)

	sort! := v => sortBy(v, x => x)

	sort := v => sort!(clone(v))