Skip to content

Instantly share code, notes, and snippets.

@thesephist

thesephist/analysis.ink

Last active Jun 8, 2020
Embed
What would you like to do?
Blog writing analysis script, referenced in thesephist.com/posts/blog-analysis/ ✍️
` Count sentence length, word size distribution over past posts `
std := load('std')
str := load('str')
quicksort := load('quicksort')
log := std.log
f := std.format
append := std.append
cat := std.cat
slice := std.slice
flatten := std.flatten
reduce := std.reduce
map := std.map
each := std.each
filter := std.filter
readFile := std.readFile
writeFile := std.writeFile
hasPrefix? := str.hasPrefix?
split := str.split
trim := str.trim
trimS := s => trim(s, ' ')
sortBy := quicksort.sortBy
sort := quicksort.sort
` Constants `
PostsDir := './content/posts'
Newline := char(10)
blank? := s => trimS(s) = ''
` find all blog posts on the site and callback with all file names `
withAllPosts := cb => (
postFiles := dir(PostsDir, evt => evt.type :: {
'error' -> log('error: could not read posts directory!')
'data' -> cb(filter(
map(evt.data, entry => entry.name)
` filter out hidden files and the _index.md file `
fName => ~(hasPrefix?(fName, '.') | hasPrefix?(fName, '_'))
))
})
)
` given a potentially double-quoted string, strip the quotes `
stripQuotes := s => s.0 :: {
'"' -> slice(s, 1, len(s) - 1)
_ -> s
}
` given a file name to a blog post, parse it completely
and return a PostRecord structure with parsed metadata and body `
withPostRecord := (fileName, cb) => (
readFile(PostsDir + '/' + fileName, bytes => (
lines := filter(
split(bytes, Newline)
` remove raw HTML lines `
line => ~(blank?(line) | hasPrefix?(line, '<'))
)
` sanitize lines `
lines := map(lines, line => (sub := i => i :: {
0 -> line
_ -> (
line.(i) :: {
'_' -> line.(i) := ' '
'*' -> line.(i) := ' '
'[' -> line.(i) := ' '
']' -> line.(i) := ' '
'(' -> line.(i) := ' '
')' -> line.(i) := ' '
}
sub(i - 1)
)
})(len(line) - 1))
record := {
` parse state:
0 -> start
1 -> inside front matter
2 -> after front matter
3 -> error, stop parsing `
parseState: 0
title: ()
date: ()
body: []
}
each(lines, line => record.parseState :: {
0 -> line :: {
'---' -> record.parseState := 1
_ -> (
log(f('error: unexpected line in post file, {{0}}', [line]))
record.parseState := 3
)
}
1 -> line :: {
'---' -> record.parseState := 2
_ -> split(line, ':').0 :: {
'title' -> record.title := stripQuotes(trimS(split(line, 'title:').1))
'date' -> record.date := trimS(split(line, 'date:').1)
}
}
2 -> record.body.len(record.body) := line
3 -> ()
})
cb(record)
))
)
` mean of an array `
mean := xs => len(xs) :: {
0 -> ~1
_ -> reduce(xs, (a, b) => a + b, 0) / len(xs)
}
` median of an array `
median := xs => xs :: {
[] -> ~1
_ -> (
sorted := sort(xs)
mid := floor(len(sorted) / 2)
(len(sorted) % 2) :: {
0 -> (sorted.(mid) + sorted.(mid - 1)) / 2
1 -> sorted.(mid)
}
)
}
` split up a blog post body into a flat list of words
includes doing some sanitization `
getWords := record => filter(
flatten(map(record.body, line => split(line, ' ')))
` try to remove links and empty words`
word => blank?(word) :: {
true -> false
_ -> ~(hasPrefix?(word, 'http') | hasPrefix?(word, '/'))
}
)
` split up a blog post body into a flat list of sentences
includes doing some sanitization `
getSentences := record => flatten(map(record.body, line => split(line, '. ')))
` main analysis function that works per-PostRecord, computing
statistics over the post body and publishing a CSV `
analyze := records => (
sorted := sortBy(records, r => r.date)
log('Serializing word list...')
wordLengths := map(sorted, r => map(getWords(r), len))
log('Serializing sentence list...')
sentenceLengths := map(sorted, r => map(
getSentences(r)
sent => len(filter(split(sent, ' '), w => ~blank?(w)))
))
log('Computing mean word lengths')
meanWordLengths := map(wordLengths, mean)
log('Computing median word lengths')
medianWordLengths := map(wordLengths, median)
log('Computing median sentence lengths')
medianSentenceLengths := map(sentenceLengths, median)
log('Computing median paragraph lengths')
paragraphLengths := map(sorted, record => map(
record.body
para => len(filter(split(para, ' '), w => ~blank?(w)))
))
medianParagraphLengths := map(paragraphLengths, median)
results := {
dates: map(sorted, r => r.date)
meanWordLengths: meanWordLengths
medianWordLengths: medianWordLengths
medianSentenceLengths: medianSentenceLengths
medianParagraphLengths: medianParagraphLengths
}
csv := renderCSV(results)
log(csv)
writeFile('./analysis.csv', csv, done => done :: {
true -> log('File saved to ./analysis.csv successfully!')
() -> log('error: failed to save analysis results csv!')
})
)
` render results into a CSV for importing into Google Sheets `
renderCSV := results => (
csvLines := []
each(keys(results), key => (
rowData := append([key], map(results.(key), string))
csvLines.len(csvLines) := cat(rowData, ',')
))
cat(csvLines, Newline)
)
` main analysis routine `
postRecords := []
withAllPosts(fileNames => each(
fileNames
fName => withPostRecord(fName, record => (
log(f('read: [{{ date }}] {{ title }}', record))
postRecords.len(postRecords) := record
len(postRecords) :: {
len(fileNames) -> analyze(postRecords)
}
))
))
` minimal quicksort implementation
using hoare partition `
std := load('std')
map := std.map
clone := std.clone
sortBy := (v, pred) => (
vPred := map(v, pred)
partition := (v, lo, hi) => (
pivot := vPred.(lo)
lsub := i => (vPred.(i) < pivot) :: {
true -> lsub(i + 1)
false -> i
}
rsub := j => (vPred.(j) > pivot) :: {
true -> rsub(j - 1)
false -> j
}
(sub := (i, j) => (
i := lsub(i)
j := rsub(j)
(i < j) :: {
false -> j
true -> (
` inlined swap! `
tmp := v.(i)
tmpPred := vPred.(i)
v.(i) := v.(j)
v.(j) := tmp
vPred.(i) := vPred.(j)
vPred.(j) := tmpPred
sub(i + 1, j - 1)
)
}
))(lo, hi)
)
(quicksort := (v, lo, hi) => len(v) :: {
0 -> v
_ -> (lo < hi) :: {
false -> v
true -> (
p := partition(v, lo, hi)
quicksort(v, lo, p)
quicksort(v, p + 1, hi)
)
}
})(v, 0, len(v) - 1)
)
sort! := v => sortBy(v, x => x)
sort := v => sort!(clone(v))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.