Blog writing analysis script, referenced in thesephist.com/posts/blog-analysis/ ✍️
` Count sentence length, word size distribution over past posts ` | |
std := load('std') | |
str := load('str') | |
quicksort := load('quicksort') | |
log := std.log | |
f := std.format | |
append := std.append | |
cat := std.cat | |
slice := std.slice | |
flatten := std.flatten | |
reduce := std.reduce | |
map := std.map | |
each := std.each | |
filter := std.filter | |
readFile := std.readFile | |
writeFile := std.writeFile | |
hasPrefix? := str.hasPrefix? | |
split := str.split | |
trim := str.trim | |
trimS := s => trim(s, ' ') | |
sortBy := quicksort.sortBy | |
sort := quicksort.sort | |
` Constants ` | |
PostsDir := './content/posts' | |
Newline := char(10) | |
blank? := s => trimS(s) = '' | |
` find all blog posts on the site and callback with all file names ` | |
withAllPosts := cb => ( | |
postFiles := dir(PostsDir, evt => evt.type :: { | |
'error' -> log('error: could not read posts directory!') | |
'data' -> cb(filter( | |
map(evt.data, entry => entry.name) | |
` filter out hidden files and the _index.md file ` | |
fName => ~(hasPrefix?(fName, '.') | hasPrefix?(fName, '_')) | |
)) | |
}) | |
) | |
` given a potentially double-quoted string, strip the quotes ` | |
stripQuotes := s => s.0 :: { | |
'"' -> slice(s, 1, len(s) - 1) | |
_ -> s | |
} | |
` given a file name to a blog post, parse it completely | |
and return a PostRecord structure with parsed metadata and body ` | |
withPostRecord := (fileName, cb) => ( | |
readFile(PostsDir + '/' + fileName, bytes => ( | |
lines := filter( | |
split(bytes, Newline) | |
` remove raw HTML lines ` | |
line => ~(blank?(line) | hasPrefix?(line, '<')) | |
) | |
` sanitize lines ` | |
lines := map(lines, line => (sub := i => i :: { | |
0 -> line | |
_ -> ( | |
line.(i) :: { | |
'_' -> line.(i) := ' ' | |
'*' -> line.(i) := ' ' | |
'[' -> line.(i) := ' ' | |
']' -> line.(i) := ' ' | |
'(' -> line.(i) := ' ' | |
')' -> line.(i) := ' ' | |
} | |
sub(i - 1) | |
) | |
})(len(line) - 1)) | |
record := { | |
` parse state: | |
0 -> start | |
1 -> inside front matter | |
2 -> after front matter | |
3 -> error, stop parsing ` | |
parseState: 0 | |
title: () | |
date: () | |
body: [] | |
} | |
each(lines, line => record.parseState :: { | |
0 -> line :: { | |
'---' -> record.parseState := 1 | |
_ -> ( | |
log(f('error: unexpected line in post file, {{0}}', [line])) | |
record.parseState := 3 | |
) | |
} | |
1 -> line :: { | |
'---' -> record.parseState := 2 | |
_ -> split(line, ':').0 :: { | |
'title' -> record.title := stripQuotes(trimS(split(line, 'title:').1)) | |
'date' -> record.date := trimS(split(line, 'date:').1) | |
} | |
} | |
2 -> record.body.len(record.body) := line | |
3 -> () | |
}) | |
cb(record) | |
)) | |
) | |
` mean of an array ` | |
mean := xs => len(xs) :: { | |
0 -> ~1 | |
_ -> reduce(xs, (a, b) => a + b, 0) / len(xs) | |
} | |
` median of an array ` | |
median := xs => xs :: { | |
[] -> ~1 | |
_ -> ( | |
sorted := sort(xs) | |
mid := floor(len(sorted) / 2) | |
(len(sorted) % 2) :: { | |
0 -> (sorted.(mid) + sorted.(mid - 1)) / 2 | |
1 -> sorted.(mid) | |
} | |
) | |
} | |
` split up a blog post body into a flat list of words | |
includes doing some sanitization ` | |
getWords := record => filter( | |
flatten(map(record.body, line => split(line, ' '))) | |
` try to remove links and empty words` | |
word => blank?(word) :: { | |
true -> false | |
_ -> ~(hasPrefix?(word, 'http') | hasPrefix?(word, '/')) | |
} | |
) | |
` split up a blog post body into a flat list of sentences | |
includes doing some sanitization ` | |
getSentences := record => flatten(map(record.body, line => split(line, '. '))) | |
` main analysis function that works per-PostRecord, computing | |
statistics over the post body and publishing a CSV ` | |
analyze := records => ( | |
sorted := sortBy(records, r => r.date) | |
log('Serializing word list...') | |
wordLengths := map(sorted, r => map(getWords(r), len)) | |
log('Serializing sentence list...') | |
sentenceLengths := map(sorted, r => map( | |
getSentences(r) | |
sent => len(filter(split(sent, ' '), w => ~blank?(w))) | |
)) | |
log('Computing mean word lengths') | |
meanWordLengths := map(wordLengths, mean) | |
log('Computing median word lengths') | |
medianWordLengths := map(wordLengths, median) | |
log('Computing median sentence lengths') | |
medianSentenceLengths := map(sentenceLengths, median) | |
log('Computing median paragraph lengths') | |
paragraphLengths := map(sorted, record => map( | |
record.body | |
para => len(filter(split(para, ' '), w => ~blank?(w))) | |
)) | |
medianParagraphLengths := map(paragraphLengths, median) | |
results := { | |
dates: map(sorted, r => r.date) | |
meanWordLengths: meanWordLengths | |
medianWordLengths: medianWordLengths | |
medianSentenceLengths: medianSentenceLengths | |
medianParagraphLengths: medianParagraphLengths | |
} | |
csv := renderCSV(results) | |
log(csv) | |
writeFile('./analysis.csv', csv, done => done :: { | |
true -> log('File saved to ./analysis.csv successfully!') | |
() -> log('error: failed to save analysis results csv!') | |
}) | |
) | |
` render results into a CSV for importing into Google Sheets ` | |
renderCSV := results => ( | |
csvLines := [] | |
each(keys(results), key => ( | |
rowData := append([key], map(results.(key), string)) | |
csvLines.len(csvLines) := cat(rowData, ',') | |
)) | |
cat(csvLines, Newline) | |
) | |
` main analysis routine ` | |
postRecords := [] | |
withAllPosts(fileNames => each( | |
fileNames | |
fName => withPostRecord(fName, record => ( | |
log(f('read: [{{ date }}] {{ title }}', record)) | |
postRecords.len(postRecords) := record | |
len(postRecords) :: { | |
len(fileNames) -> analyze(postRecords) | |
} | |
)) | |
)) |
` minimal quicksort implementation | |
using hoare partition ` | |
std := load('std') | |
map := std.map | |
clone := std.clone | |
sortBy := (v, pred) => ( | |
vPred := map(v, pred) | |
partition := (v, lo, hi) => ( | |
pivot := vPred.(lo) | |
lsub := i => (vPred.(i) < pivot) :: { | |
true -> lsub(i + 1) | |
false -> i | |
} | |
rsub := j => (vPred.(j) > pivot) :: { | |
true -> rsub(j - 1) | |
false -> j | |
} | |
(sub := (i, j) => ( | |
i := lsub(i) | |
j := rsub(j) | |
(i < j) :: { | |
false -> j | |
true -> ( | |
` inlined swap! ` | |
tmp := v.(i) | |
tmpPred := vPred.(i) | |
v.(i) := v.(j) | |
v.(j) := tmp | |
vPred.(i) := vPred.(j) | |
vPred.(j) := tmpPred | |
sub(i + 1, j - 1) | |
) | |
} | |
))(lo, hi) | |
) | |
(quicksort := (v, lo, hi) => len(v) :: { | |
0 -> v | |
_ -> (lo < hi) :: { | |
false -> v | |
true -> ( | |
p := partition(v, lo, hi) | |
quicksort(v, lo, p) | |
quicksort(v, p + 1, hi) | |
) | |
} | |
})(v, 0, len(v) - 1) | |
) | |
sort! := v => sortBy(v, x => x) | |
sort := v => sort!(clone(v)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment