Skip to content

Instantly share code, notes, and snippets.

@hsingtism
Created June 30, 2024 04:18
Show Gist options
  • Select an option

  • Save hsingtism/8055896cb9871dbdf3b0a1fc90aa12b4 to your computer and use it in GitHub Desktop.

Select an option

Save hsingtism/8055896cb9871dbdf3b0a1fc90aa12b4 to your computer and use it in GitHub Desktop.
const LOG_TRUNCATE_LENGTH = 2
const LIST_TRUNCATE_LENGTH = 25
const BUCKET_TRUNCATE_LENGTH = 10
const corpusData_PATH = 'count_1w.txt' // https://norvig.com/ngrams/ mirrored at https://gist.github.com/hsingtism/de783abe07e9393630705d1567483f11
const dictionary_PATH = 'words_alpha.txt' // https://github.com/dwyl/english-words/
/*
IF YOU GET AN ERROR, IT MAY BE BECAUSE OF THE DIFFERENCE BETWEEN WINDOWS
AND LINUX LINE BREAKS. TRY INTERCHANGING "\n" and "\r\n".
*/
/*----------------------------------------------------------------------------*/
const fs = require('fs');
const wordsFrequency = fs.readFileSync(corpusData_PATH).toString().split("\n")
.map((v, i) => [v.split('\t')[0] , Number(v.split('\t')[1]), i + 1])
const words = fs.readFileSync(dictionary_PATH).toString().split("\r\n")
words.pop()
wordsFrequency.pop()
const alphabet = [...'abcdefghijklmnopqrstuvwxyz']
// call with null for empty array so they're not referening the same thing
function letterFieldObject(fill) {
const template = {}
for (const startLetter of alphabet) {
template[startLetter] = {}
for (const endLetter of alphabet) {
template[startLetter][endLetter] = fill ?? []
}
}
return template
}
function letterFieldMap(src, map) {
object = letterFieldObject(0)
for (const startLetter of alphabet) {
for (const endLetter of alphabet) {
object[startLetter][endLetter] = map(src[startLetter][endLetter])
}
}
return object
}
function extractTopBottomValues(object, truncateLength) {
let list = []
for (const startLetter of alphabet) {
for (const endLetter of alphabet) {
list.push([startLetter + endLetter, object[startLetter][endLetter]])
}
}
list = list.sort((a, b) => a[1] - b[1]);
for(let i = 0; i < truncateLength; i++) {
list[i][2] = list[list.length - 1 - i][0]
list[i][3] = list[list.length - 1 - i][1]
}
return list.slice(0, truncateLength)
}
function addSumColumns(src) {
object = JSON.parse(JSON.stringify(src))
const startTotals = {}
const endTotals = {}
for (const startLetter of alphabet) {
for (const endLetter of alphabet) {
const entry = object[startLetter][endLetter]
startTotals[startLetter] = (startTotals[startLetter] ?? 0) + entry
endTotals[endLetter] = (endTotals[endLetter] ?? 0) + entry
}
}
object['_sum'] = endTotals
for (const letter of alphabet) {
object[letter]['_sum'] = startTotals[letter]
}
object['_total'] = Object.values(startTotals).reduce((a, b) => a + b)
return object
}
const wordBucket = letterFieldObject(null)
const frequencyBucket = letterFieldObject(null)
for(const word of words) {
wordBucket[word[0]][word[word.length - 1]].push(word)
}
for (const entry of wordsFrequency) {
const word = entry[0]
frequencyBucket[word[0]][word[word.length - 1]].push(entry)
}
function sumColumn(data, column) {
let acc = 0
for(let i = 0; i < data.length; i++) {
acc += data[i][column]
}
return acc
}
const lengthF = v => v.length
const truncatedLog = v => Number(Math.log10(v).toFixed(LOG_TRUNCATE_LENGTH))
const dictionaryCount = letterFieldMap(wordBucket, lengthF)
const corpusTotal = letterFieldMap(frequencyBucket, entry => entry.map(v => v[1]).reduce((a, b) => a + b))
const corpusTotalLog = letterFieldMap(corpusTotal, truncatedLog)
const corpusCount = letterFieldMap(frequencyBucket, lengthF)
// console.table(wordBucket['z']['q'].slice(0, LIST_TRUNCATE_LENGTH))
// console.table(frequencyBucket['z']['q'].slice(0, LIST_TRUNCATE_LENGTH))
// console.table(addSumColumns(dictionaryCount))
// console.table(extractTopBottomValues(dictionaryCount, LIST_TRUNCATE_LENGTH))
// console.table(addSumColumns(corpusTotal))
// console.table(extractTopBottomValues(corpusTotal, LIST_TRUNCATE_LENGTH))
// console.table(corpusTotalLog)
// console.table(extractTopBottomValues(corpusCount, LIST_TRUNCATE_LENGTH))
// console.table(addSumColumns(corpusCount))
const wordBucketTruncated = letterFieldMap(wordBucket, arr => arr.slice(0, BUCKET_TRUNCATE_LENGTH))
const frequBucketTruncated = letterFieldMap(frequencyBucket, arr => arr.slice(0, BUCKET_TRUNCATE_LENGTH))
// console.table(wordBucketTruncated)
// console.table(frequBucketTruncated)
const sourceInformation = 'Dictionary from github: dwyl/english-words. Google Web Trillion Word Corpus data from Peter Norvig: https://norvig.com/ngrams/. This script and project by Hsing Lo; https://hsing.org; github: hsingtism'
const bigGlob = {
'sourceInformation': sourceInformation,
'dictionaryCount_': addSumColumns(dictionaryCount),
'dictionaryCountRanks': extractTopBottomValues(dictionaryCount, LIST_TRUNCATE_LENGTH),
'corpusTotal_': addSumColumns(corpusTotal),
'corpusTotalRanks': extractTopBottomValues(corpusTotal, LIST_TRUNCATE_LENGTH),
'corpusTotalLog10': corpusTotalLog,
'corpusCount_': addSumColumns(corpusCount),
'corpusCountRanks': extractTopBottomValues(corpusCount, LIST_TRUNCATE_LENGTH),
'wordBucketTruncated': wordBucketTruncated,
'frequBucketTruncated': frequBucketTruncated,
}
// console.log(JSON.stringify(bigGlob))
fs.writeFileSync('test.json', JSON.stringify(bigGlob))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment