-
-
Save hsingtism/8055896cb9871dbdf3b0a1fc90aa12b4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const LOG_TRUNCATE_LENGTH = 2 | |
| const LIST_TRUNCATE_LENGTH = 25 | |
| const BUCKET_TRUNCATE_LENGTH = 10 | |
| const corpusData_PATH = 'count_1w.txt' // https://norvig.com/ngrams/ mirrored at https://gist.github.com/hsingtism/de783abe07e9393630705d1567483f11 | |
| const dictionary_PATH = 'words_alpha.txt' // https://github.com/dwyl/english-words/ | |
| /* | |
| IF YOU GET AN ERROR, IT MAY BE BECAUSE OF THE DIFFERENCE BETWEEN WINDOWS | |
| AND LINUX LINE BREAKS. TRY INTERCHANGING "\n" and "\r\n". | |
| */ | |
| /*----------------------------------------------------------------------------*/ | |
| const fs = require('fs'); | |
| const wordsFrequency = fs.readFileSync(corpusData_PATH).toString().split("\n") | |
| .map((v, i) => [v.split('\t')[0] , Number(v.split('\t')[1]), i + 1]) | |
| const words = fs.readFileSync(dictionary_PATH).toString().split("\r\n") | |
| words.pop() | |
| wordsFrequency.pop() | |
| const alphabet = [...'abcdefghijklmnopqrstuvwxyz'] | |
| // call with null for empty array so they're not referening the same thing | |
| function letterFieldObject(fill) { | |
| const template = {} | |
| for (const startLetter of alphabet) { | |
| template[startLetter] = {} | |
| for (const endLetter of alphabet) { | |
| template[startLetter][endLetter] = fill ?? [] | |
| } | |
| } | |
| return template | |
| } | |
| function letterFieldMap(src, map) { | |
| object = letterFieldObject(0) | |
| for (const startLetter of alphabet) { | |
| for (const endLetter of alphabet) { | |
| object[startLetter][endLetter] = map(src[startLetter][endLetter]) | |
| } | |
| } | |
| return object | |
| } | |
| function extractTopBottomValues(object, truncateLength) { | |
| let list = [] | |
| for (const startLetter of alphabet) { | |
| for (const endLetter of alphabet) { | |
| list.push([startLetter + endLetter, object[startLetter][endLetter]]) | |
| } | |
| } | |
| list = list.sort((a, b) => a[1] - b[1]); | |
| for(let i = 0; i < truncateLength; i++) { | |
| list[i][2] = list[list.length - 1 - i][0] | |
| list[i][3] = list[list.length - 1 - i][1] | |
| } | |
| return list.slice(0, truncateLength) | |
| } | |
| function addSumColumns(src) { | |
| object = JSON.parse(JSON.stringify(src)) | |
| const startTotals = {} | |
| const endTotals = {} | |
| for (const startLetter of alphabet) { | |
| for (const endLetter of alphabet) { | |
| const entry = object[startLetter][endLetter] | |
| startTotals[startLetter] = (startTotals[startLetter] ?? 0) + entry | |
| endTotals[endLetter] = (endTotals[endLetter] ?? 0) + entry | |
| } | |
| } | |
| object['_sum'] = endTotals | |
| for (const letter of alphabet) { | |
| object[letter]['_sum'] = startTotals[letter] | |
| } | |
| object['_total'] = Object.values(startTotals).reduce((a, b) => a + b) | |
| return object | |
| } | |
| const wordBucket = letterFieldObject(null) | |
| const frequencyBucket = letterFieldObject(null) | |
| for(const word of words) { | |
| wordBucket[word[0]][word[word.length - 1]].push(word) | |
| } | |
| for (const entry of wordsFrequency) { | |
| const word = entry[0] | |
| frequencyBucket[word[0]][word[word.length - 1]].push(entry) | |
| } | |
| function sumColumn(data, column) { | |
| let acc = 0 | |
| for(let i = 0; i < data.length; i++) { | |
| acc += data[i][column] | |
| } | |
| return acc | |
| } | |
| const lengthF = v => v.length | |
| const truncatedLog = v => Number(Math.log10(v).toFixed(LOG_TRUNCATE_LENGTH)) | |
| const dictionaryCount = letterFieldMap(wordBucket, lengthF) | |
| const corpusTotal = letterFieldMap(frequencyBucket, entry => entry.map(v => v[1]).reduce((a, b) => a + b)) | |
| const corpusTotalLog = letterFieldMap(corpusTotal, truncatedLog) | |
| const corpusCount = letterFieldMap(frequencyBucket, lengthF) | |
| // console.table(wordBucket['z']['q'].slice(0, LIST_TRUNCATE_LENGTH)) | |
| // console.table(frequencyBucket['z']['q'].slice(0, LIST_TRUNCATE_LENGTH)) | |
| // console.table(addSumColumns(dictionaryCount)) | |
| // console.table(extractTopBottomValues(dictionaryCount, LIST_TRUNCATE_LENGTH)) | |
| // console.table(addSumColumns(corpusTotal)) | |
| // console.table(extractTopBottomValues(corpusTotal, LIST_TRUNCATE_LENGTH)) | |
| // console.table(corpusTotalLog) | |
| // console.table(extractTopBottomValues(corpusCount, LIST_TRUNCATE_LENGTH)) | |
| // console.table(addSumColumns(corpusCount)) | |
| const wordBucketTruncated = letterFieldMap(wordBucket, arr => arr.slice(0, BUCKET_TRUNCATE_LENGTH)) | |
| const frequBucketTruncated = letterFieldMap(frequencyBucket, arr => arr.slice(0, BUCKET_TRUNCATE_LENGTH)) | |
| // console.table(wordBucketTruncated) | |
| // console.table(frequBucketTruncated) | |
| const sourceInformation = 'Dictionary from github: dwyl/english-words. Google Web Trillion Word Corpus data from Peter Norvig: https://norvig.com/ngrams/. This script and project by Hsing Lo; https://hsing.org; github: hsingtism' | |
| const bigGlob = { | |
| 'sourceInformation': sourceInformation, | |
| 'dictionaryCount_': addSumColumns(dictionaryCount), | |
| 'dictionaryCountRanks': extractTopBottomValues(dictionaryCount, LIST_TRUNCATE_LENGTH), | |
| 'corpusTotal_': addSumColumns(corpusTotal), | |
| 'corpusTotalRanks': extractTopBottomValues(corpusTotal, LIST_TRUNCATE_LENGTH), | |
| 'corpusTotalLog10': corpusTotalLog, | |
| 'corpusCount_': addSumColumns(corpusCount), | |
| 'corpusCountRanks': extractTopBottomValues(corpusCount, LIST_TRUNCATE_LENGTH), | |
| 'wordBucketTruncated': wordBucketTruncated, | |
| 'frequBucketTruncated': frequBucketTruncated, | |
| } | |
| // console.log(JSON.stringify(bigGlob)) | |
| fs.writeFileSync('test.json', JSON.stringify(bigGlob)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment