Last active
April 13, 2024 05:31
-
-
Save g023/20144fb6f443b7538db69e516660f3dc to your computer and use it in GitHub Desktop.
Making dictionaries with Javascript
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!-- /* | |
// author: https://github.com/g023 | |
// license: BSD 3-Clause (https://opensource.org/license/BSD-3-clause) | |
An example to compress and decompress some text based on a given dictionary. | |
Makes a simpler array from the dictionary created from findTwoCharCombinations(text) and findThreeCharCombinations(text) | |
That should make compression even better. | |
TODO: compress the dictionary using the dictionary. | |
*/ --> | |
<script> | |
function findThreeCharCombinations(text) { | |
const paddedText = ` ${text} `; // Pad the string with whitespace at both ends to accommodate 3-character combinations | |
const combinations = {}; | |
// Iterate through the padded text to find all 3-character combinations | |
for (let i = 0; i < paddedText.length - 2; i++) { | |
const combination = paddedText.substring(i, i + 3); | |
// Check if the combination already exists in the combinations object | |
if (combinations[combination]) { | |
combinations[combination]++; | |
} else { // If it doesn't exist, add it to the combinations object with a count of 1 | |
combinations[combination] = 1; | |
} | |
} | |
// Convert combinations object to array of objects for sorting | |
const combinationArray = Object.keys(combinations).map(key => ({ combination: key, count: combinations[key] })); | |
// Sort combinations by count (descending) | |
combinationArray.sort((a, b) => b.count - a.count); | |
return combinationArray; | |
} | |
// Example usage: | |
// const text = "This is a sample text with some three character combinations like 'Thi', 'his', 'isa', etc."; | |
// load text fileshake.txt in the same dir ./shake.txt from the server | |
const threeCharCombinations = findThreeCharCombinations(text); | |
console.log(threeCharCombinations); | |
// begin compression related | |
// function to load a text file from server | |
function loadFile(filePath) { | |
var result = null; | |
var xmlhttp = new XMLHttpRequest(); | |
xmlhttp.open("GET", filePath, false); | |
xmlhttp.send(); | |
if (xmlhttp.status==200) { | |
result = xmlhttp.responseText; | |
} | |
return result; | |
} | |
const text = loadFile("shake.txt"); | |
function dictToArr(dict) { | |
// simplify the part we need to use for compression/decompression | |
let arr = []; | |
for (let i = 0; i < dict.length; i++) { | |
arr.push(dict[i].combination); | |
} | |
return arr; | |
} | |
function compressText(text, the_dict_arr) { | |
// ideas on restoring the padding: | |
// 1. store the length of padding at the beginning of the compressed text | |
// - then string recombining can be done after first. | |
let chunk_size = the_dict_arr[0].length; | |
console.log("Chunk size: ", chunk_size); | |
let compressed_text = []; | |
// first compressed_text element is size of padding | |
compressed_text.push(text.length % chunk_size); | |
let paddedText = text.padEnd(text.length + (chunk_size - text.length % chunk_size), ' '); | |
for (let i = 0; i < paddedText.length; i += chunk_size) { | |
let chunk = paddedText.substring(i, i + chunk_size); | |
let index = the_dict_arr.findIndex(x => x === chunk); | |
if (i === 0) { | |
compressed_text.push(index); | |
} else { | |
compressed_text.push(index - compressed_text[compressed_text.length - 1]); | |
} | |
} | |
return compressed_text; | |
} | |
function decompressText(compressed_text, the_dict_arr) { | |
// first compressed_text element is size of padding. remove it from the beginning and then process at end. | |
let chunk_size = the_dict_arr[0].length; | |
console.log("Chunk size: ", chunk_size); | |
// remove the padding | |
var padding = compressed_text.shift(); | |
let decompressed_text = ""; | |
for (let i = 0; i < compressed_text.length; i++) { | |
let index = compressed_text[i]; | |
if (i === 0) { | |
decompressed_text += the_dict_arr[index]; | |
} else { | |
decompressed_text += the_dict_arr[index + compressed_text[i - 1]]; | |
} | |
} | |
// process padding | |
decompressed_text = decompressed_text.substring(0, decompressed_text.length - padding); | |
return decompressed_text; | |
} | |
// example | |
let the_dict = findThreeCharCombinations(text); | |
let the_dict_arr = dictToArr(the_dict); | |
let compressed_text = compressText(text, the_dict_arr); | |
console.log("Compressed text: ", compressed_text); | |
let decompressed_text = decompressText(compressed_text, the_dict_arr); | |
console.log("Decompressed text: ", decompressed_text); | |
// debug string sizes | |
console.log("Original text size: ", text.length); | |
console.log("Compressed text size: ", compressed_text.length); | |
console.log("Decompressed text size: ", decompressed_text.length); | |
// dict array size | |
console.log("Dict array size: ", the_dict_arr.length); | |
</script> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
// author: https://github.com/g023 | |
// license: BSD 3-Clause (https://opensource.org/license/BSD-3-clause) | |
This function takes a string text as input and returns an array of objects containing the three-character combinations and their respective counts, ordered by frequency from most used to least used. | |
*/ | |
function findThreeCharCombinations(text) { | |
const paddedText = ` ${text} `; // Pad the string with whitespace at both ends to accommodate 3-character combinations | |
const combinations = {}; | |
// Iterate through the padded text to find all 3-character combinations | |
for (let i = 0; i < paddedText.length - 2; i++) { | |
const combination = paddedText.substring(i, i + 3); | |
// Ignore combinations containing whitespace | |
/* | |
if (!/\s/.test(combination)) { | |
if (combinations[combination]) { | |
combinations[combination]++; | |
} else { | |
combinations[combination] = 1; | |
} | |
} | |
*/ | |
// alternate (no ws checking) | |
if (combinations[combination]) { | |
combinations[combination]++; | |
} else { | |
combinations[combination] = 1; | |
} | |
} | |
// Convert combinations object to array of objects for sorting | |
const combinationArray = Object.keys(combinations).map(key => ({ combination: key, count: combinations[key] })); | |
// Sort combinations by count (descending) | |
combinationArray.sort((a, b) => b.count - a.count); | |
return combinationArray; | |
} | |
// Example usage: | |
const text = "This is a sample text with some three character combinations like 'Thi', 'his', 'isa', etc."; | |
const threeCharCombinations = findThreeCharCombinations(text); | |
console.log(threeCharCombinations); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
// author: https://github.com/g023 | |
// license: BSD 3-Clause (https://opensource.org/license/BSD-3-clause) | |
This function takes a string text as input and returns an array of objects containing the two-character combinations and their respective counts, ordered by frequency from most used to least used. | |
*/ | |
function findTwoCharCombinations(text) { | |
const paddedText = ` ${text} `; // Pad the string with whitespace at both ends | |
const combinations = {}; | |
// Iterate through the padded text to find all 2-character combinations | |
for (let i = 0; i < paddedText.length - 1; i++) { | |
const combination = paddedText.substring(i, i + 2); | |
// Ignore combinations containing whitespace | |
/* | |
if (!/\s/.test(combination)) { | |
if (combinations[combination]) { | |
combinations[combination]++; | |
} else { | |
combinations[combination] = 1; | |
} | |
} | |
*/ | |
// alternate (no ws checkin) | |
if (combinations[combination]) { | |
combinations[combination]++; | |
} else { | |
combinations[combination] = 1; | |
} | |
} | |
// Convert combinations object to array of objects for sorting | |
const combinationArray = Object.keys(combinations).map(key => ({ combination: key, count: combinations[key] })); | |
// Sort combinations by count (descending) | |
combinationArray.sort((a, b) => b.count - a.count); | |
return combinationArray; | |
} | |
// Example usage: | |
const text = "This is a sample text with some two character combinations like 'th', 'is', 'sa', etc."; | |
const twoCharCombinations = findTwoCharCombinations(text); | |
console.log(twoCharCombinations); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment