Created
August 3, 2023 00:21
-
-
Save jrsinclair/45eab4e9db049fd8ba5e508a2acdb120 to your computer and use it in GitHub Desktop.
A couple of rough compression/decompression functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const countSubstringMatches = (str: string, substr: string, startAt: number) => { | |
const substringLength = substr.length; | |
let count = 1; | |
for ( | |
let substringPos = str.indexOf(substr, startAt + substringLength); | |
substringPos >= 0; | |
++count | |
) | |
substringPos = str.indexOf(substr, substringPos + substringLength); | |
return count; | |
}; | |
export const findRepeatedSubstrings = (haystack: string) => { | |
const helper = (size: number, soFar = new Map<string, number>()): Map<string, number> => { | |
if (size <= 2 || haystack.length < size) return soFar; | |
const emptyList = new Array(haystack.length - size).fill(undefined); | |
const substrings = emptyList.map((_, i) => haystack.substring(i, i + size)); | |
const counts = substrings.reduce((acc, str, i) => { | |
if (acc.has(str) || str.indexOf(DELIM) >= 0) return acc; | |
const matchesCount = countSubstringMatches(haystack, str, i); | |
if (matchesCount >= 2) acc.set(str, matchesCount); | |
return acc; | |
}, soFar); | |
return helper(size - 1, counts); | |
}; | |
return helper; | |
}; | |
export const cutOn = (str: string, pattern: string) => { | |
const idx = str.indexOf(pattern); | |
if (idx < 0) return [str, '']; | |
return [str.substring(0, idx), str.substring(idx + pattern.length)]; | |
}; | |
const inBadRange = (n: number) => n < 32 || (n >= 127 && n <= 160); | |
const findUnusedChar = (str: string) => { | |
let c; | |
for (let i = 32; i < 255; i++) { | |
c = String.fromCharCode(i); | |
if (!inBadRange(i) && str.indexOf(c) < 0) return c; | |
} | |
return undefined; | |
}; | |
const calcSavings = ([str, count]: [string, number]) => { | |
return (count - 1) * str.length - 2; | |
}; | |
const findMostSavingestSubstring = (counts: Map<string, number>) => { | |
const sorted = [...counts.entries()].sort((a, b) => calcSavings(b) - calcSavings(a)); | |
return sorted[0][0]; | |
}; | |
export const DELIM = String.fromCharCode(1); | |
const COMMON_CHARS = [ | |
['"', "'"], | |
[',', '~'], | |
['}', ')'], | |
['{', '('], | |
[' ', '.'], | |
]; | |
const escapeRegexChar = (c: string) => ('()[]{}.*'.includes(c) ? `\\${c}` : c); | |
const swapCommonChars = (str: string) => { | |
return COMMON_CHARS.reduce((s, [a, b]) => { | |
const regex = new RegExp(`${escapeRegexChar(a)}|${escapeRegexChar(b)}`, 'g'); | |
return s.replace(regex, ($1) => ($1 === a ? b : a)); | |
}, str); | |
}; | |
const unswapCommonChars = (str: string) => { | |
return COMMON_CHARS.reverse().reduce((s, [a, b]) => { | |
const regex = new RegExp(`${escapeRegexChar(a)}|${escapeRegexChar(b)}`, 'g'); | |
return s.replace(regex, ($1) => ($1 === a ? b : a)); | |
}, str); | |
}; | |
const _decompress = (toDecompress: string): string => { | |
if (toDecompress.indexOf(DELIM) === -1) return toDecompress; | |
const [dictPart, encodedPart] = cutOn(toDecompress, DELIM); | |
const replaceChar = dictPart[0] ?? ''; | |
const replacement = dictPart.slice(1) ?? ''; | |
const decoded = encodedPart.split(replaceChar).join(replacement); | |
return decoded.indexOf(DELIM) === -1 ? decoded : _decompress(decoded); | |
}; | |
export const decompress = (toDecompress: string) => unswapCommonChars(_decompress(toDecompress)); | |
export const _compress = (toCompress: string): string => { | |
if (toCompress.length <= 5) return toCompress; | |
const substrings = findRepeatedSubstrings(toCompress)( | |
Math.min(128, Math.floor(toCompress.length / 2)), | |
); | |
if (substrings.size === 0) return toCompress; | |
const c = findUnusedChar(toCompress); | |
if (!c) { | |
console.log('Ran out of characters'); | |
return toCompress; | |
} | |
const substr = findMostSavingestSubstring(substrings); | |
const encodedPart = toCompress.split(substr).join(c); | |
const encoded = `${c}${substr}${DELIM}${encodedPart}`; | |
return encoded.length < toCompress.length ? _compress(encoded) : toCompress; | |
}; | |
export const compress = (toCompress: string) => _compress(swapCommonChars(toCompress)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import * as fc from 'fast-check'; | |
import { decompress, compress, findRepeatedSubstrings, cutOn, DELIM } from './json-compress'; | |
const repeat = <A>(x: A, length: number) => new Array(length).fill(x); | |
describe.each` | |
input | pattern | expected | |
${'abc'} | ${'b'} | ${['a', 'c']} | |
${'a'} | ${'b'} | ${['a', '']} | |
${'abbbbc'} | ${'b'} | ${['a', 'bbbc']} | |
`('cutOn()', ({ input, pattern, expected }) => { | |
it(`should return ${JSON.stringify( | |
expected, | |
)} when given pattern ${pattern} and input ${input}`, () => { | |
const actual = cutOn(input, pattern); | |
expect(actual).toEqual(expected); | |
}); | |
}); | |
describe('cutOn()', () => { | |
it('should always return the same string if you cut then join', () => | |
fc.assert( | |
fc.property(fc.string(), fc.string({ minLength: 1 }), fc.string(), (pre, delim, post) => { | |
const input = `${pre}${delim}${post}`; | |
expect(cutOn(input, delim).join(delim)).toBe(input); | |
}), | |
)); | |
}); | |
describe.each` | |
input | expected | |
${'a'} | ${[]} | |
${'aa'} | ${[]} | |
${'aaaa'} | ${[]} | |
${'aaaaaa'} | ${[['aaa', 2]]} | |
`('findRepeatedSubstrings()', ({ input, expected }) => { | |
it(`should return ${JSON.stringify(expected)} when given the input ${input}`, () => { | |
const actual = findRepeatedSubstrings(input)(Math.floor(input.length / 2)); | |
expect([...actual.entries()]).toEqual(expected); | |
}); | |
}); | |
describe('compress()', () => { | |
it('should always return the same string when a compressed string is decompressed', () => { | |
fc.assert( | |
fc.property(fc.string(), (str) => { | |
const compressed = compress(str); | |
const recomposed = decompress(compressed); | |
expect(recomposed).toBe(str); | |
}), | |
); | |
}); | |
it('should ensure the compressed string is always smaller than (or equal to) the input string', () => { | |
fc.assert( | |
fc.property(fc.string(), (str) => { | |
const encoded = compress(str); | |
const actualLength = encoded.length; | |
expect(actualLength).toBeLessThanOrEqual(str.length); | |
}), | |
); | |
}); | |
it('should always provide a smaller string when given a string made by repeating characters', () => { | |
fc.assert( | |
fc.property( | |
fc.string({ minLength: 3, maxLength: 1000 }), | |
fc.nat({ max: 98 }).map((x) => x + 2), | |
(str, n) => { | |
const toCompress = repeat(str, n).join(''); | |
const compressed = compress(toCompress); | |
const maxCompressedLength = 2 + toCompress.length + str.length + n - n * str.length; | |
expect(compressed.length).toBeLessThanOrEqual(maxCompressedLength); | |
}, | |
), | |
); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment