Skip to content

Instantly share code, notes, and snippets.

@jrsinclair
Created August 3, 2023 00:21
Show Gist options
  • Save jrsinclair/45eab4e9db049fd8ba5e508a2acdb120 to your computer and use it in GitHub Desktop.
Save jrsinclair/45eab4e9db049fd8ba5e508a2acdb120 to your computer and use it in GitHub Desktop.
A couple of rough compression/decompression functions
const countSubstringMatches = (str: string, substr: string, startAt: number) => {
const substringLength = substr.length;
let count = 1;
for (
let substringPos = str.indexOf(substr, startAt + substringLength);
substringPos >= 0;
++count
)
substringPos = str.indexOf(substr, substringPos + substringLength);
return count;
};
export const findRepeatedSubstrings = (haystack: string) => {
const helper = (size: number, soFar = new Map<string, number>()): Map<string, number> => {
if (size <= 2 || haystack.length < size) return soFar;
const emptyList = new Array(haystack.length - size).fill(undefined);
const substrings = emptyList.map((_, i) => haystack.substring(i, i + size));
const counts = substrings.reduce((acc, str, i) => {
if (acc.has(str) || str.indexOf(DELIM) >= 0) return acc;
const matchesCount = countSubstringMatches(haystack, str, i);
if (matchesCount >= 2) acc.set(str, matchesCount);
return acc;
}, soFar);
return helper(size - 1, counts);
};
return helper;
};
export const cutOn = (str: string, pattern: string) => {
const idx = str.indexOf(pattern);
if (idx < 0) return [str, ''];
return [str.substring(0, idx), str.substring(idx + pattern.length)];
};
const inBadRange = (n: number) => n < 32 || (n >= 127 && n <= 160);
const findUnusedChar = (str: string) => {
let c;
for (let i = 32; i < 255; i++) {
c = String.fromCharCode(i);
if (!inBadRange(i) && str.indexOf(c) < 0) return c;
}
return undefined;
};
const calcSavings = ([str, count]: [string, number]) => {
return (count - 1) * str.length - 2;
};
const findMostSavingestSubstring = (counts: Map<string, number>) => {
const sorted = [...counts.entries()].sort((a, b) => calcSavings(b) - calcSavings(a));
return sorted[0][0];
};
export const DELIM = String.fromCharCode(1);
const COMMON_CHARS = [
['"', "'"],
[',', '~'],
['}', ')'],
['{', '('],
[' ', '.'],
];
const escapeRegexChar = (c: string) => ('()[]{}.*'.includes(c) ? `\\${c}` : c);
const swapCommonChars = (str: string) => {
return COMMON_CHARS.reduce((s, [a, b]) => {
const regex = new RegExp(`${escapeRegexChar(a)}|${escapeRegexChar(b)}`, 'g');
return s.replace(regex, ($1) => ($1 === a ? b : a));
}, str);
};
const unswapCommonChars = (str: string) => {
return COMMON_CHARS.reverse().reduce((s, [a, b]) => {
const regex = new RegExp(`${escapeRegexChar(a)}|${escapeRegexChar(b)}`, 'g');
return s.replace(regex, ($1) => ($1 === a ? b : a));
}, str);
};
const _decompress = (toDecompress: string): string => {
if (toDecompress.indexOf(DELIM) === -1) return toDecompress;
const [dictPart, encodedPart] = cutOn(toDecompress, DELIM);
const replaceChar = dictPart[0] ?? '';
const replacement = dictPart.slice(1) ?? '';
const decoded = encodedPart.split(replaceChar).join(replacement);
return decoded.indexOf(DELIM) === -1 ? decoded : _decompress(decoded);
};
export const decompress = (toDecompress: string) => unswapCommonChars(_decompress(toDecompress));
export const _compress = (toCompress: string): string => {
if (toCompress.length <= 5) return toCompress;
const substrings = findRepeatedSubstrings(toCompress)(
Math.min(128, Math.floor(toCompress.length / 2)),
);
if (substrings.size === 0) return toCompress;
const c = findUnusedChar(toCompress);
if (!c) {
console.log('Ran out of characters');
return toCompress;
}
const substr = findMostSavingestSubstring(substrings);
const encodedPart = toCompress.split(substr).join(c);
const encoded = `${c}${substr}${DELIM}${encodedPart}`;
return encoded.length < toCompress.length ? _compress(encoded) : toCompress;
};
export const compress = (toCompress: string) => _compress(swapCommonChars(toCompress));
import * as fc from 'fast-check';
import { decompress, compress, findRepeatedSubstrings, cutOn, DELIM } from './json-compress';
const repeat = <A>(x: A, length: number) => new Array(length).fill(x);
describe.each`
input | pattern | expected
${'abc'} | ${'b'} | ${['a', 'c']}
${'a'} | ${'b'} | ${['a', '']}
${'abbbbc'} | ${'b'} | ${['a', 'bbbc']}
`('cutOn()', ({ input, pattern, expected }) => {
it(`should return ${JSON.stringify(
expected,
)} when given pattern ${pattern} and input ${input}`, () => {
const actual = cutOn(input, pattern);
expect(actual).toEqual(expected);
});
});
describe('cutOn()', () => {
it('should always return the same string if you cut then join', () =>
fc.assert(
fc.property(fc.string(), fc.string({ minLength: 1 }), fc.string(), (pre, delim, post) => {
const input = `${pre}${delim}${post}`;
expect(cutOn(input, delim).join(delim)).toBe(input);
}),
));
});
describe.each`
input | expected
${'a'} | ${[]}
${'aa'} | ${[]}
${'aaaa'} | ${[]}
${'aaaaaa'} | ${[['aaa', 2]]}
`('findRepeatedSubstrings()', ({ input, expected }) => {
it(`should return ${JSON.stringify(expected)} when given the input ${input}`, () => {
const actual = findRepeatedSubstrings(input)(Math.floor(input.length / 2));
expect([...actual.entries()]).toEqual(expected);
});
});
describe('compress()', () => {
it('should always return the same string when a compressed string is decompressed', () => {
fc.assert(
fc.property(fc.string(), (str) => {
const compressed = compress(str);
const recomposed = decompress(compressed);
expect(recomposed).toBe(str);
}),
);
});
it('should ensure the compressed string is always smaller than (or equal to) the input string', () => {
fc.assert(
fc.property(fc.string(), (str) => {
const encoded = compress(str);
const actualLength = encoded.length;
expect(actualLength).toBeLessThanOrEqual(str.length);
}),
);
});
it('should always provide a smaller string when given a string made by repeating characters', () => {
fc.assert(
fc.property(
fc.string({ minLength: 3, maxLength: 1000 }),
fc.nat({ max: 98 }).map((x) => x + 2),
(str, n) => {
const toCompress = repeat(str, n).join('');
const compressed = compress(toCompress);
const maxCompressedLength = 2 + toCompress.length + str.length + n - n * str.length;
expect(compressed.length).toBeLessThanOrEqual(maxCompressedLength);
},
),
);
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment