Skip to content

Instantly share code, notes, and snippets.

@premasagar
Forked from bga/_gzip.js
Created August 11, 2010 17:57
Show Gist options
  • Save premasagar/519405 to your computer and use it in GitHub Desktop.
Save premasagar/519405 to your computer and use it in GitHub Desktop.
window._gzip = (function()
{
var _matchCount = function(s, word)
{
var n = 0, wordLen = word.length, p = -wordLen;
while((p = s.indexOf(word, p + wordLen)) > -1)
++n;
return n;
};
var _matchCountInDict = function(dict, word)
{
var n = 0, i = dict.length;
while(i--)
{
if(dict[i].word.indexOf(word) > -1)
++n;
}
return n;
};
var _escapeForCString = function(s)
{
return s.replace(/\\/g, '\\\\').
replace(/\'/g, '\\\'').
//replace(/\"/g, '\\\"').
replace(/\r/g, '\\r').
replace(/\0/g, '\\x00').
replace(/\n/g, '\\n');
};
var reEscapeableRE = /[\[\]\(\)\{\}\?\!\:\|\+\.\*\'\"]/g;
var _escapeForRegExp = function(s)
{
return s.replace(reEscapeableRE, '\\$&');
};
var _escapeForRegExpReplace = function(s)
{
return s.replace(/\$/g, '\\$$');
};
var _removeEscapebleChars = function(a)
{
delete a['^'];
delete a['\"'];
delete a['\''];
delete a['\\'];
delete a['$'];
delete a['['];
delete a[']'];
delete a['('];
delete a[')'];
delete a['{'];
delete a['}'];
delete a['?'];
delete a['!'];
delete a[':'];
delete a['|'];
delete a['+'];
delete a['`'];
delete a['.'];
delete a['*'];
};
var _dumpDict = function(dict)
{
var t = '';
var d, i = -1; while((d = dict[++i]))
t += d.word + '--';
return t;
};
//var unpackCode = 'for(var s=\'__CODE__\',d=\'__DICT__\'.split("`"),i=0;v=d[i++];)s=s.replace(RegExp(v[0],"g"),v.slice(1));console.log(s)';
var unpackCode = 'for(var s=\'__CODE__\',d=\'__DICT__\'.split("`"),i=0;v=d[i++];)s=s.replace(RegExp(v[0],"g"),v.slice(1));eval(s)';
/**
@fn pack and wrap to <unpackCode> code <s> using gzip like algo (dictonaty based)
@param allowNonAcsiiChars {Boolean} {Default = true} true if you allow use 32-256 chars range else 32-127.
@param minProfit {Number} {Default = 0} threshold in bytes for each replace pair
@return packed and wrapped to <unpackCode> code
*/
var _gzip = function(s, allowNonAcsiiChars, minProfit)
{
if(allowNonAcsiiChars == null)
allowNonAcsiiChars = true;
if(minProfit == null)
minProfit = 1;
var availableCharMap = {};
var i = 31, e = (allowNonAcsiiChars) ? 256 : 127;
// gen all available char map
while(++i < e)
availableCharMap[String.fromCharCode(i)] = 0;
// and remove some chars which require escaping (in RegExp too)
_removeEscapebleChars(availableCharMap);
// reduce availableCharMap - remove chars which used in <s>
i = -1, e = s.length; while(++i < e)
{
delete availableCharMap[s.charAt(i)];
}
// convert <allowNonAcsiiChars> to <availableChars>
var availableChars = [], j = 0;
for(var i in availableCharMap)
{
if(availableCharMap.hasOwnProperty(i))
availableChars[j++] = i;
}
//console.log(availableChars.join('> <'));
//console.log(availableChars);
var sLen = s.length;
var dict = [], dictLen = 0;
// build <dict>
i = -1, e = sLen - 2; while(++i < e)
{
var c = s.charAt(i);
var j = i + 1;
do
{
while(++j < e && s.charAt(j) != c)
;
if(j < e && s.charAt(i + 1) == s.charAt(++j))
{
var k = i + 1;
do
{
var word = s.slice(i, k + 1);
//if(!reEscapeableRE.test(word))
dict[dictLen++] = {word: word, matchCount: _matchCount(s, word)};
}
while(++k < sLen && ++j < sLen && s.charAt(k) == s.charAt(j));
}
}
while(j < e);
}
// sort <dict> by word
dict.sort(function(a, b){ return 2*(a.word < b.word) - 1; });
//console.log(_dumpDict(dict));
// remove duplicating words
var d, i = -1, word, j = 0; while((d = dict[++i]))
{
if(d.word != word)
{
dict[j++] = d;
word = d.word;
}
}
dict.length = dictLen = j;
// word will be replaced using RegExp - prepare escaped vestion
var d, i = -1; while((d = dict[++i]))
d.escapedWord = _escapeForRegExp(d.word);
// calc profit of each word in <dict>
var d, i = -1; while((d = dict[++i]))
{
var escapedWord = _escapeForRegExpReplace(d.word);
d.profit = d.matchCount*(escapedWord.length - 1) - escapedWord.length - 1 - 1;
}
// remove words with profit < <minProfit>
var d, i = -1, j = -1; while((d = dict[++i]))
{
if(d.profit > minProfit)
dict[++j] = d;
}
dict.length = dictLen = j + 1;
//console.log(_dumpDict(dict));
// and sort <dict> from last to top by profit and word length
dict.sort(function(a, b){ return (b.profit - a.profit) || (b.word.length - a.word.length); });
//console.log(_dumpDict(dict));
// assign replaceChar, replace words to replaceChars and remove unused words
var t = s;
var d, i = -1, j = 0; while((d = dict[++i]))
{
if(t.indexOf(d.word) > -1)
{
d.replaceChar = availableChars[j];
t = t.replace(new RegExp(d.escapedWord, 'g'), d.replaceChar);
dict[j] = d;
j++;
}
}
dict.length = dictLen = j;
console.log('t = ', t);
// compress <dict>
// calc match one word in other words
var d, i = -1; while((d = dict[++i]))
d.matchCountInDict = _matchCountInDict(dict, d.word);
// sort <dict> by matchCountInDict from max to min
dict.sort(function(a, b){ return a.matchCountInDict - b.matchCountInDict; });
// redice <dict>
var d, i = -1; while((d = dict[++i]))
{
var word = d.word;
var j = i, d2; while((d2 = dict[++i]))
word = word.replace(new RegExp(d2.escapedWord, 'g'), d2.replaceChar);
d.word = word;
}
//console.log(_dumpDict(dict));
console.log(dict);
//return;
// final
// serialize <dict>
var dictString = '';
var d, i = -1; while((d = dict[++i]))
dictString += d.replaceChar + _escapeForRegExpReplace(d.word) + '`';
dictString = dictString.slice(0, -1);
console.log(dictString);
// and create unpack code
var code = unpackCode.
replace('__DICT__', _escapeForCString(dictString)).
replace('__CODE__', _escapeForCString(t))
;
console.log('code.length = ', code.length);
console.log('s.length = ', s.length);
console.log('profit = ', (s.length/code.length*100).toFixed(2), '%');
return code;
};
return _gzip;
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment