premasagar/_gzip.js

## _gzip.js
window._gzip = (function()
{
  var _matchCount = function(s, word)
  {
    var n = 0, wordLen = word.length, p = -wordLen;

    while((p = s.indexOf(word, p + wordLen)) > -1)
      ++n;

    return n;
  };

  var _matchCountInDict = function(dict, word)
  {
    var n = 0, i = dict.length;

    while(i--)
    {
      if(dict[i].word.indexOf(word) > -1)
        ++n;
    }

    return n;
  };


  var _escapeForCString = function(s)
  {
    return s.replace(/\\/g, '\\\\').
      replace(/\'/g, '\\\'').
      //replace(/\"/g, '\\\"').
      replace(/\r/g, '\\r').
      replace(/\0/g, '\\x00').
      replace(/\n/g, '\\n');
  };
  var reEscapeableRE = /[\[\]\(\)\{\}\?\!\:\|\+\.\*\'\"]/g;
  var _escapeForRegExp = function(s)
  {
    return s.replace(reEscapeableRE, '\\$&');
  };
  var _escapeForRegExpReplace = function(s)
  {
    return s.replace(/\$/g, '\\$$');
  };

  var _removeEscapebleChars = function(a)
  {
    delete a['^'];
    delete a['\"'];
    delete a['\''];
    delete a['\\'];
    delete a['$'];
    delete a['['];
    delete a[']'];
    delete a['('];
    delete a[')'];
    delete a['{'];
    delete a['}'];
    delete a['?'];
    delete a['!'];
    delete a[':'];
    delete a['|'];
    delete a['+'];
    delete a['`'];
    delete a['.'];
    delete a['*'];
  };

  var _dumpDict = function(dict)
  {
    var t = '';
    var d, i = -1; while((d = dict[++i]))
      t += d.word + '--';

    return t;
  };

  //var unpackCode = 'for(var s=\'__CODE__\',d=\'__DICT__\'.split("`"),i=0;v=d[i++];)s=s.replace(RegExp(v[0],"g"),v.slice(1));console.log(s)';
  var unpackCode = 'for(var s=\'__CODE__\',d=\'__DICT__\'.split("`"),i=0;v=d[i++];)s=s.replace(RegExp(v[0],"g"),v.slice(1));eval(s)';

  /**
    @fn pack and wrap to <unpackCode> code <s> using gzip like algo (dictonaty based)
    @param allowNonAcsiiChars {Boolean} {Default = true} true if you allow use 32-256 chars range else 32-127.
    @param minProfit {Number} {Default = 0} threshold in bytes for each replace pair
    @return packed and wrapped to <unpackCode> code
  */
  var _gzip = function(s, allowNonAcsiiChars, minProfit)
  {
    if(allowNonAcsiiChars == null)
      allowNonAcsiiChars = true;
    if(minProfit == null)
      minProfit = 1;

    var availableCharMap = {};

    var i = 31, e = (allowNonAcsiiChars) ? 256 : 127;

    // gen all available char map
    while(++i < e)
      availableCharMap[String.fromCharCode(i)] = 0;

    // and remove some chars which require escaping (in RegExp too)
    _removeEscapebleChars(availableCharMap);

    // reduce availableCharMap - remove chars which used in <s>
    i = -1, e = s.length; while(++i < e)
    {
      delete availableCharMap[s.charAt(i)];
    }

    // convert <allowNonAcsiiChars> to <availableChars>
    var availableChars = [], j = 0;

    for(var i in availableCharMap)
    {
      if(availableCharMap.hasOwnProperty(i))
        availableChars[j++] = i;
    }

    //console.log(availableChars.join('> <'));
    //console.log(availableChars);

    var sLen = s.length;
    var dict = [], dictLen = 0;

    // build <dict>

    i = -1, e = sLen - 2; while(++i < e)
    {
      var c = s.charAt(i);
      var j = i + 1;

      do
      {
        while(++j < e && s.charAt(j) != c)
          ;

        if(j < e && s.charAt(i + 1) == s.charAt(++j))
        {
          var k = i + 1;

          do
          {
            var word = s.slice(i, k + 1);

            //if(!reEscapeableRE.test(word))
              dict[dictLen++] = {word: word, matchCount: _matchCount(s, word)};
          }
          while(++k < sLen && ++j < sLen && s.charAt(k) == s.charAt(j));
        }
      }
      while(j < e);
    }

    // sort <dict> by word
    dict.sort(function(a, b){ return 2*(a.word < b.word) - 1; });

    //console.log(_dumpDict(dict));

    // remove duplicating words
    var d, i = -1, word, j = 0; while((d = dict[++i]))
    {
      if(d.word != word)
      {
        dict[j++] = d;
        word = d.word;
      }
    }

    dict.length = dictLen = j;

    // word will be replaced using RegExp - prepare escaped vestion
    var d, i = -1; while((d = dict[++i]))
      d.escapedWord = _escapeForRegExp(d.word);

    // calc profit of each word in <dict>
    var d, i = -1; while((d = dict[++i]))
    {
      var escapedWord = _escapeForRegExpReplace(d.word);

      d.profit = d.matchCount*(escapedWord.length - 1) - escapedWord.length - 1 - 1;
    }

    // remove words with profit < <minProfit>
    var d, i = -1, j = -1; while((d = dict[++i]))
    {
      if(d.profit > minProfit)
        dict[++j] = d;
    }

    dict.length = dictLen = j + 1;

    //console.log(_dumpDict(dict));

    // and sort <dict> from last to top by profit and word length
    dict.sort(function(a, b){ return (b.profit - a.profit) || (b.word.length - a.word.length); });

    //console.log(_dumpDict(dict));

    // assign replaceChar, replace words to replaceChars and remove unused words
    var t = s;
    var d, i = -1, j = 0; while((d = dict[++i]))
    {
      if(t.indexOf(d.word) > -1)
      {
        d.replaceChar = availableChars[j];
        t = t.replace(new RegExp(d.escapedWord, 'g'), d.replaceChar);
        dict[j] = d;
        j++;
      }
    }

    dict.length = dictLen = j;

    console.log('t = ', t);

    // compress <dict>
    // calc match one word in other words
    var d, i = -1; while((d = dict[++i]))
      d.matchCountInDict = _matchCountInDict(dict, d.word);

    // sort <dict> by matchCountInDict from max to min
    dict.sort(function(a, b){ return a.matchCountInDict - b.matchCountInDict; });

    // redice <dict>
    var d, i = -1; while((d = dict[++i]))
    {
      var word = d.word;

      var j = i, d2; while((d2 = dict[++i]))
        word = word.replace(new RegExp(d2.escapedWord, 'g'), d2.replaceChar);

      d.word = word;
    }

    //console.log(_dumpDict(dict));
    console.log(dict);
    //return;

    // final

    // serialize <dict>
    var dictString = '';
    var d, i = -1; while((d = dict[++i]))
      dictString += d.replaceChar + _escapeForRegExpReplace(d.word) + '`';

    dictString = dictString.slice(0, -1);

    console.log(dictString);

    // and create unpack code
    var code = unpackCode.
      replace('__DICT__', _escapeForCString(dictString)).
      replace('__CODE__', _escapeForCString(t))
    ;

    console.log('code.length = ', code.length);
    console.log('s.length = ', s.length);
    console.log('profit = ', (s.length/code.length*100).toFixed(2), '%');

    return code;
  };

  return _gzip;
})();
	window._gzip = (function()
	{
	var _matchCount = function(s, word)
	{
	var n = 0, wordLen = word.length, p = -wordLen;

	while((p = s.indexOf(word, p + wordLen)) > -1)
	++n;

	return n;
	};

	var _matchCountInDict = function(dict, word)
	{
	var n = 0, i = dict.length;

	while(i--)
	{
	if(dict[i].word.indexOf(word) > -1)
	++n;
	}

	return n;
	};


	var _escapeForCString = function(s)
	{
	return s.replace(/\\/g, '\\\\').
	replace(/\'/g, '\\\'').
	//replace(/\"/g, '\\\"').
	replace(/\r/g, '\\r').
	replace(/\0/g, '\\x00').
	replace(/\n/g, '\\n');
	};
	var reEscapeableRE = /[\[\]\(\)\{\}\?\!\:\\|\+\.\*\'\"]/g;
	var _escapeForRegExp = function(s)
	{
	return s.replace(reEscapeableRE, '\\$&');
	};
	var _escapeForRegExpReplace = function(s)
	{
	return s.replace(/\$/g, '\\$$');
	};

	var _removeEscapebleChars = function(a)
	{
	delete a['^'];
	delete a['\"'];
	delete a['\''];
	delete a['\\'];
	delete a['$'];
	delete a['['];
	delete a[']'];
	delete a['('];
	delete a[')'];
	delete a['{'];
	delete a['}'];
	delete a['?'];
	delete a['!'];
	delete a[':'];
	delete a['\|'];
	delete a['+'];
	delete a['`'];
	delete a['.'];
	delete a['*'];
	};

	var _dumpDict = function(dict)
	{
	var t = '';
	var d, i = -1; while((d = dict[++i]))
	t += d.word + '--';

	return t;
	};

	//var unpackCode = 'for(var s=\'__CODE__\',d=\'__DICT__\'.split("`"),i=0;v=d[i++];)s=s.replace(RegExp(v[0],"g"),v.slice(1));console.log(s)';
	var unpackCode = 'for(var s=\'__CODE__\',d=\'__DICT__\'.split("`"),i=0;v=d[i++];)s=s.replace(RegExp(v[0],"g"),v.slice(1));eval(s)';

	/**
	@fn pack and wrap to <unpackCode> code <s> using gzip like algo (dictonaty based)
	@param allowNonAcsiiChars {Boolean} {Default = true} true if you allow use 32-256 chars range else 32-127.
	@param minProfit {Number} {Default = 0} threshold in bytes for each replace pair
	@return packed and wrapped to <unpackCode> code
	*/
	var _gzip = function(s, allowNonAcsiiChars, minProfit)
	{
	if(allowNonAcsiiChars == null)
	allowNonAcsiiChars = true;
	if(minProfit == null)
	minProfit = 1;

	var availableCharMap = {};

	var i = 31, e = (allowNonAcsiiChars) ? 256 : 127;

	// gen all available char map
	while(++i < e)
	availableCharMap[String.fromCharCode(i)] = 0;

	// and remove some chars which require escaping (in RegExp too)
	_removeEscapebleChars(availableCharMap);

	// reduce availableCharMap - remove chars which used in <s>
	i = -1, e = s.length; while(++i < e)
	{
	delete availableCharMap[s.charAt(i)];
	}

	// convert <allowNonAcsiiChars> to <availableChars>
	var availableChars = [], j = 0;

	for(var i in availableCharMap)
	{
	if(availableCharMap.hasOwnProperty(i))
	availableChars[j++] = i;
	}

	//console.log(availableChars.join('> <'));
	//console.log(availableChars);

	var sLen = s.length;
	var dict = [], dictLen = 0;

	// build <dict>

	i = -1, e = sLen - 2; while(++i < e)
	{
	var c = s.charAt(i);
	var j = i + 1;

	do
	{
	while(++j < e && s.charAt(j) != c)
	;

	if(j < e && s.charAt(i + 1) == s.charAt(++j))
	{
	var k = i + 1;

	do
	{
	var word = s.slice(i, k + 1);

	//if(!reEscapeableRE.test(word))
	dict[dictLen++] = {word: word, matchCount: _matchCount(s, word)};
	}
	while(++k < sLen && ++j < sLen && s.charAt(k) == s.charAt(j));
	}
	}
	while(j < e);
	}

	// sort <dict> by word
	dict.sort(function(a, b){ return 2*(a.word < b.word) - 1; });

	//console.log(_dumpDict(dict));

	// remove duplicating words
	var d, i = -1, word, j = 0; while((d = dict[++i]))
	{
	if(d.word != word)
	{
	dict[j++] = d;
	word = d.word;
	}
	}

	dict.length = dictLen = j;

	// word will be replaced using RegExp - prepare escaped vestion
	var d, i = -1; while((d = dict[++i]))
	d.escapedWord = _escapeForRegExp(d.word);

	// calc profit of each word in <dict>
	var d, i = -1; while((d = dict[++i]))
	{
	var escapedWord = _escapeForRegExpReplace(d.word);

	d.profit = d.matchCount*(escapedWord.length - 1) - escapedWord.length - 1 - 1;
	}

	// remove words with profit < <minProfit>
	var d, i = -1, j = -1; while((d = dict[++i]))
	{
	if(d.profit > minProfit)
	dict[++j] = d;
	}

	dict.length = dictLen = j + 1;

	//console.log(_dumpDict(dict));

	// and sort <dict> from last to top by profit and word length
	dict.sort(function(a, b){ return (b.profit - a.profit) \|\| (b.word.length - a.word.length); });

	//console.log(_dumpDict(dict));

	// assign replaceChar, replace words to replaceChars and remove unused words
	var t = s;
	var d, i = -1, j = 0; while((d = dict[++i]))
	{
	if(t.indexOf(d.word) > -1)
	{
	d.replaceChar = availableChars[j];
	t = t.replace(new RegExp(d.escapedWord, 'g'), d.replaceChar);
	dict[j] = d;
	j++;
	}
	}

	dict.length = dictLen = j;

	console.log('t = ', t);

	// compress <dict>
	// calc match one word in other words
	var d, i = -1; while((d = dict[++i]))
	d.matchCountInDict = _matchCountInDict(dict, d.word);

	// sort <dict> by matchCountInDict from max to min
	dict.sort(function(a, b){ return a.matchCountInDict - b.matchCountInDict; });

	// redice <dict>
	var d, i = -1; while((d = dict[++i]))
	{
	var word = d.word;

	var j = i, d2; while((d2 = dict[++i]))
	word = word.replace(new RegExp(d2.escapedWord, 'g'), d2.replaceChar);

	d.word = word;
	}

	//console.log(_dumpDict(dict));
	console.log(dict);
	//return;

	// final

	// serialize <dict>
	var dictString = '';
	var d, i = -1; while((d = dict[++i]))
	dictString += d.replaceChar + _escapeForRegExpReplace(d.word) + '`';

	dictString = dictString.slice(0, -1);

	console.log(dictString);

	// and create unpack code
	var code = unpackCode.
	replace('__DICT__', _escapeForCString(dictString)).
	replace('__CODE__', _escapeForCString(t))
	;

	console.log('code.length = ', code.length);
	console.log('s.length = ', s.length);
	console.log('profit = ', (s.length/code.length*100).toFixed(2), '%');

	return code;
	};

	return _gzip;
	})();