Skip to content

Instantly share code, notes, and snippets.

@revolunet
Created February 25, 2011 14:55
Show Gist options
  • Save revolunet/843889 to your computer and use it in GitHub Desktop.
Save revolunet/843889 to your computer and use it in GitHub Desktop.
LZW javascript compress/decompress
// LZW-compress a string
function lzw_encode(s) {
var dict = {};
var data = (s + "").split("");
var out = [];
var currChar;
var phrase = data[0];
var code = 256;
for (var i=1; i<data.length; i++) {
currChar=data[i];
if (dict[phrase + currChar] != null) {
phrase += currChar;
}
else {
out.push(phrase.length > 1 ? dict[phrase] : phrase.charCodeAt(0));
dict[phrase + currChar] = code;
code++;
phrase=currChar;
}
}
out.push(phrase.length > 1 ? dict[phrase] : phrase.charCodeAt(0));
for (var i=0; i<out.length; i++) {
out[i] = String.fromCharCode(out[i]);
}
return out.join("");
}
// Decompress an LZW-encoded string
function lzw_decode(s) {
var dict = {};
var data = (s + "").split("");
var currChar = data[0];
var oldPhrase = currChar;
var out = [currChar];
var code = 256;
var phrase;
for (var i=1; i<data.length; i++) {
var currCode = data[i].charCodeAt(0);
if (currCode < 256) {
phrase = data[i];
}
else {
phrase = dict[currCode] ? dict[currCode] : (oldPhrase + currChar);
}
out.push(phrase);
currChar = phrase.charAt(0);
dict[code] = oldPhrase + currChar;
code++;
oldPhrase = phrase;
}
return out.join("");
}
@martingraham
Copy link

martingraham commented Dec 9, 2019

Just a note because I happened to run across this. If your data consists of utf32 characters, start your dictionary index at 0xe000, that is:

var code = 57344;

This skips over the endpoint values from 0xd800 to 0xdfff which will cause illegal character exceptions in your encoder/decoder. Effectively, this increases your maximum dictionary size from about 55k to about 1M.

My data was UTF-16 and I had a similar problem. Firstly, once the code entries got above #10000 the encoding routine only stored the first 16 bits. So I switched Andrews54757 code above to use codePoint functions to return proper values (surrogate pairs) for codes above #10000. I then in turn had to get the incrementing code values in both routines to jump between $d800 and $e000 to avoid code values in that region that would get incorrectly interpreted as surrogate pairs.

// from https://gist.github.com/revolunet/843889

lzw_encode: function (s) {
      if (!s) return s;
      var dict = new Map(); // Use a Map!
      var data = (s + "").split("");
      var out = [];
      var currChar;
      var phrase = data[0];
      var code = 256;
      for (var i = 1; i < data.length; i++) {
          currChar = data[i];
          if (dict.has(phrase + currChar)) {
              phrase += currChar;
          } else {
              out.push (phrase.length > 1 ? dict.get(phrase) : phrase.codePointAt(0));
              dict.set(phrase + currChar, code);
              code++;
              if (code === 0xd800) { code = 0xe000; }
              phrase = currChar;
          }
      }
      out.push (phrase.length > 1 ? dict.get(phrase) : phrase.codePointAt(0));
      for (var i = 0; i < out.length; i++) {
          out[i] = String.fromCodePoint(out[i]);
      }
      //console.log ("LZW MAP SIZE", dict.size, out.slice (-50), out.length, out.join("").length);
      return out.join("");
  },

  lzw_decode: function (s) {
      var dict = new Map(); // Use a Map!
      var data = Array.from(s + "");
      //var data = (s + "").split("");
      var currChar = data[0];
      var oldPhrase = currChar;
      var out = [currChar];
      var code = 256;
      var phrase;
      for (var i = 1; i < data.length; i++) {
          var currCode = data[i].codePointAt(0);
          if (currCode < 256) {
              phrase = data[i];
          } else {
              phrase = dict.has(currCode) ? dict.get(currCode) : (oldPhrase + currChar);
          }
          out.push(phrase);
          var cp = phrase.codePointAt(0);
          currChar = String.fromCodePoint(cp); //phrase.charAt(0);
          dict.set(code, oldPhrase + currChar);
          code++;
          if (code === 0xd800) { code = 0xe000; }
          oldPhrase = phrase;
      }
      return out.join("");
  },

@hrieke
Copy link

hrieke commented Jun 10, 2020

Awesome code, under which license have you released this under?
Thanks & keep on rock'n.

@revolunet
Copy link
Author

this code has been took from somewhere and i dont remember where or why... i'm not the author of this code

@antonylesuisse
Copy link

antonylesuisse commented Apr 6, 2021

A version designed for long string storage in UTF-8 encoded files using a base64 encoding. Works with unicode (including surrogates).

https://github.com/antonylesuisse/lzwjs/blob/master/lzw.js

function lzw64_encode(s) {
    if (!s) return s;
    var b64="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
    var d=new Map();
    var s=unescape(encodeURIComponent(s)).split("");
    var word=s[0];
    var num=256;
    var key;
    var o=[];
    function out(word,num) {
        key=word.length>1 ? d.get(word) : word.charCodeAt(0);
        o.push(b64[key&0x3f]);
        o.push(b64[(key>>6)&0x3f]);
        o.push(b64[(key>>12)&0x3f]);
    }
    for (var i=1; i<s.length; i++) {
        var c=s[i];
        if (d.has(word+c)) {
            word+=c;
        } else {
            d.set(word+c,num++);
            out(word,num);
            word=c;
            if(num==(1<<18)-1) {
                d.clear();
                num=256;
            }
        }
    }
    out(word);
    return o.join("");
}

function lzw64_decode(s) {
    var b64="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
    var b64d={};
    for(var i=0; i<64; i++){
        b64d[b64.charAt(i)]=i;
    }
    var d=new Map();
    var num=256;
    var word=String.fromCharCode(b64d[s[0]]+(b64d[s[1]]<<6)+(b64d[s[2]]<<12));
    var prev=word;
    var o=[word];
    for(var i=3; i<s.length; i+=3) {
        var key=b64d[s[i]]+(b64d[s[i+1]]<<6)+(b64d[s[i+2]]<<12);
        word=key<256 ? String.fromCharCode(key) : d.has(key) ? d.get(key) : word+word.charAt(0);
        o.push(word);
        d.set(num++, prev+word.charAt(0));
        prev=word;
        if(num==(1<<18)-1) {
            d.clear();
            num=256;
        }
    }
    return decodeURIComponent(escape(o.join("")));
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment