Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
LZW javascript compress/decompress
// LZW-compress a string
function lzw_encode(s) {
var dict = {};
var data = (s + "").split("");
var out = [];
var currChar;
var phrase = data[0];
var code = 256;
for (var i=1; i<data.length; i++) {
currChar=data[i];
if (dict[phrase + currChar] != null) {
phrase += currChar;
}
else {
out.push(phrase.length > 1 ? dict[phrase] : phrase.charCodeAt(0));
dict[phrase + currChar] = code;
code++;
phrase=currChar;
}
}
out.push(phrase.length > 1 ? dict[phrase] : phrase.charCodeAt(0));
for (var i=0; i<out.length; i++) {
out[i] = String.fromCharCode(out[i]);
}
return out.join("");
}
// Decompress an LZW-encoded string
function lzw_decode(s) {
var dict = {};
var data = (s + "").split("");
var currChar = data[0];
var oldPhrase = currChar;
var out = [currChar];
var code = 256;
var phrase;
for (var i=1; i<data.length; i++) {
var currCode = data[i].charCodeAt(0);
if (currCode < 256) {
phrase = data[i];
}
else {
phrase = dict[currCode] ? dict[currCode] : (oldPhrase + currChar);
}
out.push(phrase);
currChar = phrase.charAt(0);
dict[code] = oldPhrase + currChar;
code++;
oldPhrase = phrase;
}
return out.join("");
}

Thanks you so much for this LZW algo. There was another one I was using and it wasn't working. I'm using this algo for undo/redo state save operations in my web app designer http://meldbox.net

Nice implementation. I did discover a problem on very large datasets though, most likely related to some internal limitation of todays browsers though. I was trying to compress a list of 250K English words ( ~1.7MB ). At some point around 250K characters the encode or decode would fail to match. I spent some time attempting to debug looking for obvious overflow errors etc and double checked the input to ensure it was valid all to no avail. I also ran the algorithm on Chrome/Safari and Firefox. They all produced the same error at the same location.

In an attempt to narrow down the problem I starting breaking the input into substrings and then concatenating the results. This worked fine. It turned out that the algorithm worked fine on sub strings on length of: 1 << 16 but starting failing when the input length was >= 1 << 19.

I know from praxis that javascript can easily handle strings longer than that ( join on the original list for example correclty produces a string of 1.7mb in size ). I suspect the problem is in the internal string folding algorithms and the total amount of space available for string management...although this is merely supposition.

Its also possible then using strings to represent the encoded data is a bad idea. Javascript uses unicode internally and eventually the encoding will produce a sequence that gets misinterpreted as a double byte character and silently breaks one of the internal string algorithms e.g. join to toString.

After a little extra work I discovered that converting the output of encode to a simple array of numbers and modifying decode in the same way allows the algorithm to work on strings on unlimited size. I suspect the problem is using using unicode to represent the output. When the dictionary gets large the unicode values probably exceed the limit.
The following works fine on strings of practically unlimited length. But of course, produces an compressed result that is binary and not as easy to transmit in a wen page ( although ArrayBuffer solves that problem on modern browsers ).

function lzw_encode(s) {
var dict = {};
var data = (s + "").split("");
var out = [];
var currChar;
var phrase = data[0];
var code = 256;
for (var i = 1; i < data.length; i++) {
currChar = data[i];
if (dict[phrase + currChar] != null) {
phrase += currChar;
}
else {
out.push(phrase.length > 1 ? dict[phrase] : phrase.charCodeAt(0));
dict[phrase + currChar] = code;
code++;
phrase = currChar;
}
}
out.push(phrase.length > 1 ? dict[phrase] : phrase.charCodeAt(0));

return out;

}

function lzw_decode(data) {
var dict = {};
var currChar = String.fromCharCode(data[0]);
var oldPhrase = currChar;
var out = [currChar];
var code = 256;
var phrase;
for (var i = 1; i < data.length; i++) {

    var currCode = data[i];
    if (currCode < 256) {
        phrase = String.fromCharCode(data[i]);
    }
    else {
        phrase = dict[currCode] ? dict[currCode] : (oldPhrase + currChar);
    }
    out += phrase;
    currChar = phrase[0];
    dict[code] = oldPhrase + currChar;
    code++;
    oldPhrase = phrase;
}
return out;

}

IMPORTANT NOTE

Both of these implementations treat dict as a true object hash but JavaScript objects are NOT hashes. For example, they produce invalid results if you encode a string that contains the word toString, hasOwnProperty, etc.

lerouxb commented May 23, 2014

I worked around the dict/object problem by prepending '_' to each dict key:

// LZW-compress a string
function lzw_encode(s) {
    var dict = {};
    var data = (s + "").split("");
    var out = [];
    var currChar;
    var phrase = data[0];
    var code = 256;
    for (var i=1; i<data.length; i++) {
        currChar=data[i];
        if (dict['_' + phrase + currChar] != null) {
            phrase += currChar;
        }
        else {
            out.push(phrase.length > 1 ? dict['_'+phrase] : phrase.charCodeAt(0));
            dict['_' + phrase + currChar] = code;
            code++;
            phrase=currChar;
        }
    }
    out.push(phrase.length > 1 ? dict['_'+phrase] : phrase.charCodeAt(0));
    for (var i=0; i<out.length; i++) {
        out[i] = String.fromCharCode(out[i]);
    }
    return out.join("");
}

// Decompress an LZW-encoded string
function lzw_decode(s) {
    var dict = {};
    var data = (s + "").split("");
    var currChar = data[0];
    var oldPhrase = currChar;
    var out = [currChar];
    var code = 256;
    var phrase;
    for (var i=1; i<data.length; i++) {
        var currCode = data[i].charCodeAt(0);
        if (currCode < 256) {
            phrase = data[i];
        }
        else {
           phrase = dict['_'+currCode] ? dict['_'+currCode] : (oldPhrase + currChar);
        }
        out.push(phrase);
        currChar = phrase.charAt(0);
        dict['_'+code] = oldPhrase + currChar;
        code++;
        oldPhrase = phrase;
    }
    return out.join("");
}

You might also want to encode the text as utf8 first. That way all your data consists of bytes
(http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html)

function encode_utf8(s) {
  return unescape(encodeURIComponent(s));
}

function decode_utf8(s) {
  return decodeURIComponent(escape(s));
}

I am stealing this

Just a note because I happened to run across this. If your data consists of utf32 characters, start your dictionary index at 0xe000, that is:

var code = 57344;

This skips over the endpoint values from 0xd800 to 0xdfff which will cause illegal character exceptions in your encoder/decoder. Effectively, this increases your maximum dictionary size from about 55k to about 1M.

@ghost

ghost commented Jun 3, 2016

This is extremely CPU intensive compared to native JSON parsing speeds. Bandwidth is so cheap nowadays, it's not worth the performance cost.

@Dillybob92, that depends entirely on your target users. there are plenty of bandwidth restricted environments

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment