-
-
Save revolunet/843889 to your computer and use it in GitHub Desktop.
// LZW-compress a string | |
function lzw_encode(s) { | |
var dict = {}; | |
var data = (s + "").split(""); | |
var out = []; | |
var currChar; | |
var phrase = data[0]; | |
var code = 256; | |
for (var i=1; i<data.length; i++) { | |
currChar=data[i]; | |
if (dict[phrase + currChar] != null) { | |
phrase += currChar; | |
} | |
else { | |
out.push(phrase.length > 1 ? dict[phrase] : phrase.charCodeAt(0)); | |
dict[phrase + currChar] = code; | |
code++; | |
phrase=currChar; | |
} | |
} | |
out.push(phrase.length > 1 ? dict[phrase] : phrase.charCodeAt(0)); | |
for (var i=0; i<out.length; i++) { | |
out[i] = String.fromCharCode(out[i]); | |
} | |
return out.join(""); | |
} | |
// Decompress an LZW-encoded string | |
function lzw_decode(s) { | |
var dict = {}; | |
var data = (s + "").split(""); | |
var currChar = data[0]; | |
var oldPhrase = currChar; | |
var out = [currChar]; | |
var code = 256; | |
var phrase; | |
for (var i=1; i<data.length; i++) { | |
var currCode = data[i].charCodeAt(0); | |
if (currCode < 256) { | |
phrase = data[i]; | |
} | |
else { | |
phrase = dict[currCode] ? dict[currCode] : (oldPhrase + currChar); | |
} | |
out.push(phrase); | |
currChar = phrase.charAt(0); | |
dict[code] = oldPhrase + currChar; | |
code++; | |
oldPhrase = phrase; | |
} | |
return out.join(""); | |
} |
Nice implementation. I did discover a problem on very large datasets though, most likely related to some internal limitation of todays browsers though. I was trying to compress a list of 250K English words ( ~1.7MB ). At some point around 250K characters the encode or decode would fail to match. I spent some time attempting to debug looking for obvious overflow errors etc and double checked the input to ensure it was valid all to no avail. I also ran the algorithm on Chrome/Safari and Firefox. They all produced the same error at the same location.
In an attempt to narrow down the problem I starting breaking the input into substrings and then concatenating the results. This worked fine. It turned out that the algorithm worked fine on sub strings on length of: 1 << 16 but starting failing when the input length was >= 1 << 19.
I know from praxis that javascript can easily handle strings longer than that ( join on the original list for example correclty produces a string of 1.7mb in size ). I suspect the problem is in the internal string folding algorithms and the total amount of space available for string management...although this is merely supposition.
Its also possible then using strings to represent the encoded data is a bad idea. Javascript uses unicode internally and eventually the encoding will produce a sequence that gets misinterpreted as a double byte character and silently breaks one of the internal string algorithms e.g. join to toString.
After a little extra work I discovered that converting the output of encode to a simple array of numbers and modifying decode in the same way allows the algorithm to work on strings on unlimited size. I suspect the problem is using using unicode to represent the output. When the dictionary gets large the unicode values probably exceed the limit.
The following works fine on strings of practically unlimited length. But of course, produces an compressed result that is binary and not as easy to transmit in a wen page ( although ArrayBuffer solves that problem on modern browsers ).
function lzw_encode(s) {
var dict = {};
var data = (s + "").split("");
var out = [];
var currChar;
var phrase = data[0];
var code = 256;
for (var i = 1; i < data.length; i++) {
currChar = data[i];
if (dict[phrase + currChar] != null) {
phrase += currChar;
}
else {
out.push(phrase.length > 1 ? dict[phrase] : phrase.charCodeAt(0));
dict[phrase + currChar] = code;
code++;
phrase = currChar;
}
}
out.push(phrase.length > 1 ? dict[phrase] : phrase.charCodeAt(0));
return out;
}
function lzw_decode(data) {
var dict = {};
var currChar = String.fromCharCode(data[0]);
var oldPhrase = currChar;
var out = [currChar];
var code = 256;
var phrase;
for (var i = 1; i < data.length; i++) {
var currCode = data[i];
if (currCode < 256) {
phrase = String.fromCharCode(data[i]);
}
else {
phrase = dict[currCode] ? dict[currCode] : (oldPhrase + currChar);
}
out += phrase;
currChar = phrase[0];
dict[code] = oldPhrase + currChar;
code++;
oldPhrase = phrase;
}
return out;
}
IMPORTANT NOTE
Both of these implementations treat dict
as a true object hash but JavaScript objects are NOT hashes. For example, they produce invalid results if you encode a string that contains the word toString
, hasOwnProperty
, etc.
I worked around the dict/object problem by prepending '_' to each dict key:
// LZW-compress a string
function lzw_encode(s) {
var dict = {};
var data = (s + "").split("");
var out = [];
var currChar;
var phrase = data[0];
var code = 256;
for (var i=1; i<data.length; i++) {
currChar=data[i];
if (dict['_' + phrase + currChar] != null) {
phrase += currChar;
}
else {
out.push(phrase.length > 1 ? dict['_'+phrase] : phrase.charCodeAt(0));
dict['_' + phrase + currChar] = code;
code++;
phrase=currChar;
}
}
out.push(phrase.length > 1 ? dict['_'+phrase] : phrase.charCodeAt(0));
for (var i=0; i<out.length; i++) {
out[i] = String.fromCharCode(out[i]);
}
return out.join("");
}
// Decompress an LZW-encoded string
function lzw_decode(s) {
var dict = {};
var data = (s + "").split("");
var currChar = data[0];
var oldPhrase = currChar;
var out = [currChar];
var code = 256;
var phrase;
for (var i=1; i<data.length; i++) {
var currCode = data[i].charCodeAt(0);
if (currCode < 256) {
phrase = data[i];
}
else {
phrase = dict['_'+currCode] ? dict['_'+currCode] : (oldPhrase + currChar);
}
out.push(phrase);
currChar = phrase.charAt(0);
dict['_'+code] = oldPhrase + currChar;
code++;
oldPhrase = phrase;
}
return out.join("");
}
You might also want to encode the text as utf8 first. That way all your data consists of bytes
(http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html)
function encode_utf8(s) {
return unescape(encodeURIComponent(s));
}
function decode_utf8(s) {
return decodeURIComponent(escape(s));
}
I am stealing this
Just a note because I happened to run across this. If your data consists of utf32 characters, start your dictionary index at 0xe000, that is:
var code = 57344;
This skips over the endpoint values from 0xd800 to 0xdfff which will cause illegal character exceptions in your encoder/decoder. Effectively, this increases your maximum dictionary size from about 55k to about 1M.
This is extremely CPU intensive compared to native JSON parsing speeds. Bandwidth is so cheap nowadays, it's not worth the performance cost.
@Dillybob92, that depends entirely on your target users. there are plenty of bandwidth restricted environments
Pl take a look at this lz-string: JavaScript compression, fast!
Is this LZ-based?
I started out from an LZW implementation (no more patents on that), which is very simple. What I did was:
- localStorage can only contain JavaScript strings. Strings in JavaScript are stored internally in UTF-16, meaning every character weight 16 bits. I modified the implementation to work with a 16bit-wide token space.
- I had to remove the default dictionary initialization, totally useless on a 16bit-wide token space.
- I initialize the dictionary with three tokens:
- An entry that produces a 16-bit token.
- An entry that produces an 8-bit token, because most of what I will store is in the iso-latin-1 space, meaning tokens below 256.
- An entry that mark the end of the stream.
- The output is processed by a bit stream that stores effectively 16 bits per character in the output string.
- Each token is stored with just as many bits that are needed according to the size of the dictionary. Hence, the first token takes 2 bits, the second to 7th three bits, etc....
Also comes with this chrome plugin
I have big string (represent a text of a whole book) . how can I compress it . and save it at the server side .
so when I want to view the Book(string of arabic letters. it will be downloaded as ansii or compressed form .
then at the client side I decompress it and view the actual code
I really like this lzw javacode because it is so small and mighty. But it becomes a problem if you use non ASCII characters: For instance, try to compress/decompress multiple euro signs "€" which are not at the beginning of the input text with the above declared LZW functions. To overcome such troubles you first may want to split code values >= 128 into their correct UTF-8 multi-bytes. This can be done by the following functions (encodeUTF8() for encoding and decodeUTF8() for decoding):
String.prototype.encodeUTF8 = function() {
var str = this.replace(
/[\u0080-\u07ff]/g, // U+0080 - U+07FF => 2 bytes 110yyyyy, 10zzzzzz
function(c) {
var cc = c.charCodeAt(0);
return String.fromCharCode(0xc0 | cc>>6, 0x80 | cc&0x3f); }
);
str = str.replace(
/[\u0800-\uffff]/g, // U+0800 - U+FFFF => 3 bytes 1110xxxx, 10yyyyyy, 10zzzzzz
function(c) {
var cc = c.charCodeAt(0);
return String.fromCharCode(0xe0 | cc>>12, 0x80 | cc>>6&0x3F, 0x80 | cc&0x3f); }
);
return str;
}
String.prototype.decodeUTF8 = function() {
var str = this.replace(
/[\u00c0-\u00df][\u0080-\u00bf]/g, // 2-byte chars
function(c) { // (note parentheses for precence)
var cc = (c.charCodeAt(0)&0x1f)<<6 | c.charCodeAt(1)&0x3f;
return String.fromCharCode(cc); }
);
str = str.replace(
/[\u00e0-\u00ef][\u0080-\u00bf][\u0080-\u00bf]/g, // 3-byte chars
function(c) { // (note parentheses for precence)
var cc = ((c.charCodeAt(0)&0x0f)<<12) | ((c.charCodeAt(1)&0x3f)<<6) | ( c.charCodeAt(2)&0x3f);
return String.fromCharCode(cc); }
);
return str;
}
If the lzw_encode(s) function (resp. lzw_decode(s)) from above is used, you secondly may want to transform the LZW output code values (code values from 256 and higher) from that function into multi bytes and vice versa. This can be done by the following functions (encodeLZWCode() and decodeLZWCode()):
String.prototype.encodeLZWCode = function() {
var str = this.replace(
/[^\u0000-\u00ff]/g,
function(c) {
var cc = c.charCodeAt(0);
if (cc < 4096) {
return String.fromCharCode(0xf9, 0x80 | cc >> 6 & 0x3f, 0x80 | cc & 0x3f); } // 3 bytes: 11111001, 10yyyyyy, 10zzzzzz (12 bits to be used)
else if (cc < 262144) {
return String.fromCharCode(0xfa, 0x80 | cc >> 12, 0x80 | cc >> 6 & 0x3f, 0x80 | cc & 0x3f); } // 4 bytes: 11111010, 10xxxxxx, 10yyyyyy, 10zzzzzz (18 bits to be used)
else {
return String.fromCharCode(0xfb, 0x80 | cc >> 18, 0x80 | cc >> 12, 0x80 | cc >> 6 & 0x3f, 0x80 | cc & 0x3f); } // 5 bytes: 11111011, 10wwwwww, 10xxxxxx, 10yyyyyy, 10zzzzzz (24 bits to be used)
}
);
return str;
}
String.prototype.decodeLZWCode = function() {
var str = this.replace(
/\u00f9[\u0080-\u00bf][\u0080-\u00bf]/g, // 2 data bytes each of the type 10xxxxxx (12 bits to be used)
function(c) {
var cc = ((c.charCodeAt(1) & 0x3f) << 6) | (c.charCodeAt(2) & 0x3f); // Last 6 bits of each data byte are combined together (first byte [marker] and other bits are dismissed)
return String.fromCharCode(cc); }
);
str = str.replace(
/\u00fa[\u0080-\u00bf][\u0080-\u00bf][\u0080-\u00bf]/g, // 3 data bytes each of the type 10xxxxxx (18 bits to be used)
function(c) {
var cc = ((c.charCodeAt(1) & 0x3f) << 12) | ((c.charCodeAt(2) & 0x3f) << 6) | (c.charCodeAt(3) & 0x3f);
return String.fromCharCode(cc); }
);
str = str.replace(
/\u00fb[\u0080-\u00bf][\u0080-\u00bf][\u0080-\u00bf][\u0080-\u00bf]/g, // 4 data bytes each of the type 10xxxxxx (24 bits to be used)
function(c) {
var cc = ((c.charCodeAt(1) & 0x3f) << 18) | ((c.charCodeAt(2) & 0x3f) << 12) | ((c.charCodeAt(3) & 0x3f) << 6) | (c.charCodeAt(4) & 0x3f);
return String.fromCharCode(cc); }
);
return str;
}
So for the UTF-8 way of performing the LZW encoding function from above you may use lzw_encode(str.encodeUTF8()).encodeLZWCode();
and for the decoding you may use lzw_decode(str.decodeLZWCode()).decodeUTF8();
It's now 2019! Use a Map()
!
Most modern browsers now support the Map() object. This means, that instead of using a plain JS object for the dict, one could use a Map. This would solve the problem described by @jayphelps without using the prefixes in @lerouxb 's solution.
function lzw_encode(s) {
if (!s) return s;
var dict = new Map(); // Use a Map!
var data = (s + "").split("");
var out = [];
var currChar;
var phrase = data[0];
var code = 256;
for (var i = 1; i < data.length; i++) {
currChar = data[i];
if (dict.has(phrase + currChar)) {
phrase += currChar;
} else {
out.push(phrase.length > 1 ? dict.get(phrase) : phrase.charCodeAt(0));
dict.set(phrase + currChar, code);
code++;
phrase = currChar;
}
}
out.push(phrase.length > 1 ? dict.get(phrase) : phrase.charCodeAt(0));
for (var i = 0; i < out.length; i++) {
out[i] = String.fromCharCode(out[i]);
}
return out.join("");
}
function lzw_decode(s) {
var dict = new Map(); // Use a Map!
var data = (s + "").split("");
var currChar = data[0];
var oldPhrase = currChar;
var out = [currChar];
var code = 256;
var phrase;
for (var i = 1; i < data.length; i++) {
var currCode = data[i].charCodeAt(0);
if (currCode < 256) {
phrase = data[i];
} else {
phrase = dict.has(currCode) ? dict.get(currCode) : (oldPhrase + currChar);
}
out.push(phrase);
currChar = phrase.charAt(0);
dict.set(code, oldPhrase + currChar);
code++;
oldPhrase = phrase;
}
return out.join("");
}
In addition, encoding the string as UTF8 should not be done with the escape
and unescape
functions anymore as they are now deprecated. Instead, use this solution described in MDN docs.
function encode_utf8(s) {
return encodeURIComponent(s).replace(/%([0-9A-F]{2})/g,
function toSolidBytes(match, p1) {
return String.fromCharCode('0x' + p1);
});
}
function decode_utf8(s) {
return decodeURIComponent(s.split('').map(function(c) {
return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
}).join(''));
}
Just a note because I happened to run across this. If your data consists of utf32 characters, start your dictionary index at 0xe000, that is:
var code = 57344;
This skips over the endpoint values from 0xd800 to 0xdfff which will cause illegal character exceptions in your encoder/decoder. Effectively, this increases your maximum dictionary size from about 55k to about 1M.
My data was UTF-16 and I had a similar problem. Firstly, once the code entries got above #10000 the encoding routine only stored the first 16 bits. So I switched Andrews54757 code above to use codePoint functions to return proper values (surrogate pairs) for codes above #10000. I then in turn had to get the incrementing code values in both routines to jump between $d800 and $e000 to avoid code values in that region that would get incorrectly interpreted as surrogate pairs.
// from https://gist.github.com/revolunet/843889
lzw_encode: function (s) {
if (!s) return s;
var dict = new Map(); // Use a Map!
var data = (s + "").split("");
var out = [];
var currChar;
var phrase = data[0];
var code = 256;
for (var i = 1; i < data.length; i++) {
currChar = data[i];
if (dict.has(phrase + currChar)) {
phrase += currChar;
} else {
out.push (phrase.length > 1 ? dict.get(phrase) : phrase.codePointAt(0));
dict.set(phrase + currChar, code);
code++;
if (code === 0xd800) { code = 0xe000; }
phrase = currChar;
}
}
out.push (phrase.length > 1 ? dict.get(phrase) : phrase.codePointAt(0));
for (var i = 0; i < out.length; i++) {
out[i] = String.fromCodePoint(out[i]);
}
//console.log ("LZW MAP SIZE", dict.size, out.slice (-50), out.length, out.join("").length);
return out.join("");
},
lzw_decode: function (s) {
var dict = new Map(); // Use a Map!
var data = Array.from(s + "");
//var data = (s + "").split("");
var currChar = data[0];
var oldPhrase = currChar;
var out = [currChar];
var code = 256;
var phrase;
for (var i = 1; i < data.length; i++) {
var currCode = data[i].codePointAt(0);
if (currCode < 256) {
phrase = data[i];
} else {
phrase = dict.has(currCode) ? dict.get(currCode) : (oldPhrase + currChar);
}
out.push(phrase);
var cp = phrase.codePointAt(0);
currChar = String.fromCodePoint(cp); //phrase.charAt(0);
dict.set(code, oldPhrase + currChar);
code++;
if (code === 0xd800) { code = 0xe000; }
oldPhrase = phrase;
}
return out.join("");
},
Awesome code, under which license have you released this under?
Thanks & keep on rock'n.
this code has been took from somewhere and i dont remember where or why... i'm not the author of this code
A version designed for long string storage in UTF-8 encoded files using a base64 encoding. Works with unicode (including surrogates).
https://github.com/antonylesuisse/lzwjs/blob/master/lzw.js
function lzw64_encode(s) {
if (!s) return s;
var b64="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
var d=new Map();
var s=unescape(encodeURIComponent(s)).split("");
var word=s[0];
var num=256;
var key;
var o=[];
function out(word,num) {
key=word.length>1 ? d.get(word) : word.charCodeAt(0);
o.push(b64[key&0x3f]);
o.push(b64[(key>>6)&0x3f]);
o.push(b64[(key>>12)&0x3f]);
}
for (var i=1; i<s.length; i++) {
var c=s[i];
if (d.has(word+c)) {
word+=c;
} else {
d.set(word+c,num++);
out(word,num);
word=c;
if(num==(1<<18)-1) {
d.clear();
num=256;
}
}
}
out(word);
return o.join("");
}
function lzw64_decode(s) {
var b64="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
var b64d={};
for(var i=0; i<64; i++){
b64d[b64.charAt(i)]=i;
}
var d=new Map();
var num=256;
var word=String.fromCharCode(b64d[s[0]]+(b64d[s[1]]<<6)+(b64d[s[2]]<<12));
var prev=word;
var o=[word];
for(var i=3; i<s.length; i+=3) {
var key=b64d[s[i]]+(b64d[s[i+1]]<<6)+(b64d[s[i+2]]<<12);
word=key<256 ? String.fromCharCode(key) : d.has(key) ? d.get(key) : word+word.charAt(0);
o.push(word);
d.set(num++, prev+word.charAt(0));
prev=word;
if(num==(1<<18)-1) {
d.clear();
num=256;
}
}
return decodeURIComponent(escape(o.join("")));
}
Thanks you so much for this LZW algo. There was another one I was using and it wasn't working. I'm using this algo for undo/redo state save operations in my web app designer http://meldbox.net