Created
August 12, 2019 07:23
-
-
Save pgodwin/7d66729444173146ad698d154f2b9b6c to your computer and use it in GitHub Desktop.
Cerner LZW Decompression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Text; | |
/* This code provided by Bruce Jackson (brucejackson.info) and is | |
* provided "AS IS" with no warranties, expressed or implied, as to | |
* its effectivness or use. | |
* | |
* User agrees to edit the code to meet their partiular use and | |
* understands that this code has not been fully | |
* tested in a production environment. | |
* | |
* The author expressly waives any rights of copyright to the code | |
* and so any user is welcome to use this in whatever way they want. | |
* */ | |
namespace OcfLzw | |
{ | |
/// <summary> | |
/// Alternate implementation of the OCF/LZW Decompression by Bruce Jackson (brucejackson.info). | |
/// This code has not been tested in production and is provided for reference only. | |
/// </summary> | |
public class DecompressBlob | |
{ | |
class lzwItem | |
{ | |
public lzwItem(uint _prefix, uint _sufix) | |
{ | |
Prefix = _prefix; | |
Suffix = _sufix; | |
} | |
public uint Prefix; | |
public uint Suffix; | |
} | |
private const int MAX_CODES = 8192; | |
private uint[] tempDecompressBuffer = new uint[MAX_CODES]; | |
private lzwItem[] lzwLookupTable = new lzwItem[MAX_CODES]; | |
private int tempBufferIndex = 0; | |
private int currentByteBufferIndex = 0; | |
private int codeCount = 257; | |
private byte[] finalByteBuffer = null; | |
public byte[] Decompress(byte[] rawbytes, int stringSize) | |
{ | |
finalByteBuffer = new byte[stringSize]; | |
int byteArrayIndex = 0, | |
shift = 1, | |
currentShift = 1; | |
uint prevCode = 0, | |
middleCode = 0, | |
lookupIndex = 0, | |
firstCode = rawbytes[byteArrayIndex]; | |
while (true) | |
{ | |
if (currentShift >= 9) | |
{ | |
currentShift -= 8; | |
if (firstCode != 0) | |
{ | |
middleCode = rawbytes[++byteArrayIndex]; | |
firstCode = (firstCode << currentShift + 8) | (middleCode << currentShift); | |
middleCode = rawbytes[++byteArrayIndex]; | |
uint tempCode = middleCode >> (8 - currentShift); | |
lookupIndex = firstCode | tempCode; | |
goto skipit; | |
} | |
else | |
{ | |
firstCode = rawbytes[++byteArrayIndex]; | |
middleCode = rawbytes[++byteArrayIndex]; | |
} | |
} | |
else | |
{ | |
middleCode = rawbytes[++byteArrayIndex]; | |
} | |
lookupIndex = (firstCode << currentShift) | (middleCode >> 8 - currentShift); | |
if (lookupIndex == 256) // time to move to a new lookup table | |
{ | |
shift = 1; | |
currentShift++; | |
firstCode = rawbytes[byteArrayIndex]; | |
tempDecompressBuffer = new uint[MAX_CODES]; | |
tempBufferIndex = 0; | |
lzwLookupTable = new lzwItem[MAX_CODES]; | |
codeCount = 257; | |
continue; | |
} | |
else if (lookupIndex == 257) // EOF marker, better than using the string size | |
{ | |
return finalByteBuffer; | |
} | |
skipit: | |
if (prevCode == 0) | |
{ | |
tempDecompressBuffer[0] = lookupIndex; | |
} | |
if (lookupIndex < codeCount) | |
{ | |
SaveItemToLookupTable(lookupIndex); | |
if (codeCount < MAX_CODES) | |
{ | |
lzwLookupTable[codeCount++] = new lzwItem(prevCode, tempDecompressBuffer[tempBufferIndex]); | |
} | |
} | |
else | |
{ | |
lzwLookupTable[codeCount++] = new lzwItem(prevCode, tempDecompressBuffer[tempBufferIndex]); | |
SaveItemToLookupTable(lookupIndex); | |
} | |
firstCode = (uint)(middleCode & (0xff >> currentShift)); | |
currentShift += shift; | |
switch (codeCount) // use the lookup table size and not the current byte count | |
{ | |
case 511: | |
case 1023: | |
case 2047: | |
case 4095: | |
shift++; | |
currentShift++; | |
break; | |
} | |
prevCode = lookupIndex; | |
} | |
} | |
private void SaveItemToLookupTable(uint compressedCode) | |
{ | |
tempBufferIndex = -1; | |
while (compressedCode >= 258) | |
{ | |
tempDecompressBuffer[++tempBufferIndex] = lzwLookupTable[compressedCode].Suffix; | |
compressedCode = lzwLookupTable[compressedCode].Prefix; | |
} | |
tempDecompressBuffer[++tempBufferIndex] = compressedCode; | |
for (int i = tempBufferIndex; i >= 0; i--) | |
{ | |
finalByteBuffer[currentByteBufferIndex++] = (byte)tempDecompressBuffer[i]; | |
} | |
} | |
} | |
} |
Thanks for sharing the script. It has been extremely helpful.
One issue we are having is joining split Blobs. ie where the Blob sequence number >1. Has anyone else tackled this?
@pgodwin Thank you for this! I also ported it to Java in case this is helpful to anyone else.
https://github.com/ARMoir/CernerBlobJava/blob/main/src/ocflzw/DecompressBlob.java
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you! Here's a clojure port, unprofiled just yet.
@thegoatherder: working on the
CE_BLOB
table,BLOB_LENGTH
seems to return the uncompressed length when the blob is compressed (and the concatenated length when the blob is split into a sequence?). The docs aren't helpful unfortunately.blobgetlen()
returns the actual length ofBLOB_CONTENTS
.