Skip to content

Instantly share code, notes, and snippets.

@pgodwin
Created August 12, 2019 07:23
Show Gist options
  • Save pgodwin/7d66729444173146ad698d154f2b9b6c to your computer and use it in GitHub Desktop.
Save pgodwin/7d66729444173146ad698d154f2b9b6c to your computer and use it in GitHub Desktop.
Cerner LZW Decompression
using System.Text;
/* This code provided by Bruce Jackson (brucejackson.info) and is
* provided "AS IS" with no warranties, expressed or implied, as to
* its effectivness or use.
*
* User agrees to edit the code to meet their partiular use and
* understands that this code has not been fully
* tested in a production environment.
*
* The author expressly waives any rights of copyright to the code
* and so any user is welcome to use this in whatever way they want.
* */
namespace OcfLzw
{
/// <summary>
/// Alternate implementation of the OCF/LZW Decompression by Bruce Jackson (brucejackson.info).
/// This code has not been tested in production and is provided for reference only.
/// </summary>
public class DecompressBlob
{
class lzwItem
{
public lzwItem(uint _prefix, uint _sufix)
{
Prefix = _prefix;
Suffix = _sufix;
}
public uint Prefix;
public uint Suffix;
}
private const int MAX_CODES = 8192;
private uint[] tempDecompressBuffer = new uint[MAX_CODES];
private lzwItem[] lzwLookupTable = new lzwItem[MAX_CODES];
private int tempBufferIndex = 0;
private int currentByteBufferIndex = 0;
private int codeCount = 257;
private byte[] finalByteBuffer = null;
public byte[] Decompress(byte[] rawbytes, int stringSize)
{
finalByteBuffer = new byte[stringSize];
int byteArrayIndex = 0,
shift = 1,
currentShift = 1;
uint prevCode = 0,
middleCode = 0,
lookupIndex = 0,
firstCode = rawbytes[byteArrayIndex];
while (true)
{
if (currentShift >= 9)
{
currentShift -= 8;
if (firstCode != 0)
{
middleCode = rawbytes[++byteArrayIndex];
firstCode = (firstCode << currentShift + 8) | (middleCode << currentShift);
middleCode = rawbytes[++byteArrayIndex];
uint tempCode = middleCode >> (8 - currentShift);
lookupIndex = firstCode | tempCode;
goto skipit;
}
else
{
firstCode = rawbytes[++byteArrayIndex];
middleCode = rawbytes[++byteArrayIndex];
}
}
else
{
middleCode = rawbytes[++byteArrayIndex];
}
lookupIndex = (firstCode << currentShift) | (middleCode >> 8 - currentShift);
if (lookupIndex == 256) // time to move to a new lookup table
{
shift = 1;
currentShift++;
firstCode = rawbytes[byteArrayIndex];
tempDecompressBuffer = new uint[MAX_CODES];
tempBufferIndex = 0;
lzwLookupTable = new lzwItem[MAX_CODES];
codeCount = 257;
continue;
}
else if (lookupIndex == 257) // EOF marker, better than using the string size
{
return finalByteBuffer;
}
skipit:
if (prevCode == 0)
{
tempDecompressBuffer[0] = lookupIndex;
}
if (lookupIndex < codeCount)
{
SaveItemToLookupTable(lookupIndex);
if (codeCount < MAX_CODES)
{
lzwLookupTable[codeCount++] = new lzwItem(prevCode, tempDecompressBuffer[tempBufferIndex]);
}
}
else
{
lzwLookupTable[codeCount++] = new lzwItem(prevCode, tempDecompressBuffer[tempBufferIndex]);
SaveItemToLookupTable(lookupIndex);
}
firstCode = (uint)(middleCode & (0xff >> currentShift));
currentShift += shift;
switch (codeCount) // use the lookup table size and not the current byte count
{
case 511:
case 1023:
case 2047:
case 4095:
shift++;
currentShift++;
break;
}
prevCode = lookupIndex;
}
}
private void SaveItemToLookupTable(uint compressedCode)
{
tempBufferIndex = -1;
while (compressedCode >= 258)
{
tempDecompressBuffer[++tempBufferIndex] = lzwLookupTable[compressedCode].Suffix;
compressedCode = lzwLookupTable[compressedCode].Prefix;
}
tempDecompressBuffer[++tempBufferIndex] = compressedCode;
for (int i = tempBufferIndex; i >= 0; i--)
{
finalByteBuffer[currentByteBufferIndex++] = (byte)tempDecompressBuffer[i];
}
}
}
}
@plessbd
Copy link

plessbd commented Apr 26, 2022

Just wanted to say thank you, and provide a python port https://github.com/plessbd/ocflzw-decompress

@pgodwin
Copy link
Author

pgodwin commented Apr 26, 2022

Thanks @plessbd, and thanks for sharing your port.

@thegoatherder
Copy link

@pgodwin thank you for sharing this script. Have you any idea where I can get the stringSize from in Cerner db tables?

@jdf-id-au
Copy link

jdf-id-au commented Oct 29, 2023

Thank you! Here's a clojure port, unprofiled just yet.

@thegoatherder: working on the CE_BLOB table, BLOB_LENGTH seems to return the uncompressed length when the blob is compressed (and the concatenated length when the blob is split into a sequence?). The docs aren't helpful unfortunately. blobgetlen() returns the actual length of BLOB_CONTENTS.

@RG-NHS
Copy link

RG-NHS commented Feb 8, 2024

Thanks for sharing the script. It has been extremely helpful.
One issue we are having is joining split Blobs. ie where the Blob sequence number >1. Has anyone else tackled this?

@ARMoir
Copy link

ARMoir commented May 21, 2024

@pgodwin Thank you for this! I also ported it to Java in case this is helpful to anyone else.

https://github.com/ARMoir/CernerBlobJava/blob/main/src/ocflzw/DecompressBlob.java

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment