Skip to content

Instantly share code, notes, and snippets.

@GunnarFarneback
Created January 4, 2014 11:41
Show Gist options
  • Save GunnarFarneback/8254567 to your computer and use it in GitHub Desktop.
Save GunnarFarneback/8254567 to your computer and use it in GitHub Desktop.
Pure Julia implementation of decompression of zlib and gzip compressed data, as specified by RFC 1950-1952.
# Pure Julia implementation of decompression of zlib and gzip
# compressed data, as specified by RFC 1950-1952.
module Inflate
export decompress, gunzip
# Huffman codes are internally represented by Vector{Vector{Int}},
# where code[k] are a vector of the values with code words of length
# k. Codes are assigned in order from shorter to longer codes and in
# the order listed. E.g.
# [[], [2, 7], [1, 3, 5], [6, 4]]
# would be the code
# 00 - 2
# 01 - 7
# 100 - 1
# 101 - 3
# 110 - 5
# 1110 - 6
# 1111 - 4
const fixed_literal_or_length_table = (Vector{Int})[Int[],
Int[],
Int[],
Int[],
Int[],
Int[],
Int[256:279],
Int[0:143, 280:287],
Int[144:255]]
const fixed_distance_table = (Vector{Int})[Int[],
Int[],
Int[],
Int[],
Int[0:31]]
type InflateData
bytes::Vector{Uint8}
current_byte::Int
bytepos::Int
bitpos::Int
literal_or_length_code::Vector{Vector{Int}}
distance_code::Vector{Vector{Int}}
end
function InflateData(source::Vector{Uint8})
InflateData(source, 0, 1, 0, fixed_literal_or_length_table, fixed_distance_table)
end
function getbit(in::InflateData)
if in.bitpos == 0
in.current_byte = int(in.bytes[in.bytepos])
end
b = in.current_byte & 1
in.bitpos += 1
if in.bitpos == 8
in.bitpos = 0
in.bytepos += 1
else
in.current_byte >>= 1
end
b
end
function getbits(in::InflateData, n::Int)
b = 0
for i = 0:(n-1)
b |= getbit(in) << i
end
b
end
function skip_bits_to_byte_boundary(in::InflateData)
if in.bitpos > 0
in.bytepos += 1
in.bitpos = 0
end
nothing
end
# It is the responsibility of the caller to make sure that bitpos is
# at zero, e.g. by calling skip_bits_to_byte_boundary.
function get_aligned_byte(in::InflateData)
byte = in.bytes[in.bytepos]
in.bytepos += 1
byte
end
function get_value_from_code(in::InflateData, code::Vector{Vector{Int}})
v = 0
for i = 1:length(code)
v = (v << 1) | getbit(in)
if v < length(code[i])
return code[i][1 + v]
end
v -= length(code[i])
end
throw("incomplete code table")
end
get_literal_or_length(in) = get_value_from_code(in, in.literal_or_length_code)
const base_length = [11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227]
const extra_length_bits = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5]
function getlength(in::InflateData, v::Int)
if v <= 264
return v - 254
elseif v <= 284
return base_length[v - 264] + getbits(in, extra_length_bits[v - 264])
else
return 258
end
end
function getdist(in::InflateData)
b = get_value_from_code(in, in.distance_code)
if b <= 3
return b + 1
else
extra_bits = fld(b - 2, 2)
return 1 + ((2 + b % 2) << extra_bits) + getbits(in, extra_bits)
end
end
function transform_code_lengths_to_code(code_lengths::Vector{Int})
code = Vector{Int}[]
for i = 1:length(code_lengths)
n = code_lengths[i]
if n > 0
while n > length(code)
push!(code, Int[])
end
push!(code[n], i - 1)
end
end
return code
end
const order = [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15]
function read_code_tables(in::InflateData)
hlit = getbits(in, 5) + 257
hdist = getbits(in, 5) + 1
hclen = getbits(in, 4) + 4
code_length_code_lengths = zeros(Int, 19)
for i = 1:hclen
code_length_code_lengths[1 + order[i]] = getbits(in, 3)
end
code_length_code = transform_code_lengths_to_code(code_length_code_lengths)
code_lengths = zeros(Int, hlit + hdist)
i = 1
while i <= hlit + hdist
c = get_value_from_code(in, code_length_code)
n = 1
l = 0
if c < 16
l = c
elseif c == 16
n = 3 + getbits(in, 2)
l = code_lengths[i-1]
elseif c == 17
n = 3 + getbits(in, 3)
elseif c == 18
n = 11 + getbits(in, 7)
else
throw("invalid code length code")
end
code_lengths[i:(i+n-1)] = l
i += n
end
in.literal_or_length_code = transform_code_lengths_to_code(code_lengths[1:hlit])
in.distance_code = transform_code_lengths_to_code(code_lengths[(hlit+1):end])
end
# Decompress data given in deflate format, as specified by RFC 1951.
function inflate(in::InflateData)
dest = Uint8[]
final = false
while !final
final = getbits(in, 1) == 1
compression_mode = getbits(in, 2)
if compression_mode == 0
skip_bits_to_byte_boundary(in)
len = getbits(in, 16)
nlen = getbits(in, 16)
if len $ nlen != 0xffff
throw("corrupted data")
end
for i = 1:len
push!(dest, get_aligned_byte(in))
end
continue
elseif compression_mode == 1
in.literal_or_length_code = fixed_literal_or_length_table
in.distance_code = fixed_distance_table
elseif compression_mode == 2
read_code_tables(in)
else
throw("invalid block compression mode 3")
end
while true
v = get_literal_or_length(in)
if v < 256
push!(dest, uint8(v))
elseif v == 256
break
else
length = getlength(in, v)
distance = getdist(in)
for i = 1:length
push!(dest, dest[end - distance + 1])
end
end
end
end
return dest
end
function compute_adler_checksum(x::Vector{Uint8})
s1 = 1
s2 = 0
for b in x
s1 += b
if s1 >= 65521
s1 -= 65521
end
s2 += s1
if s2 >= 65521
s2 -= 65521
end
end
return (s2 << 16) | s1
end
const crc_table = zeros(Uint32, 256)
function make_crc_table()
for n = 1:256
c = uint32(n - 1)
for k = 1:8
if bool(c & 0x00000001)
c = 0xedb88320 $ (c >> 1)
else
c >>= 1
end
end
crc_table[n] = c
end
end
function crc(x::Vector{Uint8})
if crc_table[1] == 0
make_crc_table()
end
c = 0xffffffff
for b in x
c = crc_table[1 + ((c $ b) & 0xff)] $ (c >> 8)
end
c $ 0xffffffff
end
function read_zero_terminated_data(in)
s = Uint8[]
while true
c = get_aligned_byte(in)
push!(s, c)
if c == 0
break
end
end
end
# Decompress Zlib compressed data, as specified by RFC 1950.
function decompress(source::Vector{Uint8})
in = InflateData(source)
CMF = get_aligned_byte(in)
FLG = get_aligned_byte(in)
CM = CMF & 0x0f
CINFO = CM >> 4
FLEVEL = FLG >> 6
FDICT = (FLG >> 5) & 0x01
if CM != 8
throw("unsupported compression method")
end
if CINFO > 7
throw("invalid LZ77 window size")
end
if FDICT != 0
throw("preset dictionary not supported")
end
if mod((uint(CMF) << 8) | FLG, 31) != 0
throw("header checksum error")
end
dest = inflate(in)
skip_bits_to_byte_boundary(in)
ADLER = 0
for i = [24, 16, 8, 0]
ADLER |= int(get_aligned_byte(in)) << i
end
if compute_adler_checksum(dest) != ADLER
throw("corrupted data")
end
dest
end
# Decompress GZIP compressed data, as specified by RFC 1952.
function gunzip(source::Vector{Uint8}; check_crc::Bool = true)
in = InflateData(source)
ID1 = get_aligned_byte(in)
ID2 = get_aligned_byte(in)
if ID1 != 0x1f || ID2 != 0x8b
throw("not gzipped data")
end
CM = get_aligned_byte(in)
if CM != 8
throw("unsupported compression method")
end
FLG = get_aligned_byte(in)
MTIME = getbits(in, 32)
XFL = get_aligned_byte(in)
OS = get_aligned_byte(in)
if bool(FLG & 0x04)
# FLG.FEXTRA
xlen = getbits(in, 16)
in.bytepos += xlen
end
if bool(FLG & 0x08)
# FLG.FNAME
name = read_zero_terminated_data(in)
end
if bool(FLG & 0x10)
# FLG.FCOMMENT
comment = read_zero_terminated_data(in)
end
if bool(FLG & 0x02)
# FLG.FHCRC
crc16 = getbits(in, 16)
if check_crc && crc16 != (crc(in.bytes[1:(in.bytepos-3)]) & 0xffff)
throw("corrupted data, header crc check failed")
end
end
dest = inflate(in)
skip_bits_to_byte_boundary(in)
crc32 = getbits(in, 32)
if check_crc && crc32 != crc(dest)
throw("corrupted data, crc check failed")
end
isize = getbits(in, 32)
if isize != length(dest)
throw("corrupted data, length check failed")
end
dest
end
end # Module Inflate
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment