-
-
Save ludenus/90b07292ecbd61c67367d9ba6b465253 to your computer and use it in GitHub Desktop.
ruby framing snappy unpack
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class SnappyHelper | |
require 'snappy_ext' | |
def self.unpack(packed) | |
snappy = SnappyHelper.new | |
snappy.framed_snappy_unpack(packed) | |
end | |
# https://github.com/google/snappy/blob/master/framing_format.txt | |
# | |
# 1. General structure | |
# | |
# The file consists solely of chunks, lying back-to-back with no padding | |
# in between. | |
# | |
# ... | |
# | |
# The different chunk types are listed below. The first chunk must always | |
# be the stream identifier chunk (see section 4.1, below). The stream | |
# ends when the file ends -- there is no explicit end-of-file marker. | |
# | |
# ... | |
# | |
# a valid Snappy framed stream always starts with the bytes | |
# | |
# 0xff 0x06 0x00 0x00 0x73 0x4e 0x61 0x50 0x70 0x59 | |
# | |
def framed_snappy_unpack(packed) | |
packed=packed.bytes | |
header=[0xff, 0x06, 0x00, 0x00, 0x73, 0x4e, 0x61, 0x50, 0x70, 0x59] | |
fail 'invalid framed snappy content' unless packed.size > header.size | |
fail "invalid sNaPpY header: #{packed[0..9]} expected: #{header}" unless packed[0..9] == header | |
unpacked_data = Array.new | |
offset = header.size | |
packed = packed[offset..-1] | |
while !packed.empty? do | |
type, length, crc32c, payload = parse_snappy_chunk(packed) | |
unpacked_data += unpack_snappy_chunk(type, crc32c, payload) | |
offset = length + 4 | |
packed = packed[offset..-1] | |
end | |
unpacked_data.pack('C*') | |
end | |
private | |
# https://github.com/google/snappy/blob/master/framing_format.txt | |
# | |
# 1. General structure | |
# | |
# ... Each chunk consists first a single byte of chunk identifier, | |
# then a three-byte little-endian length of the chunk in bytes (from 0 to | |
# 16777215, inclusive), and then the data if any. The four bytes of chunk | |
# header is not counted in the data length. | |
# | |
def parse_snappy_chunk(chunk) | |
type = chunk[0] | |
b, c, d = chunk[1..3] | |
length = (d << 16 | c << 8 | b) | |
fail "invalid lenght" unless length > 0 | |
a, b, c, d = chunk[4..7] | |
crc = (d << 24 | c << 16 | b << 8 | a) | |
payload = chunk[8..8+length-5] | |
return type, length, crc, payload | |
end | |
def unpack_snappy_chunk(type, crc32c, payload) | |
unpacked = case type | |
when 0x00 then # 4.2. Compressed data (chunk type 0x00) | |
payload_string = payload.pack('C*') | |
Snappy::Ext.uncompress(payload_string).bytes | |
when 0x01 then # 4.3. Uncompressed data (chunk type 0x01) | |
payload | |
when 0xFE then # 4.4. Padding (chunk type 0xfe) | |
nil | |
else | |
fail "unsupported chunk type #{type}" | |
nil | |
end | |
verify_checksum(unpacked, crc32c) unless unpacked.nil? | |
unpacked | |
end | |
def verify_checksum(unpacked, crc32c) | |
expected = unmask(crc32c) | |
actual = crc32c(unpacked.pack('C*')) | |
fail "checksum mismatch actual:#{actual} expected#{expected}" unless actual == expected | |
end | |
# https://github.com/google/snappy/blob/master/framing_format.txt | |
# | |
# 3. Checksum format | |
# | |
# Some chunks have data protected by a checksum (the ones that do will say so | |
# explicitly). The checksums are always masked CRC-32Cs. | |
# | |
# A description of CRC-32C can be found in RFC 3720, section 12.1, with | |
# examples in section B.4. | |
# | |
# Checksums are not stored directly, but masked, as checksumming data and | |
# then its own checksum can be problematic. The masking is the same as used | |
# in Apache Hadoop: Rotate the checksum by 15 bits, then add the constant | |
# 0xa282ead8 (using wraparound as normal for unsigned integers). This is | |
# equivalent to the following C code: | |
# | |
# uint32_t mask_checksum(uint32_t x) { | |
# return ((x >> 15) | (x << 17)) + 0xa282ead8; | |
# } | |
# | |
# Note that the masking is reversible. | |
# | |
# The checksum is always stored as a four bytes long integer, in little-endian. | |
def masked_crc32c(data) | |
mask(crc32c(data)) | |
end | |
def crc32c(data) | |
crc = Digest::CRC32c.new | |
crc << data | |
crc.checksum | |
end | |
def mask_delta | |
# magic number used in java lib and in hadoop | |
# https://github.com/xerial/snappy-java/blob/master/src/main/java/org/xerial/snappy/SnappyFramed.java | |
0xa282ead8 | |
end | |
def mask(crc32c) | |
fail "Invalid input: #{crc32c} Fixnum type is only expected" unless crc32c.instance_of? Fixnum | |
(rotate(crc32c) + mask_delta) | |
end | |
def rotate(crc32c) | |
fail "Invalid input: #{crc32c} Fixnum type is only expected" unless crc32c.instance_of? Fixnum | |
((crc32c >> 15) & 0xffffffff) | ((crc32c << 17) & 0xffffffff) | |
end | |
def unrotate(rot) | |
fail "Invalid input: #{rot} Fixnum type is only expected" unless rot.instance_of? Fixnum | |
((rot >> 17) & 0xffffffff) | ((rot << 15) & 0xffffffff) | |
end | |
def unmask(masked_crc32c) | |
fail "Invalid input: #{masked_crc32c} Fixnum type is only expected" unless masked_crc32c.instance_of? Fixnum | |
rot = (masked_crc32c - mask_delta) & 0xffffffff | |
unrotate(rot) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment