Skip to content

Instantly share code, notes, and snippets.

@eliask
Created April 13, 2019 14:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eliask/0992e8efe28ae76635a67915cbff8adf to your computer and use it in GitHub Desktop.
Save eliask/0992e8efe28ae76635a67915cbff8adf to your computer and use it in GitHub Desktop.
Python get ZIP file contents with a callback API for requesting data ranges
# Code follows from ZIP specifications: https://pkware.cachefly.net/webdocs/APPNOTE/APPNOTE-6.3.5.TXT
# Maybe I should have made this with zipfile and some caching BytesIO like interface instead :)
import requests
import struct
from collections import namedtuple
from dataclasses import dataclass
# Python 3.6: pip install dataclasses
'''
4.3.14 Zip64 end of central directory record
zip64 end of central dir
signature 4 bytes (0x06064b50)
size of zip64 end of central
directory record 8 bytes
version made by 2 bytes
version needed to extract 2 bytes
number of this disk 4 bytes
number of the disk with the
start of the central directory 4 bytes
total number of entries in the
central directory on this disk 8 bytes
total number of entries in the
central directory 8 bytes
size of the central directory 8 bytes
offset of start of central
directory with respect to
the starting disk number 8 bytes
zip64 extensible data sector (variable size)
--> Maximum size for the End of central directory record = unknown.
Normally the last field is blank since it's basically proprietary.
--> Assume a constant size of 4+8+2+2+4+4+8+8+8+8 = 56 bytes
4.3.14.1 The value stored into the "size of zip64 end of central
directory record" SHOULD be the size of the remaining
record and SHOULD NOT include the leading 12 bytes.
Size = SizeOfFixedFields + SizeOfVariableData - 12.
NB: However, we're not interested in the variable fields here.
4.3.15 Zip64 end of central directory locator
zip64 end of central dir locator
signature 4 bytes (0x07064b50)
number of the disk with the
start of the zip64 end of
central directory 4 bytes
relative offset of the zip64
end of central directory record 8 bytes
total number of disks 4 bytes
4.3.16 End of central directory record:
end of central dir signature 4 bytes (0x06054b50)
number of this disk 2 bytes
number of the disk with the
start of the central directory 2 bytes
total number of entries in the
central directory on this disk 2 bytes
total number of entries in
the central directory 2 bytes
size of the central directory 4 bytes
offset of start of central
directory with respect to
the starting disk number 4 bytes
.ZIP file comment length 2 bytes
.ZIP file comment (variable size)
--> Maximum size for the End of central directory record = 65535+2+4+4+2+2+2+2+4 = 65557
--> +20 for Zip64 EOCD locator -> 65577
--> +56 for normal size Zip64 EOCD -> 65633
4.4.1.4 If one of the fields in the end of central directory
record is too small to hold required data, the field SHOULD be
set to -1 (0xFFFF or 0xFFFFFFFF) and the ZIP64 format record
SHOULD be created.
'''
Zip64EndOfCentralDirectoryRecord = namedtuple(
'Zip64EndOfCentralDirectoryRecord', '''
signature,
eocd_record_size,
version_made_by,
version_needed_to_extract,
disk_num,
disk_num_w_cd_start
cd_length_in_this_disk,
cd_length,
cd_size,
cd_start_offset,
'''
)
Zip64EndOfCentralDirectoryLocator = namedtuple(
'Zip64EndOfCentralDirectoryLocator', '''
signature,
disk_num_with_eocd_start,
eocd_offset,
number_of_disks,
'''
)
ZipEndOfCentralDirectoryRecord = namedtuple(
'ZipEndOfCentralDirectoryRecord', '''
signature
disk_num
disk_num_with_cd_start
cd_length_in_this_disk
cd_length
cd_size
cd_start_offset
comment_length
'''
)
'''
Extract Zip64ExtendedInfo from the extra field if found,
otherwise return None
4.5.3 -Zip64 Extended Information Extra Field (0x0001):
The following is the layout of the zip64 extended
information "extra" block. If one of the size or
offset fields in the Local or Central directory
record is too small to hold the required data,
a Zip64 extended information record is created.
The order of the fields in the zip64 extended
information record is fixed, but the fields MUST
only appear if the corresponding Local or Central
directory record field is set to 0xFFFF or 0xFFFFFFFF.
Note: all fields stored in Intel low-byte/high-byte order.
Value Size Description
----- ---- -----------
(ZIP64) 0x0001 2 bytes Tag for this "extra" block type
Size 2 bytes Size of this "extra" block
Original
Size 8 bytes Original uncompressed file size
Compressed
Size 8 bytes Size of compressed data
Relative Header
Offset 8 bytes Offset of local header record
Disk Start
Number 4 bytes Number of the disk on which
this file starts
'''
Zip64ExtendedInfo = namedtuple(
'Zip64ExtendedInfo', '''
uncompressed_size
compressed_size
relative_offset_of_local_header
disk_start_number
'''
)
'''
4.3.12 Central directory structure:
[central directory header 1]
.
.
.
[central directory header n]
[digital signature]
File header:
central file header signature 4 bytes (0x02014b50)
version made by 2 bytes
version needed to extract 2 bytes
general purpose bit flag 2 bytes
compression method 2 bytes
last mod file time 2 bytes
last mod file date 2 bytes
crc-32 4 bytes
compressed size 4 bytes
uncompressed size 4 bytes
file name length 2 bytes
extra field length 2 bytes
file comment length 2 bytes
disk number start 2 bytes
internal file attributes 2 bytes
external file attributes 4 bytes
relative offset of local header 4 bytes
file name (variable size)
extra field (variable size)
file comment (variable size)
4.3.13 Digital signature:
header signature 4 bytes (0x05054b50)
size of data 2 bytes
signature data (variable size)
'''
@dataclass
class CentralDirectoryRecord:
signature: int
version_made_by: int
version_needed_to_extract: int
general_purpose_bit_flag: int
compression_method: int
last_mod_file_time: int
last_mod_file_date: int
crc32: int
compressed_size: int
uncompressed_size: int
file_name_length: int
extra_field_length: int
file_comment_length: int
disk_start_number: int
internal_file_attributes: int
external_file_attributes: int
relative_offset_of_local_header: int
file_name: bytes = b''
file_comment: bytes = b''
def parse_zip_central_directory(request_data, header, file_size):
cd_length, cd_size, cd_start_offset = parse_zip_eocd(request_data, header, file_size)
if file_size - cd_start_offset > len(header):
header = request_data(cd_start_offset, cd_start_offset + cd_size)
else:
header = header[cd_start_offset-file_size : ]
CD_RECORD_FIXED_SIZE = 46
cur_offset = 0
while cur_offset < cd_size:
cd_header_bytes = header[cur_offset : cur_offset+CD_RECORD_FIXED_SIZE]
cd_header = CentralDirectoryRecord(*struct.unpack('<IHHHHHHIIIHHHHHII', cd_header_bytes))
# Digital signature occurs after the central directory entries:
if cd_header.signature == 0x05054b50:
return
assert cd_header.signature == 0x02014b50, f'Malformed Zip Central Directory signature: {hex(cd_header.signature)}'
cur_offset += CD_RECORD_FIXED_SIZE
cd_header.file_name = header[cur_offset : cur_offset+cd_header.file_name_length]
cur_offset += cd_header.file_name_length
extra_field = header[cur_offset : cur_offset+cd_header.extra_field_length]
cur_offset += cd_header.extra_field_length
zip64_extended_info = parse_zip_cd_extra_field(extra_field)
if zip64_extended_info:
for k,v in zip64_extended_info._asdict().items():
setattr(cd_header, k,v)
cd_header.file_comment = header[cur_offset : cur_offset+cd_header.file_comment_length]
cur_offset += cd_header.file_comment_length
yield cd_header
def parse_zip_cd_extra_field(extra_field):
while extra_field:
header_id, data_size = struct.unpack('<HH', extra_field[:4])
if header_id == 0x0001:
return Zip64ExtendedInfo(*struct.unpack('<QQQI', extra_field[4:4+data_size]))
extra_field = extra_field[4 + data_size:]
def parse_zip_eocd(request_data, header, file_size):
eocd_from_end = 4 + bytes(reversed(header)).find(bytes.fromhex('06054b50'))
assert eocd_from_end >= 22
eocd_bytes = header[-eocd_from_end:]
eocd = ZipEndOfCentralDirectoryRecord(*struct.unpack("<IHHHHIIH", eocd_bytes[:22]))
assert eocd.signature == 0x06054b50
# See: 4.4.1.4
should_be_zip64 = (
0xFFFF in (
eocd.disk_num,
eocd.disk_num_with_cd_start,
eocd.cd_length_in_this_disk,
eocd.cd_length,
eocd.comment_length,
)
or 0xFFFF_FFFF in (eocd.cd_size, eocd.cd_start_offset)
# TODO is this is a valid thing to do?
or header[-eocd_from_end-20 : -eocd_from_end-16] == bytes.fromhex('07064b50')
)
if not should_be_zip64:
return eocd.cd_length, eocd.cd_size, eocd.cd_start_offset
eocd_locator_bytes = header[-eocd_from_end-20 : -eocd_from_end]
eocd_locator = Zip64EndOfCentralDirectoryLocator(*struct.unpack('<IIQI', eocd_locator_bytes))
# There was no Zip64 EOCD locator signature:
if eocd_locator.signature != 0x07064b50:
return eocd.cd_length, eocd.cd_size, eocd.cd_start_offset
if file_size - eocd_locator.eocd_offset > len(header):
header = request_data(eocd_locator.eocd_offset, file_size)
eocd_bytes = header[zip64_eocd_offset:]
eocd = Zip64EndOfCentralDirectoryRecord(*struct.unpack("<IQHHIIQQQQ", eocd_bytes[:56]))
assert eocd.signature == 0x06064b50, f'Malformed Zip64 EOCD signature: {hex(eocd.signature)}'
return eocd.cd_length, eocd.cd_size, eocd.cd_start_offset
def get_zip_listing(url, session=None):
with requests.Session() as session:
r = session.head(url)
assert r.ok
if not 'Content-Length' in r.headers:
raise Exception('Server does not give Content-Length')
file_size = int(r.headers['Content-Length'])
def request_data(start, end):
print(f'Request data: {start} {end} {file_size}')
assert start < end
r = session.get(url, headers={'range': f'bytes={start}-{end-1}'})
assert r.ok
content_range = r.headers.get('Content-Range')
assert content_range == f"bytes {start}-{end-1}/{file_size}", f'Content-Range expected, got: {content_range}'
return r.content
header = request_data(file_size - 65633, file_size)
for entry in parse_zip_central_directory(request_data, header, file_size):
print(entry)
'''Sample returned values:
CentralDirectoryRecord(
signature=33639248
, version_made_by=20
, version_needed_to_extract=20
, general_purpose_bit_flag=0
, compression_method=8
, last_mod_file_time=28533
, last_mod_file_date=19645
, crc32=181966934
, compressed_size=7327
, uncompressed_size=77824
, file_name_length=13
, extra_field_length=0
, file_comment_length=0
, disk_start_number=0
, internal_file_attributes=0
, external_file_attributes=32
, relative_offset_of_local_header=1049460706
, file_name=b'test_file.bin'
, file_comment=b''
)
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment