Created
April 13, 2019 14:17
-
-
Save eliask/0992e8efe28ae76635a67915cbff8adf to your computer and use it in GitHub Desktop.
Python get ZIP file contents with a callback API for requesting data ranges
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Code follows from ZIP specifications: https://pkware.cachefly.net/webdocs/APPNOTE/APPNOTE-6.3.5.TXT | |
# Maybe I should have made this with zipfile and some caching BytesIO like interface instead :) | |
import requests | |
import struct | |
from collections import namedtuple | |
from dataclasses import dataclass | |
# Python 3.6: pip install dataclasses | |
''' | |
4.3.14 Zip64 end of central directory record | |
zip64 end of central dir | |
signature 4 bytes (0x06064b50) | |
size of zip64 end of central | |
directory record 8 bytes | |
version made by 2 bytes | |
version needed to extract 2 bytes | |
number of this disk 4 bytes | |
number of the disk with the | |
start of the central directory 4 bytes | |
total number of entries in the | |
central directory on this disk 8 bytes | |
total number of entries in the | |
central directory 8 bytes | |
size of the central directory 8 bytes | |
offset of start of central | |
directory with respect to | |
the starting disk number 8 bytes | |
zip64 extensible data sector (variable size) | |
--> Maximum size for the End of central directory record = unknown. | |
Normally the last field is blank since it's basically proprietary. | |
--> Assume a constant size of 4+8+2+2+4+4+8+8+8+8 = 56 bytes | |
4.3.14.1 The value stored into the "size of zip64 end of central | |
directory record" SHOULD be the size of the remaining | |
record and SHOULD NOT include the leading 12 bytes. | |
Size = SizeOfFixedFields + SizeOfVariableData - 12. | |
NB: However, we're not interested in the variable fields here. | |
4.3.15 Zip64 end of central directory locator | |
zip64 end of central dir locator | |
signature 4 bytes (0x07064b50) | |
number of the disk with the | |
start of the zip64 end of | |
central directory 4 bytes | |
relative offset of the zip64 | |
end of central directory record 8 bytes | |
total number of disks 4 bytes | |
4.3.16 End of central directory record: | |
end of central dir signature 4 bytes (0x06054b50) | |
number of this disk 2 bytes | |
number of the disk with the | |
start of the central directory 2 bytes | |
total number of entries in the | |
central directory on this disk 2 bytes | |
total number of entries in | |
the central directory 2 bytes | |
size of the central directory 4 bytes | |
offset of start of central | |
directory with respect to | |
the starting disk number 4 bytes | |
.ZIP file comment length 2 bytes | |
.ZIP file comment (variable size) | |
--> Maximum size for the End of central directory record = 65535+2+4+4+2+2+2+2+4 = 65557 | |
--> +20 for Zip64 EOCD locator -> 65577 | |
--> +56 for normal size Zip64 EOCD -> 65633 | |
4.4.1.4 If one of the fields in the end of central directory | |
record is too small to hold required data, the field SHOULD be | |
set to -1 (0xFFFF or 0xFFFFFFFF) and the ZIP64 format record | |
SHOULD be created. | |
''' | |
Zip64EndOfCentralDirectoryRecord = namedtuple( | |
'Zip64EndOfCentralDirectoryRecord', ''' | |
signature, | |
eocd_record_size, | |
version_made_by, | |
version_needed_to_extract, | |
disk_num, | |
disk_num_w_cd_start | |
cd_length_in_this_disk, | |
cd_length, | |
cd_size, | |
cd_start_offset, | |
''' | |
) | |
Zip64EndOfCentralDirectoryLocator = namedtuple( | |
'Zip64EndOfCentralDirectoryLocator', ''' | |
signature, | |
disk_num_with_eocd_start, | |
eocd_offset, | |
number_of_disks, | |
''' | |
) | |
ZipEndOfCentralDirectoryRecord = namedtuple( | |
'ZipEndOfCentralDirectoryRecord', ''' | |
signature | |
disk_num | |
disk_num_with_cd_start | |
cd_length_in_this_disk | |
cd_length | |
cd_size | |
cd_start_offset | |
comment_length | |
''' | |
) | |
''' | |
Extract Zip64ExtendedInfo from the extra field if found, | |
otherwise return None | |
4.5.3 -Zip64 Extended Information Extra Field (0x0001): | |
The following is the layout of the zip64 extended | |
information "extra" block. If one of the size or | |
offset fields in the Local or Central directory | |
record is too small to hold the required data, | |
a Zip64 extended information record is created. | |
The order of the fields in the zip64 extended | |
information record is fixed, but the fields MUST | |
only appear if the corresponding Local or Central | |
directory record field is set to 0xFFFF or 0xFFFFFFFF. | |
Note: all fields stored in Intel low-byte/high-byte order. | |
Value Size Description | |
----- ---- ----------- | |
(ZIP64) 0x0001 2 bytes Tag for this "extra" block type | |
Size 2 bytes Size of this "extra" block | |
Original | |
Size 8 bytes Original uncompressed file size | |
Compressed | |
Size 8 bytes Size of compressed data | |
Relative Header | |
Offset 8 bytes Offset of local header record | |
Disk Start | |
Number 4 bytes Number of the disk on which | |
this file starts | |
''' | |
Zip64ExtendedInfo = namedtuple( | |
'Zip64ExtendedInfo', ''' | |
uncompressed_size | |
compressed_size | |
relative_offset_of_local_header | |
disk_start_number | |
''' | |
) | |
''' | |
4.3.12 Central directory structure: | |
[central directory header 1] | |
. | |
. | |
. | |
[central directory header n] | |
[digital signature] | |
File header: | |
central file header signature 4 bytes (0x02014b50) | |
version made by 2 bytes | |
version needed to extract 2 bytes | |
general purpose bit flag 2 bytes | |
compression method 2 bytes | |
last mod file time 2 bytes | |
last mod file date 2 bytes | |
crc-32 4 bytes | |
compressed size 4 bytes | |
uncompressed size 4 bytes | |
file name length 2 bytes | |
extra field length 2 bytes | |
file comment length 2 bytes | |
disk number start 2 bytes | |
internal file attributes 2 bytes | |
external file attributes 4 bytes | |
relative offset of local header 4 bytes | |
file name (variable size) | |
extra field (variable size) | |
file comment (variable size) | |
4.3.13 Digital signature: | |
header signature 4 bytes (0x05054b50) | |
size of data 2 bytes | |
signature data (variable size) | |
''' | |
@dataclass | |
class CentralDirectoryRecord: | |
signature: int | |
version_made_by: int | |
version_needed_to_extract: int | |
general_purpose_bit_flag: int | |
compression_method: int | |
last_mod_file_time: int | |
last_mod_file_date: int | |
crc32: int | |
compressed_size: int | |
uncompressed_size: int | |
file_name_length: int | |
extra_field_length: int | |
file_comment_length: int | |
disk_start_number: int | |
internal_file_attributes: int | |
external_file_attributes: int | |
relative_offset_of_local_header: int | |
file_name: bytes = b'' | |
file_comment: bytes = b'' | |
def parse_zip_central_directory(request_data, header, file_size): | |
cd_length, cd_size, cd_start_offset = parse_zip_eocd(request_data, header, file_size) | |
if file_size - cd_start_offset > len(header): | |
header = request_data(cd_start_offset, cd_start_offset + cd_size) | |
else: | |
header = header[cd_start_offset-file_size : ] | |
CD_RECORD_FIXED_SIZE = 46 | |
cur_offset = 0 | |
while cur_offset < cd_size: | |
cd_header_bytes = header[cur_offset : cur_offset+CD_RECORD_FIXED_SIZE] | |
cd_header = CentralDirectoryRecord(*struct.unpack('<IHHHHHHIIIHHHHHII', cd_header_bytes)) | |
# Digital signature occurs after the central directory entries: | |
if cd_header.signature == 0x05054b50: | |
return | |
assert cd_header.signature == 0x02014b50, f'Malformed Zip Central Directory signature: {hex(cd_header.signature)}' | |
cur_offset += CD_RECORD_FIXED_SIZE | |
cd_header.file_name = header[cur_offset : cur_offset+cd_header.file_name_length] | |
cur_offset += cd_header.file_name_length | |
extra_field = header[cur_offset : cur_offset+cd_header.extra_field_length] | |
cur_offset += cd_header.extra_field_length | |
zip64_extended_info = parse_zip_cd_extra_field(extra_field) | |
if zip64_extended_info: | |
for k,v in zip64_extended_info._asdict().items(): | |
setattr(cd_header, k,v) | |
cd_header.file_comment = header[cur_offset : cur_offset+cd_header.file_comment_length] | |
cur_offset += cd_header.file_comment_length | |
yield cd_header | |
def parse_zip_cd_extra_field(extra_field): | |
while extra_field: | |
header_id, data_size = struct.unpack('<HH', extra_field[:4]) | |
if header_id == 0x0001: | |
return Zip64ExtendedInfo(*struct.unpack('<QQQI', extra_field[4:4+data_size])) | |
extra_field = extra_field[4 + data_size:] | |
def parse_zip_eocd(request_data, header, file_size): | |
eocd_from_end = 4 + bytes(reversed(header)).find(bytes.fromhex('06054b50')) | |
assert eocd_from_end >= 22 | |
eocd_bytes = header[-eocd_from_end:] | |
eocd = ZipEndOfCentralDirectoryRecord(*struct.unpack("<IHHHHIIH", eocd_bytes[:22])) | |
assert eocd.signature == 0x06054b50 | |
# See: 4.4.1.4 | |
should_be_zip64 = ( | |
0xFFFF in ( | |
eocd.disk_num, | |
eocd.disk_num_with_cd_start, | |
eocd.cd_length_in_this_disk, | |
eocd.cd_length, | |
eocd.comment_length, | |
) | |
or 0xFFFF_FFFF in (eocd.cd_size, eocd.cd_start_offset) | |
# TODO is this is a valid thing to do? | |
or header[-eocd_from_end-20 : -eocd_from_end-16] == bytes.fromhex('07064b50') | |
) | |
if not should_be_zip64: | |
return eocd.cd_length, eocd.cd_size, eocd.cd_start_offset | |
eocd_locator_bytes = header[-eocd_from_end-20 : -eocd_from_end] | |
eocd_locator = Zip64EndOfCentralDirectoryLocator(*struct.unpack('<IIQI', eocd_locator_bytes)) | |
# There was no Zip64 EOCD locator signature: | |
if eocd_locator.signature != 0x07064b50: | |
return eocd.cd_length, eocd.cd_size, eocd.cd_start_offset | |
if file_size - eocd_locator.eocd_offset > len(header): | |
header = request_data(eocd_locator.eocd_offset, file_size) | |
eocd_bytes = header[zip64_eocd_offset:] | |
eocd = Zip64EndOfCentralDirectoryRecord(*struct.unpack("<IQHHIIQQQQ", eocd_bytes[:56])) | |
assert eocd.signature == 0x06064b50, f'Malformed Zip64 EOCD signature: {hex(eocd.signature)}' | |
return eocd.cd_length, eocd.cd_size, eocd.cd_start_offset | |
def get_zip_listing(url, session=None): | |
with requests.Session() as session: | |
r = session.head(url) | |
assert r.ok | |
if not 'Content-Length' in r.headers: | |
raise Exception('Server does not give Content-Length') | |
file_size = int(r.headers['Content-Length']) | |
def request_data(start, end): | |
print(f'Request data: {start} {end} {file_size}') | |
assert start < end | |
r = session.get(url, headers={'range': f'bytes={start}-{end-1}'}) | |
assert r.ok | |
content_range = r.headers.get('Content-Range') | |
assert content_range == f"bytes {start}-{end-1}/{file_size}", f'Content-Range expected, got: {content_range}' | |
return r.content | |
header = request_data(file_size - 65633, file_size) | |
for entry in parse_zip_central_directory(request_data, header, file_size): | |
print(entry) | |
'''Sample returned values: | |
CentralDirectoryRecord( | |
signature=33639248 | |
, version_made_by=20 | |
, version_needed_to_extract=20 | |
, general_purpose_bit_flag=0 | |
, compression_method=8 | |
, last_mod_file_time=28533 | |
, last_mod_file_date=19645 | |
, crc32=181966934 | |
, compressed_size=7327 | |
, uncompressed_size=77824 | |
, file_name_length=13 | |
, extra_field_length=0 | |
, file_comment_length=0 | |
, disk_start_number=0 | |
, internal_file_attributes=0 | |
, external_file_attributes=32 | |
, relative_offset_of_local_header=1049460706 | |
, file_name=b'test_file.bin' | |
, file_comment=b'' | |
) | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment