Skip to content

Instantly share code, notes, and snippets.

@rhardih
Last active April 25, 2021 23:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save rhardih/278a3cc6f5534785068819159f2c2919 to your computer and use it in GitHub Desktop.
Save rhardih/278a3cc6f5534785068819159f2c2919 to your computer and use it in GitHub Desktop.
Listing filenames of a remote ZIP archive without downloading entire file
require 'httparty'
require 'uri'
def get_file_names(zip_url)
# ZIP file format: https://en.wikipedia.org/wiki/ZIP_(file_format)
# 1. Do an initial head request to figure out how big the file is from the
# content size
response = HTTParty.head(zip_url)
content_length = response.headers["content-length"].to_i
# 2. Request just enough bytes from the end of the file, to get the End of
# central directory record (EOCD)
response = HTTParty.get(zip_url, {
headers: {
'Range' => "bytes=#{content_length - 100}-#{content_length}"
}
})
# 3. Extract the central directory byte offset, size and comment length
ss = StringScanner.new(response.body)
ss.scan_until(/\x50\x4b\x05\x06/) # end of central directory signature
ss.pos += 12 - 4 # pos is index of first byte after a match
cd_size, cd_offset, comment_length = ss.peek(10).unpack("L2S")
throw "Not enough bytes requested for EOCD" if comment_length > 80
# 4. Use the offset and size to request just the bytes that contain the
# Central directory file header
response = HTTParty.get(zip_url, {
headers: {
'Range' => "bytes=#{cd_offset}-#{cd_offset + cd_size}"
}
})
# 5. Go through each entry in the file and collect the filenames
file_names = []
ss = StringScanner.new(response.body)
until ss.scan_until(/\x50\x4b\x01\x02/).nil? # central directory signature
ss.pos += 28 - 4
file_name_length = ss.peek(2).unpack("S").first
ss.pos += 18
file_name = ss.peek(file_name_length)
file_names << file_name
end
file_names
end
pp get_file_names("https://rhardih.io/wp-content/uploads/2021/04/test.zip")
require 'httparty'
require 'uri'
def get_file_names(zip_url)
# ZIP file format: https://en.wikipedia.org/wiki/ZIP_(file_format)
# 1. Request just enough bytes from the end of the file, to get the End of
# central directory record (EOCD)
response = HTTParty.get(zip_url, { headers: { 'Range' => "bytes=-100" } })
# 2. Extract the central directory byte offset, size and comment length
ss = StringScanner.new(response.body)
ss.scan_until(/\x50\x4b\x05\x06/) # end of central directory signature
ss.pos += 12 - 4 # pos is index of first byte after a match
cd_size, cd_offset, comment_length = ss.peek(10).unpack("L2S")
throw "Not enough bytes requested for EOCD" if comment_length > 80
# 3. Use the offset and size to request just the bytes that contain the
# Central directory file header
response = HTTParty.get(zip_url, {
headers: {
'Range' => "bytes=#{cd_offset}-#{cd_offset + cd_size}"
}
})
# 4. Go through each entry in the file and collect the filenames
file_names = []
ss = StringScanner.new(response.body)
until ss.scan_until(/\x50\x4b\x01\x02/).nil? # central directory signature
ss.pos += 28 - 4
file_name_length = ss.peek(2).unpack("S").first
ss.pos += 18
file_name = ss.peek(file_name_length)
file_names << file_name
end
file_names
end
pp get_file_names("https://rhardih.io/wp-content/uploads/2021/04/test.zip")
@rhardih
Copy link
Author

rhardih commented Apr 18, 2021

Install dependent gem with $ gem install httparty

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment