Last active
December 21, 2022 22:00
-
-
Save JBlaschke/ec0f4ece417bdfe1428a85c78c0804f7 to your computer and use it in GitHub Desktop.
Example of in-memory file system used to read a compressed tar archive (without extracting to disk). Based on the uncompressed version from: https://gist.github.com/KristofferC/b354b654e776f4910b03985830418723
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module memtar | |
using Tar, CodecZlib, TranscodingStreams | |
mutable struct InMemoryFile | |
size::Int | |
pos::Int | |
str::Union{Nothing, String} | |
end | |
mutable struct InMemoryFileSystem | |
d::Dict{String, InMemoryFile} | |
tarball_io::IOBuffer | |
out_io::IOBuffer | |
buf::Vector{UInt8} | |
end | |
function readfile(ref::InMemoryFileSystem, path::AbstractString) | |
file = ref.d[path] | |
file.str === nothing || return file.str | |
seek(ref.tarball_io, file.pos) | |
Tar.read_data(ref.tarball_io, ref.out_io; size=file.size, buf=ref.buf) | |
file.str = String(take!(ref.out_io)) | |
end | |
const true_predicate = _ -> true | |
function create_inmemory_filesystem(path::AbstractString) | |
# Load and decompress archive into memory | |
tarball_io = open(path) do tarball_gz | |
tarball_io = IOBuffer() | |
gz_stream = GzipDecompressorStream(tarball_io) | |
write(gz_stream, tarball_gz, TranscodingStreams.TOKEN_END) | |
flush(gz_stream) | |
seekstart(tarball_io) | |
tarball_io | |
end | |
# In-memory file system | |
buf = Vector{UInt8}(undef, Tar.DEFAULT_BUFFER_SIZE) | |
system = InMemoryFileSystem( | |
Dict{String, InMemoryFile}(), tarball_io, IOBuffer(), buf | |
) | |
# Unpack decompressed archive in-memory file system | |
Tar.arg_read(tarball_io) do tar | |
Tar.read_tarball(true_predicate, tar; buf=buf) do hdr, _ | |
if hdr.type == :file | |
p = position(tar) | |
Tar.skip_data(tar, hdr.size) | |
system.d[hdr.path] = InMemoryFile(hdr.size, p, nothing) | |
end | |
end | |
end | |
return system | |
end | |
end | |
# Test | |
using Tar, CodecZlib | |
# Make a compressed tar archive | |
td = mktempdir() | |
data1 = "foobarbaz"^10 | |
data2 = "spam"^20 | |
open(joinpath(td, "data1.txt"), "w") do io | |
write(io, data1) | |
end | |
open(joinpath(td, "data2.txt"), "w") do io | |
write(io, data2) | |
end | |
tar_gz = open("archive.tar.gz", write=true) | |
tar = GzipCompressorStream(tar_gz) | |
Tar.create(td, tar) | |
close(tar) | |
# Load compressed tar archive into memory (and decompress it) | |
data_path = joinpath(@__DIR__, "archive.tar.gz") | |
fs = memtar.create_inmemory_filesystem(data_path) | |
println(fs.d) | |
# Extract data from in-memory file system | |
data = memtar.readfile(fs, "data1.txt") | |
println(data) | |
data = memtar.readfile(fs, "data2.txt") | |
println(data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment