Skip to content

Instantly share code, notes, and snippets.

@JBlaschke
Last active December 21, 2022 22:00
Show Gist options
  • Save JBlaschke/ec0f4ece417bdfe1428a85c78c0804f7 to your computer and use it in GitHub Desktop.
Save JBlaschke/ec0f4ece417bdfe1428a85c78c0804f7 to your computer and use it in GitHub Desktop.
Example of in-memory file system used to read a compressed tar archive (without extracting to disk). Based on the uncompressed version from: https://gist.github.com/KristofferC/b354b654e776f4910b03985830418723
module memtar
using Tar, CodecZlib, TranscodingStreams
mutable struct InMemoryFile
size::Int
pos::Int
str::Union{Nothing, String}
end
mutable struct InMemoryFileSystem
d::Dict{String, InMemoryFile}
tarball_io::IOBuffer
out_io::IOBuffer
buf::Vector{UInt8}
end
function readfile(ref::InMemoryFileSystem, path::AbstractString)
file = ref.d[path]
file.str === nothing || return file.str
seek(ref.tarball_io, file.pos)
Tar.read_data(ref.tarball_io, ref.out_io; size=file.size, buf=ref.buf)
file.str = String(take!(ref.out_io))
end
const true_predicate = _ -> true
function create_inmemory_filesystem(path::AbstractString)
# Load and decompress archive into memory
tarball_io = open(path) do tarball_gz
tarball_io = IOBuffer()
gz_stream = GzipDecompressorStream(tarball_io)
write(gz_stream, tarball_gz, TranscodingStreams.TOKEN_END)
flush(gz_stream)
seekstart(tarball_io)
tarball_io
end
# In-memory file system
buf = Vector{UInt8}(undef, Tar.DEFAULT_BUFFER_SIZE)
system = InMemoryFileSystem(
Dict{String, InMemoryFile}(), tarball_io, IOBuffer(), buf
)
# Unpack decompressed archive in-memory file system
Tar.arg_read(tarball_io) do tar
Tar.read_tarball(true_predicate, tar; buf=buf) do hdr, _
if hdr.type == :file
p = position(tar)
Tar.skip_data(tar, hdr.size)
system.d[hdr.path] = InMemoryFile(hdr.size, p, nothing)
end
end
end
return system
end
end
# Test
using Tar, CodecZlib
# Make a compressed tar archive
td = mktempdir()
data1 = "foobarbaz"^10
data2 = "spam"^20
open(joinpath(td, "data1.txt"), "w") do io
write(io, data1)
end
open(joinpath(td, "data2.txt"), "w") do io
write(io, data2)
end
tar_gz = open("archive.tar.gz", write=true)
tar = GzipCompressorStream(tar_gz)
Tar.create(td, tar)
close(tar)
# Load compressed tar archive into memory (and decompress it)
data_path = joinpath(@__DIR__, "archive.tar.gz")
fs = memtar.create_inmemory_filesystem(data_path)
println(fs.d)
# Extract data from in-memory file system
data = memtar.readfile(fs, "data1.txt")
println(data)
data = memtar.readfile(fs, "data2.txt")
println(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment