Skip to content

Instantly share code, notes, and snippets.

@unixpickle
Created December 22, 2019 22:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save unixpickle/0eda21a5e11e947e1f7bfb38e2e2c1f8 to your computer and use it in GitHub Desktop.
Save unixpickle/0eda21a5e11e947e1f7bfb38e2e2c1f8 to your computer and use it in GitHub Desktop.
Extract ImageNet tar in-place
// Command extract_tar can be used to extract a large
// ImageNet tarbal on a system that doesn't have enough
// storage for both the tarbal and the untarred data.
//
// As it extracts the tarbal, it truncates the original
// tar file so that it takes up less and less space.
//
// Based on this earlier gist for processing ImageNet tars:
// https://gist.github.com/unixpickle/7304c78032c9f433e28a87409f4d5aca
package main
import (
"io"
"io/ioutil"
"log"
"os"
"sort"
"strconv"
"strings"
"github.com/unixpickle/essentials"
)
func main() {
if len(os.Args) != 2 {
essentials.Die("Usage: extract_tar <file.tar>")
}
f, err := os.OpenFile(os.Args[1], os.O_RDWR, 0)
essentials.Must(err)
defer f.Close()
offsetToPath := PathOffsets(f)
offsets := make([]FileOffset, 0, len(offsetToPath))
for offset := range offsetToPath {
offsets = append(offsets, offset)
}
sort.Slice(offsets, func(i, j int) bool {
return offsets[i].Start > offsets[j].Start
})
log.Println("Extracting", len(offsets), "files...")
for _, offset := range offsets {
path := offsetToPath[offset]
log.Println(" -", path)
dirname := strings.Split(path, "/")[0]
os.Mkdir(dirname, 0755)
_, err = f.Seek(offset.Start, io.SeekStart)
essentials.Must(err)
data := make([]byte, offset.Length)
_, err = io.ReadFull(f, data)
essentials.Must(err)
essentials.Must(ioutil.WriteFile(path, data, 0755))
f.Seek(0, io.SeekStart)
essentials.Must(f.Truncate(offset.Start))
}
}
type FileOffset struct {
Start int64
Length int
}
func PathOffsets(r io.Reader) map[FileOffset]string {
offsets := map[FileOffset]string{}
ReadTar(r, func(wnid string, folderTar io.Reader, offset int64) {
if !strings.HasSuffix(wnid, ".tar") {
return
}
wnid = wnid[:len(wnid)-4]
log.Println("Processing wnid:", wnid)
ReadTar(folderTar, func(imageName string, img io.Reader, subOffset int64) {
imgData, err := ioutil.ReadAll(img)
if err != nil {
log.Println(err)
}
offset := FileOffset{
Start: subOffset + offset,
Length: len(imgData),
}
offsets[offset] = wnid + "/" + imageName
})
})
return offsets
}
func ReadTar(r io.Reader, cb func(name string, data io.Reader, offset int64)) error {
var offset int64
for {
name, size, err := ReadTarHeader(r)
if err != nil {
return err
}
if name == "" {
return nil
}
limited := io.LimitReader(r, size)
cb(name, limited, offset+512)
if _, err := io.Copy(ioutil.Discard, limited); err != nil {
return err
}
offset += 512 + size
if size%512 != 0 {
extra := 512 - (size % 512)
if _, err := io.Copy(ioutil.Discard, io.LimitReader(r, extra)); err != nil {
return err
}
offset += extra
}
}
}
func ReadTarHeader(r io.Reader) (name string, size int64, err error) {
buf := make([]byte, 512)
if _, err := io.ReadFull(r, buf); err != nil {
return "", 0, err
}
name = NullTermStr(buf[:100])
size, err = strconv.ParseInt(NullTermStr(buf[124:136]), 8, 64)
if err != nil {
return "", 0, err
}
return name, size, nil
}
func NullTermStr(data []byte) string {
for i, b := range data {
if b == 0 {
return string(data[:i])
}
}
return string(data)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment