Skip to content

Instantly share code, notes, and snippets.

@ales-erjavec
Created April 6, 2021 13:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ales-erjavec/1753ba18f90756158e4c030f6a57fdd4 to your computer and use it in GitHub Desktop.
Save ales-erjavec/1753ba18f90756158e4c030f6a57fdd4 to your computer and use it in GitHub Desktop.
"""
Read the IDX file format as described by http://yann.lecun.com/exdb/mnist/
"""
import os
import io
import struct
import mmap
from functools import reduce
from typing import IO, Tuple, Union
import numpy as np
def parse_magic(mg: bytes) -> Tuple[np.dtype, int]:
"""Parse the magic header"""
b1, b2, typecode, ndim = struct.unpack("BBBB", mg)
assert b1 == b2 == 0
mapping = {
0x08: np.ubyte,
0x09: np.byte,
0x0B: np.int16,
0x0C: np.int32,
0x0D: np.float32,
0x0E: np.float64
}
return np.dtype(mapping[typecode]), ndim
def read_idx(f: IO[bytes]) -> np.ndarray:
"""Read `f` and load the array into memory. `f` can be compressed.
"""
def parse_int(mb: bytes):
return struct.unpack(">i", mb)[0]
magic = f.read(4)
dtype, ndim = parse_magic(magic)
shape = []
for m in range(ndim):
shape.append(parse_int(f.read(4)))
data = f.read()
return np.frombuffer(data, dtype).reshape(tuple(shape))
def mmap_idx(f: Union[int, io.FileIO]) -> np.ndarray:
"""Memory map the IDX file contents into memory.
`f` must be a uncompressed and reside on the local filesystem.
"""
def parse_int(mb: bytes):
return struct.unpack(">i", mb)[0]
if isinstance(f, int):
f = os.fdopen(f, 'rb', buffering=0, closefd=False)
magic = f.read(4)
dtype, ndim = parse_magic(magic)
shape = []
for m in range(ndim):
shape.append(parse_int(f.read(4)))
size = reduce(int.__mul__, shape + [dtype.itemsize])
buffer = mmap.mmap(
f.fileno(), length=size, access=mmap.ACCESS_READ,
)
return np.frombuffer(buffer, dtype).reshape(tuple(shape))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment