Skip to content

Instantly share code, notes, and snippets.

@sedrubal
Created July 22, 2023 13:39
Show Gist options
  • Save sedrubal/ca29ff50df8282a5ec4c9337c4ecb5e5 to your computer and use it in GitHub Desktop.
Save sedrubal/ca29ff50df8282a5ec4c9337c4ecb5e5 to your computer and use it in GitHub Desktop.
cpio parser
#!/usr/bin/python3
"""
Docs: https://www.systutorials.com/docs/linux/man/5-cpio/
Implements the old binary format.
TODO does only support regular files
TODO does not yet extract the files
"""
import argparse
import grp
import pwd
import struct
import sys
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
HEADER_MAGIC = "0o70707"
HEADER_LEN = 26
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"file",
type=argparse.FileType(mode="rb"),
help="The cpio file to read.",
)
return parser.parse_args()
def parse_4_byte_int(value: bytes) -> int:
b, a, d, c = value
return a << 24 | b << 16 | c << 8 | d
class ParseError(Exception):
"""Exception when things could not be parsed."""
def read_padding_byte(file):
padding = file.read(1)
if not len(padding) == 1:
print("File ended before reading the padding byte.", file=sys.stderr)
if padding[0]:
print(f"Padding byte should be zero, but was {padding[0]}", file=sys.stderr)
@dataclass
class Header:
magic: int
dev: int
ino: int
mode: int
user: int | str
group: int | str
nlink: int
rdev: int
mtime: datetime
namesize: int
filesize: int
@classmethod
def read(cls, file) -> "Header":
header_bytes = file.read(HEADER_LEN)
if len(header_bytes) < HEADER_LEN:
raise ParseError(
f"File ended before end of header. Header bytes: {header_bytes!r}"
)
(
magic,
dev,
ino,
mode,
uid,
gid,
nlink,
rdev,
mtime,
namesize,
filesize,
) = struct.unpack(
"<HHHHHHHH4sH4s",
header_bytes,
)
if oct(magic) != HEADER_MAGIC:
raise ParseError(
f"Header magic should be {HEADER_MAGIC} but was {oct(magic)} (in octal)"
)
try:
user = pwd.getpwuid(uid).pw_name
except KeyError:
user = uid
try:
group = grp.getgrgid(gid).gr_name
except KeyError:
group = gid
try:
mtime = (datetime.fromtimestamp(parse_4_byte_int(mtime)),)
except Exception as exc:
print(f"Error parsing mtime {exc}", file=sys.stderr)
mtime = datetime.fromtimestamp(0)
return cls(
magic=magic,
dev=dev,
ino=ino,
mode=mode,
user=user,
group=group,
nlink=nlink,
rdev=rdev,
mtime=mtime,
namesize=namesize,
filesize=parse_4_byte_int(filesize),
)
def read_filename(self, file) -> Path:
filename = file.read(self.namesize)
if len(filename) < self.namesize:
print(
f"File ended before reading complete filename. Filename so far is {filename}",
file=sys.stderr,
)
if filename[-1] == 0:
filename = filename[:-1]
if self.namesize % 2:
read_padding_byte(file)
try:
return Path(filename.decode("utf8"))
except UnicodeDecodeError as err:
print(f"Could not decode filename: {err}", file=sys.stderr)
return Path(filename.decode("utf8", errors="ignore"))
def read_file(self, file) -> bytes:
data = file.read(self.filesize)
if len(data) < self.filesize:
print(
"File ended before reading complete content."
f" Read {len(data)} Bytes. Expected {self.filesize} Bytes.",
file=sys.stderr,
)
if self.filesize % 2:
read_padding_byte(file)
return data
def main():
args = parse_args()
cpio_file = args.file
files = dict[Path, bytes]()
while True:
try:
header = Header.read(cpio_file)
except ParseError as err:
print(err, file=sys.stderr)
break
filename = header.read_filename(cpio_file)
filedata = header.read_file(cpio_file)
if filename == "TRAILER!!!":
if not all(byte == 0 for byte in filedata):
print("Trailer file contains data", file=sys.stderr)
break
if filename in files:
print(f"File {filename} already in files. Overwriting.", file=sys.stderr)
files[filename] = filedata
print("Content:")
print("========")
for file_name, data in files.items():
print()
print(f"{file_name}:")
print()
try:
content = data.decode("utf-8")
except UnicodeDecodeError:
content = f"{len(data)} Bytes binary"
print(content)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment