Skip to content

Instantly share code, notes, and snippets.

@iwalton3
Last active September 6, 2021 01:38
Show Gist options
  • Save iwalton3/643cb4150f7ee170bdf9d6c36284c982 to your computer and use it in GitHub Desktop.
Save iwalton3/643cb4150f7ee170bdf9d6c36284c982 to your computer and use it in GitHub Desktop.
Blob Indexer/Reassembler

Blob Indexer/Reassembler

If you have some sort of uncompressed unfragmented block filesystem, aligned to 1K segments, such as an ISO, you can use this software to store the individual files but be able to reassemble the original whenever you want. The restored file will be bit-for-bit identical to the original regardless of the input file format. Only known file segments will be cut out.

Usage

For instance, if you have a copy of an Ubuntu installer:

$ 7z x ubuntu-21.04-desktop-amd64.iso -o"ubuntu-21.04-desktop-amd64.iso.unpack"
$ idx_blob.py ubuntu-21.04-desktop-amd64.iso

You'll get these files:

$ du -hs *
2.7G  ubuntu-21.04-desktop-amd64.iso
2.1M  ubuntu-21.04-desktop-amd64.iso.bin
44K   ubuntu-21.04-desktop-amd64.iso.idx
2.7G  ubuntu-21.04-desktop-amd64.iso.unpack

You can now throw out ubuntu-21.04-desktop-amd64.iso and rebuild it at any time with:

$ rebuild_blob.py ubuntu-21.04-desktop-amd64.iso

You can specify another output path as a second argument if desired. The *.bin file typically has a lot of empty space in it, so you may want to compress it. The .idx file is just a json mapping of where all the files were cut out from the *.bin file, which contains the "extra" data.

#!/usr/bin/env python3
import os
import os.path
import sys
import json
import math
print("Collecting files...")
files = {}
files_multiple = {}
path = sys.argv[1]
fpath = f"{path}.unpack"
fpath_len = len(fpath)+1
empty_key = bytes(1024)
for root, _, subpaths in os.walk(fpath, topdown=False):
for file in subpaths:
full_path = os.path.join(root, file)
fkey = full_path[fpath_len:]
with open(full_path, 'rb') as fh:
fdata = fh.read(1024)
if len(fdata) < 1024:
continue
if fdata == empty_key:
continue
if fdata in files:
if files[fdata] == '__multiple__':
files_multiple[fdata].add(fkey)
else:
existing = files[fdata]
files_multiple[fdata] = {fkey}
files[fdata] = '__multiple__'
else:
files[fdata] = fkey
i=0
index = []
matching_index = []
prevent_messages = set()
def copy_blocks(source, dest, count):
for _ in range(count):
dest.write(source.read(1024))
with open(path, 'rb') as fh:
print("Building index...")
while True:
fdata = fh.read(1024)
if len(fdata) < 1024:
break
data_match = files.get(fdata)
if data_match:
if data_match == '__multiple__':
for match in files_multiple[fdata]:
index.append((i, match))
if match not in prevent_messages:
prevent_messages.add(match)
print(f"Found possible match at {i} for {match} (multi)")
else:
index.append((i, data_match))
if data_match not in prevent_messages:
prevent_messages.add(data_match)
print(f"Found possible match at {i} for {data_match}")
i += 1
del files
print("Verifying...")
for pos, file in index:
fh.seek(pos*1024)
with open(os.path.join(fpath, file), 'rb') as match:
match_ok = True
while True:
fdata1 = fh.read(1024)
fdata2 = match.read(1024)
if len(fdata2) < 1024:
if len(fdata2) == 0:
break
leftovers = fdata1[len(fdata2):]
for c in leftovers:
if c != 0:
print(f"Found junk data at {pos} for {file} (ignoring)")
match_ok = False
break
fdata1 = fdata1[:len(fdata2)]
if fdata1 != fdata2:
match_ok = False
break
if match_ok:
print(f"Verified match at {pos} for {file}")
matching_index.append((pos, file))
del index
print("Verifying index...")
good_index = []
index_with_sizes = []
blocks = 0
blocks_saved = 0
total_blocks = math.ceil(os.path.getsize(path)/1024)
for pos, file in matching_index:
if blocks > pos:
print(f"Ignoring entry {file} due to overlap.")
continue
skip = math.ceil(os.path.getsize(os.path.join(fpath, file))/1024)
index_with_sizes.append((pos, file, skip))
good_index.append((pos, file))
blocks = pos + skip
blocks_saved += skip
savings = math.floor(blocks_saved/total_blocks*100)
del matching_index
print(f"Will save {savings} percent of filesize!")
if len(good_index) == 0:
print(f"FATAL: Index is empty!")
sys.exit(1)
print("Writing index...")
with open(f"{path}.idx", 'w') as idx:
json.dump(good_index, idx)
print("Writing binary...")
with open(f"{path}.bin", 'wb') as bin:
blocks = 0
fh.seek(0)
for pos, file, skip in index_with_sizes:
head_copy = pos - blocks
copy_blocks(fh, bin, head_copy)
blocks = pos + skip
fh.seek(blocks*1024)
tail_copy = total_blocks - blocks
copy_blocks(fh, bin, tail_copy)
#!/usr/bin/env python3
import os
import os.path
import sys
import json
import math
path = sys.argv[1]
write_path = sys.argv[2] if len(sys.argv) == 3 else path
fpath = f"{path}.unpack"
print("Reading index...")
with open(f"{path}.idx", 'r') as fh:
index = json.load(fh)
print("Scanning files...")
index_with_sizes = []
for pos, file in index:
skip = math.ceil(os.path.getsize(os.path.join(fpath, file))/1024)
index_with_sizes.append((pos, file, skip))
def copy_blocks(source, dest, count):
for _ in range(count):
dest.write(source.read(1024))
def copy_blocks_padded(source, dest, count):
if count < 0:
return
for _ in range(count - 1):
dest.write(source.read(1024))
fdata = source.read(1024)
dest.write(fdata)
if len(fdata) < 1024:
dest.write(bytes(1024 - len(fdata)))
print("Rebuilding file...")
total_blocks = math.ceil(os.path.getsize(f"{path}.bin")/1024)
with open(write_path, 'wb') as fh:
with open(f"{path}.bin", 'rb') as bin:
blocks = 0
blocks_copied = 0
for pos, file, skip in index_with_sizes:
head_copy = pos - blocks
copy_blocks(bin, fh, head_copy)
with open(os.path.join(fpath, file), 'rb') as fch:
copy_blocks_padded(fch, fh, skip)
blocks = pos + skip
blocks_copied += head_copy
tail_copy = total_blocks - blocks_copied
copy_blocks(bin, fh, tail_copy)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment