iwalton3/idx_blob.py

## readme.md

      
    Raw
  

              readme.md
            
          
    Blob Indexer/Reassembler

If you have some sort of uncompressed unfragmented block filesystem, aligned to 1K segments, such as an ISO, you can
use this software to store the individual files but be able to reassemble the original whenever you want. The restored
file will be bit-for-bit identical to the original regardless of the input file format. Only known file segments will be
cut out.
Usage

For instance, if you have a copy of an Ubuntu installer:
$ 7z x ubuntu-21.04-desktop-amd64.iso -o"ubuntu-21.04-desktop-amd64.iso.unpack"
$ idx_blob.py ubuntu-21.04-desktop-amd64.iso
You'll get these files:
$ du -hs *
2.7G  ubuntu-21.04-desktop-amd64.iso
2.1M  ubuntu-21.04-desktop-amd64.iso.bin
44K   ubuntu-21.04-desktop-amd64.iso.idx
2.7G  ubuntu-21.04-desktop-amd64.iso.unpack
You can now throw out ubuntu-21.04-desktop-amd64.iso and rebuild it at any time with:
$ rebuild_blob.py ubuntu-21.04-desktop-amd64.iso
You can specify another output path as a second argument if desired. The *.bin file typically has a lot of empty
space in it, so you may want to compress it. The .idx file is just a json mapping of where all the files
were cut out from the *.bin file, which contains the "extra" data.

  
## idx_blob.py
#!/usr/bin/env python3
import os
import os.path
import sys
import json
import math

print("Collecting files...")

files = {}
files_multiple = {}
path = sys.argv[1]
fpath = f"{path}.unpack"
fpath_len = len(fpath)+1
empty_key = bytes(1024)

for root, _, subpaths in os.walk(fpath, topdown=False):
    for file in subpaths:
        full_path = os.path.join(root, file)
        fkey = full_path[fpath_len:]
        with open(full_path, 'rb') as fh:
            fdata = fh.read(1024)
            if len(fdata) < 1024:
                continue
            if fdata == empty_key:
                continue
            if fdata in files:
                if files[fdata] == '__multiple__':
                    files_multiple[fdata].add(fkey)
                else:
                    existing = files[fdata]
                    files_multiple[fdata] = {fkey}
                    files[fdata] = '__multiple__'
            else:
                files[fdata] = fkey

i=0
index = []
matching_index = []
prevent_messages = set()

def copy_blocks(source, dest, count):
    for _ in range(count):
        dest.write(source.read(1024))

with open(path, 'rb') as fh:
    print("Building index...")
    while True:
        fdata = fh.read(1024)
        if len(fdata) < 1024:
            break
        data_match = files.get(fdata)
        if data_match:
            if data_match == '__multiple__':
                for match in files_multiple[fdata]:
                    index.append((i, match))
                    if match not in prevent_messages:
                        prevent_messages.add(match)
                        print(f"Found possible match at {i} for {match} (multi)")
            else:
                index.append((i, data_match))
                if data_match not in prevent_messages:
                    prevent_messages.add(data_match)
                    print(f"Found possible match at {i} for {data_match}")
        i += 1
    del files

    print("Verifying...")
    for pos, file in index:
        fh.seek(pos*1024)
        with open(os.path.join(fpath, file), 'rb') as match:
            match_ok = True
            while True:
                fdata1 = fh.read(1024)
                fdata2 = match.read(1024)
                if len(fdata2) < 1024:
                    if len(fdata2) == 0:
                        break
                    leftovers = fdata1[len(fdata2):]
                    for c in leftovers:
                        if c != 0:
                            print(f"Found junk data at {pos} for {file} (ignoring)")
                            match_ok = False
                            break
                    fdata1 = fdata1[:len(fdata2)]
                if fdata1 != fdata2:
                    match_ok = False
                    break
            if match_ok:
                print(f"Verified match at {pos} for {file}")
                matching_index.append((pos, file))
    del index

    print("Verifying index...")
    good_index = []
    index_with_sizes = []
    blocks = 0
    blocks_saved = 0
    total_blocks = math.ceil(os.path.getsize(path)/1024)
    for pos, file in matching_index:
        if blocks > pos:
            print(f"Ignoring entry {file} due to overlap.")
            continue
        skip = math.ceil(os.path.getsize(os.path.join(fpath, file))/1024)
        index_with_sizes.append((pos, file, skip))
        good_index.append((pos, file))
        blocks = pos + skip
        blocks_saved += skip
    savings = math.floor(blocks_saved/total_blocks*100)
    del matching_index
    print(f"Will save {savings} percent of filesize!")

    if len(good_index) == 0:
        print(f"FATAL: Index is empty!")
        sys.exit(1)

    print("Writing index...")
    with open(f"{path}.idx", 'w') as idx:
        json.dump(good_index, idx)

    print("Writing binary...")
    with open(f"{path}.bin", 'wb') as bin:
        blocks = 0
        fh.seek(0)
        for pos, file, skip in index_with_sizes:
            head_copy = pos - blocks
            copy_blocks(fh, bin, head_copy)
            blocks = pos + skip
            fh.seek(blocks*1024)
        tail_copy = total_blocks - blocks
        copy_blocks(fh, bin, tail_copy)

## rebuild_blob.py
#!/usr/bin/env python3
import os
import os.path
import sys
import json
import math

path = sys.argv[1]
write_path = sys.argv[2] if len(sys.argv) == 3 else path
fpath = f"{path}.unpack"

print("Reading index...")
with open(f"{path}.idx", 'r') as fh:
    index = json.load(fh)

print("Scanning files...")
index_with_sizes = []
for pos, file in index:
    skip = math.ceil(os.path.getsize(os.path.join(fpath, file))/1024)
    index_with_sizes.append((pos, file, skip))

def copy_blocks(source, dest, count):
    for _ in range(count):
        dest.write(source.read(1024))

def copy_blocks_padded(source, dest, count):
    if count < 0:
        return
    for _ in range(count - 1):
        dest.write(source.read(1024))
    fdata = source.read(1024)
    dest.write(fdata)
    if len(fdata) < 1024:
        dest.write(bytes(1024 - len(fdata)))

print("Rebuilding file...")
total_blocks = math.ceil(os.path.getsize(f"{path}.bin")/1024)
with open(write_path, 'wb') as fh:
    with open(f"{path}.bin", 'rb') as bin:
        blocks = 0
        blocks_copied = 0
        for pos, file, skip in index_with_sizes:
            head_copy = pos - blocks
            copy_blocks(bin, fh, head_copy)
            with open(os.path.join(fpath, file), 'rb') as fch:
                copy_blocks_padded(fch, fh, skip)
            blocks = pos + skip
            blocks_copied += head_copy
        tail_copy = total_blocks - blocks_copied
        copy_blocks(bin, fh, tail_copy)
	#!/usr/bin/env python3
	import os
	import os.path
	import sys
	import json
	import math

	print("Collecting files...")

	files = {}
	files_multiple = {}
	path = sys.argv[1]
	fpath = f"{path}.unpack"
	fpath_len = len(fpath)+1
	empty_key = bytes(1024)

	for root, _, subpaths in os.walk(fpath, topdown=False):
	for file in subpaths:
	full_path = os.path.join(root, file)
	fkey = full_path[fpath_len:]
	with open(full_path, 'rb') as fh:
	fdata = fh.read(1024)
	if len(fdata) < 1024:
	continue
	if fdata == empty_key:
	continue
	if fdata in files:
	if files[fdata] == '__multiple__':
	files_multiple[fdata].add(fkey)
	else:
	existing = files[fdata]
	files_multiple[fdata] = {fkey}
	files[fdata] = '__multiple__'
	else:
	files[fdata] = fkey

	i=0
	index = []
	matching_index = []
	prevent_messages = set()

	def copy_blocks(source, dest, count):
	for _ in range(count):
	dest.write(source.read(1024))

	with open(path, 'rb') as fh:
	print("Building index...")
	while True:
	fdata = fh.read(1024)
	if len(fdata) < 1024:
	break
	data_match = files.get(fdata)
	if data_match:
	if data_match == '__multiple__':
	for match in files_multiple[fdata]:
	index.append((i, match))
	if match not in prevent_messages:
	prevent_messages.add(match)
	print(f"Found possible match at {i} for {match} (multi)")
	else:
	index.append((i, data_match))
	if data_match not in prevent_messages:
	prevent_messages.add(data_match)
	print(f"Found possible match at {i} for {data_match}")
	i += 1
	del files

	print("Verifying...")
	for pos, file in index:
	fh.seek(pos*1024)
	with open(os.path.join(fpath, file), 'rb') as match:
	match_ok = True
	while True:
	fdata1 = fh.read(1024)
	fdata2 = match.read(1024)
	if len(fdata2) < 1024:
	if len(fdata2) == 0:
	break
	leftovers = fdata1[len(fdata2):]
	for c in leftovers:
	if c != 0:
	print(f"Found junk data at {pos} for {file} (ignoring)")
	match_ok = False
	break
	fdata1 = fdata1[:len(fdata2)]
	if fdata1 != fdata2:
	match_ok = False
	break
	if match_ok:
	print(f"Verified match at {pos} for {file}")
	matching_index.append((pos, file))
	del index

	print("Verifying index...")
	good_index = []
	index_with_sizes = []
	blocks = 0
	blocks_saved = 0
	total_blocks = math.ceil(os.path.getsize(path)/1024)
	for pos, file in matching_index:
	if blocks > pos:
	print(f"Ignoring entry {file} due to overlap.")
	continue
	skip = math.ceil(os.path.getsize(os.path.join(fpath, file))/1024)
	index_with_sizes.append((pos, file, skip))
	good_index.append((pos, file))
	blocks = pos + skip
	blocks_saved += skip
	savings = math.floor(blocks_saved/total_blocks*100)
	del matching_index
	print(f"Will save {savings} percent of filesize!")

	if len(good_index) == 0:
	print(f"FATAL: Index is empty!")
	sys.exit(1)

	print("Writing index...")
	with open(f"{path}.idx", 'w') as idx:
	json.dump(good_index, idx)

	print("Writing binary...")
	with open(f"{path}.bin", 'wb') as bin:
	blocks = 0
	fh.seek(0)
	for pos, file, skip in index_with_sizes:
	head_copy = pos - blocks
	copy_blocks(fh, bin, head_copy)
	blocks = pos + skip
	fh.seek(blocks*1024)
	tail_copy = total_blocks - blocks
	copy_blocks(fh, bin, tail_copy)