dgnsrekt/filepacker.py

## filepacker.py
from pathlib import Path
from os import walk

import bson

#      ^------ pip install bson

# This is a concept for walking through a folder with nested folders each contaning files.

# This can easily be modified to take in a list of mutliple target folders
# There can also be a version for a single file
"""
movie
├── into.txt
├── movie.txt
├── sub1
│   └── submovie.txt
└── sub2
    ├── submovie.txt
    └── submoviex.txt
"""
# ^ --- list of random files and folders to serialize

target_folder = "movie"  # <----- folder chosen by user.

ROOT_PATH = Path(__file__)
ROOT_FOLDER = ROOT_PATH.parent
TARGET_PATH = ROOT_FOLDER / target_folder  # <- relative path

assert TARGET_PATH.resolve().exists(), "Target path does not exist."
# ^ ------ checks if path exist. This can easily be changed to an exception.

ABOSLUTE_TARGET_FOLDER = TARGET_PATH.resolve().parent

print(ABOSLUTE_TARGET_FOLDER, "\n")

print(20 * "-", "SERIALIZATION START   ", "-" * 20)

package_data_file = []  # <- This represents a package.dat file.
manifest_file = []  # <- This represents the file_list.txt or manifest.txt

for dir, _, files in walk(TARGET_PATH):  # <- recursivly iterates through all the folders.

    payload = {}  # <- helps build a payload for the file being serialized
    payload["dir"] = dir  # <- adds directory to the payload

    for file_name in files:
        absolute_file_path = ABOSLUTE_TARGET_FOLDER / dir / file_name
        # ^ --- creates an absolute path to the file

        assert absolute_file_path.exists(), "File does not exist."
        # ^ --- check if the files exists.

        size = absolute_file_path.stat().st_size
        #      ^ --- gets filesize in bytes.

        payload["file"] = {file_name: {"size": size}}
        #                 ^---- adds file info with out bytes for manifest
        manifest_file.append(str(payload))
        # ^------- would be writing to manifest.dat here instead of appending to a list.
        # with file.open(MANIFEST_FILE, 'b').write(str(payload))

        # Also, there is no need to check file masking logic here. You might as well always
        # generate the manifest file. The file masking logic should be decided when creating
        # the header. At that point you can decide to use the manifest file or not.

        bytes = absolute_file_path.read_bytes()  # creates the bytes of the file
        payload["file"] = {file_name: {"size": size, "bytes": bytes}}  # adds bytes to payload

        serialized = bson.dumps(payload)
        # ^---- converts the payload dictionary to a bson binary object.
        print(serialized, "\n")

        package_data_file.append(serialized)
        # ^------- would be writing to package.dat here instead of appending to a list.

print(20 * "-", "SERIALIZATION END    ", "-" * 20, "\n")

print(20 * "-", "DESERIALIZATION BEGIN", "-" * 20)
for payload in package_data_file:
    data = bson.loads(payload)
    # ^ ---- from here you can deserialize the bson file and use it like a dictionary.

    print(data)
    print()

print(20 * "-", "DESERIALIZATION END", "-" * 20, "\n")

print("MANIFEST.txt")
for file in manifest_file:
    print(file)


# V ----- OUTPUT
"""
/home/dgnsrekt/dev/python/filepackager

-------------------- SERIALIZATION START    --------------------
b'X\x00\x00\x00\x02dir\x00\x06\x00\x00\x00movie\x00\x03file\x00>\x00\x00\x00\x03into.txt\x00/\x00\x00\x00\x10size\x00\x14\x00\x00\x00\x05bytes\x00\x14\x00\x00\x00\x00data from info.txt\r\n\x00\x00\x00'

b'\\\x00\x00\x00\x02dir\x00\x06\x00\x00\x00movie\x00\x03file\x00B\x00\x00\x00\x03movie.txt\x002\x00\x00\x00\x10size\x00\x17\x00\x00\x00\x05bytes\x00\x17\x00\x00\x00\x00data from movie.txt\r\n\r\n\x00\x00\x00'

b'\x84\x00\x00\x00\x02dir\x00\x0b\x00\x00\x00movie/sub2\x00\x03file\x00e\x00\x00\x00\x03submoviex.txt\x00Q\x00\x00\x00\x10size\x006\x00\x00\x00\x05bytes\x006\x00\x00\x00\x00data in submoviex.txt file in sub 2 foler!!!!!!!!!!!\r\n\x00\x00\x00'

b's\x00\x00\x00\x02dir\x00\x0b\x00\x00\x00movie/sub2\x00\x03file\x00T\x00\x00\x00\x03submovie.txt\x00A\x00\x00\x00\x10size\x00&\x00\x00\x00\x05bytes\x00&\x00\x00\x00\x00data from submove.txt in sub2 folder\r\n\x00\x00\x00'

b'e\x00\x00\x00\x02dir\x00\x0b\x00\x00\x00movie/sub1\x00\x03file\x00F\x00\x00\x00\x03submovie.txt\x003\x00\x00\x00\x10size\x00\x18\x00\x00\x00\x05bytes\x00\x18\x00\x00\x00\x00data from submovie.txt\r\n\x00\x00\x00'

-------------------- SERIALIZATION END     --------------------

-------------------- DESERIALIZATION BEGIN --------------------
{'dir': 'movie', 'file': {'into.txt': {'size': 20, 'bytes': b'data from info.txt\r\n'}}}

{'dir': 'movie', 'file': {'movie.txt': {'size': 23, 'bytes': b'data from movie.txt\r\n\r\n'}}}

{'dir': 'movie/sub2', 'file': {'submoviex.txt': {'size': 54, 'bytes': b'data in submoviex.txt file in sub 2 foler!!!!!!!!!!!\r\n'}}}

{'dir': 'movie/sub2', 'file': {'submovie.txt': {'size': 38, 'bytes': b'data from submove.txt in sub2 folder\r\n'}}}

{'dir': 'movie/sub1', 'file': {'submovie.txt': {'size': 24, 'bytes': b'data from submovie.txt\r\n'}}}

-------------------- DESERIALIZATION END --------------------

MANIFEST.txt
{'dir': 'movie', 'file': {'into.txt': {'size': 20}}}
{'dir': 'movie', 'file': {'movie.txt': {'size': 23}}}
{'dir': 'movie/sub2', 'file': {'submoviex.txt': {'size': 54}}}
{'dir': 'movie/sub2', 'file': {'submovie.txt': {'size': 38}}}
{'dir': 'movie/sub1', 'file': {'submovie.txt': {'size': 24}}}

"""

## filepacker_working.py
from pathlib import Path
from os import walk, remove
import bson
# NEED TO DOCUMENT. THIS IS A WORKING VERSION THAT HAS SERIALIZED -> PACKAGE.DAT -> DESERIALZIED a file over 6 gigs. SHA256_SUM verified perfect copy.
#------ pip install pymongo


target_folder = "movie"

ROOT_PATH = Path(__file__)
ROOT_FOLDER = ROOT_PATH.parent
TARGET_PATH = ROOT_FOLDER / target_folder

PACKAGE_DAT_PATH = ROOT_FOLDER / "package.dat"
MANIFEST_PATH = ROOT_FOLDER / "manifest.txt"

CHUNK_SIZE = (1024 * 1024) * 1024
try:
    remove(PACKAGE_DAT_PATH)
    remove(MANIFEST_PATH)
except:
    pass

PACKAGE_DAT_PATH.touch()
MANIFEST_PATH.touch()

assert TARGET_PATH.exists(), "Target path does not exist."
assert PACKAGE_DAT_PATH.exists()
assert MANIFEST_PATH.exists()

ABOSLUTE_TARGET_FOLDER = TARGET_PATH.resolve().parent

print(ABOSLUTE_TARGET_FOLDER, "\n")

print(20 * "-", "SERIALIZATION START   ", "-" * 20)

for dir, _, files in walk(TARGET_PATH):

    payload = {}
    payload["dir"] = dir

    for file_name in files:
        absolute_file_path = ABOSLUTE_TARGET_FOLDER / dir / file_name

        assert absolute_file_path.exists(), "File does not exist."

        size = absolute_file_path.stat().st_size

        payload["file"] = {file_name: {"size": size}}
        with MANIFEST_PATH.open(mode="a") as file:
            file.write(str(payload))
            file.write("\n")


        chunk_position = 0
        with open(absolute_file_path, mode="rb") as file:
            while True:
                chunk = file.read(CHUNK_SIZE)  # creates the bytes of the file
                if not chunk:
                    break
                payload["file"] = {
                    file_name: {"size": size, "bytes": chunk, "chunk": chunk_position}
                }
                serialized = bson.encode(payload)

                with PACKAGE_DAT_PATH.open(mode="ba") as file_w:
                    file_w.write(serialized)
                    chunk_position += 1
                    print(".", end="", flush=True)

print(20 * "-", "SERIALIZATION END    ", "-" * 20, "\n")

print(20 * "-", "DESERIALIZATION BEGIN", "-" * 20)

with open(PACKAGE_DAT_PATH, mode="rb") as file:
    for data in bson.decode_file_iter(file):
        dir = Path("new_" + data["dir"])
        dir.mkdir(exist_ok=True)

        file = data["file"].keys()
        for idx, filename in enumerate(file):
            print(filename)
            # print(data["file"][filename].keys())
            # print(data["file"][filename]["size"])
            #:print(data["file"][filename]["chunk"])
            byte_data = data["file"][filename]["bytes"]

            file_location = Path(dir / filename)

            with file_location.open(mode="ba") as file_w:
                file_w.write(byte_data)

    # with open(file_location, mode="ab") as file:

    # file_location.write_bytes(data["file"][file]["bytes"])
    #
print(20 * "-", "DESERIALIZATION END", "-" * 20, "\n")
#
	from pathlib import Path
	from os import walk

	import bson

	# ^------ pip install bson

	# This is a concept for walking through a folder with nested folders each contaning files.

	# This can easily be modified to take in a list of mutliple target folders
	# There can also be a version for a single file
	"""
	movie
	├── into.txt
	├── movie.txt
	├── sub1
	│ └── submovie.txt
	└── sub2
	├── submovie.txt
	└── submoviex.txt
	"""
	# ^ --- list of random files and folders to serialize

	target_folder = "movie" # <----- folder chosen by user.

	ROOT_PATH = Path(__file__)
	ROOT_FOLDER = ROOT_PATH.parent
	TARGET_PATH = ROOT_FOLDER / target_folder # <- relative path

	assert TARGET_PATH.resolve().exists(), "Target path does not exist."
	# ^ ------ checks if path exist. This can easily be changed to an exception.

	ABOSLUTE_TARGET_FOLDER = TARGET_PATH.resolve().parent

	print(ABOSLUTE_TARGET_FOLDER, "\n")

	print(20 * "-", "SERIALIZATION START ", "-" * 20)

	package_data_file = [] # <- This represents a package.dat file.
	manifest_file = [] # <- This represents the file_list.txt or manifest.txt

	for dir, _, files in walk(TARGET_PATH): # <- recursivly iterates through all the folders.

	payload = {} # <- helps build a payload for the file being serialized
	payload["dir"] = dir # <- adds directory to the payload

	for file_name in files:
	absolute_file_path = ABOSLUTE_TARGET_FOLDER / dir / file_name
	# ^ --- creates an absolute path to the file

	assert absolute_file_path.exists(), "File does not exist."
	# ^ --- check if the files exists.

	size = absolute_file_path.stat().st_size
	# ^ --- gets filesize in bytes.

	payload["file"] = {file_name: {"size": size}}
	# ^---- adds file info with out bytes for manifest
	manifest_file.append(str(payload))
	# ^------- would be writing to manifest.dat here instead of appending to a list.
	# with file.open(MANIFEST_FILE, 'b').write(str(payload))

	# Also, there is no need to check file masking logic here. You might as well always
	# generate the manifest file. The file masking logic should be decided when creating
	# the header. At that point you can decide to use the manifest file or not.

	bytes = absolute_file_path.read_bytes() # creates the bytes of the file
	payload["file"] = {file_name: {"size": size, "bytes": bytes}} # adds bytes to payload

	serialized = bson.dumps(payload)
	# ^---- converts the payload dictionary to a bson binary object.
	print(serialized, "\n")

	package_data_file.append(serialized)
	# ^------- would be writing to package.dat here instead of appending to a list.

	print(20 * "-", "SERIALIZATION END ", "-" * 20, "\n")

	print(20 * "-", "DESERIALIZATION BEGIN", "-" * 20)
	for payload in package_data_file:
	data = bson.loads(payload)
	# ^ ---- from here you can deserialize the bson file and use it like a dictionary.

	print(data)
	print()

	print(20 * "-", "DESERIALIZATION END", "-" * 20, "\n")

	print("MANIFEST.txt")
	for file in manifest_file:
	print(file)


	# V ----- OUTPUT
	"""
	/home/dgnsrekt/dev/python/filepackager

	-------------------- SERIALIZATION START --------------------
	b'X\x00\x00\x00\x02dir\x00\x06\x00\x00\x00movie\x00\x03file\x00>\x00\x00\x00\x03into.txt\x00/\x00\x00\x00\x10size\x00\x14\x00\x00\x00\x05bytes\x00\x14\x00\x00\x00\x00data from info.txt\r\n\x00\x00\x00'

	b'\\\x00\x00\x00\x02dir\x00\x06\x00\x00\x00movie\x00\x03file\x00B\x00\x00\x00\x03movie.txt\x002\x00\x00\x00\x10size\x00\x17\x00\x00\x00\x05bytes\x00\x17\x00\x00\x00\x00data from movie.txt\r\n\r\n\x00\x00\x00'

	b'\x84\x00\x00\x00\x02dir\x00\x0b\x00\x00\x00movie/sub2\x00\x03file\x00e\x00\x00\x00\x03submoviex.txt\x00Q\x00\x00\x00\x10size\x006\x00\x00\x00\x05bytes\x006\x00\x00\x00\x00data in submoviex.txt file in sub 2 foler!!!!!!!!!!!\r\n\x00\x00\x00'

	b's\x00\x00\x00\x02dir\x00\x0b\x00\x00\x00movie/sub2\x00\x03file\x00T\x00\x00\x00\x03submovie.txt\x00A\x00\x00\x00\x10size\x00&\x00\x00\x00\x05bytes\x00&\x00\x00\x00\x00data from submove.txt in sub2 folder\r\n\x00\x00\x00'

	b'e\x00\x00\x00\x02dir\x00\x0b\x00\x00\x00movie/sub1\x00\x03file\x00F\x00\x00\x00\x03submovie.txt\x003\x00\x00\x00\x10size\x00\x18\x00\x00\x00\x05bytes\x00\x18\x00\x00\x00\x00data from submovie.txt\r\n\x00\x00\x00'

	-------------------- SERIALIZATION END --------------------

	-------------------- DESERIALIZATION BEGIN --------------------
	{'dir': 'movie', 'file': {'into.txt': {'size': 20, 'bytes': b'data from info.txt\r\n'}}}

	{'dir': 'movie', 'file': {'movie.txt': {'size': 23, 'bytes': b'data from movie.txt\r\n\r\n'}}}

	{'dir': 'movie/sub2', 'file': {'submoviex.txt': {'size': 54, 'bytes': b'data in submoviex.txt file in sub 2 foler!!!!!!!!!!!\r\n'}}}

	{'dir': 'movie/sub2', 'file': {'submovie.txt': {'size': 38, 'bytes': b'data from submove.txt in sub2 folder\r\n'}}}

	{'dir': 'movie/sub1', 'file': {'submovie.txt': {'size': 24, 'bytes': b'data from submovie.txt\r\n'}}}

	-------------------- DESERIALIZATION END --------------------

	MANIFEST.txt
	{'dir': 'movie', 'file': {'into.txt': {'size': 20}}}
	{'dir': 'movie', 'file': {'movie.txt': {'size': 23}}}
	{'dir': 'movie/sub2', 'file': {'submoviex.txt': {'size': 54}}}
	{'dir': 'movie/sub2', 'file': {'submovie.txt': {'size': 38}}}
	{'dir': 'movie/sub1', 'file': {'submovie.txt': {'size': 24}}}

	"""
	from pathlib import Path
	from os import walk, remove
	import bson
	# NEED TO DOCUMENT. THIS IS A WORKING VERSION THAT HAS SERIALIZED -> PACKAGE.DAT -> DESERIALZIED a file over 6 gigs. SHA256_SUM verified perfect copy.
	#------ pip install pymongo


	target_folder = "movie"

	ROOT_PATH = Path(__file__)
	ROOT_FOLDER = ROOT_PATH.parent
	TARGET_PATH = ROOT_FOLDER / target_folder

	PACKAGE_DAT_PATH = ROOT_FOLDER / "package.dat"
	MANIFEST_PATH = ROOT_FOLDER / "manifest.txt"

	CHUNK_SIZE = (1024 * 1024) * 1024
	try:
	remove(PACKAGE_DAT_PATH)
	remove(MANIFEST_PATH)
	except:
	pass

	PACKAGE_DAT_PATH.touch()
	MANIFEST_PATH.touch()

	assert TARGET_PATH.exists(), "Target path does not exist."
	assert PACKAGE_DAT_PATH.exists()
	assert MANIFEST_PATH.exists()

	ABOSLUTE_TARGET_FOLDER = TARGET_PATH.resolve().parent

	print(ABOSLUTE_TARGET_FOLDER, "\n")

	print(20 * "-", "SERIALIZATION START ", "-" * 20)

	for dir, _, files in walk(TARGET_PATH):

	payload = {}
	payload["dir"] = dir

	for file_name in files:
	absolute_file_path = ABOSLUTE_TARGET_FOLDER / dir / file_name

	assert absolute_file_path.exists(), "File does not exist."

	size = absolute_file_path.stat().st_size

	payload["file"] = {file_name: {"size": size}}
	with MANIFEST_PATH.open(mode="a") as file:
	file.write(str(payload))
	file.write("\n")


	chunk_position = 0
	with open(absolute_file_path, mode="rb") as file:
	while True:
	chunk = file.read(CHUNK_SIZE) # creates the bytes of the file
	if not chunk:
	break
	payload["file"] = {
	file_name: {"size": size, "bytes": chunk, "chunk": chunk_position}
	}
	serialized = bson.encode(payload)

	with PACKAGE_DAT_PATH.open(mode="ba") as file_w:
	file_w.write(serialized)
	chunk_position += 1
	print(".", end="", flush=True)

	print(20 * "-", "SERIALIZATION END ", "-" * 20, "\n")

	print(20 * "-", "DESERIALIZATION BEGIN", "-" * 20)

	with open(PACKAGE_DAT_PATH, mode="rb") as file:
	for data in bson.decode_file_iter(file):
	dir = Path("new_" + data["dir"])
	dir.mkdir(exist_ok=True)

	file = data["file"].keys()
	for idx, filename in enumerate(file):
	print(filename)
	# print(data["file"][filename].keys())
	# print(data["file"][filename]["size"])
	#:print(data["file"][filename]["chunk"])
	byte_data = data["file"][filename]["bytes"]

	file_location = Path(dir / filename)

	with file_location.open(mode="ba") as file_w:
	file_w.write(byte_data)

	# with open(file_location, mode="ab") as file:

	# file_location.write_bytes(data["file"][file]["bytes"])
	#
	print(20 * "-", "DESERIALIZATION END", "-" * 20, "\n")
	#