Skip to content

Instantly share code, notes, and snippets.

@dgnsrekt
Last active November 6, 2019 09:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dgnsrekt/7ec29400dec71f21245d5abc848ae0d8 to your computer and use it in GitHub Desktop.
Save dgnsrekt/7ec29400dec71f21245d5abc848ae0d8 to your computer and use it in GitHub Desktop.
from pathlib import Path
from os import walk
import bson
# ^------ pip install bson
# This is a concept for walking through a folder with nested folders each contaning files.
# This can easily be modified to take in a list of mutliple target folders
# There can also be a version for a single file
"""
movie
├── into.txt
├── movie.txt
├── sub1
│   └── submovie.txt
└── sub2
├── submovie.txt
└── submoviex.txt
"""
# ^ --- list of random files and folders to serialize
target_folder = "movie" # <----- folder chosen by user.
ROOT_PATH = Path(__file__)
ROOT_FOLDER = ROOT_PATH.parent
TARGET_PATH = ROOT_FOLDER / target_folder # <- relative path
assert TARGET_PATH.resolve().exists(), "Target path does not exist."
# ^ ------ checks if path exist. This can easily be changed to an exception.
ABOSLUTE_TARGET_FOLDER = TARGET_PATH.resolve().parent
print(ABOSLUTE_TARGET_FOLDER, "\n")
print(20 * "-", "SERIALIZATION START ", "-" * 20)
package_data_file = [] # <- This represents a package.dat file.
manifest_file = [] # <- This represents the file_list.txt or manifest.txt
for dir, _, files in walk(TARGET_PATH): # <- recursivly iterates through all the folders.
payload = {} # <- helps build a payload for the file being serialized
payload["dir"] = dir # <- adds directory to the payload
for file_name in files:
absolute_file_path = ABOSLUTE_TARGET_FOLDER / dir / file_name
# ^ --- creates an absolute path to the file
assert absolute_file_path.exists(), "File does not exist."
# ^ --- check if the files exists.
size = absolute_file_path.stat().st_size
# ^ --- gets filesize in bytes.
payload["file"] = {file_name: {"size": size}}
# ^---- adds file info with out bytes for manifest
manifest_file.append(str(payload))
# ^------- would be writing to manifest.dat here instead of appending to a list.
# with file.open(MANIFEST_FILE, 'b').write(str(payload))
# Also, there is no need to check file masking logic here. You might as well always
# generate the manifest file. The file masking logic should be decided when creating
# the header. At that point you can decide to use the manifest file or not.
bytes = absolute_file_path.read_bytes() # creates the bytes of the file
payload["file"] = {file_name: {"size": size, "bytes": bytes}} # adds bytes to payload
serialized = bson.dumps(payload)
# ^---- converts the payload dictionary to a bson binary object.
print(serialized, "\n")
package_data_file.append(serialized)
# ^------- would be writing to package.dat here instead of appending to a list.
print(20 * "-", "SERIALIZATION END ", "-" * 20, "\n")
print(20 * "-", "DESERIALIZATION BEGIN", "-" * 20)
for payload in package_data_file:
data = bson.loads(payload)
# ^ ---- from here you can deserialize the bson file and use it like a dictionary.
print(data)
print()
print(20 * "-", "DESERIALIZATION END", "-" * 20, "\n")
print("MANIFEST.txt")
for file in manifest_file:
print(file)
# V ----- OUTPUT
"""
/home/dgnsrekt/dev/python/filepackager
-------------------- SERIALIZATION START --------------------
b'X\x00\x00\x00\x02dir\x00\x06\x00\x00\x00movie\x00\x03file\x00>\x00\x00\x00\x03into.txt\x00/\x00\x00\x00\x10size\x00\x14\x00\x00\x00\x05bytes\x00\x14\x00\x00\x00\x00data from info.txt\r\n\x00\x00\x00'
b'\\\x00\x00\x00\x02dir\x00\x06\x00\x00\x00movie\x00\x03file\x00B\x00\x00\x00\x03movie.txt\x002\x00\x00\x00\x10size\x00\x17\x00\x00\x00\x05bytes\x00\x17\x00\x00\x00\x00data from movie.txt\r\n\r\n\x00\x00\x00'
b'\x84\x00\x00\x00\x02dir\x00\x0b\x00\x00\x00movie/sub2\x00\x03file\x00e\x00\x00\x00\x03submoviex.txt\x00Q\x00\x00\x00\x10size\x006\x00\x00\x00\x05bytes\x006\x00\x00\x00\x00data in submoviex.txt file in sub 2 foler!!!!!!!!!!!\r\n\x00\x00\x00'
b's\x00\x00\x00\x02dir\x00\x0b\x00\x00\x00movie/sub2\x00\x03file\x00T\x00\x00\x00\x03submovie.txt\x00A\x00\x00\x00\x10size\x00&\x00\x00\x00\x05bytes\x00&\x00\x00\x00\x00data from submove.txt in sub2 folder\r\n\x00\x00\x00'
b'e\x00\x00\x00\x02dir\x00\x0b\x00\x00\x00movie/sub1\x00\x03file\x00F\x00\x00\x00\x03submovie.txt\x003\x00\x00\x00\x10size\x00\x18\x00\x00\x00\x05bytes\x00\x18\x00\x00\x00\x00data from submovie.txt\r\n\x00\x00\x00'
-------------------- SERIALIZATION END --------------------
-------------------- DESERIALIZATION BEGIN --------------------
{'dir': 'movie', 'file': {'into.txt': {'size': 20, 'bytes': b'data from info.txt\r\n'}}}
{'dir': 'movie', 'file': {'movie.txt': {'size': 23, 'bytes': b'data from movie.txt\r\n\r\n'}}}
{'dir': 'movie/sub2', 'file': {'submoviex.txt': {'size': 54, 'bytes': b'data in submoviex.txt file in sub 2 foler!!!!!!!!!!!\r\n'}}}
{'dir': 'movie/sub2', 'file': {'submovie.txt': {'size': 38, 'bytes': b'data from submove.txt in sub2 folder\r\n'}}}
{'dir': 'movie/sub1', 'file': {'submovie.txt': {'size': 24, 'bytes': b'data from submovie.txt\r\n'}}}
-------------------- DESERIALIZATION END --------------------
MANIFEST.txt
{'dir': 'movie', 'file': {'into.txt': {'size': 20}}}
{'dir': 'movie', 'file': {'movie.txt': {'size': 23}}}
{'dir': 'movie/sub2', 'file': {'submoviex.txt': {'size': 54}}}
{'dir': 'movie/sub2', 'file': {'submovie.txt': {'size': 38}}}
{'dir': 'movie/sub1', 'file': {'submovie.txt': {'size': 24}}}
"""
from pathlib import Path
from os import walk, remove
import bson
# NEED TO DOCUMENT. THIS IS A WORKING VERSION THAT HAS SERIALIZED -> PACKAGE.DAT -> DESERIALZIED a file over 6 gigs. SHA256_SUM verified perfect copy.
#------ pip install pymongo
target_folder = "movie"
ROOT_PATH = Path(__file__)
ROOT_FOLDER = ROOT_PATH.parent
TARGET_PATH = ROOT_FOLDER / target_folder
PACKAGE_DAT_PATH = ROOT_FOLDER / "package.dat"
MANIFEST_PATH = ROOT_FOLDER / "manifest.txt"
CHUNK_SIZE = (1024 * 1024) * 1024
try:
remove(PACKAGE_DAT_PATH)
remove(MANIFEST_PATH)
except:
pass
PACKAGE_DAT_PATH.touch()
MANIFEST_PATH.touch()
assert TARGET_PATH.exists(), "Target path does not exist."
assert PACKAGE_DAT_PATH.exists()
assert MANIFEST_PATH.exists()
ABOSLUTE_TARGET_FOLDER = TARGET_PATH.resolve().parent
print(ABOSLUTE_TARGET_FOLDER, "\n")
print(20 * "-", "SERIALIZATION START ", "-" * 20)
for dir, _, files in walk(TARGET_PATH):
payload = {}
payload["dir"] = dir
for file_name in files:
absolute_file_path = ABOSLUTE_TARGET_FOLDER / dir / file_name
assert absolute_file_path.exists(), "File does not exist."
size = absolute_file_path.stat().st_size
payload["file"] = {file_name: {"size": size}}
with MANIFEST_PATH.open(mode="a") as file:
file.write(str(payload))
file.write("\n")
chunk_position = 0
with open(absolute_file_path, mode="rb") as file:
while True:
chunk = file.read(CHUNK_SIZE) # creates the bytes of the file
if not chunk:
break
payload["file"] = {
file_name: {"size": size, "bytes": chunk, "chunk": chunk_position}
}
serialized = bson.encode(payload)
with PACKAGE_DAT_PATH.open(mode="ba") as file_w:
file_w.write(serialized)
chunk_position += 1
print(".", end="", flush=True)
print(20 * "-", "SERIALIZATION END ", "-" * 20, "\n")
print(20 * "-", "DESERIALIZATION BEGIN", "-" * 20)
with open(PACKAGE_DAT_PATH, mode="rb") as file:
for data in bson.decode_file_iter(file):
dir = Path("new_" + data["dir"])
dir.mkdir(exist_ok=True)
file = data["file"].keys()
for idx, filename in enumerate(file):
print(filename)
# print(data["file"][filename].keys())
# print(data["file"][filename]["size"])
#:print(data["file"][filename]["chunk"])
byte_data = data["file"][filename]["bytes"]
file_location = Path(dir / filename)
with file_location.open(mode="ba") as file_w:
file_w.write(byte_data)
# with open(file_location, mode="ab") as file:
# file_location.write_bytes(data["file"][file]["bytes"])
#
print(20 * "-", "DESERIALIZATION END", "-" * 20, "\n")
#
@dgnsrekt
Copy link
Author

dgnsrekt commented Nov 6, 2019

Not sure why the size needs to be there.
May add the first 4 and last 4 bytes of the hash to each chunk.

@dgnsrekt
Copy link
Author

dgnsrekt commented Nov 6, 2019

{'dir': 'movie/sub1', 'filename': 'submovie.txt', 'size': 24, 'bytes': b'data from submovie.txt\r\n', 'hash':'A8FB9382'} <-- hash first and last four

@dgnsrekt
Copy link
Author

dgnsrekt commented Nov 6, 2019

Maybe add compression to chunks prior to serialization.

@dgnsrekt
Copy link
Author

dgnsrekt commented Nov 6, 2019

manifest after each payload file gives the ability to show number of chunks in manifest.

@dgnsrekt
Copy link
Author

dgnsrekt commented Nov 6, 2019

size should be len of chunk

@dgnsrekt
Copy link
Author

dgnsrekt commented Nov 6, 2019

while-else into the manifest.

@dgnsrekt
Copy link
Author

dgnsrekt commented Nov 6, 2019

size of payload without file bytes to determine next read.

@dgnsrekt
Copy link
Author

dgnsrekt commented Nov 6, 2019

bytes to data

@dgnsrekt
Copy link
Author

dgnsrekt commented Nov 6, 2019

zlib

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment