Last active
November 6, 2019 09:48
-
-
Save dgnsrekt/7ec29400dec71f21245d5abc848ae0d8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
from os import walk | |
import bson | |
# ^------ pip install bson | |
# This is a concept for walking through a folder with nested folders each contaning files. | |
# This can easily be modified to take in a list of mutliple target folders | |
# There can also be a version for a single file | |
""" | |
movie | |
├── into.txt | |
├── movie.txt | |
├── sub1 | |
│ └── submovie.txt | |
└── sub2 | |
├── submovie.txt | |
└── submoviex.txt | |
""" | |
# ^ --- list of random files and folders to serialize | |
target_folder = "movie" # <----- folder chosen by user. | |
ROOT_PATH = Path(__file__) | |
ROOT_FOLDER = ROOT_PATH.parent | |
TARGET_PATH = ROOT_FOLDER / target_folder # <- relative path | |
assert TARGET_PATH.resolve().exists(), "Target path does not exist." | |
# ^ ------ checks if path exist. This can easily be changed to an exception. | |
ABOSLUTE_TARGET_FOLDER = TARGET_PATH.resolve().parent | |
print(ABOSLUTE_TARGET_FOLDER, "\n") | |
print(20 * "-", "SERIALIZATION START ", "-" * 20) | |
package_data_file = [] # <- This represents a package.dat file. | |
manifest_file = [] # <- This represents the file_list.txt or manifest.txt | |
for dir, _, files in walk(TARGET_PATH): # <- recursivly iterates through all the folders. | |
payload = {} # <- helps build a payload for the file being serialized | |
payload["dir"] = dir # <- adds directory to the payload | |
for file_name in files: | |
absolute_file_path = ABOSLUTE_TARGET_FOLDER / dir / file_name | |
# ^ --- creates an absolute path to the file | |
assert absolute_file_path.exists(), "File does not exist." | |
# ^ --- check if the files exists. | |
size = absolute_file_path.stat().st_size | |
# ^ --- gets filesize in bytes. | |
payload["file"] = {file_name: {"size": size}} | |
# ^---- adds file info with out bytes for manifest | |
manifest_file.append(str(payload)) | |
# ^------- would be writing to manifest.dat here instead of appending to a list. | |
# with file.open(MANIFEST_FILE, 'b').write(str(payload)) | |
# Also, there is no need to check file masking logic here. You might as well always | |
# generate the manifest file. The file masking logic should be decided when creating | |
# the header. At that point you can decide to use the manifest file or not. | |
bytes = absolute_file_path.read_bytes() # creates the bytes of the file | |
payload["file"] = {file_name: {"size": size, "bytes": bytes}} # adds bytes to payload | |
serialized = bson.dumps(payload) | |
# ^---- converts the payload dictionary to a bson binary object. | |
print(serialized, "\n") | |
package_data_file.append(serialized) | |
# ^------- would be writing to package.dat here instead of appending to a list. | |
print(20 * "-", "SERIALIZATION END ", "-" * 20, "\n") | |
print(20 * "-", "DESERIALIZATION BEGIN", "-" * 20) | |
for payload in package_data_file: | |
data = bson.loads(payload) | |
# ^ ---- from here you can deserialize the bson file and use it like a dictionary. | |
print(data) | |
print() | |
print(20 * "-", "DESERIALIZATION END", "-" * 20, "\n") | |
print("MANIFEST.txt") | |
for file in manifest_file: | |
print(file) | |
# V ----- OUTPUT | |
""" | |
/home/dgnsrekt/dev/python/filepackager | |
-------------------- SERIALIZATION START -------------------- | |
b'X\x00\x00\x00\x02dir\x00\x06\x00\x00\x00movie\x00\x03file\x00>\x00\x00\x00\x03into.txt\x00/\x00\x00\x00\x10size\x00\x14\x00\x00\x00\x05bytes\x00\x14\x00\x00\x00\x00data from info.txt\r\n\x00\x00\x00' | |
b'\\\x00\x00\x00\x02dir\x00\x06\x00\x00\x00movie\x00\x03file\x00B\x00\x00\x00\x03movie.txt\x002\x00\x00\x00\x10size\x00\x17\x00\x00\x00\x05bytes\x00\x17\x00\x00\x00\x00data from movie.txt\r\n\r\n\x00\x00\x00' | |
b'\x84\x00\x00\x00\x02dir\x00\x0b\x00\x00\x00movie/sub2\x00\x03file\x00e\x00\x00\x00\x03submoviex.txt\x00Q\x00\x00\x00\x10size\x006\x00\x00\x00\x05bytes\x006\x00\x00\x00\x00data in submoviex.txt file in sub 2 foler!!!!!!!!!!!\r\n\x00\x00\x00' | |
b's\x00\x00\x00\x02dir\x00\x0b\x00\x00\x00movie/sub2\x00\x03file\x00T\x00\x00\x00\x03submovie.txt\x00A\x00\x00\x00\x10size\x00&\x00\x00\x00\x05bytes\x00&\x00\x00\x00\x00data from submove.txt in sub2 folder\r\n\x00\x00\x00' | |
b'e\x00\x00\x00\x02dir\x00\x0b\x00\x00\x00movie/sub1\x00\x03file\x00F\x00\x00\x00\x03submovie.txt\x003\x00\x00\x00\x10size\x00\x18\x00\x00\x00\x05bytes\x00\x18\x00\x00\x00\x00data from submovie.txt\r\n\x00\x00\x00' | |
-------------------- SERIALIZATION END -------------------- | |
-------------------- DESERIALIZATION BEGIN -------------------- | |
{'dir': 'movie', 'file': {'into.txt': {'size': 20, 'bytes': b'data from info.txt\r\n'}}} | |
{'dir': 'movie', 'file': {'movie.txt': {'size': 23, 'bytes': b'data from movie.txt\r\n\r\n'}}} | |
{'dir': 'movie/sub2', 'file': {'submoviex.txt': {'size': 54, 'bytes': b'data in submoviex.txt file in sub 2 foler!!!!!!!!!!!\r\n'}}} | |
{'dir': 'movie/sub2', 'file': {'submovie.txt': {'size': 38, 'bytes': b'data from submove.txt in sub2 folder\r\n'}}} | |
{'dir': 'movie/sub1', 'file': {'submovie.txt': {'size': 24, 'bytes': b'data from submovie.txt\r\n'}}} | |
-------------------- DESERIALIZATION END -------------------- | |
MANIFEST.txt | |
{'dir': 'movie', 'file': {'into.txt': {'size': 20}}} | |
{'dir': 'movie', 'file': {'movie.txt': {'size': 23}}} | |
{'dir': 'movie/sub2', 'file': {'submoviex.txt': {'size': 54}}} | |
{'dir': 'movie/sub2', 'file': {'submovie.txt': {'size': 38}}} | |
{'dir': 'movie/sub1', 'file': {'submovie.txt': {'size': 24}}} | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
from os import walk, remove | |
import bson | |
# NEED TO DOCUMENT. THIS IS A WORKING VERSION THAT HAS SERIALIZED -> PACKAGE.DAT -> DESERIALZIED a file over 6 gigs. SHA256_SUM verified perfect copy. | |
#------ pip install pymongo | |
target_folder = "movie" | |
ROOT_PATH = Path(__file__) | |
ROOT_FOLDER = ROOT_PATH.parent | |
TARGET_PATH = ROOT_FOLDER / target_folder | |
PACKAGE_DAT_PATH = ROOT_FOLDER / "package.dat" | |
MANIFEST_PATH = ROOT_FOLDER / "manifest.txt" | |
CHUNK_SIZE = (1024 * 1024) * 1024 | |
try: | |
remove(PACKAGE_DAT_PATH) | |
remove(MANIFEST_PATH) | |
except: | |
pass | |
PACKAGE_DAT_PATH.touch() | |
MANIFEST_PATH.touch() | |
assert TARGET_PATH.exists(), "Target path does not exist." | |
assert PACKAGE_DAT_PATH.exists() | |
assert MANIFEST_PATH.exists() | |
ABOSLUTE_TARGET_FOLDER = TARGET_PATH.resolve().parent | |
print(ABOSLUTE_TARGET_FOLDER, "\n") | |
print(20 * "-", "SERIALIZATION START ", "-" * 20) | |
for dir, _, files in walk(TARGET_PATH): | |
payload = {} | |
payload["dir"] = dir | |
for file_name in files: | |
absolute_file_path = ABOSLUTE_TARGET_FOLDER / dir / file_name | |
assert absolute_file_path.exists(), "File does not exist." | |
size = absolute_file_path.stat().st_size | |
payload["file"] = {file_name: {"size": size}} | |
with MANIFEST_PATH.open(mode="a") as file: | |
file.write(str(payload)) | |
file.write("\n") | |
chunk_position = 0 | |
with open(absolute_file_path, mode="rb") as file: | |
while True: | |
chunk = file.read(CHUNK_SIZE) # creates the bytes of the file | |
if not chunk: | |
break | |
payload["file"] = { | |
file_name: {"size": size, "bytes": chunk, "chunk": chunk_position} | |
} | |
serialized = bson.encode(payload) | |
with PACKAGE_DAT_PATH.open(mode="ba") as file_w: | |
file_w.write(serialized) | |
chunk_position += 1 | |
print(".", end="", flush=True) | |
print(20 * "-", "SERIALIZATION END ", "-" * 20, "\n") | |
print(20 * "-", "DESERIALIZATION BEGIN", "-" * 20) | |
with open(PACKAGE_DAT_PATH, mode="rb") as file: | |
for data in bson.decode_file_iter(file): | |
dir = Path("new_" + data["dir"]) | |
dir.mkdir(exist_ok=True) | |
file = data["file"].keys() | |
for idx, filename in enumerate(file): | |
print(filename) | |
# print(data["file"][filename].keys()) | |
# print(data["file"][filename]["size"]) | |
#:print(data["file"][filename]["chunk"]) | |
byte_data = data["file"][filename]["bytes"] | |
file_location = Path(dir / filename) | |
with file_location.open(mode="ba") as file_w: | |
file_w.write(byte_data) | |
# with open(file_location, mode="ab") as file: | |
# file_location.write_bytes(data["file"][file]["bytes"]) | |
# | |
print(20 * "-", "DESERIALIZATION END", "-" * 20, "\n") | |
# |
{'dir': 'movie/sub1', 'filename': 'submovie.txt', 'size': 24, 'bytes': b'data from submovie.txt\r\n', 'hash':'A8FB9382'} <-- hash first and last four
Maybe add compression to chunks prior to serialization.
manifest after each payload file gives the ability to show number of chunks in manifest.
size should be len of chunk
while-else into the manifest.
size of payload without file bytes to determine next read.
bytes to data
zlib
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Not sure why the size needs to be there.
May add the first 4 and last 4 bytes of the hash to each chunk.