Last active
April 16, 2018 21:50
-
-
Save radeusgd/6119fa1528fc1fb0b1d26b287bd33db8 to your computer and use it in GitHub Desktop.
A tool to compute hashes and a comparison of original vs compressed file sizes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import datetime | |
import subprocess | |
def run_proc(*args): | |
return subprocess.check_output(args) | |
def filesize(path): | |
st = os.stat(path) | |
return st.st_size | |
def test_compressor(path, compressor): | |
os.system(compressor + " < \"" + path + "\" > tmp.gxz") | |
return filesize("tmp.gxz") | |
def checksum(path): | |
o = run_proc("sha256sum", "-b", "--tag", path) | |
pos = o.find(" = ") | |
sha = o[pos+2:].strip() | |
return sha | |
basedir = os.path.join(os.curdir, sys.argv[1]) | |
if sys.argv[2] == 'sha': | |
backend = checksum | |
result = "sha.txt" | |
elif sys.argv[2] == 'gz': | |
def gz(path): | |
return test_compressor(path, "gzip") | |
backend = gz | |
result = "gz.txt" | |
else: | |
def xz(path): | |
return test_compressor(path, "xz") | |
backend = xz | |
result = "xz.txt" | |
print("Starting at {d}".format(d=datetime.datetime.now())) | |
print("Analyzing {path} into {res}".format(path=os.path.abspath(basedir), res=result)) | |
def write_header(out): | |
out.write("path,size,statistic\n") | |
def handle_file(path, out): | |
size = filesize(path) | |
statistic = backend(path) | |
out.write("{p},{size},{stat}\n".format(p=os.path.relpath(path, basedir), | |
size=size, stat=statistic)) | |
paths = [] | |
with open("files.txt", "r") as fin: | |
for f in fin: | |
paths.append(f.strip()) | |
print("Will process {fc} files".format(fc=len(paths))) | |
with open(result, "w") as f: | |
write_header(f) | |
pc = 0 | |
for path in paths: | |
path = os.path.join(basedir, path) | |
pcs = int(100 * pc / len(paths)) | |
sys.stdout.write("{pcs}% ({done}/{whole})\r".format(pcs=pcs, done=pc, whole=len(paths))) | |
sys.stdout.flush() | |
pc += 1 | |
handle_file(path, f) | |
f.flush() | |
print("Finished at {d}".format(d=datetime.datetime.now())) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from csv import DictReader | |
normal = 0 | |
compressed = 0 | |
with open(sys.argv[1]) as f: | |
r = DictReader(f) | |
for e in r: | |
f_c = int(e['statistic']) | |
f_s = int(e['size']) | |
normal += f_s | |
compressed += f_c | |
pc = int(100 * compressed / normal) | |
print("Original size:", normal) | |
print("Compressed size:", compressed) | |
print("Gain:", pc,"%") | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime | |
import os | |
import sys | |
basedir = os.path.join(os.curdir, sys.argv[1]) | |
print("Starting at {d}".format(d=datetime.datetime.now())) | |
print("Scanning {path}".format(path=os.path.abspath(basedir))) | |
with open("files.txt", "w") as out: | |
for root, dirs, files in os.walk(basedir): | |
for f in files: | |
path = os.path.relpath(os.path.join(root, f), basedir) | |
out.write(path + "\n") | |
if len(dirs) + len(files) > 5: | |
print(os.path.relpath(root, basedir)) | |
out.flush() | |
print("") | |
print("Finished at {d}".format(d=datetime.datetime.now())) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from csv import DictReader | |
import numpy as np | |
import matplotlib.pyplot as plt | |
files = {} | |
with open(sys.argv[1]) as f: | |
r = DictReader(f) | |
for e in r: | |
sha = e['statistic'] | |
size = int(e['size']) | |
if sha not in files: | |
files[sha] = {'size': size, 'count': 1} | |
else: | |
files[sha]['count'] += 1 | |
highest_count = 0 | |
total = 0 | |
deduped = 0 | |
counts = [] | |
for sha,f in files.iteritems(): | |
if f['count'] > highest_count: | |
highest_count = f['count'] | |
deduped += f['size'] | |
total += f['size'] * f['count'] | |
counts.append(f['count']) | |
pc = int(100 * deduped / total) | |
print("Highest count:", highest_count) | |
print("Total size:", total) | |
print("Dedup size:", deduped) | |
print("Gain:", pc,"%") | |
plt.hist(counts, range(0, 1000)) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment