Skip to content

Instantly share code, notes, and snippets.

@radeusgd
Last active April 16, 2018 21:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save radeusgd/6119fa1528fc1fb0b1d26b287bd33db8 to your computer and use it in GitHub Desktop.
Save radeusgd/6119fa1528fc1fb0b1d26b287bd33db8 to your computer and use it in GitHub Desktop.
A tool to compute hashes and a comparison of original vs compressed file sizes.
import os
import sys
import datetime
import subprocess
def run_proc(*args):
return subprocess.check_output(args)
def filesize(path):
st = os.stat(path)
return st.st_size
def test_compressor(path, compressor):
os.system(compressor + " < \"" + path + "\" > tmp.gxz")
return filesize("tmp.gxz")
def checksum(path):
o = run_proc("sha256sum", "-b", "--tag", path)
pos = o.find(" = ")
sha = o[pos+2:].strip()
return sha
basedir = os.path.join(os.curdir, sys.argv[1])
if sys.argv[2] == 'sha':
backend = checksum
result = "sha.txt"
elif sys.argv[2] == 'gz':
def gz(path):
return test_compressor(path, "gzip")
backend = gz
result = "gz.txt"
else:
def xz(path):
return test_compressor(path, "xz")
backend = xz
result = "xz.txt"
print("Starting at {d}".format(d=datetime.datetime.now()))
print("Analyzing {path} into {res}".format(path=os.path.abspath(basedir), res=result))
def write_header(out):
out.write("path,size,statistic\n")
def handle_file(path, out):
size = filesize(path)
statistic = backend(path)
out.write("{p},{size},{stat}\n".format(p=os.path.relpath(path, basedir),
size=size, stat=statistic))
paths = []
with open("files.txt", "r") as fin:
for f in fin:
paths.append(f.strip())
print("Will process {fc} files".format(fc=len(paths)))
with open(result, "w") as f:
write_header(f)
pc = 0
for path in paths:
path = os.path.join(basedir, path)
pcs = int(100 * pc / len(paths))
sys.stdout.write("{pcs}% ({done}/{whole})\r".format(pcs=pcs, done=pc, whole=len(paths)))
sys.stdout.flush()
pc += 1
handle_file(path, f)
f.flush()
print("Finished at {d}".format(d=datetime.datetime.now()))
import sys
from csv import DictReader
normal = 0
compressed = 0
with open(sys.argv[1]) as f:
r = DictReader(f)
for e in r:
f_c = int(e['statistic'])
f_s = int(e['size'])
normal += f_s
compressed += f_c
pc = int(100 * compressed / normal)
print("Original size:", normal)
print("Compressed size:", compressed)
print("Gain:", pc,"%")
import datetime
import os
import sys
basedir = os.path.join(os.curdir, sys.argv[1])
print("Starting at {d}".format(d=datetime.datetime.now()))
print("Scanning {path}".format(path=os.path.abspath(basedir)))
with open("files.txt", "w") as out:
for root, dirs, files in os.walk(basedir):
for f in files:
path = os.path.relpath(os.path.join(root, f), basedir)
out.write(path + "\n")
if len(dirs) + len(files) > 5:
print(os.path.relpath(root, basedir))
out.flush()
print("")
print("Finished at {d}".format(d=datetime.datetime.now()))
import sys
from csv import DictReader
import numpy as np
import matplotlib.pyplot as plt
files = {}
with open(sys.argv[1]) as f:
r = DictReader(f)
for e in r:
sha = e['statistic']
size = int(e['size'])
if sha not in files:
files[sha] = {'size': size, 'count': 1}
else:
files[sha]['count'] += 1
highest_count = 0
total = 0
deduped = 0
counts = []
for sha,f in files.iteritems():
if f['count'] > highest_count:
highest_count = f['count']
deduped += f['size']
total += f['size'] * f['count']
counts.append(f['count'])
pc = int(100 * deduped / total)
print("Highest count:", highest_count)
print("Total size:", total)
print("Dedup size:", deduped)
print("Gain:", pc,"%")
plt.hist(counts, range(0, 1000))
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment