radeusgd/compressors.py

## compressors.py
import os
import sys
import datetime
import subprocess


def run_proc(*args):
    return subprocess.check_output(args)


def filesize(path):
    st = os.stat(path)
    return st.st_size


def test_compressor(path, compressor):
    os.system(compressor + " < \"" + path + "\" > tmp.gxz")
    return filesize("tmp.gxz")


def checksum(path):
    o = run_proc("sha256sum", "-b", "--tag", path)
    pos = o.find(" = ")
    sha = o[pos+2:].strip()
    return sha


basedir = os.path.join(os.curdir, sys.argv[1])
if sys.argv[2] == 'sha':
    backend = checksum
    result = "sha.txt"
elif sys.argv[2] == 'gz':
    def gz(path):
        return test_compressor(path, "gzip")
    backend = gz
    result = "gz.txt"
else:
    def xz(path):
        return test_compressor(path, "xz")
    backend = xz
    result = "xz.txt"


print("Starting at {d}".format(d=datetime.datetime.now()))
print("Analyzing {path} into {res}".format(path=os.path.abspath(basedir), res=result))


def write_header(out):
    out.write("path,size,statistic\n")


def handle_file(path, out):
    size = filesize(path)
    statistic = backend(path)

    out.write("{p},{size},{stat}\n".format(p=os.path.relpath(path, basedir),
                                           size=size, stat=statistic))


paths = []
with open("files.txt", "r") as fin:
    for f in fin:
        paths.append(f.strip())

print("Will process {fc} files".format(fc=len(paths)))

with open(result, "w") as f:
    write_header(f)
    pc = 0
    for path in paths:
        path = os.path.join(basedir, path)
        pcs = int(100 * pc / len(paths))
        sys.stdout.write("{pcs}% ({done}/{whole})\r".format(pcs=pcs, done=pc, whole=len(paths)))
        sys.stdout.flush()
        pc += 1
        handle_file(path, f)
    f.flush()

print("Finished at {d}".format(d=datetime.datetime.now()))

## gistfile1.txt
import sys
from csv import DictReader

normal = 0
compressed = 0
with open(sys.argv[1]) as f:
    r = DictReader(f)
    for e in r:
        f_c = int(e['statistic'])
        f_s = int(e['size'])

        normal += f_s
        compressed += f_c

pc = int(100 * compressed / normal)
print("Original   size:", normal)
print("Compressed size:", compressed)
print("Gain:", pc,"%")


## recon.py
import datetime
import os
import sys
basedir = os.path.join(os.curdir, sys.argv[1])

print("Starting at {d}".format(d=datetime.datetime.now()))
print("Scanning {path}".format(path=os.path.abspath(basedir)))


with open("files.txt", "w") as out:
    for root, dirs, files in os.walk(basedir):
        for f in files:
            path = os.path.relpath(os.path.join(root, f), basedir)
            out.write(path + "\n")
        if len(dirs) + len(files) > 5:
            print(os.path.relpath(root, basedir))
        out.flush()
    print("")

print("Finished at {d}".format(d=datetime.datetime.now()))

## shas.py
import sys
from csv import DictReader

import numpy as np
import matplotlib.pyplot as plt

files = {}
with open(sys.argv[1]) as f:
    r = DictReader(f)
    for e in r:
        sha = e['statistic']
        size = int(e['size'])
        if sha not in files:
            files[sha] = {'size': size, 'count': 1}
        else:
            files[sha]['count'] += 1

highest_count = 0
total = 0
deduped = 0
counts = []
for sha,f in files.iteritems():
    if f['count'] > highest_count:
        highest_count = f['count']
    deduped += f['size']
    total += f['size'] * f['count']
    counts.append(f['count'])

pc = int(100 * deduped / total)
print("Highest count:", highest_count)
print("Total size:", total)
print("Dedup size:", deduped)
print("Gain:", pc,"%")

plt.hist(counts, range(0, 1000))
plt.show()
	import os
	import sys
	import datetime
	import subprocess


	def run_proc(*args):
	return subprocess.check_output(args)


	def filesize(path):
	st = os.stat(path)
	return st.st_size


	def test_compressor(path, compressor):
	os.system(compressor + " < \"" + path + "\" > tmp.gxz")
	return filesize("tmp.gxz")


	def checksum(path):
	o = run_proc("sha256sum", "-b", "--tag", path)
	pos = o.find(" = ")
	sha = o[pos+2:].strip()
	return sha


	basedir = os.path.join(os.curdir, sys.argv[1])
	if sys.argv[2] == 'sha':
	backend = checksum
	result = "sha.txt"
	elif sys.argv[2] == 'gz':
	def gz(path):
	return test_compressor(path, "gzip")
	backend = gz
	result = "gz.txt"
	else:
	def xz(path):
	return test_compressor(path, "xz")
	backend = xz
	result = "xz.txt"


	print("Starting at {d}".format(d=datetime.datetime.now()))
	print("Analyzing {path} into {res}".format(path=os.path.abspath(basedir), res=result))


	def write_header(out):
	out.write("path,size,statistic\n")


	def handle_file(path, out):
	size = filesize(path)
	statistic = backend(path)

	out.write("{p},{size},{stat}\n".format(p=os.path.relpath(path, basedir),
	size=size, stat=statistic))


	paths = []
	with open("files.txt", "r") as fin:
	for f in fin:
	paths.append(f.strip())

	print("Will process {fc} files".format(fc=len(paths)))

	with open(result, "w") as f:
	write_header(f)
	pc = 0
	for path in paths:
	path = os.path.join(basedir, path)
	pcs = int(100 * pc / len(paths))
	sys.stdout.write("{pcs}% ({done}/{whole})\r".format(pcs=pcs, done=pc, whole=len(paths)))
	sys.stdout.flush()
	pc += 1
	handle_file(path, f)
	f.flush()

	print("Finished at {d}".format(d=datetime.datetime.now()))
	import sys
	from csv import DictReader

	normal = 0
	compressed = 0
	with open(sys.argv[1]) as f:
	r = DictReader(f)
	for e in r:
	f_c = int(e['statistic'])
	f_s = int(e['size'])

	normal += f_s
	compressed += f_c

	pc = int(100 * compressed / normal)
	print("Original size:", normal)
	print("Compressed size:", compressed)
	print("Gain:", pc,"%")
	import sys
	from csv import DictReader

	import numpy as np
	import matplotlib.pyplot as plt

	files = {}
	with open(sys.argv[1]) as f:
	r = DictReader(f)
	for e in r:
	sha = e['statistic']
	size = int(e['size'])
	if sha not in files:
	files[sha] = {'size': size, 'count': 1}
	else:
	files[sha]['count'] += 1

	highest_count = 0
	total = 0
	deduped = 0
	counts = []
	for sha,f in files.iteritems():
	if f['count'] > highest_count:
	highest_count = f['count']
	deduped += f['size']
	total += f['size'] * f['count']
	counts.append(f['count'])

	pc = int(100 * deduped / total)
	print("Highest count:", highest_count)
	print("Total size:", total)
	print("Dedup size:", deduped)
	print("Gain:", pc,"%")

	plt.hist(counts, range(0, 1000))
	plt.show()