Last active
February 17, 2019 04:50
-
-
Save zelon88/5027282d46462b919a32947ccb78f9c5 to your computer and use it in GitHub Desktop.
xPress Compressor (learning experiment)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# An experiment with reversible compression, only I have yet to figure out the reverse part. >D | |
# -------------------------------------------------- | |
# COMPRESS & EXTRACT | |
# Load required modules and set global variables. | |
import sys, getopt, datetime, os, binascii, psutil, math, pickle | |
now = datetime.datetime.now() | |
time = now.strftime("%B %d, %Y, %H:%M") | |
inputFile = '' | |
inputPath = '' | |
dictFile = '' | |
dictPath = '' | |
tempFile = '' | |
tempPath = '' | |
outputFile = '' | |
outputPath = '' | |
chunkSize = 0 | |
offset = 0 | |
chunkCount = 0 | |
print ("\n") | |
# -------------------------------------------------- | |
# -------------------------------------------------- | |
# COMPRESS & EXTRACT | |
# Process user supplied arguments. | |
def parseArgs(argv): | |
# Check if any arguments were passed. | |
try: | |
opts, args = getopt.getopt(argv,"h") | |
except getopt.GetoptError: | |
print ('xPress.py <c>or<e> <inputFile> <outputFile>') | |
sys.exit(2) | |
if sys.argv[1] == 'c': | |
feature = 'compress' | |
if sys.argv[1] == 'e': | |
feature = 'extract' | |
# Check to see if an input file argument was supplied. | |
try: | |
sys.argv[1] | |
except IndexError: | |
# Display an error and stop execution if the input argument is missing. | |
print ('ERROR!!! xPress48, No input file was specified on '+time+'!') | |
sys.exit() | |
else: | |
inputFile = sys.argv[2] | |
inputPath = os.path.dirname(inputFile) | |
# Check to see that a directory exists to put an output file into. | |
if not os.path.exists(inputFile): | |
print ('ERROR!!! xPress55, The input file specified does not exist on '+time+'!') | |
sys.exit() | |
# Check to see if an output file argument was supplied. | |
try: | |
sys.argv[3] | |
except IndexError: | |
# Display an error and stop execution if the output argument is missing. | |
print ('ERROR!!! xPress34, No output file was specified on '+time+'!') | |
sys.exit() | |
else: | |
outputFile = sys.argv[3] | |
outputPath = os.path.dirname(outputFile) | |
tempFile = sys.argv[3]+'-TEMP.dat' | |
tempPath = os.path.dirname(tempFile) | |
dictFile = sys.argv[3]+'-DICT.dat' | |
dictPath = os.path.dirname(dictFile) | |
# Check to see that a directory exists to put an output file into. | |
if not os.path.exists(outputPath): | |
print ('ERROR!!! xPress41, The output file specified relies on an invalid directory on '+time+'!') | |
sys.exit() | |
return tempFile, tempPath, inputFile, inputPath, outputFile, outputPath, dictFile, dictPath | |
# -------------------------------------------------- | |
# -------------------------------------------------- | |
# COMPRESS & EXTRACT | |
# Define the chunkSize based on fileSize and available memory. | |
# We need to store 2 copies of the offset buffer and the rest of this application. | |
# By dynamically setting how much of a file to load into memory at a time, xPress should be hardware agnostic. | |
# Severely limited machines with memory levels measured in hundreds of megabytes may see less compression performance than machines with more memory. | |
def defineChunkSize(inputFile): | |
# Get the filesize of the input file. | |
print ('Defining chunkSize with inputFile of '+inputFile) | |
fileSize = int(os.path.getsize(inputFile)) | |
# Get the available memory. | |
mem = psutil.virtual_memory() | |
availableMemory = mem.available | |
print('Available memory is '+str(availableMemory)) | |
# Our chunkSize is 1/4 of available memory. This translates to about 1/2 of available memory used once we load each chunk twice. | |
chunkSize = int(availableMemory) / 4 | |
# If the chunkSize is smaller than the file being processed the entire file becomes the only chunk. | |
if chunkSize >= fileSize: | |
chunkSize = fileSize | |
print ('ChunkSize is '+str(chunkSize)) | |
return chunkSize | |
# -------------------------------------------------- | |
# -------------------------------------------------- | |
# COMPRESS & EXTRACT | |
# Define what the file offsets and number of chunks based on fileSize and chunkSize. | |
# If a file is too big it is divided into small chunks. | |
# The offset is different from the chunkSize in that it is evenly divisible by the filesize. | |
# To put it differently, the chunkSize limits global memory usage and the offset allocates an exact quantity of memory for each operation. | |
def defineOffset(inputFile, chunkSize): | |
# Get the filesize of the input file. | |
fileSize = int(os.path.getsize(inputFile)) | |
chunkSize = int(chunkSize) | |
print ('Defining offset with chunkSize of '+str(chunkSize)) | |
if fileSize > chunkSize: | |
chunkCount = int(math.ceil(fileSize / chunkSize)) | |
offset = fileSize | |
else: | |
chunkCount = 1 | |
offset = fileSize / chunkCount | |
print('Offset is '+str(offset)+', chunkCount is '+str(chunkCount)) | |
return offset, chunkCount | |
# -------------------------------------------------- | |
# -------------------------------------------------- | |
# COMPRESS | |
# A function to iterate through the temp file and build a dictionary for the file. | |
def buildDictionary(outputFile, inputFile, dictFile): | |
print ('Building a dictionary with inputFile '+inputFile) | |
dictionary = {} | |
dictCount = 0 | |
dictIndexNumber = 0 | |
dictIndex = '#',str(dictIndexNumber),'$' | |
counter0 = 0 | |
tempChunkSize = defineChunkSize(inputFile) | |
tempOffset, tempChunkCount = defineOffset(inputFile, tempChunkSize) | |
# Open the input file. | |
with open(inputFile, "r") as openFile: | |
while counter0 < tempChunkCount: | |
# Set the current offset. | |
filePosition = openFile.tell() | |
# Fill up the offset buffer. | |
data = openFile.read(tempOffset) | |
# Select some data and attempt to compress it. | |
for i in xrange(0, len(data), 12): | |
chars = data[i:i+12] | |
if data.find(chars) >= 0: | |
dictIndexNumber += 1 | |
dictIndex = '#'+str(dictIndexNumber)+'$' | |
data = data.replace(chars, dictIndex) | |
dictionary.update({dictIndex : chars}) | |
# Decide whether we need to create a new output file or append to an existing one. | |
if os.path.isfile(outputFile): | |
appendWrite = "ab" | |
else: | |
appendWrite = "wb" | |
# Save the compressed data to the outputFile. | |
with open(outputFile, appendWrite) as openFile2: | |
openFile2.write(data) | |
else: | |
# Save uncompressed data to the outputFile. | |
with open(outputFile, appendWrite) as openFile2: | |
# Decide whether we need to create a new output file or append to an existing one. | |
if os.path.isfile(outputFile): | |
appendWrite = "ab" | |
else: | |
appendWrite = "wb" | |
openFile2.write(data) | |
counter0 += 1 | |
openFile.close() | |
openFile2.close() | |
# Decide whether we need to create a new output file or append to an existing one. | |
if os.path.isfile(dictFile): | |
appendWrite = "ab" | |
else: | |
appendWrite = "wb" | |
with open(dictFile, appendWrite) as openFile3: | |
openFile3.write(str(dictionary)) | |
openFile3.close() | |
return dictionary, data | |
# -------------------------------------------------- | |
# -------------------------------------------------- | |
# COMPRESS | |
# A function to iterate through the temp file and compress its actual data using the dictionary. | |
def compressFile(outputFile, compressedData, dictionary): | |
result = 0 | |
print('Compressing '+outputFile) | |
# Decide whether we need to create a new output file or append to an existing one. | |
if os.path.isfile(dictFile): | |
appendWrite = "ab" | |
else: | |
appendWrite = "wb" | |
archive = open(outputFile, appendWrite) | |
archive.write('@!@!@!DICTIONARYSTART@!@!@!') | |
pickle.dump(dictionary, archive) | |
archive.write('@!@!@!DICTIONARYEND@!@!@!') | |
archive.close() | |
result = 1 | |
return result | |
# -------------------------------------------------- | |
# -------------------------------------------------- | |
# COMPRESS | |
# Code to compress a specified file. | |
if sys.argv[1] == 'c': | |
tempFile, tempPath, inputFile, inputPath, outputFile, outputPath, dictFile, dictPath = parseArgs(sys.argv[1:]) | |
dictionary, compressedData = buildDictionary(outputFile, inputFile, dictFile) | |
compressionResult = compressFile(outputFile, compressedData, dictionary) | |
# -------------------------------------------------- | |
# -------------------------------------------------- | |
# COMPRESS & EXTRACT | |
# Print the results of the operation. | |
if os.path.isfile(outputFile): | |
print ('OP-Act: The operation sucessfully generated an output file at '+outputFile+' on '+time+'!') | |
else: | |
print ('ERROR!!! xPress158, The operation failed to generate an output file at '+outputFile+' on '+time+'!') | |
print ("\n") | |
# -------------------------------------------------- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment