Skip to content

Instantly share code, notes, and snippets.

@zelon88
Last active February 17, 2019 04:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zelon88/5027282d46462b919a32947ccb78f9c5 to your computer and use it in GitHub Desktop.
Save zelon88/5027282d46462b919a32947ccb78f9c5 to your computer and use it in GitHub Desktop.
xPress Compressor (learning experiment)
# An experiment with reversible compression, only I have yet to figure out the reverse part. >D
# --------------------------------------------------
# COMPRESS & EXTRACT
# Load required modules and set global variables.
import sys, getopt, datetime, os, binascii, psutil, math, pickle
now = datetime.datetime.now()
time = now.strftime("%B %d, %Y, %H:%M")
inputFile = ''
inputPath = ''
dictFile = ''
dictPath = ''
tempFile = ''
tempPath = ''
outputFile = ''
outputPath = ''
chunkSize = 0
offset = 0
chunkCount = 0
print ("\n")
# --------------------------------------------------
# --------------------------------------------------
# COMPRESS & EXTRACT
# Process user supplied arguments.
def parseArgs(argv):
# Check if any arguments were passed.
try:
opts, args = getopt.getopt(argv,"h")
except getopt.GetoptError:
print ('xPress.py <c>or<e> <inputFile> <outputFile>')
sys.exit(2)
if sys.argv[1] == 'c':
feature = 'compress'
if sys.argv[1] == 'e':
feature = 'extract'
# Check to see if an input file argument was supplied.
try:
sys.argv[1]
except IndexError:
# Display an error and stop execution if the input argument is missing.
print ('ERROR!!! xPress48, No input file was specified on '+time+'!')
sys.exit()
else:
inputFile = sys.argv[2]
inputPath = os.path.dirname(inputFile)
# Check to see that a directory exists to put an output file into.
if not os.path.exists(inputFile):
print ('ERROR!!! xPress55, The input file specified does not exist on '+time+'!')
sys.exit()
# Check to see if an output file argument was supplied.
try:
sys.argv[3]
except IndexError:
# Display an error and stop execution if the output argument is missing.
print ('ERROR!!! xPress34, No output file was specified on '+time+'!')
sys.exit()
else:
outputFile = sys.argv[3]
outputPath = os.path.dirname(outputFile)
tempFile = sys.argv[3]+'-TEMP.dat'
tempPath = os.path.dirname(tempFile)
dictFile = sys.argv[3]+'-DICT.dat'
dictPath = os.path.dirname(dictFile)
# Check to see that a directory exists to put an output file into.
if not os.path.exists(outputPath):
print ('ERROR!!! xPress41, The output file specified relies on an invalid directory on '+time+'!')
sys.exit()
return tempFile, tempPath, inputFile, inputPath, outputFile, outputPath, dictFile, dictPath
# --------------------------------------------------
# --------------------------------------------------
# COMPRESS & EXTRACT
# Define the chunkSize based on fileSize and available memory.
# We need to store 2 copies of the offset buffer and the rest of this application.
# By dynamically setting how much of a file to load into memory at a time, xPress should be hardware agnostic.
# Severely limited machines with memory levels measured in hundreds of megabytes may see less compression performance than machines with more memory.
def defineChunkSize(inputFile):
# Get the filesize of the input file.
print ('Defining chunkSize with inputFile of '+inputFile)
fileSize = int(os.path.getsize(inputFile))
# Get the available memory.
mem = psutil.virtual_memory()
availableMemory = mem.available
print('Available memory is '+str(availableMemory))
# Our chunkSize is 1/4 of available memory. This translates to about 1/2 of available memory used once we load each chunk twice.
chunkSize = int(availableMemory) / 4
# If the chunkSize is smaller than the file being processed the entire file becomes the only chunk.
if chunkSize >= fileSize:
chunkSize = fileSize
print ('ChunkSize is '+str(chunkSize))
return chunkSize
# --------------------------------------------------
# --------------------------------------------------
# COMPRESS & EXTRACT
# Define what the file offsets and number of chunks based on fileSize and chunkSize.
# If a file is too big it is divided into small chunks.
# The offset is different from the chunkSize in that it is evenly divisible by the filesize.
# To put it differently, the chunkSize limits global memory usage and the offset allocates an exact quantity of memory for each operation.
def defineOffset(inputFile, chunkSize):
# Get the filesize of the input file.
fileSize = int(os.path.getsize(inputFile))
chunkSize = int(chunkSize)
print ('Defining offset with chunkSize of '+str(chunkSize))
if fileSize > chunkSize:
chunkCount = int(math.ceil(fileSize / chunkSize))
offset = fileSize
else:
chunkCount = 1
offset = fileSize / chunkCount
print('Offset is '+str(offset)+', chunkCount is '+str(chunkCount))
return offset, chunkCount
# --------------------------------------------------
# --------------------------------------------------
# COMPRESS
# A function to iterate through the temp file and build a dictionary for the file.
def buildDictionary(outputFile, inputFile, dictFile):
print ('Building a dictionary with inputFile '+inputFile)
dictionary = {}
dictCount = 0
dictIndexNumber = 0
dictIndex = '#',str(dictIndexNumber),'$'
counter0 = 0
tempChunkSize = defineChunkSize(inputFile)
tempOffset, tempChunkCount = defineOffset(inputFile, tempChunkSize)
# Open the input file.
with open(inputFile, "r") as openFile:
while counter0 < tempChunkCount:
# Set the current offset.
filePosition = openFile.tell()
# Fill up the offset buffer.
data = openFile.read(tempOffset)
# Select some data and attempt to compress it.
for i in xrange(0, len(data), 12):
chars = data[i:i+12]
if data.find(chars) >= 0:
dictIndexNumber += 1
dictIndex = '#'+str(dictIndexNumber)+'$'
data = data.replace(chars, dictIndex)
dictionary.update({dictIndex : chars})
# Decide whether we need to create a new output file or append to an existing one.
if os.path.isfile(outputFile):
appendWrite = "ab"
else:
appendWrite = "wb"
# Save the compressed data to the outputFile.
with open(outputFile, appendWrite) as openFile2:
openFile2.write(data)
else:
# Save uncompressed data to the outputFile.
with open(outputFile, appendWrite) as openFile2:
# Decide whether we need to create a new output file or append to an existing one.
if os.path.isfile(outputFile):
appendWrite = "ab"
else:
appendWrite = "wb"
openFile2.write(data)
counter0 += 1
openFile.close()
openFile2.close()
# Decide whether we need to create a new output file or append to an existing one.
if os.path.isfile(dictFile):
appendWrite = "ab"
else:
appendWrite = "wb"
with open(dictFile, appendWrite) as openFile3:
openFile3.write(str(dictionary))
openFile3.close()
return dictionary, data
# --------------------------------------------------
# --------------------------------------------------
# COMPRESS
# A function to iterate through the temp file and compress its actual data using the dictionary.
def compressFile(outputFile, compressedData, dictionary):
result = 0
print('Compressing '+outputFile)
# Decide whether we need to create a new output file or append to an existing one.
if os.path.isfile(dictFile):
appendWrite = "ab"
else:
appendWrite = "wb"
archive = open(outputFile, appendWrite)
archive.write('@!@!@!DICTIONARYSTART@!@!@!')
pickle.dump(dictionary, archive)
archive.write('@!@!@!DICTIONARYEND@!@!@!')
archive.close()
result = 1
return result
# --------------------------------------------------
# --------------------------------------------------
# COMPRESS
# Code to compress a specified file.
if sys.argv[1] == 'c':
tempFile, tempPath, inputFile, inputPath, outputFile, outputPath, dictFile, dictPath = parseArgs(sys.argv[1:])
dictionary, compressedData = buildDictionary(outputFile, inputFile, dictFile)
compressionResult = compressFile(outputFile, compressedData, dictionary)
# --------------------------------------------------
# --------------------------------------------------
# COMPRESS & EXTRACT
# Print the results of the operation.
if os.path.isfile(outputFile):
print ('OP-Act: The operation sucessfully generated an output file at '+outputFile+' on '+time+'!')
else:
print ('ERROR!!! xPress158, The operation failed to generate an output file at '+outputFile+' on '+time+'!')
print ("\n")
# --------------------------------------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment