Skip to content

Instantly share code, notes, and snippets.

@gBhagavathula
Created February 11, 2017 22:25
Show Gist options
  • Save gBhagavathula/9f582ce613ca4c2aad09c14be71dde8e to your computer and use it in GitHub Desktop.
Save gBhagavathula/9f582ce613ca4c2aad09c14be71dde8e to your computer and use it in GitHub Desktop.
#!/usr/bin/python
"""
Copyright (C) 2017 Siricon Limited
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>
"""
import hashlib
import sqlite3
import os
import shutil
from PIL import Image
from PIL.ExifTags import TAGS
import string
import random
import re
import sys
import getopt
DEBUG = False
imgTable = "imgData"
mtaTable = "metaData"
extns = [".*\.JPG$", ".*\.JPEG$", ".*\.GIF$", ".*\.PNG$"]
extnRegex = "(" + ")|(".join(extns) + ")"
def createTable(cursor, tableName):
cursor.execute('CREATE TABLE {tn} ({nf} {ft} PRIMARY KEY AUTOINCREMENT)'\
.format(tn=tableName, nf="id", ft="INTEGER"))
def addColumn(cursor, tableName, fieldName, fieldType, defValue=False):
try:
if ( defValue != False ):
cursor.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct} DEFAULT '{df}'" .format(tn=tableName, cn=fieldName, ct=fieldType, df=defValue))
else:
cursor.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}" .format(tn=tableName, cn=fieldName, ct=fieldType))
except Exception as err:
debug(str(err))
showError("Fields could not be setup in the database")
def makeUniqueIndex(cursor, tableName, fieldName):
try:
cursor.execute("CREATE UNIQUE INDEX unique_" + fieldName + " on {tn} ({cn})" .format(tn=tableName, cn=fieldName))
except Exception as err:
debug(str(err))
showError("Unique Index could not be setup")
def setupTable(cursor, tableName, fieldDict, uniqueField=None):
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='{tn}'" .format(tn=tableName))
tblData = cursor.fetchone()
if (tblData) and (tblData[0] == tableName):
debug("Table Exists - " + tableName)
else :
createTable(cursor, tableName)
for (fieldName, fieldType) in fieldDict.items():
addColumn(cursor, tableName, fieldName, fieldType, False)
if ( uniqueField != None ):
makeUniqueIndex(cursor, tableName, uniqueField)
def setupMetaTable(cursor, conn):
global mtaTable
fields = {'fileCounter': 'INTEGER', 'filePrefix':'TEXT'}
setupTable(cursor, mtaTable, fields)
createMetaRecord(cursor, conn)
conn.commit()
def setupImageTable(cursor, conn):
global imgTable
fields = {'fileName' : 'TEXT', 'fileHash' : 'TEXT'}
setupTable(cursor, imgTable, fields, 'fileHash')
conn.commit()
def getMetaCounter(cursor):
global mtaTable
try:
cursor.execute("SELECT fileCounter, filePrefix FROM {tn} WHERE id=1" .format(tn=mtaTable))
mtaRecord = cursor.fetchone()
if (mtaRecord):
return mtaRecord[0]
else:
return False
except:
debug("Get Meta Counter Failed")
return False
def getMetaPrefix(cursor):
global mtaTable
try:
cursor.execute("SELECT fileCounter, filePrefix FROM {tn} WHERE id=1" .format(tn=mtaTable))
mtaRecord = cursor.fetchone()
if (mtaRecord):
return mtaRecord[1]
else:
return False
except:
return False
def setMetaCounter(cursor, conn):
global mtaTable
try:
counter = getMetaCounter(cursor)
counter = counter + 1
cursor.execute("UPDATE {tn} SET fileCounter = {fc} WHERE id=1" .format(tn=mtaTable, fc=counter))
conn.commit()
return True
except:
return False
def createMetaRecord(cursor, conn):
global mtaTable
num = getMetaCounter(cursor)
if num == False:
try:
chars=string.ascii_uppercase + string.digits
prefix = ''.join(random.choice(chars) for _ in range(3))
global mtaTable
cursor.execute("INSERT INTO {tn} (id, fileCounter, filePrefix) VALUES ({id},{fc},'{fp}')" .format(tn=mtaTable, id=1, fc=1, fp=prefix))
conn.commit()
return True
except:
debug("Create MTA Record Failed")
return False
else:
return True
def getNextFilename(fileExt, cursor):
global prefix
padTemplate = "PI-{pre}-{pd:>07}"
fileNumber = getMetaCounter(cursor)
newName = padTemplate.format(pre=prefix, pd=fileNumber) + fileExt
return newName
def fileSeen(cursor, tableName, fName, fHash):
bfName = os.path.basename(fName)
cursor.execute("SELECT fileName, fileHash FROM {tn} WHERE fileHash = '{fh}'" .format(tn=tableName, fh=fHash))
fileRecord = cursor.fetchone()
if (fileRecord) and (fileRecord[1] == fHash):
return True
else:
return False
def updateHashTable(cursor, conn, tableName, fName, fHash, rebuild=False):
try:
bfName = os.path.basename(fName)
bfExtn = getFileExtension(bfName)
if ( rebuild == True ):
newName = bfName
else:
newName = getNextFilename(bfExtn, cursor)
cursor.execute("INSERT INTO {tn} (fileName, fileHash) VALUES ('{fn}','{fh}') " .format(tn=tableName, fn=newName, fh=fHash))
conn.commit()
setMetaCounter(cursor, conn)
return newName
except Exception, e:
if ( "UNIQUE constraint failed" in str(e) ) and ( "fileHash" in str(e)):
return True
else:
debug(str(e))
return False
def hash_bytestr_iter(bytesiter, hasher, ashexstr=False):
for block in bytesiter:
hasher.update(block)
return (hasher.hexdigest() if ashexstr else hasher.digest())
def file_as_blockiter(afile, blocksize=65536):
with afile:
block = afile.read(blocksize)
while len(block) > 0:
yield block
block = afile.read(blocksize)
def betterHash(fname):
return hash_bytestr_iter(file_as_blockiter(open(fname, 'rb')), hashlib.sha1(), True)
def msg(str):
print "\t[+] " + str
def debug(str):
global DEBUG
if ( DEBUG == True ):
msg("DEBUG: " + str)
def copyFile(srcFile, destDir):
shutil.copy(srcFile, destDir)
def removeFile(srcFile):
if os.path.isfile(srcFile):
os.unlink(srcFile)
def getFileExtension(fName):
return os.path.splitext(fName)[1]
def getExifData(fName):
bfExtn = getFileExtension(fName).lower()
try:
if ( bfExtn == ".jpg" ) or ( bfExtn == ".jpeg" ):
return Image.open(fName)._getexif()
else:
return "NOEXIF"
except:
return "NOEXIF"
def getExifField (exif,field) :
if exif == "NOEXIF":
return exif
try:
for (k,v) in exif.iteritems():
if TAGS.get(k) == field:
return v
except:
return "NOEXIF"
def makeDirectory(dirPath):
try:
if not os.path.exists(dirPath):
debug("Creating Directory: " + dirPath)
os.makedirs(dirPath)
return True
except:
debug("Failed Creating Directory")
return False
def checkDirectory(dirPath):
try:
if os.path.exists(dirPath):
return True
else:
return False
except:
debug("Error Checking Directory")
return False
def getBestDate(exif):
date = getExifField(exif, "DateTimeOriginal")
if ( date != False ) and ( date != None ):
return date
date = getExifField(exif, "DateTimeDigitized")
if ( date != False ) and ( date != None ):
return date
return False
def processRebuild(dbCursor, dbConnection, fName):
global imgTable
fileHash = betterHash(fName)
status = updateHashTable(dbCursor, dbConnection, imgTable, fName, fileHash, True)
bfName = os.path.basename(fName)
if ( status == False ):
showError("Error in Updating Database")
elif ( status == True ):
msg("The File already exists in the database {fn}".format(fn=bfName))
elif ( bfName == status ):
msg("Hash Updated for File: " + status)
else:
debug("fName is: " + fName + " and status is : " + status)
msg("Hash updated, but other error.")
def processImport(dbCursor, dbConnection, fName, destDir):
global imgTable
fileHash = betterHash(fName)
status = updateHashTable(dbCursor, dbConnection, imgTable, fName, fileHash, False)
if ( status == False ):
showError("Error in Updating Database")
elif ( status == True ):
bfName = os.path.basename(fName)
msg("The File already exists in the database {fn}".format(fn=bfName))
else:
exif = getExifData(fName)
date = getBestDate(exif)
if ( date != False ) and ( date != None ):
yrmn = date[:7].replace(":", os.sep)
else :
yrmn = "NOEXIF"
newDir = destDir + os.sep + yrmn
step2 = makeDirectory(newDir)
if( step2 != False ):
newFile = newDir + os.sep + status
debug("File will be copied to " + newFile)
shutil.copy(fName, newFile)
def setupDatabase(dbFile):
global imgTable, mtaTable
try:
conn = sqlite3.connect(dbFile)
dbCursor = conn.cursor()
setupImageTable(dbCursor, conn)
setupMetaTable(dbCursor, conn)
return (conn, dbCursor)
except:
return(False, False)
def processList(srcDir, destDir, dbConnection, dbCursor, runLimit=0, rebuild=False):
global extnRegex
if ( rebuild == True ):
baseDir = destDir
else:
baseDir = srcDir
count=0
for subdir, dirs, files in os.walk(baseDir):
if ( runLimit > 0 ) and ( count > runLimit ):
break
for file in files:
count = count + 1
if ( runLimit > 0 ) and ( count > runLimit ):
break
filepath = subdir + os.sep + file
if ( re.match(extnRegex, filepath, re.I) ):
if ( rebuild == True ):
debug("Rebuild Action on : " + filepath)
processRebuild(dbCursor, dbConnection, filepath)
else:
debug("Process Action on : " + filepath)
processImport(dbCursor, dbConnection, filepath, destDir)
def showBanner(prgName):
print("\n" + prgName + " Version 0.74 (http://www.siricon.co.uk/deduper)")
def showHelp(prgName, err=""):
print("Usage: " + prgName + " [options]")
if ( err != "" ):
print("ERROR")
print("\t " + err)
print("OPTIONS")
print("\t -h Help")
print("\t -v Verbose debugging")
print("\t -s <Source Directory>")
print("\t -d <Destination Directory>")
print("\t -l <number> Limit the Run to <number> of images")
print("\t -r Rebuild the Database")
print("LONG OPTIONS")
print("\t --srcdir=<Source Directory>")
print("\t --destdir=<Destination Directory>")
print("\t --limit=<number> Limit the Run to <number> of images")
print("\t --rebuild Rebuild the Database")
sys.exit(2)
def showError(err):
print("ERROR")
msg(err)
sys.exit(2)
def main(argv, prgName):
global DEBUG
global prefix
"""
BaseDir = '/Volumes/TOSHIBA EXT/Backups.backupdb/Gani* MacBook Pro/2015-01-12-112355/Macintosh HD/Users/gani/Pictures/iPhoto Library.photolibrary/Masters'
DestDir = '/Users/gani/Documents/Processed-Images'
sqlite_file = '/Users/gani/Documents/Processed-Images/imgdata.sqlite'
DEBUG = True
SMALLRUN = False
SMALLRUNCOUNT = 54
"""
DEBUG = False
runLimit = 0
srcDir = ''
destDir = ''
dbFileName = "imgdata.db"
rebuildDB = False
homeDir = os.path.expanduser('~')
showBanner(prgName)
try:
opts, args = getopt.getopt(argv,"hvrl:s:d:", ["rebuild","srcdir=","destdir=", "limit=","verbose"])
except getopt.GetoptError as err:
showHelp(prgName, str(err))
for opt, arg in opts:
if (opt == "-h"):
showHelp(prgName)
if (opt in ("-s", "--srcdir")):
srcDir = arg
if (opt in ("-d", "--destdir")):
destDir = arg
if (opt in ("-l", "--limit")):
try:
runLimit = int(arg)
except:
showError("Limit argument not a number")
runLimit = 0
if (opt in ("-r", "--rebuild")):
rebuildDB = True
if (opt in ("-v", "--verbose")):
DEBUG = True
if srcDir == "" and rebuildDB == False:
showHelp(prgName)
if (not checkDirectory(srcDir)) and (rebuildDB == False):
showError("Source Directory does not Exist")
if destDir == "":
destDir = homeDir + os.sep + "Documents" + os.sep + "Processed-Images"
msg("Destination Directory not provided.")
msg("Assumed to be " + destDir)
if not checkDirectory(destDir):
showError("Destination Directory does not Exist")
if rebuildDB == True:
msg("Database being Rebuilt from " + destDir)
if ( srcDir != "" ):
msg("Source Directory not processed, as rebuild action is selected")
dbFile = destDir + os.sep + dbFileName
(conn, dbCursor) = setupDatabase(dbFile)
if (conn==False) or (dbCursor==False):
showError("Database could not be created or accessed")
prefix = getMetaPrefix(dbCursor)
processList(srcDir, destDir, conn, dbCursor, runLimit, rebuildDB)
conn.close()
if __name__ == "__main__":
main(sys.argv[1:], os.path.basename(sys.argv[0]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment