Created
February 11, 2017 22:25
-
-
Save gBhagavathula/9f582ce613ca4c2aad09c14be71dde8e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" | |
Copyright (C) 2017 Siricon Limited | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/> | |
""" | |
import hashlib | |
import sqlite3 | |
import os | |
import shutil | |
from PIL import Image | |
from PIL.ExifTags import TAGS | |
import string | |
import random | |
import re | |
import sys | |
import getopt | |
DEBUG = False | |
imgTable = "imgData" | |
mtaTable = "metaData" | |
extns = [".*\.JPG$", ".*\.JPEG$", ".*\.GIF$", ".*\.PNG$"] | |
extnRegex = "(" + ")|(".join(extns) + ")" | |
def createTable(cursor, tableName): | |
cursor.execute('CREATE TABLE {tn} ({nf} {ft} PRIMARY KEY AUTOINCREMENT)'\ | |
.format(tn=tableName, nf="id", ft="INTEGER")) | |
def addColumn(cursor, tableName, fieldName, fieldType, defValue=False): | |
try: | |
if ( defValue != False ): | |
cursor.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct} DEFAULT '{df}'" .format(tn=tableName, cn=fieldName, ct=fieldType, df=defValue)) | |
else: | |
cursor.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}" .format(tn=tableName, cn=fieldName, ct=fieldType)) | |
except Exception as err: | |
debug(str(err)) | |
showError("Fields could not be setup in the database") | |
def makeUniqueIndex(cursor, tableName, fieldName): | |
try: | |
cursor.execute("CREATE UNIQUE INDEX unique_" + fieldName + " on {tn} ({cn})" .format(tn=tableName, cn=fieldName)) | |
except Exception as err: | |
debug(str(err)) | |
showError("Unique Index could not be setup") | |
def setupTable(cursor, tableName, fieldDict, uniqueField=None): | |
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='{tn}'" .format(tn=tableName)) | |
tblData = cursor.fetchone() | |
if (tblData) and (tblData[0] == tableName): | |
debug("Table Exists - " + tableName) | |
else : | |
createTable(cursor, tableName) | |
for (fieldName, fieldType) in fieldDict.items(): | |
addColumn(cursor, tableName, fieldName, fieldType, False) | |
if ( uniqueField != None ): | |
makeUniqueIndex(cursor, tableName, uniqueField) | |
def setupMetaTable(cursor, conn): | |
global mtaTable | |
fields = {'fileCounter': 'INTEGER', 'filePrefix':'TEXT'} | |
setupTable(cursor, mtaTable, fields) | |
createMetaRecord(cursor, conn) | |
conn.commit() | |
def setupImageTable(cursor, conn): | |
global imgTable | |
fields = {'fileName' : 'TEXT', 'fileHash' : 'TEXT'} | |
setupTable(cursor, imgTable, fields, 'fileHash') | |
conn.commit() | |
def getMetaCounter(cursor): | |
global mtaTable | |
try: | |
cursor.execute("SELECT fileCounter, filePrefix FROM {tn} WHERE id=1" .format(tn=mtaTable)) | |
mtaRecord = cursor.fetchone() | |
if (mtaRecord): | |
return mtaRecord[0] | |
else: | |
return False | |
except: | |
debug("Get Meta Counter Failed") | |
return False | |
def getMetaPrefix(cursor): | |
global mtaTable | |
try: | |
cursor.execute("SELECT fileCounter, filePrefix FROM {tn} WHERE id=1" .format(tn=mtaTable)) | |
mtaRecord = cursor.fetchone() | |
if (mtaRecord): | |
return mtaRecord[1] | |
else: | |
return False | |
except: | |
return False | |
def setMetaCounter(cursor, conn): | |
global mtaTable | |
try: | |
counter = getMetaCounter(cursor) | |
counter = counter + 1 | |
cursor.execute("UPDATE {tn} SET fileCounter = {fc} WHERE id=1" .format(tn=mtaTable, fc=counter)) | |
conn.commit() | |
return True | |
except: | |
return False | |
def createMetaRecord(cursor, conn): | |
global mtaTable | |
num = getMetaCounter(cursor) | |
if num == False: | |
try: | |
chars=string.ascii_uppercase + string.digits | |
prefix = ''.join(random.choice(chars) for _ in range(3)) | |
global mtaTable | |
cursor.execute("INSERT INTO {tn} (id, fileCounter, filePrefix) VALUES ({id},{fc},'{fp}')" .format(tn=mtaTable, id=1, fc=1, fp=prefix)) | |
conn.commit() | |
return True | |
except: | |
debug("Create MTA Record Failed") | |
return False | |
else: | |
return True | |
def getNextFilename(fileExt, cursor): | |
global prefix | |
padTemplate = "PI-{pre}-{pd:>07}" | |
fileNumber = getMetaCounter(cursor) | |
newName = padTemplate.format(pre=prefix, pd=fileNumber) + fileExt | |
return newName | |
def fileSeen(cursor, tableName, fName, fHash): | |
bfName = os.path.basename(fName) | |
cursor.execute("SELECT fileName, fileHash FROM {tn} WHERE fileHash = '{fh}'" .format(tn=tableName, fh=fHash)) | |
fileRecord = cursor.fetchone() | |
if (fileRecord) and (fileRecord[1] == fHash): | |
return True | |
else: | |
return False | |
def updateHashTable(cursor, conn, tableName, fName, fHash, rebuild=False): | |
try: | |
bfName = os.path.basename(fName) | |
bfExtn = getFileExtension(bfName) | |
if ( rebuild == True ): | |
newName = bfName | |
else: | |
newName = getNextFilename(bfExtn, cursor) | |
cursor.execute("INSERT INTO {tn} (fileName, fileHash) VALUES ('{fn}','{fh}') " .format(tn=tableName, fn=newName, fh=fHash)) | |
conn.commit() | |
setMetaCounter(cursor, conn) | |
return newName | |
except Exception, e: | |
if ( "UNIQUE constraint failed" in str(e) ) and ( "fileHash" in str(e)): | |
return True | |
else: | |
debug(str(e)) | |
return False | |
def hash_bytestr_iter(bytesiter, hasher, ashexstr=False): | |
for block in bytesiter: | |
hasher.update(block) | |
return (hasher.hexdigest() if ashexstr else hasher.digest()) | |
def file_as_blockiter(afile, blocksize=65536): | |
with afile: | |
block = afile.read(blocksize) | |
while len(block) > 0: | |
yield block | |
block = afile.read(blocksize) | |
def betterHash(fname): | |
return hash_bytestr_iter(file_as_blockiter(open(fname, 'rb')), hashlib.sha1(), True) | |
def msg(str): | |
print "\t[+] " + str | |
def debug(str): | |
global DEBUG | |
if ( DEBUG == True ): | |
msg("DEBUG: " + str) | |
def copyFile(srcFile, destDir): | |
shutil.copy(srcFile, destDir) | |
def removeFile(srcFile): | |
if os.path.isfile(srcFile): | |
os.unlink(srcFile) | |
def getFileExtension(fName): | |
return os.path.splitext(fName)[1] | |
def getExifData(fName): | |
bfExtn = getFileExtension(fName).lower() | |
try: | |
if ( bfExtn == ".jpg" ) or ( bfExtn == ".jpeg" ): | |
return Image.open(fName)._getexif() | |
else: | |
return "NOEXIF" | |
except: | |
return "NOEXIF" | |
def getExifField (exif,field) : | |
if exif == "NOEXIF": | |
return exif | |
try: | |
for (k,v) in exif.iteritems(): | |
if TAGS.get(k) == field: | |
return v | |
except: | |
return "NOEXIF" | |
def makeDirectory(dirPath): | |
try: | |
if not os.path.exists(dirPath): | |
debug("Creating Directory: " + dirPath) | |
os.makedirs(dirPath) | |
return True | |
except: | |
debug("Failed Creating Directory") | |
return False | |
def checkDirectory(dirPath): | |
try: | |
if os.path.exists(dirPath): | |
return True | |
else: | |
return False | |
except: | |
debug("Error Checking Directory") | |
return False | |
def getBestDate(exif): | |
date = getExifField(exif, "DateTimeOriginal") | |
if ( date != False ) and ( date != None ): | |
return date | |
date = getExifField(exif, "DateTimeDigitized") | |
if ( date != False ) and ( date != None ): | |
return date | |
return False | |
def processRebuild(dbCursor, dbConnection, fName): | |
global imgTable | |
fileHash = betterHash(fName) | |
status = updateHashTable(dbCursor, dbConnection, imgTable, fName, fileHash, True) | |
bfName = os.path.basename(fName) | |
if ( status == False ): | |
showError("Error in Updating Database") | |
elif ( status == True ): | |
msg("The File already exists in the database {fn}".format(fn=bfName)) | |
elif ( bfName == status ): | |
msg("Hash Updated for File: " + status) | |
else: | |
debug("fName is: " + fName + " and status is : " + status) | |
msg("Hash updated, but other error.") | |
def processImport(dbCursor, dbConnection, fName, destDir): | |
global imgTable | |
fileHash = betterHash(fName) | |
status = updateHashTable(dbCursor, dbConnection, imgTable, fName, fileHash, False) | |
if ( status == False ): | |
showError("Error in Updating Database") | |
elif ( status == True ): | |
bfName = os.path.basename(fName) | |
msg("The File already exists in the database {fn}".format(fn=bfName)) | |
else: | |
exif = getExifData(fName) | |
date = getBestDate(exif) | |
if ( date != False ) and ( date != None ): | |
yrmn = date[:7].replace(":", os.sep) | |
else : | |
yrmn = "NOEXIF" | |
newDir = destDir + os.sep + yrmn | |
step2 = makeDirectory(newDir) | |
if( step2 != False ): | |
newFile = newDir + os.sep + status | |
debug("File will be copied to " + newFile) | |
shutil.copy(fName, newFile) | |
def setupDatabase(dbFile): | |
global imgTable, mtaTable | |
try: | |
conn = sqlite3.connect(dbFile) | |
dbCursor = conn.cursor() | |
setupImageTable(dbCursor, conn) | |
setupMetaTable(dbCursor, conn) | |
return (conn, dbCursor) | |
except: | |
return(False, False) | |
def processList(srcDir, destDir, dbConnection, dbCursor, runLimit=0, rebuild=False): | |
global extnRegex | |
if ( rebuild == True ): | |
baseDir = destDir | |
else: | |
baseDir = srcDir | |
count=0 | |
for subdir, dirs, files in os.walk(baseDir): | |
if ( runLimit > 0 ) and ( count > runLimit ): | |
break | |
for file in files: | |
count = count + 1 | |
if ( runLimit > 0 ) and ( count > runLimit ): | |
break | |
filepath = subdir + os.sep + file | |
if ( re.match(extnRegex, filepath, re.I) ): | |
if ( rebuild == True ): | |
debug("Rebuild Action on : " + filepath) | |
processRebuild(dbCursor, dbConnection, filepath) | |
else: | |
debug("Process Action on : " + filepath) | |
processImport(dbCursor, dbConnection, filepath, destDir) | |
def showBanner(prgName): | |
print("\n" + prgName + " Version 0.74 (http://www.siricon.co.uk/deduper)") | |
def showHelp(prgName, err=""): | |
print("Usage: " + prgName + " [options]") | |
if ( err != "" ): | |
print("ERROR") | |
print("\t " + err) | |
print("OPTIONS") | |
print("\t -h Help") | |
print("\t -v Verbose debugging") | |
print("\t -s <Source Directory>") | |
print("\t -d <Destination Directory>") | |
print("\t -l <number> Limit the Run to <number> of images") | |
print("\t -r Rebuild the Database") | |
print("LONG OPTIONS") | |
print("\t --srcdir=<Source Directory>") | |
print("\t --destdir=<Destination Directory>") | |
print("\t --limit=<number> Limit the Run to <number> of images") | |
print("\t --rebuild Rebuild the Database") | |
sys.exit(2) | |
def showError(err): | |
print("ERROR") | |
msg(err) | |
sys.exit(2) | |
def main(argv, prgName): | |
global DEBUG | |
global prefix | |
""" | |
BaseDir = '/Volumes/TOSHIBA EXT/Backups.backupdb/Gani* MacBook Pro/2015-01-12-112355/Macintosh HD/Users/gani/Pictures/iPhoto Library.photolibrary/Masters' | |
DestDir = '/Users/gani/Documents/Processed-Images' | |
sqlite_file = '/Users/gani/Documents/Processed-Images/imgdata.sqlite' | |
DEBUG = True | |
SMALLRUN = False | |
SMALLRUNCOUNT = 54 | |
""" | |
DEBUG = False | |
runLimit = 0 | |
srcDir = '' | |
destDir = '' | |
dbFileName = "imgdata.db" | |
rebuildDB = False | |
homeDir = os.path.expanduser('~') | |
showBanner(prgName) | |
try: | |
opts, args = getopt.getopt(argv,"hvrl:s:d:", ["rebuild","srcdir=","destdir=", "limit=","verbose"]) | |
except getopt.GetoptError as err: | |
showHelp(prgName, str(err)) | |
for opt, arg in opts: | |
if (opt == "-h"): | |
showHelp(prgName) | |
if (opt in ("-s", "--srcdir")): | |
srcDir = arg | |
if (opt in ("-d", "--destdir")): | |
destDir = arg | |
if (opt in ("-l", "--limit")): | |
try: | |
runLimit = int(arg) | |
except: | |
showError("Limit argument not a number") | |
runLimit = 0 | |
if (opt in ("-r", "--rebuild")): | |
rebuildDB = True | |
if (opt in ("-v", "--verbose")): | |
DEBUG = True | |
if srcDir == "" and rebuildDB == False: | |
showHelp(prgName) | |
if (not checkDirectory(srcDir)) and (rebuildDB == False): | |
showError("Source Directory does not Exist") | |
if destDir == "": | |
destDir = homeDir + os.sep + "Documents" + os.sep + "Processed-Images" | |
msg("Destination Directory not provided.") | |
msg("Assumed to be " + destDir) | |
if not checkDirectory(destDir): | |
showError("Destination Directory does not Exist") | |
if rebuildDB == True: | |
msg("Database being Rebuilt from " + destDir) | |
if ( srcDir != "" ): | |
msg("Source Directory not processed, as rebuild action is selected") | |
dbFile = destDir + os.sep + dbFileName | |
(conn, dbCursor) = setupDatabase(dbFile) | |
if (conn==False) or (dbCursor==False): | |
showError("Database could not be created or accessed") | |
prefix = getMetaPrefix(dbCursor) | |
processList(srcDir, destDir, conn, dbCursor, runLimit, rebuildDB) | |
conn.close() | |
if __name__ == "__main__": | |
main(sys.argv[1:], os.path.basename(sys.argv[0])) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment