Skip to content

Instantly share code, notes, and snippets.

@ronivaldo
Created November 25, 2019 02:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ronivaldo/32860648d4ba998500e2af62ed9435a2 to your computer and use it in GitHub Desktop.
Save ronivaldo/32860648d4ba998500e2af62ed9435a2 to your computer and use it in GitHub Desktop.
Backup Duplicate Files
from __future__ import print_function
# tutorial
# https://www.pythoncentral.io/introductory-tutorial-python-sqlalchemy/
# https://chartio.com/resources/tutorials/how-to-execute-raw-sql-in-sqlalchemy/
import os
import sys
from sqlalchemy import Column, ForeignKey, Integer, String, Numeric
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import hashlib
import base64
import time
Base = declarative_base()
class Directory(Base):
__tablename__ = 'directory'
id = Column(Integer, primary_key=True)
path = Column(String(250), nullable=False)
class File(Base):
__tablename__ = 'file'
id = Column(Integer, primary_key=True)
filename = Column(String(250))
full_path = Column(String(250))
dirname = Column(String(250))
file_size = Column(Integer())
last_access_time = Column(Integer())
last_modification_time = Column(Integer())
last_creation_time = Column(Integer())
hash_full = Column(String(250))
hash_1k = Column(String(250))
directory_id = Column(Integer(), ForeignKey('directory.id'))
directory = relationship(Directory)
try:
os.remove("duplicate_files.db")
except Exception as e:
raise e
engine = create_engine('sqlite:///duplicate_files.db')
# Create all tables in the engine. This is equivalent to "Create Table"
# statements in raw SQL.
Base.metadata.create_all(engine)
DBSession = sessionmaker(bind=engine)
session = DBSession()
def get_file_details(full_path):
result = {
}
#print(full_path)
try:
# if the target is a symlink (soft one), this will
# dereference it - change the value to the actual target file
full_path = os.path.realpath(full_path)
result['full_path'] = full_path
dirname = os.path.dirname(full_path)
result['dirname'] = dirname
file_size = os.path.getsize(full_path)
result['file_size'] = file_size
last_access_time = os.path.getatime(full_path)
result['last_access_time'] = last_access_time
last_modification_time = os.path.getmtime(full_path)
# most important
result['last_modification_time'] = last_modification_time
last_creation_time = os.path.getctime(full_path)
result['last_creation_time'] = last_creation_time
except (OSError,):
# not accessible (permissions, etc) - pass on
pass
return result
extensions = [
'jpg',
'jpeg',
'mpeg',
'mpg',
'avi',
'mp4'
]
# 'Z:\\_BACKUP_\\_fotos_\\2006',
paths = [
'Z:\\_BACKUP_\\_fotos_\\2005',
'Z:\\_BACKUP_\\_fotos_\\2006',
]
paths = [
'D:\\TMP\\duplicate_center\\test1',
'D:\\TMP\\duplicate_center\\test2',
]
path_analysis = {}
session.commit()
print('\nprocurando arquivos nos diretorios...')
for path in paths:
#path_analysis[path] = []
new_directory = Directory(path=path)
session.add(new_directory)
print(path)
for dirpath, dirnames, filenames in os.walk(path):
#print(filenames)
for filename in filenames:
extension = filename.split('.')[-1].lower()
if extension not in extensions:
continue
full_path = os.path.join(dirpath, filename)
file_details = get_file_details(full_path)
print('.', end='', flush=True)
if file_details:
new_file = File()
new_file.filename = filename
new_file.full_path = file_details.get('full_path')
new_file.dirname = file_details.get('dirname')
new_file.file_size = file_details.get('file_size')
new_file.last_access_time = file_details.get('last_access_time')
new_file.last_modification_time = file_details.get('last_modification_time')
new_file.last_creation_time = file_details.get('last_creation_time')
new_file.directory = new_directory
session.add(new_file)
session.commit()
def chunk_reader(fobj, chunk_size=1024):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
hashobj = hash()
file_object = open(filename, 'rb')
if first_chunk_only:
hashobj.update(file_object.read(1024))
else:
for chunk in chunk_reader(file_object):
hashobj.update(chunk)
hashed = hashobj.digest()
file_object.close()
return hashed
def localtime(unix_timestamp):
local_time = time.localtime(unix_timestamp)
return time.strftime("%Y-%m-%d %H:%M:%S", local_time)
# obter todos os arquivos do mesmo tamanho e calcular o hash_1k
print('\nCalculando hash 1K de arquivos do mesmo tamanho..')
with engine.connect() as con:
rs = con.execute(
'''
select
file_size,
count(*) as file_counter
from
file
where
file_size > 0 and
hash_1k is null
group by
file_size
having count(*) > 1
order by 1
'''
)
for row in rs:
file_size = dict(row)['file_size']
#print('file_size={}'.format(file_size))
files = session.query(File).filter_by(file_size=file_size).all()
for file in files:
#print(file.full_path)
#print(file.file_size)
print('.', end='', flush=True)
try:
hash_1k_bin = get_hash(file.full_path, first_chunk_only=True)
file.hash_1k = base64.b64encode(hash_1k_bin)
session.add(file)
except (OSError,):
# the file access might've changed till the exec point got here
continue
session.commit()
# obter todos os arquivos com mesmo tamanho e hash_1k e calcular o hash
print('\nCalculando hash full de arquivos com mesmo tamanho e hash 1K..')
with engine.connect() as con:
rs = con.execute(
'''
select
hash_1k,
file_size,
count(*) as hash_counter
from
file
where
hash_1k is not null and
hash_full is null
group by
hash_1k,
file_size
having count(*) > 1
order by 1
'''
)
for row in rs:
#print(row)
hash_1k = dict(row)['hash_1k']
file_size = dict(row)['file_size']
#print('hash_1k={}'.format(hash_1k))
files = session.query(File).filter_by(hash_1k=hash_1k).filter_by(file_size=file_size).all()
for file in files:
#print(file.full_path)
#print(file.file_size)
print('.', end='', flush=True)
try:
hash_full_bin = get_hash(file.full_path, first_chunk_only=False)
file.hash_full = base64.b64encode(hash_full_bin)
session.add(file)
except (OSError,):
# the file access might've changed till the exec point got here
continue
session.commit()
# obter todos os arquivos com o mesmo nome e calcular o hash_1k
print('\nCalculando hash 1K de arquivos com o mesmo nome..')
with engine.connect() as con:
rs = con.execute(
'''
select
filename,
count(*) as file_counter
from
file
where
file_size > 0 and
hash_1k is null
group by
filename
having count(*) > 1
order by 1
'''
)
for row in rs:
filename = dict(row)['filename']
files = session.query(File).filter_by(filename=filename).all()
for file in files:
print('.', end='', flush=True)
try:
hash_1k_bin = get_hash(file.full_path, first_chunk_only=True)
file.hash_1k = base64.b64encode(hash_1k_bin)
session.add(file)
except (OSError,):
# the file access might've changed till the exec point got here
continue
session.commit()
# obter todos os arquivos com mesmo nome e hash_1k e calcular o hash
print('\nCalculando hash full de arquivos com mesmo nome e hash 1K..')
with engine.connect() as con:
rs = con.execute(
'''
select
hash_1k,
filename,
count(*) as hash_counter
from
file
where
hash_1k is not null and
hash_full is null
group by
hash_1k,
filename
having count(*) > 1
order by 1
'''
)
for row in rs:
#print(row)
hash_1k = dict(row)['hash_1k']
filename = dict(row)['filename']
#print('hash_1k={}'.format(hash_1k))
files = session.query(File).filter_by(hash_1k=hash_1k).filter_by(filename=filename).all()
for file in files:
#print(file.full_path)
#print(file.file_size)
print('.', end='', flush=True)
try:
hash_full_bin = get_hash(file.full_path, first_chunk_only=False)
file.hash_full = base64.b64encode(hash_full_bin)
session.add(file)
except (OSError,):
# the file access might've changed till the exec point got here
continue
session.commit()
# arquivos duplicados em cada um dos diretorios
print('\nArquivos duplicados em cada um dos diretorios..')
directories = session.query(Directory).all()
for directory in directories:
with engine.connect() as con:
rs = con.execute(
'''
select
hash_full,
count(*)
from
file f
where
directory_id = {} and
hash_full is not null and
not exists (
select
*
from
file
where
directory_id <> f.directory_id and
hash_full = f.hash_full
)
group by
hash_full
having count(*) > 1
order by 1
'''.format(directory.id)
)
rows = rs.fetchall()
if len(rows) > 0:
print('----- duplicados ---------')
print(directory.path)
print('--------------------------')
for row in rows:
hash_full = dict(row)['hash_full']
files = session.query(
File
).filter_by(
directory=directory
).filter_by(
hash_full=hash_full
).order_by(
File.last_creation_time.desc(),
File.last_modification_time.desc(),
File.last_access_time.desc(),
).all()
print('{} files'.format(len(files)))
for file in files:
print('--')
print(file.full_path)
print(localtime(file.last_creation_time))
print(localtime(file.last_modification_time))
print(localtime(file.last_access_time))
# arquivos duplicados presents nos dois diretorios
print('\nArquivos duplicados presents nos dois diretorios..')
directory = session.query(Directory).first()
if directory:
with engine.connect() as con:
rs = con.execute(
'''
select
hash_full,
count(*)
from
file f
where
directory_id = {} and
hash_full is not null and
exists (
select
*
from
file
where
directory_id <> f.directory_id and
hash_full = f.hash_full
)
group by
hash_full
having count(*) > 0
order by 1
'''.format(directory.id)
)
rows = rs.fetchall()
if len(rows) > 0:
print('----- duplicados ---------')
print('--------------------------')
for row in rows:
hash_full = dict(row)['hash_full']
files = session.query(
File
).filter_by(
hash_full=hash_full
).order_by(
File.last_creation_time.desc(),
File.last_modification_time.desc(),
File.last_access_time.desc(),
).all()
print('{} files'.format(len(files)))
for file in files:
print('--')
print(file.full_path)
print(localtime(file.last_creation_time))
print(localtime(file.last_modification_time))
print(localtime(file.last_access_time))
# obter os arquivos que estao na pasta atual, mas nao estao no backup
print('\nArquivos que estao na pasta atual, mas nao estao no backup..')
directory = session.query(Directory).first()
if directory:
with engine.connect() as con:
rs = con.execute(
'''
select
distinct filename
from
file f
where
directory_id = {} and
not exists (
select
*
from
file
where
directory_id <> f.directory_id and
filename = f.filename
)
order by filename
'''.format(directory.id)
)
rows = rs.fetchall()
if len(rows) > 0:
print('----- nao estao no backup ---------')
print('-----------------------------------')
for row in rows:
filename = dict(row)['filename']
files = session.query(
File
).filter_by(
directory=directory
).filter_by(
filename=filename
).order_by(
File.dirname.asc()
).all()
print('{} files'.format(len(files)))
for file in files:
print('--')
print(file.full_path)
print(localtime(file.last_creation_time))
print(localtime(file.last_modification_time))
print(localtime(file.last_access_time))
# arquivos com mesmo nome, mas com hash distintos
print('\nArquivos com mesmo nome, mas com hash distintos..')
directories = session.query(Directory).all()
for directory in directories:
with engine.connect() as con:
rs = con.execute(
'''
select distinct
file1.filename,
file1.hash_full
from
file as file1,
file as file2
where
file1.directory_id = {} and
file2.directory_id = file1.directory_id and
file1.filename = file2.filename and
file1.hash_full <> file2.hash_full
'''.format(directory.id)
)
rows = rs.fetchall()
if len(rows) > 0:
print('----- mesmo nome, hash distintos ---------')
print(directory.path)
print('-------------------------------------------')
for row in rows:
filename = dict(row)['filename']
hash_full = dict(row)['hash_full']
files = session.query(
File
).filter_by(
directory=directory
).filter_by(
filename=filename
).filter_by(
hash_full=hash_full
).order_by(
File.dirname.asc()
).all()
for file in files:
print('--')
print(file.full_path)
print(localtime(file.last_creation_time))
print(localtime(file.last_modification_time))
print(localtime(file.last_access_time))
# arquivos com mesmo hash, mas com nomes distintos
print('\nArquivos com mesmo hash, mas com nomes distintos..')
directories = session.query(Directory).all()
for directory in directories:
with engine.connect() as con:
rs = con.execute(
'''
select
hash_full,
count(*)
from
file f
where
directory_id = {} and
hash_full is not null and
exists (
select
*
from
file
where
directory_id = f.directory_id and
hash_full = f.hash_full and
filename <> f.filename
)
group by
hash_full
having count(*) > 1
order by 1
'''.format(directory.id)
)
rows = rs.fetchall()
if len(rows) > 0:
print('----- mesmo hash, nomes distintos ---------')
print(directory.path)
print('-------------------------------------------')
for row in rows:
hash_full = dict(row)['hash_full']
files = session.query(
File
).filter_by(
directory=directory
).filter_by(
hash_full=hash_full
).order_by(
File.last_creation_time.desc(),
File.last_modification_time.desc(),
File.last_access_time.desc(),
).all()
print('{} files'.format(len(files)))
for file in files:
print('--')
print(file.full_path)
print(localtime(file.last_creation_time))
print(localtime(file.last_modification_time))
print(localtime(file.last_access_time))
# arquivos mais novos na pasta (backup esta com mais antigo) com o mesmo nome??
print('\nArquivos mais novos na pasta (backup esta com mais antigo) com o mesmo nome..')
directory = session.query(Directory).first()
if directory:
with engine.connect() as con:
rs = con.execute(
'''
select distinct
file1.full_path,
file1.hash_full
from
file as file1,
file as file2
where
file1.directory_id = {} and
file2.directory_id <> file1.directory_id and
file1.filename = file2.filename and
(
file1.hash_1k <> file2.hash_1k or
file1.hash_full <> file2.hash_full
) and
file1.last_modification_time > file2.last_modification_time
'''.format(directory.id)
)
rows = rs.fetchall()
if len(rows) > 0:
print('----- arquivos mais novos na pasta ---------')
print(directory.path)
print('-------------------------------------------')
for row in rows:
full_path = dict(row)['full_path']
hash_full = dict(row)['hash_full']
files = session.query(
File
).filter_by(
directory=directory
).filter_by(
full_path=full_path
).filter_by(
hash_full=hash_full
).order_by(
File.dirname.asc()
).all()
for file in files:
print('--')
print(file.full_path)
print(localtime(file.last_creation_time))
print(localtime(file.last_modification_time))
print(localtime(file.last_access_time))
# diretorios vazios??
print('\ndiretorios vazios..')
directories = session.query(Directory).all()
for directory in directories:
print('----- diretorios vazios ---------')
print(directory.path)
print('---------------------------------')
path = directory.path
for dirpath, dirnames, filenames in os.walk(path):
if len(filenames) == 0:
print(dirpath)
# apagar copiando para uma pasta delete-me
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment