ronivaldo/duplicate_db.py

## duplicate_db.py
from __future__ import print_function
# tutorial
# https://www.pythoncentral.io/introductory-tutorial-python-sqlalchemy/
# https://chartio.com/resources/tutorials/how-to-execute-raw-sql-in-sqlalchemy/
import os
import sys
from sqlalchemy import Column, ForeignKey, Integer, String, Numeric
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import hashlib
import base64
import time

Base = declarative_base()

class Directory(Base):
    __tablename__ = 'directory'
    id = Column(Integer, primary_key=True)
    path = Column(String(250), nullable=False)

class File(Base):
    __tablename__ = 'file'
    id = Column(Integer, primary_key=True)
    filename = Column(String(250))
    full_path = Column(String(250))
    dirname = Column(String(250))
    file_size = Column(Integer())
    last_access_time = Column(Integer())
    last_modification_time = Column(Integer())
    last_creation_time = Column(Integer())
    hash_full = Column(String(250))
    hash_1k = Column(String(250))
    directory_id = Column(Integer(), ForeignKey('directory.id'))
    directory = relationship(Directory)

try:
    os.remove("duplicate_files.db")
except Exception as e:
    raise e

engine = create_engine('sqlite:///duplicate_files.db')

# Create all tables in the engine. This is equivalent to "Create Table"
# statements in raw SQL.
Base.metadata.create_all(engine)

DBSession = sessionmaker(bind=engine)
session = DBSession()


def get_file_details(full_path):
    result = {
    }
    #print(full_path)
    try:
        # if the target is a symlink (soft one), this will
        # dereference it - change the value to the actual target file
        full_path = os.path.realpath(full_path)
        result['full_path'] = full_path
        dirname = os.path.dirname(full_path)
        result['dirname'] = dirname
        file_size = os.path.getsize(full_path)
        result['file_size'] = file_size
        last_access_time = os.path.getatime(full_path)
        result['last_access_time'] = last_access_time
        last_modification_time = os.path.getmtime(full_path)
        # most important
        result['last_modification_time'] = last_modification_time
        last_creation_time = os.path.getctime(full_path)
        result['last_creation_time'] = last_creation_time
    except (OSError,):
        # not accessible (permissions, etc) - pass on
        pass
    return result


extensions = [
    'jpg',
    'jpeg',
    'mpeg',
    'mpg',
    'avi',
    'mp4'
]


#    'Z:\\_BACKUP_\\_fotos_\\2006',
paths = [
    'Z:\\_BACKUP_\\_fotos_\\2005',
    'Z:\\_BACKUP_\\_fotos_\\2006',
]

paths = [
    'D:\\TMP\\duplicate_center\\test1',
    'D:\\TMP\\duplicate_center\\test2',
]


path_analysis = {}

session.commit()

print('\nprocurando arquivos nos diretorios...')
for path in paths:
    #path_analysis[path] = []
    new_directory = Directory(path=path)
    session.add(new_directory)
    print(path)
    for dirpath, dirnames, filenames in os.walk(path):
        #print(filenames)
        for filename in filenames:
            extension = filename.split('.')[-1].lower()
            if extension not in extensions:
                continue
            full_path = os.path.join(dirpath, filename)
            file_details = get_file_details(full_path)
            print('.', end='', flush=True)
            if file_details:
                new_file = File()
                new_file.filename = filename
                new_file.full_path = file_details.get('full_path')
                new_file.dirname = file_details.get('dirname')
                new_file.file_size = file_details.get('file_size')
                new_file.last_access_time = file_details.get('last_access_time')
                new_file.last_modification_time = file_details.get('last_modification_time')
                new_file.last_creation_time = file_details.get('last_creation_time')
                new_file.directory = new_directory
                session.add(new_file)

session.commit()

def chunk_reader(fobj, chunk_size=1024):
    """Generator that reads a file in chunks of bytes"""
    while True:
        chunk = fobj.read(chunk_size)
        if not chunk:
            return
        yield chunk

def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
    hashobj = hash()
    file_object = open(filename, 'rb')

    if first_chunk_only:
        hashobj.update(file_object.read(1024))
    else:
        for chunk in chunk_reader(file_object):
            hashobj.update(chunk)
    hashed = hashobj.digest()

    file_object.close()
    return hashed

def localtime(unix_timestamp):
    local_time = time.localtime(unix_timestamp)
    return time.strftime("%Y-%m-%d %H:%M:%S", local_time)

# obter todos os arquivos do mesmo tamanho e calcular o hash_1k
print('\nCalculando hash 1K de arquivos do mesmo tamanho..')
with engine.connect() as con:
    rs = con.execute(
    '''
        select
          file_size,
          count(*) as file_counter
        from
          file
        where
          file_size > 0 and
          hash_1k is null
        group by
          file_size
        having count(*) > 1
        order by 1
    '''
    )
    for row in rs:
        file_size = dict(row)['file_size']
        #print('file_size={}'.format(file_size))
        files = session.query(File).filter_by(file_size=file_size).all()
        for file in files:
            #print(file.full_path)
            #print(file.file_size)
            print('.', end='', flush=True)
            try:
                hash_1k_bin = get_hash(file.full_path, first_chunk_only=True)
                file.hash_1k = base64.b64encode(hash_1k_bin)
                session.add(file)
            except (OSError,):
                # the file access might've changed till the exec point got here
                continue

session.commit()

# obter todos os arquivos com mesmo tamanho e hash_1k e calcular o hash
print('\nCalculando hash full de arquivos com mesmo tamanho e hash 1K..')
with engine.connect() as con:
    rs = con.execute(
    '''
        select
          hash_1k,
          file_size,
          count(*) as hash_counter
        from
          file
        where
          hash_1k is not null and
          hash_full is null
        group by
          hash_1k,
          file_size
        having count(*) > 1
        order by 1
    '''
    )
    for row in rs:
        #print(row)
        hash_1k = dict(row)['hash_1k']
        file_size = dict(row)['file_size']
        #print('hash_1k={}'.format(hash_1k))

        files = session.query(File).filter_by(hash_1k=hash_1k).filter_by(file_size=file_size).all()
        for file in files:
            #print(file.full_path)
            #print(file.file_size)
            print('.', end='', flush=True)
            try:
                hash_full_bin = get_hash(file.full_path, first_chunk_only=False)
                file.hash_full = base64.b64encode(hash_full_bin)
                session.add(file)
            except (OSError,):
                # the file access might've changed till the exec point got here
                continue

session.commit()


# obter todos os arquivos com o mesmo nome e calcular o hash_1k
print('\nCalculando hash 1K de arquivos com o mesmo nome..')
with engine.connect() as con:
    rs = con.execute(
    '''
        select
          filename,
          count(*) as file_counter
        from
          file
        where
          file_size > 0 and
          hash_1k is null
        group by
          filename
        having count(*) > 1
        order by 1
    '''
    )
    for row in rs:
        filename = dict(row)['filename']
        files = session.query(File).filter_by(filename=filename).all()
        for file in files:
            print('.', end='', flush=True)
            try:
                hash_1k_bin = get_hash(file.full_path, first_chunk_only=True)
                file.hash_1k = base64.b64encode(hash_1k_bin)
                session.add(file)
            except (OSError,):
                # the file access might've changed till the exec point got here
                continue

session.commit()

# obter todos os arquivos com mesmo nome e hash_1k e calcular o hash
print('\nCalculando hash full de arquivos com mesmo nome e hash 1K..')
with engine.connect() as con:
    rs = con.execute(
    '''
        select
          hash_1k,
          filename,
          count(*) as hash_counter
        from
          file
        where
          hash_1k is not null and
          hash_full is null
        group by
          hash_1k,
          filename
        having count(*) > 1
        order by 1
    '''
    )
    for row in rs:
        #print(row)
        hash_1k = dict(row)['hash_1k']
        filename = dict(row)['filename']
        #print('hash_1k={}'.format(hash_1k))

        files = session.query(File).filter_by(hash_1k=hash_1k).filter_by(filename=filename).all()
        for file in files:
            #print(file.full_path)
            #print(file.file_size)
            print('.', end='', flush=True)
            try:
                hash_full_bin = get_hash(file.full_path, first_chunk_only=False)
                file.hash_full = base64.b64encode(hash_full_bin)
                session.add(file)
            except (OSError,):
                # the file access might've changed till the exec point got here
                continue

session.commit()

# arquivos duplicados em cada um dos diretorios
print('\nArquivos duplicados em cada um dos diretorios..')
directories = session.query(Directory).all()
for directory in directories:
    with engine.connect() as con:
        rs = con.execute(
        '''
            select
              hash_full,
              count(*)
            from
              file f
            where
              directory_id = {} and
              hash_full is not null and
              not exists (
                select
                  *
                from
                  file
                where
                  directory_id <> f.directory_id and
                  hash_full = f.hash_full
              )
            group by
              hash_full
            having count(*) > 1
            order by 1
        '''.format(directory.id)
        )

        rows = rs.fetchall()
        if len(rows) > 0:
            print('----- duplicados ---------')
            print(directory.path)
            print('--------------------------')
            for row in rows:
                hash_full = dict(row)['hash_full']
                files = session.query(
                        File
                    ).filter_by(
                        directory=directory
                    ).filter_by(
                        hash_full=hash_full
                    ).order_by(
                        File.last_creation_time.desc(),
                        File.last_modification_time.desc(),
                        File.last_access_time.desc(),
                    ).all()
                print('{} files'.format(len(files)))
                for file in files:
                    print('--')
                    print(file.full_path)
                    print(localtime(file.last_creation_time))
                    print(localtime(file.last_modification_time))
                    print(localtime(file.last_access_time))


# arquivos duplicados presents nos dois diretorios
print('\nArquivos duplicados presents nos dois diretorios..')
directory = session.query(Directory).first()
if directory:
    with engine.connect() as con:
        rs = con.execute(
        '''
            select
              hash_full,
              count(*)
            from
              file f
            where
              directory_id = {} and
              hash_full is not null and
              exists (
                select
                  *
                from
                  file
                where
                  directory_id <> f.directory_id and
                  hash_full = f.hash_full
              )
            group by
              hash_full
            having count(*) > 0
            order by 1
        '''.format(directory.id)
        )

        rows = rs.fetchall()
        if len(rows) > 0:
            print('----- duplicados ---------')
            print('--------------------------')
            for row in rows:
                hash_full = dict(row)['hash_full']
                files = session.query(
                        File
                    ).filter_by(
                        hash_full=hash_full
                    ).order_by(
                        File.last_creation_time.desc(),
                        File.last_modification_time.desc(),
                        File.last_access_time.desc(),
                    ).all()
                print('{} files'.format(len(files)))
                for file in files:
                    print('--')
                    print(file.full_path)
                    print(localtime(file.last_creation_time))
                    print(localtime(file.last_modification_time))
                    print(localtime(file.last_access_time))


# obter os arquivos que estao na pasta atual, mas nao estao no backup
print('\nArquivos que estao na pasta atual, mas nao estao no backup..')
directory = session.query(Directory).first()
if directory:
    with engine.connect() as con:
        rs = con.execute(
        '''
            select
              distinct filename
            from
              file f
            where
              directory_id = {} and
              not exists (
                select
                  *
                from
                  file
                where
                  directory_id <> f.directory_id and
                  filename = f.filename
              )
              order by filename
        '''.format(directory.id)
        )

        rows = rs.fetchall()
        if len(rows) > 0:
            print('----- nao estao no backup ---------')
            print('-----------------------------------')
            for row in rows:
                filename = dict(row)['filename']
                files = session.query(
                        File
                    ).filter_by(
                        directory=directory
                    ).filter_by(
                        filename=filename
                    ).order_by(
                        File.dirname.asc()
                    ).all()
                print('{} files'.format(len(files)))
                for file in files:
                    print('--')
                    print(file.full_path)
                    print(localtime(file.last_creation_time))
                    print(localtime(file.last_modification_time))
                    print(localtime(file.last_access_time))

# arquivos com mesmo nome, mas com hash distintos
print('\nArquivos com mesmo nome, mas com hash distintos..')
directories = session.query(Directory).all()
for directory in directories:
    with engine.connect() as con:
        rs = con.execute(
        '''
            select distinct
              file1.filename,
              file1.hash_full
            from
              file as file1,
              file as file2
            where
              file1.directory_id = {} and
              file2.directory_id = file1.directory_id and
              file1.filename = file2.filename and
              file1.hash_full <> file2.hash_full
        '''.format(directory.id)
        )

        rows = rs.fetchall()
        if len(rows) > 0:
            print('----- mesmo nome, hash distintos ---------')
            print(directory.path)
            print('-------------------------------------------')
            for row in rows:
                filename = dict(row)['filename']
                hash_full = dict(row)['hash_full']
                files = session.query(
                        File
                    ).filter_by(
                        directory=directory
                    ).filter_by(
                        filename=filename
                    ).filter_by(
                        hash_full=hash_full
                    ).order_by(
                        File.dirname.asc()
                    ).all()
                for file in files:
                    print('--')
                    print(file.full_path)
                    print(localtime(file.last_creation_time))
                    print(localtime(file.last_modification_time))
                    print(localtime(file.last_access_time))

# arquivos com mesmo hash, mas com nomes distintos
print('\nArquivos com mesmo hash, mas com nomes distintos..')
directories = session.query(Directory).all()
for directory in directories:
    with engine.connect() as con:
        rs = con.execute(
        '''
            select
              hash_full,
              count(*)
            from
              file f
            where
              directory_id = {} and
              hash_full is not null and
              exists (
                select
                  *
                from
                  file
                where
                  directory_id = f.directory_id and
                  hash_full = f.hash_full and
                  filename <> f.filename
              )
            group by
              hash_full
            having count(*) > 1
            order by 1
        '''.format(directory.id)
        )

        rows = rs.fetchall()
        if len(rows) > 0:
            print('----- mesmo hash, nomes distintos ---------')
            print(directory.path)
            print('-------------------------------------------')
            for row in rows:
                hash_full = dict(row)['hash_full']
                files = session.query(
                        File
                    ).filter_by(
                        directory=directory
                    ).filter_by(
                        hash_full=hash_full
                    ).order_by(
                        File.last_creation_time.desc(),
                        File.last_modification_time.desc(),
                        File.last_access_time.desc(),
                    ).all()
                print('{} files'.format(len(files)))
                for file in files:
                    print('--')
                    print(file.full_path)
                    print(localtime(file.last_creation_time))
                    print(localtime(file.last_modification_time))
                    print(localtime(file.last_access_time))

# arquivos mais novos na pasta (backup esta com mais antigo) com o mesmo nome??
print('\nArquivos mais novos na pasta (backup esta com mais antigo) com o mesmo nome..')
directory = session.query(Directory).first()
if directory:
    with engine.connect() as con:
        rs = con.execute(
        '''
            select distinct
              file1.full_path,
              file1.hash_full
            from
              file as file1,
              file as file2
            where
              file1.directory_id = {} and
              file2.directory_id <> file1.directory_id and
              file1.filename = file2.filename and
              (
                file1.hash_1k <> file2.hash_1k or
                file1.hash_full <> file2.hash_full
              ) and
              file1.last_modification_time > file2.last_modification_time
        '''.format(directory.id)
        )

        rows = rs.fetchall()
        if len(rows) > 0:
            print('----- arquivos mais novos na pasta ---------')
            print(directory.path)
            print('-------------------------------------------')
            for row in rows:
                full_path = dict(row)['full_path']
                hash_full = dict(row)['hash_full']
                files = session.query(
                        File
                    ).filter_by(
                        directory=directory
                    ).filter_by(
                        full_path=full_path
                    ).filter_by(
                        hash_full=hash_full
                    ).order_by(
                        File.dirname.asc()
                    ).all()
                for file in files:
                    print('--')
                    print(file.full_path)
                    print(localtime(file.last_creation_time))
                    print(localtime(file.last_modification_time))
                    print(localtime(file.last_access_time))


# diretorios vazios??
print('\ndiretorios vazios..')
directories = session.query(Directory).all()
for directory in directories:
    print('----- diretorios vazios ---------')
    print(directory.path)
    print('---------------------------------')
    path = directory.path
    for dirpath, dirnames, filenames in os.walk(path):
        if len(filenames) == 0:
            print(dirpath)


# apagar copiando para uma pasta delete-me
	from __future__ import print_function
	# tutorial
	# https://www.pythoncentral.io/introductory-tutorial-python-sqlalchemy/
	# https://chartio.com/resources/tutorials/how-to-execute-raw-sql-in-sqlalchemy/
	import os
	import sys
	from sqlalchemy import Column, ForeignKey, Integer, String, Numeric
	from sqlalchemy.ext.declarative import declarative_base
	from sqlalchemy.orm import relationship
	from sqlalchemy import create_engine
	from sqlalchemy.orm import sessionmaker
	import hashlib
	import base64
	import time

	Base = declarative_base()

	class Directory(Base):
	__tablename__ = 'directory'
	id = Column(Integer, primary_key=True)
	path = Column(String(250), nullable=False)

	class File(Base):
	__tablename__ = 'file'
	id = Column(Integer, primary_key=True)
	filename = Column(String(250))
	full_path = Column(String(250))
	dirname = Column(String(250))
	file_size = Column(Integer())
	last_access_time = Column(Integer())
	last_modification_time = Column(Integer())
	last_creation_time = Column(Integer())
	hash_full = Column(String(250))
	hash_1k = Column(String(250))
	directory_id = Column(Integer(), ForeignKey('directory.id'))
	directory = relationship(Directory)

	try:
	os.remove("duplicate_files.db")
	except Exception as e:
	raise e

	engine = create_engine('sqlite:///duplicate_files.db')

	# Create all tables in the engine. This is equivalent to "Create Table"
	# statements in raw SQL.
	Base.metadata.create_all(engine)

	DBSession = sessionmaker(bind=engine)
	session = DBSession()


	def get_file_details(full_path):
	result = {
	}
	#print(full_path)
	try:
	# if the target is a symlink (soft one), this will
	# dereference it - change the value to the actual target file
	full_path = os.path.realpath(full_path)
	result['full_path'] = full_path
	dirname = os.path.dirname(full_path)
	result['dirname'] = dirname
	file_size = os.path.getsize(full_path)
	result['file_size'] = file_size
	last_access_time = os.path.getatime(full_path)
	result['last_access_time'] = last_access_time
	last_modification_time = os.path.getmtime(full_path)
	# most important
	result['last_modification_time'] = last_modification_time
	last_creation_time = os.path.getctime(full_path)
	result['last_creation_time'] = last_creation_time
	except (OSError,):
	# not accessible (permissions, etc) - pass on
	pass
	return result


	extensions = [
	'jpg',
	'jpeg',
	'mpeg',
	'mpg',
	'avi',
	'mp4'
	]


	# 'Z:\\_BACKUP_\\_fotos_\\2006',
	paths = [
	'Z:\\_BACKUP_\\_fotos_\\2005',
	'Z:\\_BACKUP_\\_fotos_\\2006',
	]

	paths = [
	'D:\\TMP\\duplicate_center\\test1',
	'D:\\TMP\\duplicate_center\\test2',
	]


	path_analysis = {}

	session.commit()

	print('\nprocurando arquivos nos diretorios...')
	for path in paths:
	#path_analysis[path] = []
	new_directory = Directory(path=path)
	session.add(new_directory)
	print(path)
	for dirpath, dirnames, filenames in os.walk(path):
	#print(filenames)
	for filename in filenames:
	extension = filename.split('.')[-1].lower()
	if extension not in extensions:
	continue
	full_path = os.path.join(dirpath, filename)
	file_details = get_file_details(full_path)
	print('.', end='', flush=True)
	if file_details:
	new_file = File()
	new_file.filename = filename
	new_file.full_path = file_details.get('full_path')
	new_file.dirname = file_details.get('dirname')
	new_file.file_size = file_details.get('file_size')
	new_file.last_access_time = file_details.get('last_access_time')
	new_file.last_modification_time = file_details.get('last_modification_time')
	new_file.last_creation_time = file_details.get('last_creation_time')
	new_file.directory = new_directory
	session.add(new_file)

	session.commit()

	def chunk_reader(fobj, chunk_size=1024):
	"""Generator that reads a file in chunks of bytes"""
	while True:
	chunk = fobj.read(chunk_size)
	if not chunk:
	return
	yield chunk

	def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
	hashobj = hash()
	file_object = open(filename, 'rb')

	if first_chunk_only:
	hashobj.update(file_object.read(1024))
	else:
	for chunk in chunk_reader(file_object):
	hashobj.update(chunk)
	hashed = hashobj.digest()

	file_object.close()
	return hashed

	def localtime(unix_timestamp):
	local_time = time.localtime(unix_timestamp)
	return time.strftime("%Y-%m-%d %H:%M:%S", local_time)

	# obter todos os arquivos do mesmo tamanho e calcular o hash_1k
	print('\nCalculando hash 1K de arquivos do mesmo tamanho..')
	with engine.connect() as con:
	rs = con.execute(
	'''
	select
	file_size,
	count(*) as file_counter
	from
	file
	where
	file_size > 0 and
	hash_1k is null
	group by
	file_size
	having count(*) > 1
	order by 1
	'''
	)
	for row in rs:
	file_size = dict(row)['file_size']
	#print('file_size={}'.format(file_size))
	files = session.query(File).filter_by(file_size=file_size).all()
	for file in files:
	#print(file.full_path)
	#print(file.file_size)
	print('.', end='', flush=True)
	try:
	hash_1k_bin = get_hash(file.full_path, first_chunk_only=True)
	file.hash_1k = base64.b64encode(hash_1k_bin)
	session.add(file)
	except (OSError,):
	# the file access might've changed till the exec point got here
	continue

	session.commit()

	# obter todos os arquivos com mesmo tamanho e hash_1k e calcular o hash
	print('\nCalculando hash full de arquivos com mesmo tamanho e hash 1K..')
	with engine.connect() as con:
	rs = con.execute(
	'''
	select
	hash_1k,
	file_size,
	count(*) as hash_counter
	from
	file
	where
	hash_1k is not null and
	hash_full is null
	group by
	hash_1k,
	file_size
	having count(*) > 1
	order by 1
	'''
	)
	for row in rs:
	#print(row)
	hash_1k = dict(row)['hash_1k']
	file_size = dict(row)['file_size']
	#print('hash_1k={}'.format(hash_1k))

	files = session.query(File).filter_by(hash_1k=hash_1k).filter_by(file_size=file_size).all()
	for file in files:
	#print(file.full_path)
	#print(file.file_size)
	print('.', end='', flush=True)
	try:
	hash_full_bin = get_hash(file.full_path, first_chunk_only=False)
	file.hash_full = base64.b64encode(hash_full_bin)
	session.add(file)
	except (OSError,):
	# the file access might've changed till the exec point got here
	continue

	session.commit()




	# obter todos os arquivos com o mesmo nome e calcular o hash_1k
	print('\nCalculando hash 1K de arquivos com o mesmo nome..')
	with engine.connect() as con:
	rs = con.execute(
	'''
	select
	filename,
	count(*) as file_counter
	from
	file
	where
	file_size > 0 and
	hash_1k is null
	group by
	filename
	having count(*) > 1
	order by 1
	'''
	)
	for row in rs:
	filename = dict(row)['filename']
	files = session.query(File).filter_by(filename=filename).all()
	for file in files:
	print('.', end='', flush=True)
	try:
	hash_1k_bin = get_hash(file.full_path, first_chunk_only=True)
	file.hash_1k = base64.b64encode(hash_1k_bin)
	session.add(file)
	except (OSError,):
	# the file access might've changed till the exec point got here
	continue

	session.commit()

	# obter todos os arquivos com mesmo nome e hash_1k e calcular o hash
	print('\nCalculando hash full de arquivos com mesmo nome e hash 1K..')
	with engine.connect() as con:
	rs = con.execute(
	'''
	select
	hash_1k,
	filename,
	count(*) as hash_counter
	from
	file
	where
	hash_1k is not null and
	hash_full is null
	group by
	hash_1k,
	filename
	having count(*) > 1
	order by 1
	'''
	)
	for row in rs:
	#print(row)
	hash_1k = dict(row)['hash_1k']
	filename = dict(row)['filename']
	#print('hash_1k={}'.format(hash_1k))

	files = session.query(File).filter_by(hash_1k=hash_1k).filter_by(filename=filename).all()
	for file in files:
	#print(file.full_path)
	#print(file.file_size)
	print('.', end='', flush=True)
	try:
	hash_full_bin = get_hash(file.full_path, first_chunk_only=False)
	file.hash_full = base64.b64encode(hash_full_bin)
	session.add(file)
	except (OSError,):
	# the file access might've changed till the exec point got here
	continue

	session.commit()

	# arquivos duplicados em cada um dos diretorios
	print('\nArquivos duplicados em cada um dos diretorios..')
	directories = session.query(Directory).all()
	for directory in directories:
	with engine.connect() as con:
	rs = con.execute(
	'''
	select
	hash_full,
	count(*)
	from
	file f
	where
	directory_id = {} and
	hash_full is not null and
	not exists (
	select
	*
	from
	file
	where
	directory_id <> f.directory_id and
	hash_full = f.hash_full
	)
	group by
	hash_full
	having count(*) > 1
	order by 1
	'''.format(directory.id)
	)

	rows = rs.fetchall()
	if len(rows) > 0:
	print('----- duplicados ---------')
	print(directory.path)
	print('--------------------------')
	for row in rows:
	hash_full = dict(row)['hash_full']
	files = session.query(
	File
	).filter_by(
	directory=directory
	).filter_by(
	hash_full=hash_full
	).order_by(
	File.last_creation_time.desc(),
	File.last_modification_time.desc(),
	File.last_access_time.desc(),
	).all()
	print('{} files'.format(len(files)))
	for file in files:
	print('--')
	print(file.full_path)
	print(localtime(file.last_creation_time))
	print(localtime(file.last_modification_time))
	print(localtime(file.last_access_time))


	# arquivos duplicados presents nos dois diretorios
	print('\nArquivos duplicados presents nos dois diretorios..')
	directory = session.query(Directory).first()
	if directory:
	with engine.connect() as con:
	rs = con.execute(
	'''
	select
	hash_full,
	count(*)
	from
	file f
	where
	directory_id = {} and
	hash_full is not null and
	exists (
	select
	*
	from
	file
	where
	directory_id <> f.directory_id and
	hash_full = f.hash_full
	)
	group by
	hash_full
	having count(*) > 0
	order by 1
	'''.format(directory.id)
	)

	rows = rs.fetchall()
	if len(rows) > 0:
	print('----- duplicados ---------')
	print('--------------------------')
	for row in rows:
	hash_full = dict(row)['hash_full']
	files = session.query(
	File
	).filter_by(
	hash_full=hash_full
	).order_by(
	File.last_creation_time.desc(),
	File.last_modification_time.desc(),
	File.last_access_time.desc(),
	).all()
	print('{} files'.format(len(files)))
	for file in files:
	print('--')
	print(file.full_path)
	print(localtime(file.last_creation_time))
	print(localtime(file.last_modification_time))
	print(localtime(file.last_access_time))



	# obter os arquivos que estao na pasta atual, mas nao estao no backup
	print('\nArquivos que estao na pasta atual, mas nao estao no backup..')
	directory = session.query(Directory).first()
	if directory:
	with engine.connect() as con:
	rs = con.execute(
	'''
	select
	distinct filename
	from
	file f
	where
	directory_id = {} and
	not exists (
	select
	*
	from
	file
	where
	directory_id <> f.directory_id and
	filename = f.filename
	)
	order by filename
	'''.format(directory.id)
	)

	rows = rs.fetchall()
	if len(rows) > 0:
	print('----- nao estao no backup ---------')
	print('-----------------------------------')
	for row in rows:
	filename = dict(row)['filename']
	files = session.query(
	File
	).filter_by(
	directory=directory
	).filter_by(
	filename=filename
	).order_by(
	File.dirname.asc()
	).all()
	print('{} files'.format(len(files)))
	for file in files:
	print('--')
	print(file.full_path)
	print(localtime(file.last_creation_time))
	print(localtime(file.last_modification_time))
	print(localtime(file.last_access_time))

	# arquivos com mesmo nome, mas com hash distintos
	print('\nArquivos com mesmo nome, mas com hash distintos..')
	directories = session.query(Directory).all()
	for directory in directories:
	with engine.connect() as con:
	rs = con.execute(
	'''
	select distinct
	file1.filename,
	file1.hash_full
	from
	file as file1,
	file as file2
	where
	file1.directory_id = {} and
	file2.directory_id = file1.directory_id and
	file1.filename = file2.filename and
	file1.hash_full <> file2.hash_full
	'''.format(directory.id)
	)

	rows = rs.fetchall()
	if len(rows) > 0:
	print('----- mesmo nome, hash distintos ---------')
	print(directory.path)
	print('-------------------------------------------')
	for row in rows:
	filename = dict(row)['filename']
	hash_full = dict(row)['hash_full']
	files = session.query(
	File
	).filter_by(
	directory=directory
	).filter_by(
	filename=filename
	).filter_by(
	hash_full=hash_full
	).order_by(
	File.dirname.asc()
	).all()
	for file in files:
	print('--')
	print(file.full_path)
	print(localtime(file.last_creation_time))
	print(localtime(file.last_modification_time))
	print(localtime(file.last_access_time))

	# arquivos com mesmo hash, mas com nomes distintos
	print('\nArquivos com mesmo hash, mas com nomes distintos..')
	directories = session.query(Directory).all()
	for directory in directories:
	with engine.connect() as con:
	rs = con.execute(
	'''
	select
	hash_full,
	count(*)
	from
	file f
	where
	directory_id = {} and
	hash_full is not null and
	exists (
	select
	*
	from
	file
	where
	directory_id = f.directory_id and
	hash_full = f.hash_full and
	filename <> f.filename
	)
	group by
	hash_full
	having count(*) > 1
	order by 1
	'''.format(directory.id)
	)

	rows = rs.fetchall()
	if len(rows) > 0:
	print('----- mesmo hash, nomes distintos ---------')
	print(directory.path)
	print('-------------------------------------------')
	for row in rows:
	hash_full = dict(row)['hash_full']
	files = session.query(
	File
	).filter_by(
	directory=directory
	).filter_by(
	hash_full=hash_full
	).order_by(
	File.last_creation_time.desc(),
	File.last_modification_time.desc(),
	File.last_access_time.desc(),
	).all()
	print('{} files'.format(len(files)))
	for file in files:
	print('--')
	print(file.full_path)
	print(localtime(file.last_creation_time))
	print(localtime(file.last_modification_time))
	print(localtime(file.last_access_time))

	# arquivos mais novos na pasta (backup esta com mais antigo) com o mesmo nome??
	print('\nArquivos mais novos na pasta (backup esta com mais antigo) com o mesmo nome..')
	directory = session.query(Directory).first()
	if directory:
	with engine.connect() as con:
	rs = con.execute(
	'''
	select distinct
	file1.full_path,
	file1.hash_full
	from
	file as file1,
	file as file2
	where
	file1.directory_id = {} and
	file2.directory_id <> file1.directory_id and
	file1.filename = file2.filename and
	(
	file1.hash_1k <> file2.hash_1k or
	file1.hash_full <> file2.hash_full
	) and
	file1.last_modification_time > file2.last_modification_time
	'''.format(directory.id)
	)

	rows = rs.fetchall()
	if len(rows) > 0:
	print('----- arquivos mais novos na pasta ---------')
	print(directory.path)
	print('-------------------------------------------')
	for row in rows:
	full_path = dict(row)['full_path']
	hash_full = dict(row)['hash_full']
	files = session.query(
	File
	).filter_by(
	directory=directory
	).filter_by(
	full_path=full_path
	).filter_by(
	hash_full=hash_full
	).order_by(
	File.dirname.asc()
	).all()
	for file in files:
	print('--')
	print(file.full_path)
	print(localtime(file.last_creation_time))
	print(localtime(file.last_modification_time))
	print(localtime(file.last_access_time))


	# diretorios vazios??
	print('\ndiretorios vazios..')
	directories = session.query(Directory).all()
	for directory in directories:
	print('----- diretorios vazios ---------')
	print(directory.path)
	print('---------------------------------')
	path = directory.path
	for dirpath, dirnames, filenames in os.walk(path):
	if len(filenames) == 0:
	print(dirpath)


	# apagar copiando para uma pasta delete-me