Skip to content

Instantly share code, notes, and snippets.

@kitroed
Created February 20, 2016 14:36
Show Gist options
  • Save kitroed/9cccd7a4fda1ea16d766 to your computer and use it in GitHub Desktop.
Save kitroed/9cccd7a4fda1ea16d766 to your computer and use it in GitHub Desktop.
# the idea here is to throw together a quick
# SQLite database to store the hash of all
# the files found in a given subdir
import hashlib
import os
import time
import datetime
import socket
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String, DateTime, Boolean, MetaData
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, backref
from sqlalchemy.sql.schema import ForeignKey
Base = declarative_base()
# Should make a Host class?
class File(Base):
__tablename__ = 'files'
host = Column(String(50), nullable=False)
full_path = Column(String, primary_key=True)
md5_hash = Column(String(32), nullable=False)
path = Column(String)
size = Column(Integer)
filename = Column(String)
extension = Column(String)
modified = Column(DateTime)
created = Column(DateTime)
can_read = Column(Boolean)
last_checked = Column(DateTime)
def __repr__(self):
return "<File(Filename='%s' Hash='%s')>" % (self.filename, self.md5_hash)
################################################################################
basedir = os.path.abspath(os.path.dirname(__file__))
engine = create_engine('sqlite:///' + os.path.join(basedir, 'filehashdata.sqlite'),
echo=False)
Base.metadata.create_all(engine)
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()
path = 'D:\\'
hostname = socket.gethostname()
for dir_path, dir_names, file_names in os.walk(path):
for file_name in file_names:
file = File(full_path=os.path.join(dir_path, file_name))
file.host = hostname
file.path = dir_path
file.filename = file_name
file.extension = os.path.splitext(file_name)[1]
file.last_checked = datetime.datetime.now()
file.can_read = False
try:
file.size = os.path.getsize(file.full_path)
file.modified = datetime.datetime.fromtimestamp(os.path.getmtime(file.full_path))
file.created = datetime.datetime.fromtimestamp(os.path.getctime(file.full_path))
file.md5_hash = hashlib.md5(open(file.full_path, 'rb').read()).hexdigest()
file.last_checked = datetime.datetime.now()
file.can_read = True
print(file)
except (PermissionError, FileNotFoundError, OSError):
print("Permission or FileNotFound error when hashing %s" % file.full_path)
continue
session.merge(file)
session.commit()
# save info to database
# we'll use merge since the path is unique
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment