Skip to content

Instantly share code, notes, and snippets.

@ScottWales
Created February 22, 2019 05:22
Show Gist options
  • Save ScottWales/f3f89e8be290042cb145e7086ab66e55 to your computer and use it in GitHub Desktop.
Save ScottWales/f3f89e8be290042cb145e7086ab66e55 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# Copyright 2019 ARC Centre of Excellence for Climate Extremes
# author: Scott Wales <scott.wales@unimelb.edu.au>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import stat
import sqlalchemy as sa
import sqlalchemy.sql.functions as safunc
import tqdm
import sys
import pandas
import pwd
import grp
import datetime
def walk(path, parent_inode=None, progress=None):
"""
Descend a directory, constructing a list of metadata for each file found
"""
# Find the parent inode if not supplied
if parent_inode is None:
parent_inode = os.stat(path).st_ino
records = []
for inode in os.scandir(path):
# Loop over each file in the directory, adding it to the results list
stat = inode.stat(follow_symlinks=False)
records.append({'name': inode.name, 'inode': stat.st_ino, 'size': stat.st_size, 'mtime': stat.st_mtime, 'parent_inode': parent_inode, 'uid': stat.st_uid, 'gid': stat.st_gid})
# Recurse into directories
if inode.is_dir(follow_symlinks=False):
try:
records.extend(walk(inode.path, parent_inode=stat.st_ino, progress=progress))
except FileNotFoundError:
pass
# Update progress bar
if progress is not None:
progress.update(1)
# Return metadata of all files under 'path'
return records
with tqdm.tqdm(desc="Directories Scanned") as pbar:
records = walk(sys.argv[1], progress=pbar)
# Setup a sqlite database to query the results
engine = sa.create_engine('sqlite:///sqldu.sqlite', echo=False)
metadata = sa.MetaData()
paths = sa.Table('paths', metadata,
sa.Column('id',sa.Integer,primary_key=True),
sa.Column('name',sa.String),
sa.Column('inode',sa.Integer),
sa.Column('size',sa.Integer),
sa.Column('mtime',sa.Integer),
sa.Column('parent_inode',sa.Integer),
sa.Column('uid', sa.Integer),
sa.Column('gid', sa.Integer),
)
metadata.drop_all(engine)
metadata.create_all(engine)
conn = engine.connect()
conn.execute(paths.insert(), records)
# Calculate the POSIX timestamp for "old" files
old_threshold = (pandas.Timestamp.now() - pandas.offsets.MonthBegin(6))
print(f'Considering files modified before {old_threshold} as old')
old_threshold = int((old_threshold).timestamp())
# Create some queries:
# All files
total = (sa.sql.select([paths.c.uid, paths.c.gid, safunc.sum(paths.c.size).label('total_size'), safunc.count().label('total_inodes')])
.group_by(paths.c.uid, paths.c.gid))
# Files older than 'old_threshold'
old = (sa.sql.select([paths.c.uid.label('ouid'), paths.c.gid.label('ogid'), safunc.sum(paths.c.size).label('old_size'), safunc.count().label('old_inodes')])
.where(paths.c.mtime < old_threshold)
.group_by(paths.c.uid, paths.c.gid))
joined_tables = sa.sql.join(total, old, sa.and_(total.c.uid == old.c.ouid, total.c.gid == old.c.ogid), isouter=True)
# Convert to a Pandas dataframe for post-processing and tidying up
df = pandas.read_sql(sa.sql.select([total.c.uid, total.c.gid, total.c.total_size, total.c.total_inodes, old.c.old_size, old.c.old_inodes]).select_from(joined_tables), conn)
df['name'] = df['uid'].apply(lambda u: pwd.getpwuid(u).pw_gecos)
df['group'] = df['gid'].apply(lambda u: grp.getgrgid(u).gr_name)
df['total size (GB)'] = df['total_size']/1024**3
df['old size (GB)'] = df['old_size']/1024**3
df['old fraction'] = df['old_size'] / df['total_size']
# Print output
print(df[['name','group','total size (GB)','total_inodes', 'old size (GB)', 'old_inodes', 'old fraction']])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment