Skip to content

Instantly share code, notes, and snippets.

@yarikoptic
Created May 15, 2020 16:39
Show Gist options
  • Save yarikoptic/7284b634d8ab12277a3d316a117cc0db to your computer and use it in GitHub Desktop.
Save yarikoptic/7284b634d8ab12277a3d316a117cc0db to your computer and use it in GitHub Desktop.
A dirty helper to time traversal of the file tree and collection of os.stat results for files. See https://github.com/con/pyfscacher/issues/1 for more info
#!/usr/bin/env python
import os
from os import stat
from os.path import join, islink
from time import time
from pathlib import Path
from functools import wraps
from joblib import Parallel, delayed
def safestat(p):
try:
return stat(p, follow_symlinks=False)
except FileNotFoundError:
return None
def timeit(f):
@wraps(f)
def wrapper(*args, **kwargs):
t0 = time()
ret = f(*args, **kwargs)
dt = time() - t0
return ret, dt
return wrapper
def statfiles(dp, files):
return {val: safestat(join(dp, val)) for val in files}
def statdir(d='.'):
return [(dp, statfiles(dp, files)) for dp, _, files in os.walk(d)]
def statdir(d='.'):
return [(dp, statfiles(dp, files)) for dp, _, files in os.walk(d)]
# with file descriptor for a dir
def statfilesfd(dp, files):
fd = os.open(dp, os.O_RDONLY)
ret = {val: stat(val, follow_symlinks=False, dir_fd=fd) for val in files}
os.close(fd)
return ret
def statdirfd(d='.'):
return [(dp, statfilesfd(dp, files)) for dp, _, files in os.walk(d)]
def statdirp(d='.'):
_, dirs, files = next(os.walk(d))
# exclude symlinked dirs
dirs = [d for d in dirs if not islink(d)]
return sum((statdir(_) for _ in dirs), [{d: statfiles(d, files)}])
def statdirp_joblib(d='.'):
_, dirs, files = next(os.walk(d))
# exclude symlinked dirs
dirs = [d for d in dirs if not islink(d)]
# now we can parallelize across dirs
# Takes notable time!
#print("Parallelizing across %d dirs" % len(dirs))
#return sum((statdir(_) for _ in dirs), [{d: statfiles(d, files)}])
return sum(
Parallel(n_jobs=2)(#, prefer="threads")(
delayed(statdir)(_) for _ in dirs),
[{d: statfiles(d, files)}]
)
def statunsafeplain(d='.'):
return [[stat(join(dp, val), follow_symlinks=False) for val in files] for dp, _, files in os.walk('.')]
def statsafeplain(d='.'):
return [[safestat(join(dp, val)) for val in files] for dp, _, files in os.walk('.')]
import sys
if True:
# cold/warm
if len(sys.argv)>1:
func = sys.argv[1]
print(f"Using {func}")
func = locals()[sys.argv[1]]
else:
func = statdirp
f = timeit(func)
dfiles1, dt1 = f()
dfiles, dt2 = f()
assert dfiles1 == dfiles
else:
# verifying that statdir and statdirp return the same
dfiles1, dt1 = timeit(statdir)()
dfiles, dt2 = timeit(statdirp)()
from pprint import pprint
if len(dfiles) < 10:
pprint(dfiles1)
pprint(dfiles)
assert len(dfiles1) == len(dfiles)
# but they aren't exactly the same -- next level walk results have no ./ prefix ATM
total = 0
for e in dfiles:
total += len(e)
#from pprint import pprint
#pprint(dfiles[-3:])
print("Total %d:%d took %.5f cold %.5f warm" % (len(dfiles), total, dt1, dt2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment