Skip to content

Instantly share code, notes, and snippets.

@phizaz
Created October 5, 2020 15:28
Show Gist options
  • Save phizaz/fb190374a07fec59bf7dc67c4e42c59e to your computer and use it in GitHub Desktop.
Save phizaz/fb190374a07fec59bf7dc67c4e42c59e to your computer and use it in GitHub Desktop.
filelock.py
import errno
import os
import time
from collections import defaultdict
from contextlib import ContextDecorator, contextmanager
from itertools import count
import torch
CUDA_ALLOC_FILE = os.path.expanduser('~/mlkit.alloc')
def global_lock(n=1, delay=3.0, verbose=True, enable=True):
"""using a global lock file shared across the user"""
if enable:
return _FileLock(
None,
path=os.path.expanduser('~/mlkit.lock'),
n=n,
delay=delay,
verbose=verbose
)
else:
return nullcontext()
def wait_global_lock(n=1, delay=3.0, verbose=True):
"""wait for at least one of the locks to be available but not acquiring it"""
with _FileLock(
None, path=os.path.expanduser('~/mlkit.lock'), n=n, delay=delay, verbose=verbose
):
pass
@contextmanager
def cuda_round_robin(devices=[0], verbose=False, enable=True):
"""
Args:
devices: list of ints
"""
assert len(devices) > 0, "no device available"
if not enable:
dev = f'cuda:{devices[0]}'
yield dev
else:
# get alloc rights
with _get_alloc_right(verbose=verbose):
# count the cuda locks
locks = _list_cuda_locks()
min_dev = None
min_cnt = float('inf')
for dev in devices:
cnt = len(locks[dev])
if cnt < min_cnt:
min_cnt = cnt
min_dev = dev
# lock the cuda file
dirname = os.path.expanduser('~')
for i in count(start=0):
if i not in locks[min_dev]: break
fd, path = _lockfile(os.path.join(dirname, f'mlkit.cuda{min_dev}.{i}'))
if verbose: print(f'locked {path}')
try:
# yield
dev = f'cuda:{min_dev}'
yield dev
finally:
# remove the cuda file
try:
os.close(fd)
os.unlink(path)
except:
pass
if verbose: print(f'released {path}')
def _lockfile(path):
try:
fd = os.open(path, os.O_CREAT | os.O_EXCL | os.O_RDWR)
return fd, path
except OSError as e:
# it should not exist
raise e
def _get_alloc_right(verbose=False):
"""global allocation lock"""
return _FileLock(CUDA_ALLOC_FILE, delay=0.1, verbose=verbose)
def _list_cuda_locks():
# locks are mlkit.cuda{dev}.{i}
dirname = os.path.expanduser('~')
locks = defaultdict(list)
for f in os.listdir(dirname):
if 'mlkit.cuda' in f:
_, dev, i = f.split('.')
dev = int(dev[4:]) # from cuda*
i = int(i)
locks[dev].append(i)
return locks
class _FileLockException(Exception):
pass
class _FileLock(ContextDecorator):
""" A file locking mechanism that has context-manager support so
you can use it in a with statement. This should be relatively cross
compatible as it doesn't rely on msvcrt or fcntl for the locking.
From: https://github.com/dmfrey/FileLock/blob/master/filelock/filelock.py
"""
def __init__(self, file_name, n=1, delay=1.0, verbose=True, path=None):
self.is_locked = False
def lockfile_path(i):
if path is None:
return os.path.join(
os.getcwd(), f'{file_name}.lock.{i}'
) # use working directory
else:
return f'{path}.{i}'
self.all_lockfiles = [lockfile_path(i) for i in range(n)]
self.lockfile = None
self.delay = delay
self.verbose = verbose
def acquire(self):
if self.verbose: print('Acquiring for a lockfile')
while True:
for lockfile in self.all_lockfiles:
try:
self.fd = os.open(lockfile, os.O_CREAT | os.O_EXCL | os.O_RDWR)
self.lockfile = lockfile
self.is_locked = True
if self.verbose: print(f'Lockfile {lockfile} acquired')
return
except OSError as e:
if e.errno != errno.EEXIST:
raise
if not self.is_locked:
time.sleep(self.delay)
def release(self):
""" Get rid of the lock by deleting the lockfile.
When working in a `with` statement, this gets automatically
called at the end.
"""
if self.is_locked:
try:
os.close(self.fd)
os.unlink(self.lockfile)
if self.verbose: print(f'Lockfile {self.lockfile} released')
except Exception as e: # ignore errors
print(f'error releasing lock file {self.lockfile}:', e)
self.is_locked = False
def __enter__(self):
""" Activated when used in the with statement.
Should automatically acquire a lock to be used in the with block.
"""
if not self.is_locked:
self.acquire()
return self
def __exit__(self, type, value, traceback):
""" Activated at the end of the with statement.
It automatically releases the lock if it isn't locked.
"""
if self.is_locked:
self.release()
def __del__(self):
""" Make sure that the FileLock instance doesn't leave a lockfile
lying around.
"""
self.release()
@contextmanager
def nullcontext():
"""a context manager than yield a null object which can be called without any result"""
class NullCls:
def __getattr__(self, name):
return _nullfn
yield NullCls()
def _nullfn(*args, **kwargs):
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment