Skip to content

Instantly share code, notes, and snippets.

@belltailjp
Last active February 16, 2021 07:43
Show Gist options
  • Save belltailjp/eb6a98a799bf1a56bd42132b92db763b to your computer and use it in GitHub Desktop.
Save belltailjp/eb6a98a799bf1a56bd42132b92db763b to your computer and use it in GitHub Desktop.
import argparse
import glob
import os
import time
import numpy as np
from torch.utils.data import DataLoader
from pfio.cache import MultiprocessFileCache
from pfio.cache import FileCache
class CachedDataset:
def __init__(self, cache):
self.cache = cache
def __len__(self):
return len(self.cache)
def __getitem__(self, idx):
return self.cache.get(idx)
def main():
args = argparse.ArgumentParser()
args.add_argument('--cache-dir', default='/tmp')
args = args.parse_args()
cache_dir = args.cache_dir
num_workers = [16, 32, 64, 128]
n_trials = 5
all_N_l = [
(32768, 1024 ** 2),
(1024 ** 2, 32768),
]
print('| # samples | sample size | # workers | mean time per sample (us) | stddev (us) |')
print('|:---|:---|:---|:---|:---|')
for i, (N, l) in enumerate(all_N_l):
# build the cache
cache = FileCache(N, do_pickle=False, dir=cache_dir)
for j in range(N):
buf = np.random.bytes(l)
cache.put(j, buf)
cache.preserve('cache_data')
# Load cache
for n_worker in num_workers:
cache = MultiprocessFileCache(N, do_pickle=False,
dir=cache_dir)
cache.preload('cache_data')
ds = CachedDataset(cache)
times = []
for _ in range(n_trials):
loader = DataLoader(ds, collate_fn=lambda x: x,
batch_size=128,
num_workers=n_worker, shuffle=True)
before = time.time()
for samples in loader:
assert all(len(s) == l for s in samples)
after = time.time()
times.append((after - before) / N)
mean, std = np.mean(times), np.std(times)
print('| {} | {} | {} | {:.2f} | {:.2f} |'
.format(N, l, n_worker, 1e+6 * mean, 1e+6 * std))
for f in glob.glob('{}/cache_data*'.format(cache_dir)):
os.remove(f)
if __name__ == '__main__':
main()

Environment

  • Xeon 6254 x 2
  • DDR4 384GB
  • OS: Ubuntu 18.04x64 (4.15.0-58-generic)
  • Local storage: local SSD
  • Python: 3.8.6

master

# samples sample size # workers mean time per sample (us) stddev (us)
32768 1048576 16 2030.02 307.05
32768 1048576 32 1876.47 271.42
32768 1048576 64 2151.52 238.03
32768 1048576 128 2059.07 149.95
1048576 32768 16 33.73 0.39
1048576 32768 32 36.07 0.56
1048576 32768 64 35.81 0.19
1048576 32768 128 38.54 0.63

single cache

# samples sample size # workers mean time per sample (us) stddev (us)
32768 1048576 16 2276.69 221.75
32768 1048576 32 2253.60 197.82
32768 1048576 64 1975.38 121.20
32768 1048576 128 1903.31 77.03
1048576 32768 16 33.59 0.21
1048576 32768 32 35.94 0.36
1048576 32768 64 35.97 0.21
1048576 32768 128 38.96 0.14
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment