Skip to content

Instantly share code, notes, and snippets.

@cako
Last active November 27, 2023 03:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cako/e1894a18c84b76ed50d8155b7d3630ca to your computer and use it in GitHub Desktop.
Save cako/e1894a18c84b76ed50d8155b7d3630ca to your computer and use it in GitHub Desktop.
# In-Place Memory Pinning in PyTorch
# See # https://github.com/pytorch/pytorch/issues/32167#issuecomment-753551842
#
# Copyright 2023 Carlos Alberto da Costa Filho <c.dacostaf@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the “Software”), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
import gc
from time import perf_counter_ns
import psutil
import torch
from torch.cuda import cudart, empty_cache, nvtx, synchronize
ARRAY_MEMORY_GiB = 3
GiB_to_B = 1024 * 1024 * 1024
BIT_TO_BYTE = 8
DTYPE = torch.float32
n = (ARRAY_MEMORY_GiB * GiB_to_B * BIT_TO_BYTE) // torch.finfo(DTYPE).bits
class PerfBlock:
def __init__(self, name: str) -> None:
self.name = name
def __enter__(self):
gc.collect()
ram = psutil.virtual_memory()
free_ram, total_ram = ram.available, ram.total
free_gpu, total_gpu = cudart().cudaMemGetInfo(0)
print(f"Free RAM: {free_ram /GiB_to_B:.1f} GiB / {total_ram /GiB_to_B:.1f} GiB")
print(f"Free GPU: {free_gpu /GiB_to_B:.1f} GiB / {total_gpu /GiB_to_B:.1f} GiB")
print(self.name)
nvtx.range_push(self.name)
self.tic = perf_counter_ns()
def __exit__(self, exc_type, exc_val, exc_tb):
self.toc = perf_counter_ns()
nvtx.range_pop()
print(f"Runtime: {1e-6 * (self.toc-self.tic):.1f} ms")
print()
free, total = cudart().cudaMemGetInfo(0)
assert free >= ARRAY_MEMORY_GiB * GiB_to_B, ValueError("Array memory is too large")
cudart().cudaProfilerStart()
# Paged
print("PAGED")
with PerfBlock("Paged CPU create"):
x = torch.ones(n, device="cpu", dtype=DTYPE)
assert not x.is_pinned()
with PerfBlock("Paged CPU to GPU call"):
x = x.to("cuda", non_blocking=True)
with PerfBlock("Paged CPU to GPU sync"):
synchronize()
with PerfBlock("Paged CPU to GPU math"):
x.mul_(2)
y = x.mean()
synchronize()
with PerfBlock("Paged CPU to GPU del"):
del x
del y
empty_cache()
synchronize()
print("PINNED Out-of-place")
with PerfBlock("Pinned oop CPU create"):
x = torch.ones(n, device="cpu", dtype=DTYPE)
assert not x.is_pinned()
with PerfBlock("Pinned oop CPU pin"):
x = x.pin_memory()
assert x.is_pinned()
with PerfBlock("Pinned oop CPU to GPU call"):
x = x.to("cuda", non_blocking=True)
with PerfBlock("Pinned oop CPU to GPU sync"):
synchronize()
with PerfBlock("Pinned oop CPU to GPU math"):
x.mul_(2)
y = x.mean()
synchronize()
with PerfBlock("Pinned oop CPU to GPU del"):
del x
del y
empty_cache()
synchronize()
# Pinned
print("PINNED")
with PerfBlock("Pinned CPU create"):
x = torch.ones(n, device="cpu", dtype=DTYPE)
with PerfBlock("Pinned CPU pin"):
# See https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
ret = cudart().cudaHostRegister(x.data_ptr(), x.nbytes, 0)
assert ret
assert x.is_pinned()
xcpu = x
with PerfBlock("Pinned CPU to GPU call"):
x = x.to("cuda", non_blocking=True)
with PerfBlock("Pinned CPU to GPU sync"):
synchronize()
with PerfBlock("Pinned CPU to GPU unpin"):
ret = cudart().cudaHostUnregister(xcpu.data_ptr())
del xcpu
assert ret
with PerfBlock("Pinned CPU to GPU math"):
x.mul_(2)
y = x.mean()
synchronize()
with PerfBlock("Pinned CPU to GPU del"):
del x
del y
empty_cache()
synchronize()
# Shared
print("SHARED")
with PerfBlock("Shared CPU create"):
x = torch.ones(n, device="cpu", dtype=DTYPE)
with PerfBlock("Shared CPU share"):
x.share_memory_()
assert x.is_shared()
with PerfBlock("Shared CPU pin"):
# See https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
ret = cudart().cudaHostRegister(x.data_ptr(), x.nbytes, 0)
assert ret
assert x.is_pinned()
xcpu = x
with PerfBlock("Shared CPU to GPU call"):
x = x.to("cuda", non_blocking=True)
with PerfBlock("Shared CPU to GPU sync"):
synchronize()
with PerfBlock("Sahred CPU to GPU unpin"):
ret = cudart().cudaHostUnregister(xcpu.data_ptr())
del xcpu
assert ret
with PerfBlock("Shared CPU to GPU math"):
x.mul_(2)
y = x.mean()
synchronize()
with PerfBlock("Shared CPU to GPU del"):
del x
del y
empty_cache()
synchronize()
cudart().cudaProfilerStop()
# nsys profile --trace cuda,nvtx --capture-range cudaProfilerApi --output inplace_pin --force-overwrite true python inplace_pin.py
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment