cako/inplace_pin.py

## inplace_pin.py
# In-Place Memory Pinning in PyTorch
# See # https://github.com/pytorch/pytorch/issues/32167#issuecomment-753551842
#
# Copyright 2023 Carlos Alberto da Costa Filho <c.dacostaf@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the “Software”), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

import gc
from time import perf_counter_ns

import psutil
import torch
from torch.cuda import cudart, empty_cache, nvtx, synchronize

ARRAY_MEMORY_GiB = 3
GiB_to_B = 1024 * 1024 * 1024
BIT_TO_BYTE = 8
DTYPE = torch.float32
n = (ARRAY_MEMORY_GiB * GiB_to_B * BIT_TO_BYTE) // torch.finfo(DTYPE).bits


class PerfBlock:
    def __init__(self, name: str) -> None:
        self.name = name

    def __enter__(self):
        gc.collect()
        ram = psutil.virtual_memory()
        free_ram, total_ram = ram.available, ram.total
        free_gpu, total_gpu = cudart().cudaMemGetInfo(0)
        print(f"Free RAM: {free_ram /GiB_to_B:.1f} GiB / {total_ram /GiB_to_B:.1f} GiB")
        print(f"Free GPU: {free_gpu /GiB_to_B:.1f} GiB / {total_gpu /GiB_to_B:.1f} GiB")
        print(self.name)
        nvtx.range_push(self.name)
        self.tic = perf_counter_ns()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.toc = perf_counter_ns()
        nvtx.range_pop()
        print(f"Runtime: {1e-6 * (self.toc-self.tic):.1f} ms")
        print()


free, total = cudart().cudaMemGetInfo(0)
assert free >= ARRAY_MEMORY_GiB * GiB_to_B, ValueError("Array memory is too large")
cudart().cudaProfilerStart()

# Paged
print("PAGED")
with PerfBlock("Paged CPU create"):
    x = torch.ones(n, device="cpu", dtype=DTYPE)
assert not x.is_pinned()
with PerfBlock("Paged CPU to GPU call"):
    x = x.to("cuda", non_blocking=True)
with PerfBlock("Paged CPU to GPU sync"):
    synchronize()
with PerfBlock("Paged CPU to GPU math"):
    x.mul_(2)
    y = x.mean()
    synchronize()
with PerfBlock("Paged CPU to GPU del"):
    del x
    del y
    empty_cache()
    synchronize()

print("PINNED Out-of-place")
with PerfBlock("Pinned oop CPU create"):
    x = torch.ones(n, device="cpu", dtype=DTYPE)
assert not x.is_pinned()
with PerfBlock("Pinned oop CPU pin"):
    x = x.pin_memory()
assert x.is_pinned()
with PerfBlock("Pinned oop CPU to GPU call"):
    x = x.to("cuda", non_blocking=True)
with PerfBlock("Pinned oop CPU to GPU sync"):
    synchronize()
with PerfBlock("Pinned oop CPU to GPU math"):
    x.mul_(2)
    y = x.mean()
    synchronize()
with PerfBlock("Pinned oop CPU to GPU del"):
    del x
    del y
    empty_cache()
    synchronize()

# Pinned
print("PINNED")
with PerfBlock("Pinned CPU create"):
    x = torch.ones(n, device="cpu", dtype=DTYPE)
with PerfBlock("Pinned CPU pin"):
    # See https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
    ret = cudart().cudaHostRegister(x.data_ptr(), x.nbytes, 0)
assert ret
assert x.is_pinned()
xcpu = x
with PerfBlock("Pinned CPU to GPU call"):
    x = x.to("cuda", non_blocking=True)
with PerfBlock("Pinned CPU to GPU sync"):
    synchronize()
with PerfBlock("Pinned CPU to GPU unpin"):
    ret = cudart().cudaHostUnregister(xcpu.data_ptr())
    del xcpu
assert ret
with PerfBlock("Pinned CPU to GPU math"):
    x.mul_(2)
    y = x.mean()
    synchronize()
with PerfBlock("Pinned CPU to GPU del"):
    del x
    del y
    empty_cache()
    synchronize()

# Shared
print("SHARED")
with PerfBlock("Shared CPU create"):
    x = torch.ones(n, device="cpu", dtype=DTYPE)
with PerfBlock("Shared CPU share"):
    x.share_memory_()
assert x.is_shared()
with PerfBlock("Shared CPU pin"):
    # See https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
    ret = cudart().cudaHostRegister(x.data_ptr(), x.nbytes, 0)
assert ret
assert x.is_pinned()
xcpu = x
with PerfBlock("Shared CPU to GPU call"):
    x = x.to("cuda", non_blocking=True)
with PerfBlock("Shared CPU to GPU sync"):
    synchronize()
with PerfBlock("Sahred CPU to GPU unpin"):
    ret = cudart().cudaHostUnregister(xcpu.data_ptr())
    del xcpu
assert ret
with PerfBlock("Shared CPU to GPU math"):
    x.mul_(2)
    y = x.mean()
    synchronize()
with PerfBlock("Shared CPU to GPU del"):
    del x
    del y
    empty_cache()
    synchronize()

cudart().cudaProfilerStop()

# nsys profile --trace cuda,nvtx --capture-range cudaProfilerApi --output inplace_pin --force-overwrite true python inplace_pin.py
	# In-Place Memory Pinning in PyTorch
	# See # https://github.com/pytorch/pytorch/issues/32167#issuecomment-753551842
	#
	# Copyright 2023 Carlos Alberto da Costa Filho <c.dacostaf@gmail.com>
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the “Software”), to
	# deal in the Software without restriction, including without limitation the
	# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
	# sell copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.

	import gc
	from time import perf_counter_ns

	import psutil
	import torch
	from torch.cuda import cudart, empty_cache, nvtx, synchronize

	ARRAY_MEMORY_GiB = 3
	GiB_to_B = 1024 * 1024 * 1024
	BIT_TO_BYTE = 8
	DTYPE = torch.float32
	n = (ARRAY_MEMORY_GiB * GiB_to_B * BIT_TO_BYTE) // torch.finfo(DTYPE).bits


	class PerfBlock:
	def __init__(self, name: str) -> None:
	self.name = name

	def __enter__(self):
	gc.collect()
	ram = psutil.virtual_memory()
	free_ram, total_ram = ram.available, ram.total
	free_gpu, total_gpu = cudart().cudaMemGetInfo(0)
	print(f"Free RAM: {free_ram /GiB_to_B:.1f} GiB / {total_ram /GiB_to_B:.1f} GiB")
	print(f"Free GPU: {free_gpu /GiB_to_B:.1f} GiB / {total_gpu /GiB_to_B:.1f} GiB")
	print(self.name)
	nvtx.range_push(self.name)
	self.tic = perf_counter_ns()

	def __exit__(self, exc_type, exc_val, exc_tb):
	self.toc = perf_counter_ns()
	nvtx.range_pop()
	print(f"Runtime: {1e-6 * (self.toc-self.tic):.1f} ms")
	print()


	free, total = cudart().cudaMemGetInfo(0)
	assert free >= ARRAY_MEMORY_GiB * GiB_to_B, ValueError("Array memory is too large")
	cudart().cudaProfilerStart()

	# Paged
	print("PAGED")
	with PerfBlock("Paged CPU create"):
	x = torch.ones(n, device="cpu", dtype=DTYPE)
	assert not x.is_pinned()
	with PerfBlock("Paged CPU to GPU call"):
	x = x.to("cuda", non_blocking=True)
	with PerfBlock("Paged CPU to GPU sync"):
	synchronize()
	with PerfBlock("Paged CPU to GPU math"):
	x.mul_(2)
	y = x.mean()
	synchronize()
	with PerfBlock("Paged CPU to GPU del"):
	del x
	del y
	empty_cache()
	synchronize()

	print("PINNED Out-of-place")
	with PerfBlock("Pinned oop CPU create"):
	x = torch.ones(n, device="cpu", dtype=DTYPE)
	assert not x.is_pinned()
	with PerfBlock("Pinned oop CPU pin"):
	x = x.pin_memory()
	assert x.is_pinned()
	with PerfBlock("Pinned oop CPU to GPU call"):
	x = x.to("cuda", non_blocking=True)
	with PerfBlock("Pinned oop CPU to GPU sync"):
	synchronize()
	with PerfBlock("Pinned oop CPU to GPU math"):
	x.mul_(2)
	y = x.mean()
	synchronize()
	with PerfBlock("Pinned oop CPU to GPU del"):
	del x
	del y
	empty_cache()
	synchronize()

	# Pinned
	print("PINNED")
	with PerfBlock("Pinned CPU create"):
	x = torch.ones(n, device="cpu", dtype=DTYPE)
	with PerfBlock("Pinned CPU pin"):
	# See https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
	ret = cudart().cudaHostRegister(x.data_ptr(), x.nbytes, 0)
	assert ret
	assert x.is_pinned()
	xcpu = x
	with PerfBlock("Pinned CPU to GPU call"):
	x = x.to("cuda", non_blocking=True)
	with PerfBlock("Pinned CPU to GPU sync"):
	synchronize()
	with PerfBlock("Pinned CPU to GPU unpin"):
	ret = cudart().cudaHostUnregister(xcpu.data_ptr())
	del xcpu
	assert ret
	with PerfBlock("Pinned CPU to GPU math"):
	x.mul_(2)
	y = x.mean()
	synchronize()
	with PerfBlock("Pinned CPU to GPU del"):
	del x
	del y
	empty_cache()
	synchronize()

	# Shared
	print("SHARED")
	with PerfBlock("Shared CPU create"):
	x = torch.ones(n, device="cpu", dtype=DTYPE)
	with PerfBlock("Shared CPU share"):
	x.share_memory_()
	assert x.is_shared()
	with PerfBlock("Shared CPU pin"):
	# See https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
	ret = cudart().cudaHostRegister(x.data_ptr(), x.nbytes, 0)
	assert ret
	assert x.is_pinned()
	xcpu = x
	with PerfBlock("Shared CPU to GPU call"):
	x = x.to("cuda", non_blocking=True)
	with PerfBlock("Shared CPU to GPU sync"):
	synchronize()
	with PerfBlock("Sahred CPU to GPU unpin"):
	ret = cudart().cudaHostUnregister(xcpu.data_ptr())
	del xcpu
	assert ret
	with PerfBlock("Shared CPU to GPU math"):
	x.mul_(2)
	y = x.mean()
	synchronize()
	with PerfBlock("Shared CPU to GPU del"):
	del x
	del y
	empty_cache()
	synchronize()

	cudart().cudaProfilerStop()

	# nsys profile --trace cuda,nvtx --capture-range cudaProfilerApi --output inplace_pin --force-overwrite true python inplace_pin.py