mkolod/pcie_bw.py

## pcie_bw.py
import torch
from time import time

# initialize CUDA, to not count startup later
foo = torch.ones(1).cuda()

MB = 1 << 20

# 27 MB tensor
NUM_MB = 27
# Floats
SIZEOF_DTYPE = 4
TENSOR_SIZE = int(NUM_MB * MB / SIZEOF_DTYPE)
THEORETICAL_V3_X16 = 15.75 * (1 << 30) / MB
PIN_MEMORY = True

data = torch.randn(TENSOR_SIZE)
if PIN_MEMORY:
    data = data.pin_memory()

# unnecessary here, but kept to not forget
#  if we schedule async work before
torch.cuda.synchronize()
time.sleep(1)

start = time()
# This is blocking, so timing will be correct after that
data = data.cuda()
duration = time() - start
print("Copy duration: {:.2f} ms".format(duration * 1000))
effective_bw = NUM_MB / duration
print("Effective Bandwidth: {:.2f} MB/s".format(effective_bw))
pct_theoretical = effective_bw / THEORETICAL_V3_X16 * 100
print("Percent theoretical PCIe v3 x16 bandwidth: {:.2f}".format(pct_theoretical))
	import torch
	from time import time

	# initialize CUDA, to not count startup later
	foo = torch.ones(1).cuda()

	MB = 1 << 20

	# 27 MB tensor
	NUM_MB = 27
	# Floats
	SIZEOF_DTYPE = 4
	TENSOR_SIZE = int(NUM_MB * MB / SIZEOF_DTYPE)
	THEORETICAL_V3_X16 = 15.75 * (1 << 30) / MB
	PIN_MEMORY = True

	data = torch.randn(TENSOR_SIZE)
	if PIN_MEMORY:
	data = data.pin_memory()

	# unnecessary here, but kept to not forget
	# if we schedule async work before
	torch.cuda.synchronize()
	time.sleep(1)

	start = time()
	# This is blocking, so timing will be correct after that
	data = data.cuda()
	duration = time() - start
	print("Copy duration: {:.2f} ms".format(duration * 1000))
	effective_bw = NUM_MB / duration
	print("Effective Bandwidth: {:.2f} MB/s".format(effective_bw))
	pct_theoretical = effective_bw / THEORETICAL_V3_X16 * 100
	print("Percent theoretical PCIe v3 x16 bandwidth: {:.2f}".format(pct_theoretical))