Skip to content

Instantly share code, notes, and snippets.

@jcrousse
Created March 8, 2022 20:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jcrousse/9e26506ec9f6c12c5ecd203645f91caf to your computer and use it in GitHub Desktop.
Save jcrousse/9e26506ec9f6c12c5ecd203645f91caf to your computer and use it in GitHub Desktop.
Using CUDA streams with CuPy
import cupy as cp
import time
import asyncio
async def predict(N, power):
compute_stream = cp.cuda.stream.Stream(non_blocking=True)
compute_stream.use()
d_mat = cp.random.randn(N * N, dtype=cp.float64).reshape(N, N)
d_ret = d_mat
cp.matmul(d_ret, d_mat)
start = time.time()
for i in range(power - 1):
d_ret = cp.matmul(d_ret, d_mat)
pre_synch = time.time()
await asyncio.sleep(5)
compute_stream.synchronize()
cpu_time = pre_synch - start
gpu_time = time.time() - pre_synch
print(f"CPU time: {cpu_time}, GPU time: {gpu_time}")
return cpu_time, gpu_time
async def main(n):
cpu_time, gpu_time = await predict(1024, n)
single_request_time = round(cpu_time + gpu_time, 1)
start = time.time()
_ = await asyncio.gather(predict(1024, n), predict(1024, n), predict(1024, n), predict(1024, n))
total_time = round(time.time() - start, 1)
gain = round(total_time / (single_request_time * 4) * 100)
print(f"Treated one request of size {n} in {cpu_time + gpu_time}\n "
f"Treated 4 requests of size {n} in {total_time} seconds, instead "
f"of {4 * single_request_time}, ({gain}% of sequential operations)")
if __name__ == "__main__":
asyncio.run(main(32))
asyncio.run(main(512))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment