madlag/speed2.py

## speed2.py
#!/usr/bin/env python

# Any copyright is dedicated to the Public Domain.
# https://creativecommons.org/publicdomain/zero/1.0/

# Written by Francois Fleuret <francois@fleuret.org>
# Modified by François Lagunas <francois.lagunas@m4x.org>

import time, torch

if torch.cuda.is_available():
    device = torch.device('cuda')
    sync = torch.cuda.synchronize
else:
    device = torch.device('cpu')
    sync = lambda: None

d1, d2, d3 = 2048 * 8 * 4, 2048 * 4, 2048 * 4

iterations = 100

def test(prefix):
    for t in [ torch.float32, torch.float16, torch.bfloat16 ]:
        try:
            a = torch.rand(d1, d2, device = device, dtype=t)
            b = torch.rand(d2, d3, device = device, dtype=t)

            sync()
            start_time = time.perf_counter()
            for i in range(iterations):
                c = torch.mm(a,b)

            sync()
            duration = time.perf_counter() - start_time

            nb_flop = float(iterations * d1 * d2 * d3 * 2) # 1 multiply-and-add is 2 ops
            speed = nb_flop / duration

            for u in [ '', 'K', 'M', 'G', 'T', 'P' ]:
                if speed < 1e3: break
                speed /= 1e3

            print(f'{prefix} {speed:.02f} {u}flops with {t} on {device}')

        except:
            print(f'{prefix} {t} is not available on {device}')

test("AMP off")
with torch.cuda.amp.autocast():
    test("AMP on")


# Results on a RTX 3090
#AMP off 34.66 Tflops with torch.float32 on cuda
#AMP off 77.28 Tflops with torch.float16 on cuda
#AMP off 78.06 Tflops with torch.bfloat16 on cuda
#AMP on 73.69 Tflops with torch.float32 on cuda
#AMP on 76.48 Tflops with torch.float16 on cuda
#AMP on 74.64 Tflops with torch.bfloat16 on cuda
	#!/usr/bin/env python

	# Any copyright is dedicated to the Public Domain.
	# https://creativecommons.org/publicdomain/zero/1.0/

	# Written by Francois Fleuret <francois@fleuret.org>
	# Modified by François Lagunas <francois.lagunas@m4x.org>

	import time, torch

	if torch.cuda.is_available():
	device = torch.device('cuda')
	sync = torch.cuda.synchronize
	else:
	device = torch.device('cpu')
	sync = lambda: None

	d1, d2, d3 = 2048 * 8 * 4, 2048 * 4, 2048 * 4

	iterations = 100

	def test(prefix):
	for t in [ torch.float32, torch.float16, torch.bfloat16 ]:
	try:
	a = torch.rand(d1, d2, device = device, dtype=t)
	b = torch.rand(d2, d3, device = device, dtype=t)

	sync()
	start_time = time.perf_counter()
	for i in range(iterations):
	c = torch.mm(a,b)

	sync()
	duration = time.perf_counter() - start_time

	nb_flop = float(iterations * d1 * d2 * d3 * 2) # 1 multiply-and-add is 2 ops
	speed = nb_flop / duration

	for u in [ '', 'K', 'M', 'G', 'T', 'P' ]:
	if speed < 1e3: break
	speed /= 1e3

	print(f'{prefix} {speed:.02f} {u}flops with {t} on {device}')

	except:
	print(f'{prefix} {t} is not available on {device}')

	test("AMP off")
	with torch.cuda.amp.autocast():
	test("AMP on")



	# Results on a RTX 3090
	#AMP off 34.66 Tflops with torch.float32 on cuda
	#AMP off 77.28 Tflops with torch.float16 on cuda
	#AMP off 78.06 Tflops with torch.bfloat16 on cuda
	#AMP on 73.69 Tflops with torch.float32 on cuda
	#AMP on 76.48 Tflops with torch.float16 on cuda
	#AMP on 74.64 Tflops with torch.bfloat16 on cuda