nomaddo/test.py

## test.py
import numpy as np
import time

from videocore.assembler import qpu
from videocore.driver import Driver

@qpu
def hello_world(asm):
    mov(r0, uniform)
    ldi(r2, 3000000)

    L.loop
    mov(tmu0_s, r0)
    # 1.
    # if we insert nops here, 0.6456s -> 0.6461s
    # for i in range(0, 4):
    #     nop()
    nop(sig='load tmu0')
    # 2.
    # if we insert nops here, 0.6456s -> 0.8060s
    # for i in range(0, 4):
    #     nop()
    isub(r2, r2, 1, set_flags=True)
    jzc(L.loop)
    nop()
    nop()
    nop()

    # Finish the thread
    exit()

with Driver() as drv:
    # Input vectors
    a = np.random.random(16).astype('float32')

    # Copy vectors to shared memory for DMA transfer
    inp = drv.copy(np.r_[a])

    # Run the program
    start = time.perf_counter()
    drv.execute(
        n_threads=1,
        program=drv.program(hello_world),
        uniforms=[inp.address]
    )
    end = time.perf_counter()

    print('{:.4f}'.format(end - start))
	import numpy as np
	import time

	from videocore.assembler import qpu
	from videocore.driver import Driver

	@qpu
	def hello_world(asm):
	mov(r0, uniform)
	ldi(r2, 3000000)

	L.loop
	mov(tmu0_s, r0)
	# 1.
	# if we insert nops here, 0.6456s -> 0.6461s
	# for i in range(0, 4):
	# nop()
	nop(sig='load tmu0')
	# 2.
	# if we insert nops here, 0.6456s -> 0.8060s
	# for i in range(0, 4):
	# nop()
	isub(r2, r2, 1, set_flags=True)
	jzc(L.loop)
	nop()
	nop()
	nop()

	# Finish the thread
	exit()

	with Driver() as drv:
	# Input vectors
	a = np.random.random(16).astype('float32')

	# Copy vectors to shared memory for DMA transfer
	inp = drv.copy(np.r_[a])

	# Run the program
	start = time.perf_counter()
	drv.execute(
	n_threads=1,
	program=drv.program(hello_world),
	uniforms=[inp.address]
	)
	end = time.perf_counter()

	print('{:.4f}'.format(end - start))