chudur-budur/gist:46e4f4d33b175962c85988af6cb33dd4

## gistfile1.txt
func @matmul(%A: memref<24x96xf32>, %B: memref<96x64xf32>, %C: memref<24x64xf32>) {

    %cf1 = constant 0.0 : f32

    %AC = memref.cast %A : memref<24x96xf32> to memref<*xf32>
    %BC = memref.cast %B : memref<96x64xf32> to memref<*xf32>
    %CC = memref.cast %C : memref<24x64xf32> to memref<*xf32>

    gpu.host_register %AC : memref<*xf32>
    gpu.host_register %BC : memref<*xf32>
    gpu.host_register %CC : memref<*xf32>

    linalg.fill(%A, %cf1) : memref<24x96xf32>, f32
    linalg.fill(%B, %cf1) : memref<96x64xf32>, f32
    linalg.fill(%C, %cf1) : memref<24x64xf32>, f32

    linalg.matmul ins(%A, %B : memref<24x96xf32>, memref<96x64xf32>)
        outs(%C: memref<24x64xf32>)
    return
}
	func @matmul(%A: memref<24x96xf32>, %B: memref<96x64xf32>, %C: memref<24x64xf32>) {

	%cf1 = constant 0.0 : f32

	%AC = memref.cast %A : memref<24x96xf32> to memref<*xf32>
	%BC = memref.cast %B : memref<96x64xf32> to memref<*xf32>
	%CC = memref.cast %C : memref<24x64xf32> to memref<*xf32>

	gpu.host_register %AC : memref<*xf32>
	gpu.host_register %BC : memref<*xf32>
	gpu.host_register %CC : memref<*xf32>

	linalg.fill(%A, %cf1) : memref<24x96xf32>, f32
	linalg.fill(%B, %cf1) : memref<96x64xf32>, f32
	linalg.fill(%C, %cf1) : memref<24x64xf32>, f32

	linalg.matmul ins(%A, %B : memref<24x96xf32>, memref<96x64xf32>)
	outs(%C: memref<24x64xf32>)
	return
	}