Alex Yang aleozlx

## __blog_cutlass_matmul.py
C = np.zeros((3, 4), dtype=int)
block_size = (3, 2)
div_up = lambda a, b: (a + b - 1) // b

### CUDA Grid
for m in range(0, C.shape[0], block_size[0]):
    for n in range(0, C.shape[1], block_size[1]):

### Main loop in the CUDA kernel
### Smaller K is favorable to satisfy the shared memory bandwidth

## __blog_matmul1.py
C = np.zeros((3, 4), dtype=int)

for k in range(A.shape[1]):
    C += np.outer(A[:, k], B[k, :])
print("C =\n", C)

"""
OUTPUT
=============
C =

## __blog_matmul_naive1.py
C = np.empty((3, 4), dtype=int)

for i in range(A.shape[0]):
    for j in range(B.shape[1]):
        C[i, j] = np.dot(A[i, :], B[:, j])
print("C =\n", C)

"""
OUTPUT
=============

## __blog_matmul_naive.py
import numpy as np
np.random.seed(123)

A = np.random.randint(5, size=3*5).reshape(3, 5)
print("A =\n", A)

B = np.random.randint(5, size=5*4).reshape(5, 4)
print("B =\n", B)

C = np.dot(A, B)

## __blog_learning_cutlass_pt1_test_sobel.cpp
void test_sobel() {
  // Configure the convolution kernel
  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
    ElementInputA, LayoutInputA,
    ElementInputB, LayoutInputB,
    ElementOutput, LayoutOutput,
    ElementAccumulator,
    cutlass::arch::OpClassTensorOp,
    cutlass::arch::Sm75,
    cutlass::gemm::GemmShape<128, 128, 32>,              // Threadblock tile shape,

## __blog_learning_cutlass_dtype.cpp
using ElementAccumulator = float;
using ElementInputA = cutlass::half_t;
using ElementInputB = cutlass::half_t;
using ElementOutput = float;
using LayoutInputA = cutlass::layout::TensorNHWC;
using LayoutInputB = cutlass::layout::TensorNHWC;
using LayoutOutput = cutlass::layout::TensorNHWC;

## __blog_imgio.py
from pathlib import Path
from PIL import Image, ImageOps

def write_bin():
    base_dir = Path(__file__).parent
    a = 390
    im = ImageOps.grayscale(Image.open(str(base_dir / "buildings_original.jpg"))).resize((a, a))
    data = bytearray([im.getpixel((x, y)) for x in range(a) for y in range(a)])
    with open(str(base_dir / "buildings_original.bin"), 'wb') as f:
        f.write(data)

## __blog_gs-play.sh
gst-launch-1.0 udpsrc port=5000 ! \
  application/x-rtp,encoding-name=VP9,playload=96 ! \
  rtpvp9depay ! queue ! \
  avdec_vp9 ! \
  xvimagesink sync=false async=false

## __blog_gs-stream.sh
gst-launch-1.0 nvarguscamerasrc ! \
  "video/x-raw(memory:NVMM), width=1920, height=1080, format=NV12, framerate=30/1" ! \
  nvivafilter cuda-process=true customer-lib-name="/home/alex/Videos/nvsample_cudaprocess/libcustom_cuda_filter.so" ! \
  nvv4l2vp9enc ! \
  rtpvp9pay mtu=1400 ! \
  udpsink host=${CLIENT_IP} port=5000 sync=false async=false

## __blog_gs-collect.sh
gst-launch-1.0 nvarguscamerasrc ! "video/x-raw(memory:NVMM), width=1920, height=1080, format=NV12, framerate=30/1" ! \
  nvv4l2vp9enc ! matroskamux ! \
  filesink location="/mnt/extra/v-$(date +%F-%H-%M-%S).mkv" -e
	C = np.zeros((3, 4), dtype=int)
	block_size = (3, 2)
	div_up = lambda a, b: (a + b - 1) // b

	### CUDA Grid
	for m in range(0, C.shape[0], block_size[0]):
	for n in range(0, C.shape[1], block_size[1]):

	### Main loop in the CUDA kernel
	### Smaller K is favorable to satisfy the shared memory bandwidth
	C = np.zeros((3, 4), dtype=int)

	for k in range(A.shape[1]):
	C += np.outer(A[:, k], B[k, :])
	print("C =\n", C)

	"""
	OUTPUT
	=============
	C =
	C = np.empty((3, 4), dtype=int)

	for i in range(A.shape[0]):
	for j in range(B.shape[1]):
	C[i, j] = np.dot(A[i, :], B[:, j])
	print("C =\n", C)

	"""
	OUTPUT
	=============
	import numpy as np
	np.random.seed(123)

	A = np.random.randint(5, size=3*5).reshape(3, 5)
	print("A =\n", A)

	B = np.random.randint(5, size=5*4).reshape(5, 4)
	print("B =\n", B)

	C = np.dot(A, B)
	void test_sobel() {
	// Configure the convolution kernel
	using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
	ElementInputA, LayoutInputA,
	ElementInputB, LayoutInputB,
	ElementOutput, LayoutOutput,
	ElementAccumulator,
	cutlass::arch::OpClassTensorOp,
	cutlass::arch::Sm75,
	cutlass::gemm::GemmShape<128, 128, 32>, // Threadblock tile shape,
	using ElementAccumulator = float;
	using ElementInputA = cutlass::half_t;
	using ElementInputB = cutlass::half_t;
	using ElementOutput = float;
	using LayoutInputA = cutlass::layout::TensorNHWC;
	using LayoutInputB = cutlass::layout::TensorNHWC;
	using LayoutOutput = cutlass::layout::TensorNHWC;
	from pathlib import Path
	from PIL import Image, ImageOps

	def write_bin():
	base_dir = Path(__file__).parent
	a = 390
	im = ImageOps.grayscale(Image.open(str(base_dir / "buildings_original.jpg"))).resize((a, a))
	data = bytearray([im.getpixel((x, y)) for x in range(a) for y in range(a)])
	with open(str(base_dir / "buildings_original.bin"), 'wb') as f:
	f.write(data)
	gst-launch-1.0 udpsrc port=5000 ! \
	application/x-rtp,encoding-name=VP9,playload=96 ! \
	rtpvp9depay ! queue ! \
	avdec_vp9 ! \
	xvimagesink sync=false async=false
	gst-launch-1.0 nvarguscamerasrc ! \
	"video/x-raw(memory:NVMM), width=1920, height=1080, format=NV12, framerate=30/1" ! \
	nvivafilter cuda-process=true customer-lib-name="/home/alex/Videos/nvsample_cudaprocess/libcustom_cuda_filter.so" ! \
	nvv4l2vp9enc ! \
	rtpvp9pay mtu=1400 ! \
	udpsink host=${CLIENT_IP} port=5000 sync=false async=false
	gst-launch-1.0 nvarguscamerasrc ! "video/x-raw(memory:NVMM), width=1920, height=1080, format=NV12, framerate=30/1" ! \
	nvv4l2vp9enc ! matroskamux ! \
	filesink location="/mnt/extra/v-$(date +%F-%H-%M-%S).mkv" -e