Skip to content

Instantly share code, notes, and snippets.

View aleozlx's full-sized avatar

Alex Yang aleozlx

View GitHub Profile
C = np.zeros((3, 4), dtype=int)
block_size = (3, 2)
div_up = lambda a, b: (a + b - 1) // b
### CUDA Grid
for m in range(0, C.shape[0], block_size[0]):
for n in range(0, C.shape[1], block_size[1]):
### Main loop in the CUDA kernel
### Smaller K is favorable to satisfy the shared memory bandwidth
C = np.zeros((3, 4), dtype=int)
for k in range(A.shape[1]):
C += np.outer(A[:, k], B[k, :])
print("C =\n", C)
"""
OUTPUT
=============
C =
C = np.empty((3, 4), dtype=int)
for i in range(A.shape[0]):
for j in range(B.shape[1]):
C[i, j] = np.dot(A[i, :], B[:, j])
print("C =\n", C)
"""
OUTPUT
=============
import numpy as np
np.random.seed(123)
A = np.random.randint(5, size=3*5).reshape(3, 5)
print("A =\n", A)
B = np.random.randint(5, size=5*4).reshape(5, 4)
print("B =\n", B)
C = np.dot(A, B)
void test_sobel() {
// Configure the convolution kernel
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementInputA, LayoutInputA,
ElementInputB, LayoutInputB,
ElementOutput, LayoutOutput,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 32>, // Threadblock tile shape,
using ElementAccumulator = float;
using ElementInputA = cutlass::half_t;
using ElementInputB = cutlass::half_t;
using ElementOutput = float;
using LayoutInputA = cutlass::layout::TensorNHWC;
using LayoutInputB = cutlass::layout::TensorNHWC;
using LayoutOutput = cutlass::layout::TensorNHWC;
from pathlib import Path
from PIL import Image, ImageOps
def write_bin():
base_dir = Path(__file__).parent
a = 390
im = ImageOps.grayscale(Image.open(str(base_dir / "buildings_original.jpg"))).resize((a, a))
data = bytearray([im.getpixel((x, y)) for x in range(a) for y in range(a)])
with open(str(base_dir / "buildings_original.bin"), 'wb') as f:
f.write(data)
gst-launch-1.0 udpsrc port=5000 ! \
application/x-rtp,encoding-name=VP9,playload=96 ! \
rtpvp9depay ! queue ! \
avdec_vp9 ! \
xvimagesink sync=false async=false
gst-launch-1.0 nvarguscamerasrc ! \
"video/x-raw(memory:NVMM), width=1920, height=1080, format=NV12, framerate=30/1" ! \
nvivafilter cuda-process=true customer-lib-name="/home/alex/Videos/nvsample_cudaprocess/libcustom_cuda_filter.so" ! \
nvv4l2vp9enc ! \
rtpvp9pay mtu=1400 ! \
udpsink host=${CLIENT_IP} port=5000 sync=false async=false
gst-launch-1.0 nvarguscamerasrc ! "video/x-raw(memory:NVMM), width=1920, height=1080, format=NV12, framerate=30/1" ! \
nvv4l2vp9enc ! matroskamux ! \
filesink location="/mnt/extra/v-$(date +%F-%H-%M-%S).mkv" -e