Last active
January 27, 2020 04:34
-
-
Save christophernhill/de9e934a2315fd2551a794d40255d301 to your computer and use it in GitHub Desktop.
Code fragments for Satori Speed Up Green Up Hackathon during IAP 2020
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# demo for profiling python using command | |
# | |
# nvprof --print-gpu-trace python cuml_tsne.py | |
from sklearn.datasets import load_digits | |
X, y = load_digits().data, load_digits().target | |
from cuml.manifold import TSNE | |
tsne = TSNE(n_components = 2) | |
X_hat = tsne.fit_transform(X) | |
# code is based on https://medium.com/rapids-ai/tsne-with-gpus-hours-to-seconds-9d9c17c941db | |
# and needs sklearn and cuml, which are both available for ppc64le in conda channels. | |
# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Gists for speed-up, green-up Satori hackathon IAP 2020 | |
o vector-add.cu | |
Andrew Kirby code for nvprof example |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#define N 1048576 | |
__global__ void add_vectors(int *a, int *b, int *c){ | |
int id = blockDim.x * blockIdx.x + threadIdx.x; | |
if(id < N) c[id] = a[id] + b[id]; | |
} | |
int main(){ | |
size_t bytes = N*sizeof(int); | |
int *A = (int*)malloc(bytes); | |
int *B = (int*)malloc(bytes); | |
int *C = (int*)malloc(bytes); | |
int *d_A, *d_B, *d_C; | |
cudaMalloc(&d_A, bytes); | |
cudaMalloc(&d_B, bytes); | |
cudaMalloc(&d_C, bytes); | |
for(int i=0; i<N; i++){ | |
A[i] = 1; | |
B[i] = 2; | |
} | |
cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice); | |
cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice); | |
int thr_per_blk = 256; | |
int blk_in_grid = ceil( float(N) / thr_per_blk ); | |
add_vectors<<< blk_in_grid, thr_per_blk >>>(d_A, d_B, d_C); | |
cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost); | |
free(A); | |
free(B); | |
free(C); | |
cudaFree(d_A); | |
cudaFree(d_B); | |
cudaFree(d_C); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment