Skip to content

Instantly share code, notes, and snippets.

@christophernhill
Last active January 27, 2020 04:34
Show Gist options
  • Save christophernhill/de9e934a2315fd2551a794d40255d301 to your computer and use it in GitHub Desktop.
Save christophernhill/de9e934a2315fd2551a794d40255d301 to your computer and use it in GitHub Desktop.
Code fragments for Satori Speed Up Green Up Hackathon during IAP 2020
# demo for profiling python using command
#
# nvprof --print-gpu-trace python cuml_tsne.py
from sklearn.datasets import load_digits
X, y = load_digits().data, load_digits().target
from cuml.manifold import TSNE
tsne = TSNE(n_components = 2)
X_hat = tsne.fit_transform(X)
# code is based on https://medium.com/rapids-ai/tsne-with-gpus-hours-to-seconds-9d9c17c941db
# and needs sklearn and cuml, which are both available for ppc64le in conda channels.
#
Gists for speed-up, green-up Satori hackathon IAP 2020
o vector-add.cu
Andrew Kirby code for nvprof example
#include <stdio.h>
#define N 1048576
__global__ void add_vectors(int *a, int *b, int *c){
int id = blockDim.x * blockIdx.x + threadIdx.x;
if(id < N) c[id] = a[id] + b[id];
}
int main(){
size_t bytes = N*sizeof(int);
int *A = (int*)malloc(bytes);
int *B = (int*)malloc(bytes);
int *C = (int*)malloc(bytes);
int *d_A, *d_B, *d_C;
cudaMalloc(&d_A, bytes);
cudaMalloc(&d_B, bytes);
cudaMalloc(&d_C, bytes);
for(int i=0; i<N; i++){
A[i] = 1;
B[i] = 2;
}
cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);
int thr_per_blk = 256;
int blk_in_grid = ceil( float(N) / thr_per_blk );
add_vectors<<< blk_in_grid, thr_per_blk >>>(d_A, d_B, d_C);
cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);
free(A);
free(B);
free(C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment