christophernhill/cuml-tsne.py

## cuml-tsne.py
# demo for profiling python using command
#
# nvprof --print-gpu-trace python cuml_tsne.py

from sklearn.datasets import load_digits
X, y = load_digits().data, load_digits().target
from cuml.manifold import TSNE
tsne = TSNE(n_components = 2)
X_hat = tsne.fit_transform(X)

# code is based on https://medium.com/rapids-ai/tsne-with-gpus-hours-to-seconds-9d9c17c941db
# and needs sklearn and cuml, which are both available for ppc64le in conda channels.
#

## satori-green-up-hackathon-iap-2020
Gists for speed-up, green-up Satori hackathon IAP 2020

o vector-add.cu
  Andrew Kirby code for nvprof example

## vector-add.cu
#include <stdio.h>
#define N 1048576

__global__ void add_vectors(int *a, int *b, int *c){
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    if(id < N) c[id] = a[id] + b[id];
}

int main(){
    size_t bytes = N*sizeof(int);
    int *A = (int*)malloc(bytes);
    int *B = (int*)malloc(bytes);
    int *C = (int*)malloc(bytes);
    int *d_A, *d_B, *d_C;

    cudaMalloc(&d_A, bytes);
    cudaMalloc(&d_B, bytes);
    cudaMalloc(&d_C, bytes);
    for(int i=0; i<N; i++){
        A[i] = 1;
        B[i] = 2;
    }

    cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);

    int thr_per_blk = 256;
    int blk_in_grid = ceil( float(N) / thr_per_blk );

    add_vectors<<< blk_in_grid, thr_per_blk >>>(d_A, d_B, d_C);
    cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);

    free(A);
    free(B);
    free(C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    return 0;
}
	# demo for profiling python using command
	#
	# nvprof --print-gpu-trace python cuml_tsne.py

	from sklearn.datasets import load_digits
	X, y = load_digits().data, load_digits().target
	from cuml.manifold import TSNE
	tsne = TSNE(n_components = 2)
	X_hat = tsne.fit_transform(X)

	# code is based on https://medium.com/rapids-ai/tsne-with-gpus-hours-to-seconds-9d9c17c941db
	# and needs sklearn and cuml, which are both available for ppc64le in conda channels.
	#
	Gists for speed-up, green-up Satori hackathon IAP 2020

	o vector-add.cu
	Andrew Kirby code for nvprof example
	#include <stdio.h>
	#define N 1048576

	__global__ void add_vectors(int a, int b, int *c){
	int id = blockDim.x * blockIdx.x + threadIdx.x;
	if(id < N) c[id] = a[id] + b[id];
	}

	int main(){
	size_t bytes = N*sizeof(int);
	int A = (int)malloc(bytes);
	int B = (int)malloc(bytes);
	int C = (int)malloc(bytes);
	int d_A, d_B, *d_C;

	cudaMalloc(&d_A, bytes);
	cudaMalloc(&d_B, bytes);
	cudaMalloc(&d_C, bytes);
	for(int i=0; i<N; i++){
	A[i] = 1;
	B[i] = 2;
	}

	cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);

	int thr_per_blk = 256;
	int blk_in_grid = ceil( float(N) / thr_per_blk );

	add_vectors<<< blk_in_grid, thr_per_blk >>>(d_A, d_B, d_C);
	cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);

	free(A);
	free(B);
	free(C);
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
	return 0;
	}