Last active
September 2, 2019 09:26
-
-
Save juniorprincewang/d4a8120fa754a2701b5f7c7a76777877 to your computer and use it in GitHub Desktop.
CUDA contexts created by threads are supported by mps. \n first `make threadMigration_kernel64.ptx` then `make`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
NVCC=/usr/local/cuda/bin/nvcc | |
LIBS=-lcuda | |
INCLUDE=~/NVIDIA_CUDA-9.1_Samples/common/inc | |
all:main | |
threadMigration_kernel64.ptx:threadMigration_kernel.cu | |
$(NVCC) -ptx -o $@ $< $(LIBS) -arch=sm_35 | |
main:threadMigration.cpp | |
$(NVCC) $(LIBS) -I $(INCLUDE) -o threadMigration threadMigration.cpp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved. | |
* | |
* Please refer to the NVIDIA end user license agreement (EULA) associated | |
* with this source code for terms and conditions that govern your use of | |
* this software. Any use, reproduction, disclosure, or distribution of | |
* this software and related documentation outside the terms of the EULA | |
* is strictly prohibited. | |
* | |
*/ | |
/****************************************************************************** | |
* | |
* Module: threadMigration.cpp | |
* | |
* Description: | |
* Simple sample demonstrating multi-GPU/multithread functionality using | |
* the CUDA Context Management API. This API allows the a CUDA context to be | |
* associated with a CPU process. A host thread may have only one device context | |
* current at a time. | |
* | |
* Refer to the CUDA programming guide 4.5.3.3 on Context Management | |
* | |
******************************************************************************/ | |
#define MAXTHREADS 256 | |
#define NUM_INTS 32 | |
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |
// Windows threads use different data structures | |
#include <windows.h> | |
DWORD rgdwThreadIds[MAXTHREADS]; | |
HANDLE rghThreads[MAXTHREADS]; | |
CRITICAL_SECTION g_cs; | |
#define ENTERCRITICALSECTION EnterCriticalSection(&g_cs); | |
#define LEAVECRITICALSECTION LeaveCriticalSection(&g_cs); | |
#define STRICMP stricmp | |
#else | |
// Includes POSIX thread headers for Linux thread support | |
#include <pthread.h> | |
#include <stdint.h> | |
pthread_t rghThreads[MAXTHREADS]; | |
pthread_mutex_t g_mutex; | |
#define ENTERCRITICALSECTION pthread_mutex_lock(&g_mutex); | |
#define LEAVECRITICALSECTION pthread_mutex_unlock(&g_mutex); | |
#define STRICMP strcasecmp | |
#endif | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <cuda.h> | |
#include <cuda_runtime_api.h> | |
#include <helper_cuda_drvapi.h> | |
#include <iostream> | |
#include <cstring> | |
#include <time.h> | |
#include <sys/time.h> | |
#define USECPSEC 1000000ULL | |
using namespace std; | |
int NumThreads; | |
int ThreadLaunchCount; | |
typedef struct _CUDAContext_st | |
{ | |
CUcontext hcuContext; | |
CUmodule hcuModule; | |
CUfunction hcuFunction; | |
CUdeviceptr dptr; | |
CUdevice hcuDevice; | |
int deviceID; | |
int threadNum; | |
unsigned long long time; | |
} CUDAContext; | |
CUDAContext g_ThreadParams[MAXTHREADS]; | |
//define input ptx file for different platforms | |
#if defined(_WIN64) || defined(__LP64__) | |
#define PTX_FILE "threadMigration_kernel64.ptx" | |
#define CUBIN_FILE "threadMigration_kernel64.cubin" | |
#else | |
#define PTX_FILE "threadMigration_kernel32.ptx" | |
#define CUBIN_FILE "threadMigration_kernel32.cubin" | |
#endif | |
bool gbAutoQuit = false; | |
//////////////////////////////////////////////////////////////////////////////// | |
// declaration, forward | |
bool runTest(int argc, char **argv); | |
#define CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status) \ | |
if ( dptr ) cuMemFree( dptr ); \ | |
if ( hcuModule ) cuModuleUnload( hcuModule ); \ | |
if ( hcuContext ) cuCtxDestroy( hcuContext ); \ | |
return NULL; | |
// return status; | |
#define THREAD_QUIT \ | |
printf("Error\n"); \ | |
return 0; | |
bool inline | |
findModulePath(const char *module_file, string &module_path, char **argv, string &ptx_source) | |
{ | |
// char *actual_path = sdkFindFilePath(module_file, argv[0]); | |
char *actual_path = sdkFindFilePath(module_file, NULL); | |
if (actual_path) | |
{ | |
module_path = actual_path; | |
} | |
else | |
{ | |
printf("> findModulePath file not found: <%s> \n", module_file); | |
return false; | |
} | |
if (module_path.empty()) | |
{ | |
printf("> findModulePath could not find file: <%s> \n", module_file); | |
return false; | |
} | |
else | |
{ | |
printf("> findModulePath found file at <%s>\n", module_path.c_str()); | |
if (module_path.rfind(".ptx") != string::npos) | |
{ | |
FILE *fp; | |
FOPEN(fp, module_path.c_str(), "rb"); | |
fseek(fp, 0, SEEK_END); | |
int file_size = ftell(fp); | |
char *buf = new char[file_size+1]; | |
fseek(fp, 0, SEEK_SET); | |
fread(buf, sizeof(char), file_size, fp); | |
fclose(fp); | |
buf[file_size] = '\0'; | |
ptx_source = buf; | |
delete[] buf; | |
} | |
return true; | |
} | |
} | |
unsigned long long dtime_usec(unsigned long long start) | |
{ | |
timeval tv; | |
gettimeofday(&tv, 0); | |
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start; | |
} | |
// ThreadProc launches the CUDA kernel on a CUDA context. | |
// We have more than one thread that talks to a CUDA context | |
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |
DWORD WINAPI ThreadProc(CUDAContext *pParams) | |
#else | |
void *ThreadProc(CUDAContext *pParams) | |
#endif | |
{ | |
int wrong = 0; | |
int *pInt = 0; | |
unsigned delay_t_r=5; | |
unsigned long long difft = dtime_usec(0); | |
CUcontext hcuContext = 0; | |
CUmodule hcuModule = 0; | |
CUfunction hcuFunction = 0; | |
CUdeviceptr dptr = 0; | |
CUdevprop devProps; | |
CUresult status; | |
int deviceID = pParams->deviceID; | |
CUdevice hcuDevice = pParams->hcuDevice; | |
/*status = cuInit(0); | |
if (CUDA_SUCCESS != status) { | |
printf("init failed\n"); | |
return NULL; | |
} | |
*/ | |
printf("get device %d\n", deviceID); | |
// cuCtxCreate: Function works on floating contexts and current context | |
status = cuCtxCreate(&hcuContext, 0, hcuDevice); | |
if (CUDA_SUCCESS != status) | |
{ | |
fprintf(stderr, "cuCtxCreate for <deviceID=%d> failed %d\n", deviceID, status); | |
CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status); | |
} | |
status = CUDA_ERROR_INVALID_IMAGE; | |
if (CUDA_SUCCESS != cuDeviceGetProperties(&devProps, hcuDevice)) | |
{ | |
printf("cuDeviceGetProperties FAILED\n"); | |
CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status); | |
} | |
string module_path, ptx_source; | |
if (!findModulePath(PTX_FILE, module_path, NULL, ptx_source)) | |
{ | |
if (!findModulePath(CUBIN_FILE, module_path, NULL, ptx_source)) | |
{ | |
fprintf(stderr, "> findModulePath could not find <threadMigration> ptx or cubin\n"); | |
CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status); | |
} | |
} | |
printf("module path=%s\n", module_path.c_str()); | |
status = cuModuleLoad(&hcuModule, module_path.c_str()); | |
if (CUDA_SUCCESS != status) | |
{ | |
fprintf(stderr, "cuModuleLoad failed %d\n", status); | |
CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status); | |
} | |
status = cuModuleGetFunction(&hcuFunction, hcuModule, "delay_kernel"); | |
if (CUDA_SUCCESS != status) | |
{ | |
fprintf(stderr, "cuModuleGetFunction failed %d\n", status); | |
CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status); | |
} | |
// Here we must release the CUDA context from the thread context | |
status = cuCtxPopCurrent(NULL); | |
if (CUDA_SUCCESS != status) | |
{ | |
fprintf(stderr, "cuCtxPopCurrent failed %d\n", status); | |
CLEANUP_ON_ERROR(dptr, hcuModule, hcuContext, status); | |
} | |
printf("<CUDA Device=%d, Context=%p, Thread=%d> - ThreadProc() Launched...\n", | |
pParams->deviceID, hcuContext, pParams->threadNum); | |
// cuCtxPushCurrent: Attach the caller CUDA context to the thread context. | |
status = cuCtxPushCurrent(hcuContext); | |
if (CUDA_SUCCESS != status) | |
{ | |
THREAD_QUIT; | |
} | |
// There are two ways to launch CUDA kernels via the Driver API. | |
// In this CUDA Sample, we illustrate both ways to pass parameters | |
// and specify parameters. By default we use the simpler method. | |
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method) | |
void *args[5] = { &delay_t_r }; | |
// new CUDA 4.0 Driver API Kernel launch call | |
status = cuLaunchKernel(hcuFunction, 1, 1, 1, | |
1, 1, 1, | |
0, | |
NULL, args, NULL); | |
if (CUDA_SUCCESS != status) | |
{ | |
fprintf(stderr, "cuLaunch failed %d\n", status); | |
THREAD_QUIT; | |
} | |
checkCudaErrors(cuCtxSynchronize()); | |
difft = dtime_usec(difft); | |
printf("kernel duration: %fs\n", difft/(float)USECPSEC); | |
// cuCtxPopCurrent: Detach the current CUDA context from the calling thread. | |
checkCudaErrors(cuCtxPopCurrent(NULL)); | |
printf("<CUDA Device=%d, Context=%p, Thread=%d> - ThreadProc() Finished!\n\n", | |
pParams->deviceID, hcuContext, pParams->threadNum); | |
return 0; | |
} | |
bool FinalErrorCheck(CUDAContext *pContext, int NumThreads, int deviceCount) | |
{ | |
if (ThreadLaunchCount != NumThreads*deviceCount) | |
{ | |
printf("<Expected=%d, Actual=%d> ThreadLaunchCounts(s)\n", | |
NumThreads*deviceCount, ThreadLaunchCount); | |
return false; | |
} | |
else | |
{ | |
for (int iDevice = 0; iDevice < deviceCount; iDevice++) | |
{ | |
// cuCtxDestroy called on current context or a floating context | |
if (CUDA_SUCCESS != cuCtxDestroy(pContext[iDevice].hcuContext)) | |
return false; | |
} | |
return true; | |
} | |
} | |
int | |
main(int argc, char **argv) | |
{ | |
printf("Starting threadMigration\n"); | |
bool bTestResult = runTest(argc, argv); | |
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); | |
} | |
bool | |
runTest(int argc, char **argv) | |
{ | |
printf("[ threadMigration ] API test...\n"); | |
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |
InitializeCriticalSection(&g_cs); | |
#else | |
pthread_mutex_init(&g_mutex, NULL); | |
#endif | |
// By default, we will launch 2 CUDA threads for each device | |
NumThreads = 2; | |
if (argc > 1) | |
{ | |
// If we are doing the QAtest or automated testing, we quit without prompting | |
if (checkCmdLineFlag(argc, (const char **)argv, "qatest") || | |
checkCmdLineFlag(argc, (const char **)argv, "noprompt")) | |
{ | |
gbAutoQuit = true; | |
} | |
if (checkCmdLineFlag(argc, (const char **)argv, "numthreads")) | |
{ | |
NumThreads = getCmdLineArgumentInt(argc, (const char **) argv, "numthreads"); | |
if (NumThreads < 1 || NumThreads > 15) | |
{ | |
printf("Usage: \"threadMigration -n=<threads>\", <threads> ranges 1-15\n"); | |
return 1; | |
} | |
} | |
} | |
int deviceCount; | |
int hcuDevice = 0; | |
CUresult status; | |
status = cuInit(0); | |
if (CUDA_SUCCESS != status) | |
return false; | |
status = cuDeviceGetCount(&deviceCount); | |
if (CUDA_SUCCESS != status) | |
return false; | |
printf("> %d CUDA device(s), %d Thread(s)/device to launched\n\n", deviceCount, NumThreads); | |
if (deviceCount == 0) | |
{ | |
return false; | |
} | |
int ihThread = 0; | |
int ThreadIndex = 0; | |
CUDAContext *pContext = (CUDAContext*) malloc(sizeof(CUDAContext)*NumThreads); | |
for (int iDevice = 0; iDevice < deviceCount; iDevice++) | |
{ | |
char szName[256]; | |
status = cuDeviceGet(&hcuDevice, iDevice); | |
if (CUDA_SUCCESS != status) | |
return false; | |
status = cuDeviceGetName(szName, 256, hcuDevice); | |
if (CUDA_SUCCESS != status) | |
return false; | |
CUdevprop devProps; | |
if (CUDA_SUCCESS == cuDeviceGetProperties(&devProps, hcuDevice)) | |
{ | |
int major = 0, minor = 0; | |
checkCudaErrors(cuDeviceComputeCapability(&major, &minor, hcuDevice)); | |
printf("Device %d: \"%s\" (Compute %d.%d)\n", iDevice, szName, major, minor); | |
printf("\tsharedMemPerBlock: %d\n", devProps.sharedMemPerBlock); | |
printf("\tconstantMemory : %d\n", devProps.totalConstantMemory); | |
printf("\tregsPerBlock : %d\n", devProps.regsPerBlock); | |
printf("\tclockRate : %d\n", devProps.clockRate); | |
printf("\n"); | |
} | |
for (int iThread = 0; iThread < NumThreads; iThread++, ihThread++) | |
{ | |
g_ThreadParams[ThreadIndex].deviceID = iDevice; | |
g_ThreadParams[ThreadIndex].threadNum = iThread; | |
g_ThreadParams[ThreadIndex].hcuDevice = hcuDevice; | |
// Launch (NumThreads) for each CUDA context | |
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |
rghThreads[ThreadIndex] = CreateThread(NULL, 0, | |
(LPTHREAD_START_ROUTINE) ThreadProc, | |
&g_ThreadParams[ThreadIndex], | |
0, &rgdwThreadIds[ThreadIndex]); | |
#else // Assume we are running linux | |
pthread_create(&rghThreads[ThreadIndex], NULL, | |
(void *(*)(void *)) ThreadProc, &g_ThreadParams[ThreadIndex]); | |
#endif | |
ThreadIndex += 1; | |
} | |
} | |
// Wait until all workers are done | |
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |
WaitForMultipleObjects(ThreadIndex, rghThreads, TRUE, INFINITE); | |
#else | |
for (int i = 0; i < ThreadIndex; i++) | |
{ | |
pthread_join(rghThreads[i], NULL); | |
} | |
#endif | |
bool ret_status = FinalErrorCheck(pContext, NumThreads, deviceCount); | |
free(pContext); | |
return ret_status; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <time.h> | |
#include <sys/time.h> | |
#define APPRX_CLKS_PER_SEC 1000000000ULL | |
extern "C" __global__ void delay_kernel(unsigned seconds){ | |
unsigned long long dt = clock64(); | |
while (clock64() < (dt + (seconds*APPRX_CLKS_PER_SEC))); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment