Skip to content

Instantly share code, notes, and snippets.

@me-vlad
Created March 4, 2016 01:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save me-vlad/4388b4a9436cb4fad3c8 to your computer and use it in GitHub Desktop.
Save me-vlad/4388b4a9436cb4fad3c8 to your computer and use it in GitHub Desktop.
ffmpeg nvidia cuda patches
diff -ruN ffmpeg-orig/libavcodec/Makefile ffmpeg/libavcodec/Makefile
--- ffmpeg-orig/libavcodec/Makefile 2016-03-04 00:17:21.166198276 +0200
+++ ffmpeg/libavcodec/Makefile 2016-03-04 00:18:17.583433360 +0200
@@ -102,7 +102,7 @@
motion_est.o ratecontrol.o \
mpegvideoencdsp.o
OBJS-$(CONFIG_MSS34DSP) += mss34dsp.o
-OBJS-$(CONFIG_NVENC) += nvenc.o
+OBJS-$(CONFIG_NVENC) += nvenc.o nvenc_ptx.o
OBJS-$(CONFIG_PIXBLOCKDSP) += pixblockdsp.o
OBJS-$(CONFIG_QPELDSP) += qpeldsp.o
OBJS-$(CONFIG_QSV) += qsv.o
diff -ruN ffmpeg-orig/libavcodec/nvenc.c ffmpeg/libavcodec/nvenc.c
--- ffmpeg-orig/libavcodec/nvenc.c 2016-03-04 00:17:21.113193357 +0200
+++ ffmpeg/libavcodec/nvenc.c 2016-03-04 00:26:51.381033223 +0200
@@ -32,15 +32,11 @@
#include "libavutil/avassert.h"
#include "libavutil/opt.h"
#include "libavutil/mem.h"
+#include "libavutil/cudautils.h"
#include "avcodec.h"
#include "internal.h"
#include "thread.h"
-#if defined(_WIN32)
-#define CUDAAPI __stdcall
-#else
-#define CUDAAPI
-#endif
#if defined(_WIN32)
#define LOAD_FUNC(l, s) GetProcAddress(l, s)
@@ -50,28 +46,19 @@
#define DL_CLOSE_FUNC(l) dlclose(l)
#endif
-typedef enum cudaError_enum {
- CUDA_SUCCESS = 0
-} CUresult;
-typedef int CUdevice;
-typedef void* CUcontext;
-
-typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags);
-typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count);
-typedef CUresult(CUDAAPI *PCUDEVICEGET)(CUdevice *device, int ordinal);
-typedef CUresult(CUDAAPI *PCUDEVICEGETNAME)(char *name, int len, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUDEVICECOMPUTECAPABILITY)(int *major, int *minor, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUCTXCREATE)(CUcontext *pctx, unsigned int flags, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUCTXPOPCURRENT)(CUcontext *pctx);
-typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx);
+#define BLOCKSX 128
+#define THREADSX 128
typedef NVENCSTATUS (NVENCAPI* PNVENCODEAPICREATEINSTANCE)(NV_ENCODE_API_FUNCTION_LIST *functionList);
typedef struct NvencInputSurface
{
NV_ENC_INPUT_PTR input_surface;
+ CUdeviceptr dptr;
+ void* hRes;
int width;
int height;
+ size_t pitch;
int lockCount;
@@ -107,24 +94,11 @@
typedef struct NvencDynLoadFunctions
{
- PCUINIT cu_init;
- PCUDEVICEGETCOUNT cu_device_get_count;
- PCUDEVICEGET cu_device_get;
- PCUDEVICEGETNAME cu_device_get_name;
- PCUDEVICECOMPUTECAPABILITY cu_device_compute_capability;
- PCUCTXCREATE cu_ctx_create;
- PCUCTXPOPCURRENT cu_ctx_pop_current;
- PCUCTXDESTROY cu_ctx_destroy;
-
NV_ENCODE_API_FUNCTION_LIST nvenc_funcs;
- int nvenc_device_count;
- CUdevice nvenc_devices[16];
#if defined(_WIN32)
- HMODULE cuda_lib;
HMODULE nvenc_lib;
#else
- void* cuda_lib;
void* nvenc_lib;
#endif
} NvencDynLoadFunctions;
@@ -140,14 +114,18 @@
AVClass *avclass;
NvencDynLoadFunctions nvenc_dload_funcs;
+ CudaDynLoadFunctions* cuda_dload_funcs;
NV_ENC_INITIALIZE_PARAMS init_encode_params;
NV_ENC_CONFIG encode_config;
CUcontext cu_context;
+ CUmodule cu_module;
+ CUfunction cu_func_interleaveChroma;
int max_surface_count;
NvencInputSurface *input_surfaces;
NvencOutputSurface *output_surfaces;
+ NvencInputSurface transferSurf;
NvencDataList output_surface_queue;
NvencDataList output_surface_ready_queue;
@@ -164,8 +142,10 @@
int twopass;
int gpu;
int buffer_delay;
+ int aq;
} NvencContext;
+
static const NvencValuePair nvenc_h264_level_pairs[] = {
{ "auto", NV_ENC_LEVEL_AUTOSELECT },
{ "1" , NV_ENC_LEVEL_H264_1 },
@@ -330,79 +310,18 @@
return res->u.timestamp;
}
-#define CHECK_LOAD_FUNC(t, f, s) \
-do { \
- (f) = (t)LOAD_FUNC(dl_fn->cuda_lib, s); \
- if (!(f)) { \
- av_log(avctx, AV_LOG_FATAL, "Failed loading %s from CUDA library\n", s); \
- goto error; \
- } \
-} while (0)
-
-static av_cold int nvenc_dyload_cuda(AVCodecContext *avctx)
-{
- NvencContext *ctx = avctx->priv_data;
- NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
-
- if (dl_fn->cuda_lib)
- return 1;
-
-#if defined(_WIN32)
- dl_fn->cuda_lib = LoadLibrary(TEXT("nvcuda.dll"));
-#else
- dl_fn->cuda_lib = dlopen("libcuda.so", RTLD_LAZY);
-#endif
-
- if (!dl_fn->cuda_lib) {
- av_log(avctx, AV_LOG_FATAL, "Failed loading CUDA library\n");
- goto error;
- }
-
- CHECK_LOAD_FUNC(PCUINIT, dl_fn->cu_init, "cuInit");
- CHECK_LOAD_FUNC(PCUDEVICEGETCOUNT, dl_fn->cu_device_get_count, "cuDeviceGetCount");
- CHECK_LOAD_FUNC(PCUDEVICEGET, dl_fn->cu_device_get, "cuDeviceGet");
- CHECK_LOAD_FUNC(PCUDEVICEGETNAME, dl_fn->cu_device_get_name, "cuDeviceGetName");
- CHECK_LOAD_FUNC(PCUDEVICECOMPUTECAPABILITY, dl_fn->cu_device_compute_capability, "cuDeviceComputeCapability");
- CHECK_LOAD_FUNC(PCUCTXCREATE, dl_fn->cu_ctx_create, "cuCtxCreate_v2");
- CHECK_LOAD_FUNC(PCUCTXPOPCURRENT, dl_fn->cu_ctx_pop_current, "cuCtxPopCurrent_v2");
- CHECK_LOAD_FUNC(PCUCTXDESTROY, dl_fn->cu_ctx_destroy, "cuCtxDestroy_v2");
-
- return 1;
-
-error:
-
- if (dl_fn->cuda_lib)
- DL_CLOSE_FUNC(dl_fn->cuda_lib);
-
- dl_fn->cuda_lib = NULL;
-
- return 0;
-}
-
-static av_cold int check_cuda_errors(AVCodecContext *avctx, CUresult err, const char *func)
-{
- if (err != CUDA_SUCCESS) {
- av_log(avctx, AV_LOG_FATAL, ">> %s - failed with error code 0x%x\n", func, err);
- return 0;
- }
- return 1;
-}
-#define check_cuda_errors(f) if (!check_cuda_errors(avctx, f, #f)) goto error
-
static av_cold int nvenc_check_cuda(AVCodecContext *avctx)
{
- int device_count = 0;
- CUdevice cu_device = 0;
- char gpu_name[128];
- int smminor = 0, smmajor = 0;
- int i, smver, target_smver;
+ int target_smver;
NvencContext *ctx = avctx->priv_data;
- NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+
+ if (!init_cuda())
+ return 0;
switch (avctx->codec->id) {
case AV_CODEC_ID_H264:
- target_smver = avctx->pix_fmt == AV_PIX_FMT_YUV444P ? 0x52 : 0x30;
+ target_smver = 0x30;
break;
case AV_CODEC_ID_H265:
target_smver = 0x52;
@@ -412,49 +331,19 @@
goto error;
}
- if (!nvenc_dyload_cuda(avctx))
- return 0;
-
- if (dl_fn->nvenc_device_count > 0)
- return 1;
-
- check_cuda_errors(dl_fn->cu_init(0));
-
- check_cuda_errors(dl_fn->cu_device_get_count(&device_count));
-
- if (!device_count) {
- av_log(avctx, AV_LOG_FATAL, "No CUDA capable devices found\n");
+ if (!is_gpu_feature_available(ctx->gpu, target_smver))
+ {
+ av_log(avctx, AV_LOG_FATAL, "NVENC with Codec %s Not Available at requested GPU %d \n", (avctx->codec->id == AV_CODEC_ID_H264)? "H264" : "H265", ctx->gpu);
goto error;
}
-
- av_log(avctx, AV_LOG_VERBOSE, "%d CUDA capable devices found\n", device_count);
-
- dl_fn->nvenc_device_count = 0;
-
- for (i = 0; i < device_count; ++i) {
- check_cuda_errors(dl_fn->cu_device_get(&cu_device, i));
- check_cuda_errors(dl_fn->cu_device_get_name(gpu_name, sizeof(gpu_name), cu_device));
- check_cuda_errors(dl_fn->cu_device_compute_capability(&smmajor, &smminor, cu_device));
-
- smver = (smmajor << 4) | smminor;
-
- av_log(avctx, AV_LOG_VERBOSE, "[ GPU #%d - < %s > has Compute SM %d.%d, NVENC %s ]\n", i, gpu_name, smmajor, smminor, (smver >= target_smver) ? "Available" : "Not Available");
-
- if (smver >= target_smver)
- dl_fn->nvenc_devices[dl_fn->nvenc_device_count++] = cu_device;
- }
-
- if (!dl_fn->nvenc_device_count) {
- av_log(avctx, AV_LOG_FATAL, "No NVENC capable devices found\n");
- goto error;
+ else
+ {
+ av_log(avctx, AV_LOG_VERBOSE, "NVENC with Codec %s Available at requested GPU %d \n", (avctx->codec->id == AV_CODEC_ID_H264) ? "H264" : "H265", ctx->gpu);
}
return 1;
error:
-
- dl_fn->nvenc_device_count = 0;
-
return 0;
}
@@ -488,23 +377,18 @@
}
nvEncodeAPICreateInstance = (PNVENCODEAPICREATEINSTANCE)LOAD_FUNC(dl_fn->nvenc_lib, "NvEncodeAPICreateInstance");
-
if (!nvEncodeAPICreateInstance) {
av_log(avctx, AV_LOG_FATAL, "Failed to load nvenc entrypoint\n");
goto error;
}
dl_fn->nvenc_funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER;
-
nvstatus = nvEncodeAPICreateInstance(&dl_fn->nvenc_funcs);
-
if (nvstatus != NV_ENC_SUCCESS) {
av_log(avctx, AV_LOG_FATAL, "Failed to create nvenc instance\n");
goto error;
}
- av_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n");
-
return 1;
error:
@@ -512,7 +396,6 @@
DL_CLOSE_FUNC(dl_fn->nvenc_lib);
dl_fn->nvenc_lib = NULL;
-
return 0;
}
@@ -523,29 +406,16 @@
DL_CLOSE_FUNC(dl_fn->nvenc_lib);
dl_fn->nvenc_lib = NULL;
-
- dl_fn->nvenc_device_count = 0;
-
- DL_CLOSE_FUNC(dl_fn->cuda_lib);
- dl_fn->cuda_lib = NULL;
-
- dl_fn->cu_init = NULL;
- dl_fn->cu_device_get_count = NULL;
- dl_fn->cu_device_get = NULL;
- dl_fn->cu_device_get_name = NULL;
- dl_fn->cu_device_compute_capability = NULL;
- dl_fn->cu_ctx_create = NULL;
- dl_fn->cu_ctx_pop_current = NULL;
- dl_fn->cu_ctx_destroy = NULL;
-
+ deinit_cuda();
av_log(avctx, AV_LOG_VERBOSE, "Nvenc unloaded\n");
}
static av_cold int nvenc_encode_init(AVCodecContext *avctx)
{
NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS encode_session_params = { 0 };
+ NV_ENC_REGISTER_RESOURCE registerParams = { 0 };
NV_ENC_PRESET_CONFIG preset_config = { 0 };
- CUcontext cu_context_curr;
+ CudaDynLoadFunctions *p_cuda;
CUresult cu_res;
GUID encoder_preset = NV_ENC_PRESET_HQ_GUID;
GUID codec;
@@ -558,6 +428,7 @@
int res = 0;
int dw, dh;
int qp_inter_p;
+ extern char color_ptx[];
NvencContext *ctx = avctx->priv_data;
NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
@@ -575,28 +446,18 @@
encode_session_params.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
encode_session_params.apiVersion = NVENCAPI_VERSION;
- if (ctx->gpu >= dl_fn->nvenc_device_count) {
- av_log(avctx, AV_LOG_FATAL, "Requested GPU %d, but only %d GPUs are available!\n", ctx->gpu, dl_fn->nvenc_device_count);
- res = AVERROR(EINVAL);
- goto error;
- }
-
- ctx->cu_context = NULL;
- cu_res = dl_fn->cu_ctx_create(&ctx->cu_context, 4, dl_fn->nvenc_devices[ctx->gpu]); // CU_CTX_SCHED_BLOCKING_SYNC=4, avoid CPU spins
+ cu_res = get_cuda_context(&ctx->cu_context, ctx->gpu);
+ p_cuda = get_cuda_dl_func();
if (cu_res != CUDA_SUCCESS) {
av_log(avctx, AV_LOG_FATAL, "Failed creating CUDA context for NVENC: 0x%x\n", (int)cu_res);
res = AVERROR_EXTERNAL;
goto error;
}
+ av_log(avctx, AV_LOG_VERBOSE, "NVENC : Cuda Context created 0x%x\n", (int)ctx->cu_context);
- cu_res = dl_fn->cu_ctx_pop_current(&cu_context_curr);
-
- if (cu_res != CUDA_SUCCESS) {
- av_log(avctx, AV_LOG_FATAL, "Failed popping CUDA context: 0x%x\n", (int)cu_res);
- res = AVERROR_EXTERNAL;
- goto error;
- }
+ __cu(p_cuda->cu_module_load_data(&ctx->cu_module, color_ptx));
+ __cu(p_cuda->cu_module_get_function(&ctx->cu_func_interleaveChroma, ctx->cu_module, "interleaveChroma"));
encode_session_params.device = ctx->cu_context;
encode_session_params.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
@@ -866,6 +727,15 @@
ctx->encode_config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
}
+ if (ctx->aq)
+ {
+ ctx->encode_config.rcParams.enableAQ = 1;
+ }
+ else
+ {
+ ctx->encode_config.rcParams.enableAQ = 0;
+ }
+
switch (avctx->codec->id) {
case AV_CODEC_ID_H264:
ctx->encode_config.encodeCodecConfig.h264Config.h264VUIParameters.colourDescriptionPresentFlag = 1;
@@ -885,9 +755,6 @@
if (!ctx->profile) {
switch (avctx->profile) {
- case FF_PROFILE_H264_HIGH_444_PREDICTIVE:
- ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
- break;
case FF_PROFILE_H264_BASELINE:
ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
break;
@@ -913,9 +780,6 @@
} else if (!strcmp(ctx->profile, "baseline")) {
ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
avctx->profile = FF_PROFILE_H264_BASELINE;
- } else if (!strcmp(ctx->profile, "high444p")) {
- ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
- avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE;
} else {
av_log(avctx, AV_LOG_FATAL, "Profile \"%s\" is unknown! Supported profiles: high, main, baseline\n", ctx->profile);
res = AVERROR(EINVAL);
@@ -923,13 +787,7 @@
}
}
- // force setting profile as high444p if input is AV_PIX_FMT_YUV444P
- if (avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
- ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
- avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE;
- }
-
- ctx->encode_config.encodeCodecConfig.h264Config.chromaFormatIDC = avctx->profile == FF_PROFILE_H264_HIGH_444_PREDICTIVE ? 3 : 1;
+ ctx->encode_config.encodeCodecConfig.h264Config.chromaFormatIDC = 1;
if (ctx->level) {
res = input_string_to_uint32(avctx, nvenc_h264_level_pairs, ctx->level, &ctx->encode_config.encodeCodecConfig.h264Config.level);
@@ -988,6 +846,8 @@
goto error;
}
+ av_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n");
+
ctx->input_surfaces = av_malloc(ctx->max_surface_count * sizeof(*ctx->input_surfaces));
if (!ctx->input_surfaces) {
@@ -1002,28 +862,32 @@
goto error;
}
+ // Allocation for temp surface used for sys mem -> device mem transfer
+ if (avctx->pix_fmt == AV_PIX_FMT_YUV420P)
+ {
+ ctx->transferSurf.width = (avctx->width + 31) & ~31;
+ ctx->transferSurf.height = (avctx->height + 31) & ~31;
+ p_cuda->cu_mem_alloc_pitch(&ctx->transferSurf.dptr,
+ &ctx->transferSurf.pitch,
+ ctx->transferSurf.width,
+ ctx->transferSurf.height/ 2, 16);
+ }
+
for (surfaceCount = 0; surfaceCount < ctx->max_surface_count; ++surfaceCount) {
- NV_ENC_CREATE_INPUT_BUFFER allocSurf = { 0 };
NV_ENC_CREATE_BITSTREAM_BUFFER allocOut = { 0 };
- allocSurf.version = NV_ENC_CREATE_INPUT_BUFFER_VER;
allocOut.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
- allocSurf.width = (avctx->width + 31) & ~31;
- allocSurf.height = (avctx->height + 31) & ~31;
-
- allocSurf.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED;
+ ctx->input_surfaces[surfaceCount].width = (avctx->width + 31) & ~31;
+ ctx->input_surfaces[surfaceCount].height = (avctx->height + 31) & ~31;
switch (avctx->pix_fmt) {
case AV_PIX_FMT_YUV420P:
- allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YV12_PL;
- break;
-
case AV_PIX_FMT_NV12:
- allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL;
- break;
-
- case AV_PIX_FMT_YUV444P:
- allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YUV444_PL;
+ ctx->input_surfaces[surfaceCount].format = NV_ENC_BUFFER_FORMAT_NV12_PL;
+ p_cuda->cu_mem_alloc_pitch(&ctx->input_surfaces[surfaceCount].dptr,
+ &ctx->input_surfaces[surfaceCount].pitch,
+ ctx->input_surfaces[surfaceCount].width,
+ ctx->input_surfaces[surfaceCount].height * 3 / 2, 16);
break;
default:
@@ -1032,18 +896,21 @@
goto error;
}
- nv_status = p_nvenc->nvEncCreateInputBuffer(ctx->nvencoder, &allocSurf);
+ registerParams.version = NV_ENC_REGISTER_RESOURCE_VER,
+ registerParams.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR,
+ registerParams.width = ctx->input_surfaces[surfaceCount].width,
+ registerParams.height = ctx->input_surfaces[surfaceCount].height,
+ registerParams.pitch = ctx->input_surfaces[surfaceCount].pitch,
+ registerParams.bufferFormat = ctx->input_surfaces[surfaceCount].format;
+ registerParams.resourceToRegister = (void*)ctx->input_surfaces[surfaceCount].dptr,
+ nv_status = p_nvenc->nvEncRegisterResource(ctx->nvencoder, &registerParams);
if (nv_status != NV_ENC_SUCCESS) {
- av_log(avctx, AV_LOG_FATAL, "CreateInputBuffer failed\n");
+ av_log(avctx, AV_LOG_FATAL, "RegisterResource failed\n");
res = AVERROR_EXTERNAL;
goto error;
}
-
+ ctx->input_surfaces[surfaceCount].hRes = registerParams.registeredResource;
ctx->input_surfaces[surfaceCount].lockCount = 0;
- ctx->input_surfaces[surfaceCount].input_surface = allocSurf.inputBuffer;
- ctx->input_surfaces[surfaceCount].format = allocSurf.bufferFmt;
- ctx->input_surfaces[surfaceCount].width = allocSurf.width;
- ctx->input_surfaces[surfaceCount].height = allocSurf.height;
/* 1MB is large enough to hold most output frames. NVENC increases this automaticaly if it's not enough. */
allocOut.size = 1024 * 1024;
@@ -1106,21 +973,22 @@
return 0;
error:
-
for (i = 0; i < surfaceCount; ++i) {
- p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->input_surfaces[i].input_surface);
+ p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->input_surfaces[i].hRes);
+ p_cuda->cu_mem_free(ctx->input_surfaces[i].dptr);
+
if (ctx->output_surfaces[i].output_surface)
p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->output_surfaces[i].output_surface);
}
+ p_cuda->cu_mem_free(ctx->transferSurf.dptr);
if (ctx->nvencoder)
p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
if (ctx->cu_context)
- dl_fn->cu_ctx_destroy(ctx->cu_context);
+ release_cuda_context(&ctx->cu_context, ctx->gpu);
nvenc_unload_nvenc(avctx);
-
ctx->nvencoder = NULL;
ctx->cu_context = NULL;
@@ -1132,6 +1000,7 @@
NvencContext *ctx = avctx->priv_data;
NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+ CudaDynLoadFunctions *p_cuda = get_cuda_dl_func();
int i;
av_freep(&ctx->timestamp_list.data);
@@ -1139,16 +1008,19 @@
av_freep(&ctx->output_surface_queue.data);
for (i = 0; i < ctx->max_surface_count; ++i) {
- p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->input_surfaces[i].input_surface);
+ p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->input_surfaces[i].hRes);
+ p_cuda->cu_mem_free(ctx->input_surfaces[i].dptr);
p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->output_surfaces[i].output_surface);
}
ctx->max_surface_count = 0;
- p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
+ if (ctx->nvencoder)
+ p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
+
ctx->nvencoder = NULL;
- dl_fn->cu_ctx_destroy(ctx->cu_context);
- ctx->cu_context = NULL;
+ if (ctx->cu_context)
+ release_cuda_context(&ctx->cu_context, ctx->gpu);
nvenc_unload_nvenc(avctx);
@@ -1163,6 +1035,7 @@
uint32_t slice_mode_data;
uint32_t *slice_offsets;
+ char picType = 'X';
NV_ENC_LOCK_BITSTREAM lock_params = { 0 };
NVENCSTATUS nv_status;
int res = 0;
@@ -1215,12 +1088,15 @@
FF_DISABLE_DEPRECATION_WARNINGS
case NV_ENC_PIC_TYPE_I:
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+ picType = 'I';
break;
case NV_ENC_PIC_TYPE_P:
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+ picType = 'P';
break;
case NV_ENC_PIC_TYPE_B:
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+ picType = 'B';
break;
case NV_ENC_PIC_TYPE_BI:
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_BI;
@@ -1234,6 +1110,8 @@
#endif
}
+ av_log(avctx, AV_LOG_VERBOSE, "FRAME STATISTICS: Frame No. %d PicType %c Frame AvgQP %d SATD Cost %d Size %d bytes\r", lock_params.frameIdx, picType, lock_params.frameAvgQP, lock_params.frameSatd, lock_params.bitstreamSizeInBytes);
+
pkt->pts = lock_params.outputTimeStamp;
pkt->dts = timestamp_queue_dequeue(&ctx->timestamp_list);
@@ -1261,6 +1139,61 @@
return res;
}
+
+static int call_interleavechroma_kernel(CudaDynLoadFunctions* dl_func, CUfunction func,
+ CUdeviceptr cb_dptr, CUdeviceptr cr_dptr, CUdeviceptr nv12chroma_dptr, int width, int height, int srcStride, int dstStride)
+{
+ void *args_uchar[] = { &cb_dptr, &cr_dptr, &nv12chroma_dptr, &width, &height, &srcStride, &dstStride};
+ __cu(dl_func->cu_launch_kernel(func, BLOCKSX, 1, 1, THREADSX, 1, 1, 0, NULL, args_uchar, NULL));
+
+ return 0;
+}
+
+static int nvenc_copy_to_inputbuffer(NvencContext *ctx, const AVFrame* frame, NvencInputSurface *inSurf)
+{
+ CudaDynLoadFunctions *p_cuda = get_cuda_dl_func();
+ if (frame->format == AV_PIX_FMT_NV12) {
+
+ // check opaque field, if there's already a deviceptr
+ if (frame->opaque && check_nvinfo(frame->opaque) &&
+ ((ffnvinfo*)(frame->opaque))->dptr[0]) {
+ ffnvinfo* info = (ffnvinfo*)frame->opaque;
+
+ __cu(cuMemCpy2d(NULL, info->dptr[0], info->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+ __cu(cuMemCpy2d(NULL, info->dptr[1], info->linesize[1], NULL, inSurf->dptr + inSurf->pitch*inSurf->height, inSurf->pitch, frame->width, frame->height/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+ }
+ else
+ {
+ __cu(cuMemCpy2d(frame->data[0], (CUdeviceptr)NULL, frame->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ __cu(cuMemCpy2d(frame->data[1], (CUdeviceptr)NULL, frame->linesize[1], NULL, inSurf->dptr + inSurf->pitch*inSurf->height, inSurf->pitch, frame->width, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ }
+ }
+ else if (frame->format == AV_PIX_FMT_YUV420P) {
+ // check opaque field, if there's already a deviceptr
+ if (frame->opaque && check_nvinfo(frame->opaque) &&
+ ((ffnvinfo*)(frame->opaque))->dptr[0]) {
+ ffnvinfo* info = (ffnvinfo*)frame->opaque;
+
+ __cu(cuMemCpy2d(NULL, info->dptr[0], info->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+ call_interleavechroma_kernel(p_cuda, ctx->cu_func_interleaveChroma, info->dptr[1], info->dptr[2], inSurf->dptr + inSurf->pitch*inSurf->height, (frame->width+31) & ~31 , frame->height, (info->linesize[1]<<1), inSurf->pitch);
+ }
+ else
+ {
+ __cu(cuMemCpy2d(frame->data[0], (CUdeviceptr)NULL, frame->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ __cu(cuMemCpy2d(frame->data[1], (CUdeviceptr)NULL, frame->linesize[1], NULL, ctx->transferSurf.dptr, ctx->transferSurf.pitch / 2, ctx->transferSurf.width / 2, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ __cu(cuMemCpy2d(frame->data[2], (CUdeviceptr)NULL, frame->linesize[2], NULL, ctx->transferSurf.dptr + ctx->transferSurf.pitch*ctx->transferSurf.height / 4, ctx->transferSurf.pitch / 2, ctx->transferSurf.width / 2, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+
+ call_interleavechroma_kernel(p_cuda, ctx->cu_func_interleaveChroma, ctx->transferSurf.dptr, ctx->transferSurf.dptr + ctx->transferSurf.pitch*ctx->transferSurf.height/4, inSurf->dptr + inSurf->pitch*inSurf->height, (frame->width + 31) & ~31, frame->height, ctx->transferSurf.pitch, inSurf->pitch);
+ }
+ }
+ else {
+ av_log(NULL, AV_LOG_FATAL, "Invalid pixel format!\n");
+ return AVERROR(EINVAL);
+ }
+
+ return 0;
+}
+
static int nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
const AVFrame *frame, int *got_packet)
{
@@ -1276,7 +1209,7 @@
pic_params.version = NV_ENC_PIC_PARAMS_VER;
if (frame) {
- NV_ENC_LOCK_INPUT_BUFFER lockBufferParams = { 0 };
+ NV_ENC_MAP_INPUT_RESOURCE mapParams = { 0 };
NvencInputSurface *inSurf = NULL;
for (i = 0; i < ctx->max_surface_count; ++i) {
@@ -1290,69 +1223,27 @@
inSurf->lockCount = 1;
- lockBufferParams.version = NV_ENC_LOCK_INPUT_BUFFER_VER;
- lockBufferParams.inputBuffer = inSurf->input_surface;
-
- nv_status = p_nvenc->nvEncLockInputBuffer(ctx->nvencoder, &lockBufferParams);
+ mapParams.version = NV_ENC_MAP_INPUT_RESOURCE_VER;
+ mapParams.registeredResource = inSurf->hRes;
+ nv_status = p_nvenc->nvEncMapInputResource(ctx->nvencoder, &mapParams);
if (nv_status != NV_ENC_SUCCESS) {
- av_log(avctx, AV_LOG_ERROR, "Failed locking nvenc input buffer\n");
+ av_log(avctx, AV_LOG_ERROR, "Failed mapping nvenc input buffer\n");
return 0;
}
- if (avctx->pix_fmt == AV_PIX_FMT_YUV420P) {
- uint8_t *buf = lockBufferParams.bufferDataPtr;
-
- av_image_copy_plane(buf, lockBufferParams.pitch,
- frame->data[0], frame->linesize[0],
- avctx->width, avctx->height);
-
- buf += inSurf->height * lockBufferParams.pitch;
-
- av_image_copy_plane(buf, lockBufferParams.pitch >> 1,
- frame->data[2], frame->linesize[2],
- avctx->width >> 1, avctx->height >> 1);
-
- buf += (inSurf->height * lockBufferParams.pitch) >> 2;
-
- av_image_copy_plane(buf, lockBufferParams.pitch >> 1,
- frame->data[1], frame->linesize[1],
- avctx->width >> 1, avctx->height >> 1);
- } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {
- uint8_t *buf = lockBufferParams.bufferDataPtr;
-
- av_image_copy_plane(buf, lockBufferParams.pitch,
- frame->data[0], frame->linesize[0],
- avctx->width, avctx->height);
-
- buf += inSurf->height * lockBufferParams.pitch;
-
- av_image_copy_plane(buf, lockBufferParams.pitch,
- frame->data[1], frame->linesize[1],
- avctx->width, avctx->height >> 1);
- } else if (avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
- uint8_t *buf = lockBufferParams.bufferDataPtr;
-
- av_image_copy_plane(buf, lockBufferParams.pitch,
- frame->data[0], frame->linesize[0],
- avctx->width, avctx->height);
-
- buf += inSurf->height * lockBufferParams.pitch;
-
- av_image_copy_plane(buf, lockBufferParams.pitch,
- frame->data[1], frame->linesize[1],
- avctx->width, avctx->height);
-
- buf += inSurf->height * lockBufferParams.pitch;
+ inSurf->input_surface = mapParams.mappedResource;
+ if (inSurf->format != mapParams.mappedBufferFmt) {
+ av_log(avctx, AV_LOG_ERROR, "Incompatible buffer format!\n");
+ return 0;
+ }
- av_image_copy_plane(buf, lockBufferParams.pitch,
- frame->data[2], frame->linesize[2],
- avctx->width, avctx->height);
- } else {
- av_log(avctx, AV_LOG_FATAL, "Invalid pixel format!\n");
- return AVERROR(EINVAL);
+ if (nvenc_copy_to_inputbuffer(ctx, frame, inSurf) != 0) {
+ p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, inSurf->input_surface);
+ av_log(avctx, AV_LOG_ERROR, "Failed to copy data to NVENC input buffer!\n");
+ return 0;
}
- nv_status = p_nvenc->nvEncUnlockInputBuffer(ctx->nvencoder, inSurf->input_surface);
+ nv_status = p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, inSurf->input_surface);
if (nv_status != NV_ENC_SUCCESS) {
av_log(avctx, AV_LOG_FATAL, "Failed unlocking input buffer!\n");
return AVERROR_EXTERNAL;
@@ -1470,7 +1361,6 @@
static const enum AVPixelFormat pix_fmts_nvenc[] = {
AV_PIX_FMT_YUV420P,
AV_PIX_FMT_NV12,
- AV_PIX_FMT_YUV444P,
AV_PIX_FMT_NONE
};
@@ -1478,13 +1368,14 @@
#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
static const AVOption options[] = {
{ "preset", "Set the encoding preset (one of slow = hq 2pass, medium = hq, fast = hp, hq, hp, bd, ll, llhq, llhp, default)", OFFSET(preset), AV_OPT_TYPE_STRING, { .str = "medium" }, 0, 0, VE },
- { "profile", "Set the encoding profile (high, main, baseline or high444p)", OFFSET(profile), AV_OPT_TYPE_STRING, { .str = "main" }, 0, 0, VE },
+ { "profile", "Set the encoding profile (high, main, baseline)", OFFSET(profile), AV_OPT_TYPE_STRING, { .str = "main" }, 0, 0, VE },
{ "level", "Set the encoding level restriction (auto, 1.0, 1.0b, 1.1, 1.2, ..., 4.2, 5.0, 5.1)", OFFSET(level), AV_OPT_TYPE_STRING, { .str = "auto" }, 0, 0, VE },
{ "tier", "Set the encoding tier (main or high)", OFFSET(tier), AV_OPT_TYPE_STRING, { .str = "main" }, 0, 0, VE },
{ "cbr", "Use cbr encoding mode", OFFSET(cbr), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
{ "2pass", "Use 2pass encoding mode", OFFSET(twopass), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, VE },
{ "gpu", "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.", OFFSET(gpu), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
{ "delay", "Delays frame output by the given amount of frames.", OFFSET(buffer_delay), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
+ { "enableaq", "set to 1 to enable AQ ", OFFSET(aq), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
{ NULL }
};
diff -ruN ffmpeg-orig/libavcodec/nvenc_ptx.c ffmpeg/libavcodec/nvenc_ptx.c
--- ffmpeg-orig/libavcodec/nvenc_ptx.c 1970-01-01 03:00:00.000000000 +0300
+++ ffmpeg/libavcodec/nvenc_ptx.c 2016-03-04 00:18:17.585433546 +0200
@@ -0,0 +1,240 @@
+#if _WIN32 || _WIN64
+#if _WIN64
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+// Check GCC
+#if __GNUC__
+#if __x86_64__ || __ppc64__
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+#ifdef ENVIRONMENT32
+const char color_ptx[] = \
+ "//\n"
+ "// Generated by NVIDIA NVVM Compiler\n"
+ "//\n"
+ "// Compiler Build ID: CL-19830389\n"
+ "// Cuda compilation tools, release 8.0, V8.0.0\n"
+ "// Based on LLVM 3.4svn\n"
+ "//\n"
+ "\n"
+ ".version 4.3\n"
+ ".target sm_30\n"
+ ".address_size 32\n"
+ "\n"
+ "// .globl interleaveChroma\n"
+ "\n"
+ ".visible .entry interleaveChroma(\n"
+ ".param .u32 interleaveChroma_param_0,\n"
+ ".param .u32 interleaveChroma_param_1,\n"
+ ".param .u32 interleaveChroma_param_2,\n"
+ ".param .u32 interleaveChroma_param_3,\n"
+ ".param .u32 interleaveChroma_param_4,\n"
+ ".param .u32 interleaveChroma_param_5,\n"
+ ".param .u32 interleaveChroma_param_6\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<5>;\n"
+ ".reg .b32 %r<57>;\n"
+ "\n"
+ "\n"
+ "ld.param.u32 %r15, [interleaveChroma_param_0];\n"
+ "ld.param.u32 %r16, [interleaveChroma_param_1];\n"
+ "ld.param.u32 %r17, [interleaveChroma_param_2];\n"
+ "ld.param.u32 %r18, [interleaveChroma_param_3];\n"
+ "ld.param.u32 %r21, [interleaveChroma_param_4];\n"
+ "ld.param.u32 %r19, [interleaveChroma_param_5];\n"
+ "ld.param.u32 %r20, [interleaveChroma_param_6];\n"
+ "shr.s32 %r1, %r21, 1;\n"
+ "mov.u32 %r55, %ctaid.x;\n"
+ "setp.ge.s32 %p1, %r55, %r1;\n"
+ "@%p1 bra BB0_6;\n"
+ "\n"
+ "cvta.to.global.u32 %r3, %r17;\n"
+ "cvta.to.global.u32 %r4, %r16;\n"
+ "cvta.to.global.u32 %r5, %r15;\n"
+ "mov.u32 %r6, %tid.x;\n"
+ "shr.s32 %r7, %r18, 3;\n"
+ "mov.u32 %r8, %ntid.x;\n"
+ "\n"
+ "BB0_2:\n"
+ "setp.ge.s32 %p2, %r6, %r7;\n"
+ "@%p2 bra BB0_5;\n"
+ "\n"
+ "mul.lo.s32 %r22, %r55, %r19;\n"
+ "mul.lo.s32 %r23, %r55, %r20;\n"
+ "shr.s32 %r10, %r23, 2;\n"
+ "shr.s32 %r24, %r22, 2;\n"
+ "shr.u32 %r11, %r24, 1;\n"
+ "mov.u32 %r56, %r6;\n"
+ "\n"
+ "BB0_4:\n"
+ "mov.u32 %r12, %r56;\n"
+ "add.s32 %r25, %r12, %r11;\n"
+ "shl.b32 %r26, %r25, 2;\n"
+ "add.s32 %r27, %r5, %r26;\n"
+ "add.s32 %r28, %r4, %r26;\n"
+ "ld.global.u32 %r29, [%r28];\n"
+ "and.b32 %r30, %r29, 65280;\n"
+ "shl.b32 %r31, %r30, 16;\n"
+ "ld.global.u32 %r32, [%r27];\n"
+ "shl.b32 %r33, %r32, 8;\n"
+ "and.b32 %r34, %r33, 16711680;\n"
+ "shl.b32 %r35, %r29, 8;\n"
+ "and.b32 %r36, %r35, 65280;\n"
+ "and.b32 %r37, %r32, 255;\n"
+ "or.b32 %r38, %r34, %r37;\n"
+ "or.b32 %r39, %r38, %r31;\n"
+ "or.b32 %r40, %r39, %r36;\n"
+ "shl.b32 %r41, %r12, 1;\n"
+ "add.s32 %r42, %r41, %r10;\n"
+ "shl.b32 %r43, %r42, 2;\n"
+ "add.s32 %r44, %r3, %r43;\n"
+ "st.global.u32 [%r44], %r40;\n"
+ "and.b32 %r45, %r29, -16777216;\n"
+ "and.b32 %r46, %r32, -16777216;\n"
+ "shr.u32 %r47, %r46, 8;\n"
+ "or.b32 %r48, %r45, %r47;\n"
+ "and.b32 %r49, %r29, 16711680;\n"
+ "shr.u32 %r50, %r49, 8;\n"
+ "bfe.u32 %r51, %r32, 16, 8;\n"
+ "or.b32 %r52, %r48, %r51;\n"
+ "or.b32 %r53, %r52, %r50;\n"
+ "st.global.u32 [%r44+4], %r53;\n"
+ "add.s32 %r13, %r8, %r12;\n"
+ "setp.lt.s32 %p3, %r13, %r7;\n"
+ "mov.u32 %r56, %r13;\n"
+ "@%p3 bra BB0_4;\n"
+ "\n"
+ "BB0_5:\n"
+ "mov.u32 %r54, %nctaid.x;\n"
+ "add.s32 %r55, %r54, %r55;\n"
+ "setp.lt.s32 %p4, %r55, %r1;\n"
+ "@%p4 bra BB0_2;\n"
+ "\n"
+ "BB0_6:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "\n"
+;
+#elif defined ENVIRONMENT64
+const char color_ptx[] = \
+ "//\n"
+ "// Generated by NVIDIA NVVM Compiler\n"
+ "//\n"
+ "// Compiler Build ID: CL-19830389\n"
+ "// Cuda compilation tools, release 8.0, V8.0.0\n"
+ "// Based on LLVM 3.4svn\n"
+ "//\n"
+ "\n"
+ ".version 4.3\n"
+ ".target sm_30\n"
+ ".address_size 64\n"
+ "\n"
+ "// .globl interleaveChroma\n"
+ "\n"
+ ".visible .entry interleaveChroma(\n"
+ ".param .u64 interleaveChroma_param_0,\n"
+ ".param .u64 interleaveChroma_param_1,\n"
+ ".param .u64 interleaveChroma_param_2,\n"
+ ".param .u32 interleaveChroma_param_3,\n"
+ ".param .u32 interleaveChroma_param_4,\n"
+ ".param .u32 interleaveChroma_param_5,\n"
+ ".param .u32 interleaveChroma_param_6\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<5>;\n"
+ ".reg .b32 %r<47>;\n"
+ ".reg .b64 %rd<14>;\n"
+ "\n"
+ "\n"
+ "ld.param.u64 %rd4, [interleaveChroma_param_0];\n"
+ "ld.param.u64 %rd5, [interleaveChroma_param_1];\n"
+ "ld.param.u64 %rd6, [interleaveChroma_param_2];\n"
+ "ld.param.u32 %r12, [interleaveChroma_param_3];\n"
+ "ld.param.u32 %r15, [interleaveChroma_param_4];\n"
+ "ld.param.u32 %r13, [interleaveChroma_param_5];\n"
+ "ld.param.u32 %r14, [interleaveChroma_param_6];\n"
+ "shr.s32 %r1, %r15, 1;\n"
+ "mov.u32 %r45, %ctaid.x;\n"
+ "setp.ge.s32 %p1, %r45, %r1;\n"
+ "@%p1 bra BB0_6;\n"
+ "\n"
+ "cvta.to.global.u64 %rd1, %rd6;\n"
+ "cvta.to.global.u64 %rd2, %rd5;\n"
+ "cvta.to.global.u64 %rd3, %rd4;\n"
+ "mov.u32 %r3, %tid.x;\n"
+ "shr.s32 %r4, %r12, 3;\n"
+ "mov.u32 %r5, %ntid.x;\n"
+ "\n"
+ "BB0_2:\n"
+ "setp.ge.s32 %p2, %r3, %r4;\n"
+ "@%p2 bra BB0_5;\n"
+ "\n"
+ "mul.lo.s32 %r16, %r45, %r13;\n"
+ "mul.lo.s32 %r17, %r45, %r14;\n"
+ "shr.s32 %r7, %r17, 2;\n"
+ "shr.s32 %r18, %r16, 2;\n"
+ "shr.u32 %r8, %r18, 1;\n"
+ "mov.u32 %r46, %r3;\n"
+ "\n"
+ "BB0_4:\n"
+ "mov.u32 %r9, %r46;\n"
+ "add.s32 %r19, %r9, %r8;\n"
+ "mul.wide.u32 %rd7, %r19, 4;\n"
+ "add.s64 %rd8, %rd3, %rd7;\n"
+ "add.s64 %rd9, %rd2, %rd7;\n"
+ "ld.global.u32 %r20, [%rd9];\n"
+ "and.b32 %r21, %r20, 65280;\n"
+ "shl.b32 %r22, %r21, 16;\n"
+ "ld.global.u32 %r23, [%rd8];\n"
+ "shl.b32 %r24, %r23, 8;\n"
+ "and.b32 %r25, %r24, 16711680;\n"
+ "shl.b32 %r26, %r20, 8;\n"
+ "and.b32 %r27, %r26, 65280;\n"
+ "and.b32 %r28, %r23, 255;\n"
+ "or.b32 %r29, %r25, %r28;\n"
+ "or.b32 %r30, %r29, %r22;\n"
+ "or.b32 %r31, %r30, %r27;\n"
+ "shl.b32 %r32, %r9, 1;\n"
+ "add.s32 %r33, %r32, %r7;\n"
+ "mul.wide.u32 %rd10, %r33, 4;\n"
+ "add.s64 %rd11, %rd1, %rd10;\n"
+ "st.global.u32 [%rd11], %r31;\n"
+ "and.b32 %r34, %r20, -16777216;\n"
+ "and.b32 %r35, %r23, -16777216;\n"
+ "shr.u32 %r36, %r35, 8;\n"
+ "or.b32 %r37, %r34, %r36;\n"
+ "and.b32 %r38, %r20, 16711680;\n"
+ "shr.u32 %r39, %r38, 8;\n"
+ "bfe.u32 %r40, %r23, 16, 8;\n"
+ "or.b32 %r41, %r37, %r40;\n"
+ "or.b32 %r42, %r41, %r39;\n"
+ "add.s32 %r43, %r33, 1;\n"
+ "mul.wide.u32 %rd12, %r43, 4;\n"
+ "add.s64 %rd13, %rd1, %rd12;\n"
+ "st.global.u32 [%rd13], %r42;\n"
+ "add.s32 %r10, %r5, %r9;\n"
+ "setp.lt.s32 %p3, %r10, %r4;\n"
+ "mov.u32 %r46, %r10;\n"
+ "@%p3 bra BB0_4;\n"
+ "\n"
+ "BB0_5:\n"
+ "mov.u32 %r44, %nctaid.x;\n"
+ "add.s32 %r45, %r44, %r45;\n"
+ "setp.lt.s32 %p4, %r45, %r1;\n"
+ "@%p4 bra BB0_2;\n"
+ "\n"
+ "BB0_6:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "\n"
+;
+#endif
diff -ruN ffmpeg-orig/configure ffmpeg/configure
--- ffmpeg-orig/configure 2016-03-04 00:17:20.652150576 +0200
+++ ffmpeg/configure 2016-03-04 02:36:28.264332984 +0200
@@ -278,6 +278,7 @@
--enable-mmal enable decoding via MMAL [no]
--enable-netcdf enable NetCDF, needed for sofalizer filter [no]
--enable-nvenc enable NVIDIA NVENC support [no]
+ --enable-nvresize enable NVIDIA CUDA accelerated resizer [no]
--enable-openal enable OpenAL 1.1 capture support [no]
--enable-opencl enable OpenCL code
--enable-opengl enable OpenGL rendering [no]
@@ -1502,6 +1503,7 @@
mmal
netcdf
nvenc
+ nvresize
openal
opencl
opengl
@@ -5325,6 +5327,7 @@
frei0r_src_filter_extralibs='$ldl'
ladspa_filter_extralibs='$ldl'
nvenc_encoder_extralibs='$ldl'
+nvresize_filter_extralibs='$ldl'
if ! disabled network; then
check_func getaddrinfo $network_extralibs
diff -ruN ffmpeg-orig/libavcodec/Makefile ffmpeg/libavcodec/Makefile
--- ffmpeg-orig/libavcodec/Makefile 2016-03-04 00:17:21.166198276 +0200
+++ ffmpeg/libavcodec/Makefile 2016-03-04 00:18:17.583433360 +0200
@@ -102,7 +102,7 @@
motion_est.o ratecontrol.o \
mpegvideoencdsp.o
OBJS-$(CONFIG_MSS34DSP) += mss34dsp.o
-OBJS-$(CONFIG_NVENC) += nvenc.o
+OBJS-$(CONFIG_NVENC) += nvenc.o nvenc_ptx.o
OBJS-$(CONFIG_PIXBLOCKDSP) += pixblockdsp.o
OBJS-$(CONFIG_QPELDSP) += qpeldsp.o
OBJS-$(CONFIG_QSV) += qsv.o
diff -ruN ffmpeg-orig/libavcodec/nvenc.c ffmpeg/libavcodec/nvenc.c
--- ffmpeg-orig/libavcodec/nvenc.c 2016-03-04 00:17:21.113193357 +0200
+++ ffmpeg/libavcodec/nvenc.c 2016-03-04 00:26:51.381033223 +0200
@@ -32,15 +32,11 @@
#include "libavutil/avassert.h"
#include "libavutil/opt.h"
#include "libavutil/mem.h"
+#include "libavutil/cudautils.h"
#include "avcodec.h"
#include "internal.h"
#include "thread.h"
-#if defined(_WIN32)
-#define CUDAAPI __stdcall
-#else
-#define CUDAAPI
-#endif
#if defined(_WIN32)
#define LOAD_FUNC(l, s) GetProcAddress(l, s)
@@ -50,28 +46,19 @@
#define DL_CLOSE_FUNC(l) dlclose(l)
#endif
-typedef enum cudaError_enum {
- CUDA_SUCCESS = 0
-} CUresult;
-typedef int CUdevice;
-typedef void* CUcontext;
-
-typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags);
-typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count);
-typedef CUresult(CUDAAPI *PCUDEVICEGET)(CUdevice *device, int ordinal);
-typedef CUresult(CUDAAPI *PCUDEVICEGETNAME)(char *name, int len, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUDEVICECOMPUTECAPABILITY)(int *major, int *minor, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUCTXCREATE)(CUcontext *pctx, unsigned int flags, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUCTXPOPCURRENT)(CUcontext *pctx);
-typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx);
+#define BLOCKSX 128
+#define THREADSX 128
typedef NVENCSTATUS (NVENCAPI* PNVENCODEAPICREATEINSTANCE)(NV_ENCODE_API_FUNCTION_LIST *functionList);
typedef struct NvencInputSurface
{
NV_ENC_INPUT_PTR input_surface;
+ CUdeviceptr dptr;
+ void* hRes;
int width;
int height;
+ size_t pitch;
int lockCount;
@@ -107,24 +94,11 @@
typedef struct NvencDynLoadFunctions
{
- PCUINIT cu_init;
- PCUDEVICEGETCOUNT cu_device_get_count;
- PCUDEVICEGET cu_device_get;
- PCUDEVICEGETNAME cu_device_get_name;
- PCUDEVICECOMPUTECAPABILITY cu_device_compute_capability;
- PCUCTXCREATE cu_ctx_create;
- PCUCTXPOPCURRENT cu_ctx_pop_current;
- PCUCTXDESTROY cu_ctx_destroy;
-
NV_ENCODE_API_FUNCTION_LIST nvenc_funcs;
- int nvenc_device_count;
- CUdevice nvenc_devices[16];
#if defined(_WIN32)
- HMODULE cuda_lib;
HMODULE nvenc_lib;
#else
- void* cuda_lib;
void* nvenc_lib;
#endif
} NvencDynLoadFunctions;
@@ -140,14 +114,18 @@
AVClass *avclass;
NvencDynLoadFunctions nvenc_dload_funcs;
+ CudaDynLoadFunctions* cuda_dload_funcs;
NV_ENC_INITIALIZE_PARAMS init_encode_params;
NV_ENC_CONFIG encode_config;
CUcontext cu_context;
+ CUmodule cu_module;
+ CUfunction cu_func_interleaveChroma;
int max_surface_count;
NvencInputSurface *input_surfaces;
NvencOutputSurface *output_surfaces;
+ NvencInputSurface transferSurf;
NvencDataList output_surface_queue;
NvencDataList output_surface_ready_queue;
@@ -164,8 +142,10 @@
int twopass;
int gpu;
int buffer_delay;
+ int aq;
} NvencContext;
+
static const NvencValuePair nvenc_h264_level_pairs[] = {
{ "auto", NV_ENC_LEVEL_AUTOSELECT },
{ "1" , NV_ENC_LEVEL_H264_1 },
@@ -330,79 +310,18 @@
return res->u.timestamp;
}
-#define CHECK_LOAD_FUNC(t, f, s) \
-do { \
- (f) = (t)LOAD_FUNC(dl_fn->cuda_lib, s); \
- if (!(f)) { \
- av_log(avctx, AV_LOG_FATAL, "Failed loading %s from CUDA library\n", s); \
- goto error; \
- } \
-} while (0)
-
-static av_cold int nvenc_dyload_cuda(AVCodecContext *avctx)
-{
- NvencContext *ctx = avctx->priv_data;
- NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
-
- if (dl_fn->cuda_lib)
- return 1;
-
-#if defined(_WIN32)
- dl_fn->cuda_lib = LoadLibrary(TEXT("nvcuda.dll"));
-#else
- dl_fn->cuda_lib = dlopen("libcuda.so", RTLD_LAZY);
-#endif
-
- if (!dl_fn->cuda_lib) {
- av_log(avctx, AV_LOG_FATAL, "Failed loading CUDA library\n");
- goto error;
- }
-
- CHECK_LOAD_FUNC(PCUINIT, dl_fn->cu_init, "cuInit");
- CHECK_LOAD_FUNC(PCUDEVICEGETCOUNT, dl_fn->cu_device_get_count, "cuDeviceGetCount");
- CHECK_LOAD_FUNC(PCUDEVICEGET, dl_fn->cu_device_get, "cuDeviceGet");
- CHECK_LOAD_FUNC(PCUDEVICEGETNAME, dl_fn->cu_device_get_name, "cuDeviceGetName");
- CHECK_LOAD_FUNC(PCUDEVICECOMPUTECAPABILITY, dl_fn->cu_device_compute_capability, "cuDeviceComputeCapability");
- CHECK_LOAD_FUNC(PCUCTXCREATE, dl_fn->cu_ctx_create, "cuCtxCreate_v2");
- CHECK_LOAD_FUNC(PCUCTXPOPCURRENT, dl_fn->cu_ctx_pop_current, "cuCtxPopCurrent_v2");
- CHECK_LOAD_FUNC(PCUCTXDESTROY, dl_fn->cu_ctx_destroy, "cuCtxDestroy_v2");
-
- return 1;
-
-error:
-
- if (dl_fn->cuda_lib)
- DL_CLOSE_FUNC(dl_fn->cuda_lib);
-
- dl_fn->cuda_lib = NULL;
-
- return 0;
-}
-
-static av_cold int check_cuda_errors(AVCodecContext *avctx, CUresult err, const char *func)
-{
- if (err != CUDA_SUCCESS) {
- av_log(avctx, AV_LOG_FATAL, ">> %s - failed with error code 0x%x\n", func, err);
- return 0;
- }
- return 1;
-}
-#define check_cuda_errors(f) if (!check_cuda_errors(avctx, f, #f)) goto error
-
static av_cold int nvenc_check_cuda(AVCodecContext *avctx)
{
- int device_count = 0;
- CUdevice cu_device = 0;
- char gpu_name[128];
- int smminor = 0, smmajor = 0;
- int i, smver, target_smver;
+ int target_smver;
NvencContext *ctx = avctx->priv_data;
- NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+
+ if (!init_cuda())
+ return 0;
switch (avctx->codec->id) {
case AV_CODEC_ID_H264:
- target_smver = avctx->pix_fmt == AV_PIX_FMT_YUV444P ? 0x52 : 0x30;
+ target_smver = 0x30;
break;
case AV_CODEC_ID_H265:
target_smver = 0x52;
@@ -412,49 +331,19 @@
goto error;
}
- if (!nvenc_dyload_cuda(avctx))
- return 0;
-
- if (dl_fn->nvenc_device_count > 0)
- return 1;
-
- check_cuda_errors(dl_fn->cu_init(0));
-
- check_cuda_errors(dl_fn->cu_device_get_count(&device_count));
-
- if (!device_count) {
- av_log(avctx, AV_LOG_FATAL, "No CUDA capable devices found\n");
+ if (!is_gpu_feature_available(ctx->gpu, target_smver))
+ {
+ av_log(avctx, AV_LOG_FATAL, "NVENC with Codec %s Not Available at requested GPU %d \n", (avctx->codec->id == AV_CODEC_ID_H264)? "H264" : "H265", ctx->gpu);
goto error;
}
-
- av_log(avctx, AV_LOG_VERBOSE, "%d CUDA capable devices found\n", device_count);
-
- dl_fn->nvenc_device_count = 0;
-
- for (i = 0; i < device_count; ++i) {
- check_cuda_errors(dl_fn->cu_device_get(&cu_device, i));
- check_cuda_errors(dl_fn->cu_device_get_name(gpu_name, sizeof(gpu_name), cu_device));
- check_cuda_errors(dl_fn->cu_device_compute_capability(&smmajor, &smminor, cu_device));
-
- smver = (smmajor << 4) | smminor;
-
- av_log(avctx, AV_LOG_VERBOSE, "[ GPU #%d - < %s > has Compute SM %d.%d, NVENC %s ]\n", i, gpu_name, smmajor, smminor, (smver >= target_smver) ? "Available" : "Not Available");
-
- if (smver >= target_smver)
- dl_fn->nvenc_devices[dl_fn->nvenc_device_count++] = cu_device;
- }
-
- if (!dl_fn->nvenc_device_count) {
- av_log(avctx, AV_LOG_FATAL, "No NVENC capable devices found\n");
- goto error;
+ else
+ {
+ av_log(avctx, AV_LOG_VERBOSE, "NVENC with Codec %s Available at requested GPU %d \n", (avctx->codec->id == AV_CODEC_ID_H264) ? "H264" : "H265", ctx->gpu);
}
return 1;
error:
-
- dl_fn->nvenc_device_count = 0;
-
return 0;
}
@@ -488,23 +377,18 @@
}
nvEncodeAPICreateInstance = (PNVENCODEAPICREATEINSTANCE)LOAD_FUNC(dl_fn->nvenc_lib, "NvEncodeAPICreateInstance");
-
if (!nvEncodeAPICreateInstance) {
av_log(avctx, AV_LOG_FATAL, "Failed to load nvenc entrypoint\n");
goto error;
}
dl_fn->nvenc_funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER;
-
nvstatus = nvEncodeAPICreateInstance(&dl_fn->nvenc_funcs);
-
if (nvstatus != NV_ENC_SUCCESS) {
av_log(avctx, AV_LOG_FATAL, "Failed to create nvenc instance\n");
goto error;
}
- av_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n");
-
return 1;
error:
@@ -512,7 +396,6 @@
DL_CLOSE_FUNC(dl_fn->nvenc_lib);
dl_fn->nvenc_lib = NULL;
-
return 0;
}
@@ -523,29 +406,16 @@
DL_CLOSE_FUNC(dl_fn->nvenc_lib);
dl_fn->nvenc_lib = NULL;
-
- dl_fn->nvenc_device_count = 0;
-
- DL_CLOSE_FUNC(dl_fn->cuda_lib);
- dl_fn->cuda_lib = NULL;
-
- dl_fn->cu_init = NULL;
- dl_fn->cu_device_get_count = NULL;
- dl_fn->cu_device_get = NULL;
- dl_fn->cu_device_get_name = NULL;
- dl_fn->cu_device_compute_capability = NULL;
- dl_fn->cu_ctx_create = NULL;
- dl_fn->cu_ctx_pop_current = NULL;
- dl_fn->cu_ctx_destroy = NULL;
-
+ deinit_cuda();
av_log(avctx, AV_LOG_VERBOSE, "Nvenc unloaded\n");
}
static av_cold int nvenc_encode_init(AVCodecContext *avctx)
{
NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS encode_session_params = { 0 };
+ NV_ENC_REGISTER_RESOURCE registerParams = { 0 };
NV_ENC_PRESET_CONFIG preset_config = { 0 };
- CUcontext cu_context_curr;
+ CudaDynLoadFunctions *p_cuda;
CUresult cu_res;
GUID encoder_preset = NV_ENC_PRESET_HQ_GUID;
GUID codec;
@@ -558,6 +428,7 @@
int res = 0;
int dw, dh;
int qp_inter_p;
+ extern char color_ptx[];
NvencContext *ctx = avctx->priv_data;
NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
@@ -575,28 +446,18 @@
encode_session_params.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
encode_session_params.apiVersion = NVENCAPI_VERSION;
- if (ctx->gpu >= dl_fn->nvenc_device_count) {
- av_log(avctx, AV_LOG_FATAL, "Requested GPU %d, but only %d GPUs are available!\n", ctx->gpu, dl_fn->nvenc_device_count);
- res = AVERROR(EINVAL);
- goto error;
- }
-
- ctx->cu_context = NULL;
- cu_res = dl_fn->cu_ctx_create(&ctx->cu_context, 4, dl_fn->nvenc_devices[ctx->gpu]); // CU_CTX_SCHED_BLOCKING_SYNC=4, avoid CPU spins
+ cu_res = get_cuda_context(&ctx->cu_context, ctx->gpu);
+ p_cuda = get_cuda_dl_func();
if (cu_res != CUDA_SUCCESS) {
av_log(avctx, AV_LOG_FATAL, "Failed creating CUDA context for NVENC: 0x%x\n", (int)cu_res);
res = AVERROR_EXTERNAL;
goto error;
}
+ av_log(avctx, AV_LOG_VERBOSE, "NVENC : Cuda Context created 0x%x\n", (int)ctx->cu_context);
- cu_res = dl_fn->cu_ctx_pop_current(&cu_context_curr);
-
- if (cu_res != CUDA_SUCCESS) {
- av_log(avctx, AV_LOG_FATAL, "Failed popping CUDA context: 0x%x\n", (int)cu_res);
- res = AVERROR_EXTERNAL;
- goto error;
- }
+ __cu(p_cuda->cu_module_load_data(&ctx->cu_module, color_ptx));
+ __cu(p_cuda->cu_module_get_function(&ctx->cu_func_interleaveChroma, ctx->cu_module, "interleaveChroma"));
encode_session_params.device = ctx->cu_context;
encode_session_params.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
@@ -866,6 +727,15 @@
ctx->encode_config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
}
+ if (ctx->aq)
+ {
+ ctx->encode_config.rcParams.enableAQ = 1;
+ }
+ else
+ {
+ ctx->encode_config.rcParams.enableAQ = 0;
+ }
+
switch (avctx->codec->id) {
case AV_CODEC_ID_H264:
ctx->encode_config.encodeCodecConfig.h264Config.h264VUIParameters.colourDescriptionPresentFlag = 1;
@@ -885,9 +755,6 @@
if (!ctx->profile) {
switch (avctx->profile) {
- case FF_PROFILE_H264_HIGH_444_PREDICTIVE:
- ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
- break;
case FF_PROFILE_H264_BASELINE:
ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
break;
@@ -913,9 +780,6 @@
} else if (!strcmp(ctx->profile, "baseline")) {
ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
avctx->profile = FF_PROFILE_H264_BASELINE;
- } else if (!strcmp(ctx->profile, "high444p")) {
- ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
- avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE;
} else {
av_log(avctx, AV_LOG_FATAL, "Profile \"%s\" is unknown! Supported profiles: high, main, baseline\n", ctx->profile);
res = AVERROR(EINVAL);
@@ -923,13 +787,7 @@
}
}
- // force setting profile as high444p if input is AV_PIX_FMT_YUV444P
- if (avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
- ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
- avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE;
- }
-
- ctx->encode_config.encodeCodecConfig.h264Config.chromaFormatIDC = avctx->profile == FF_PROFILE_H264_HIGH_444_PREDICTIVE ? 3 : 1;
+ ctx->encode_config.encodeCodecConfig.h264Config.chromaFormatIDC = 1;
if (ctx->level) {
res = input_string_to_uint32(avctx, nvenc_h264_level_pairs, ctx->level, &ctx->encode_config.encodeCodecConfig.h264Config.level);
@@ -988,6 +846,8 @@
goto error;
}
+ av_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n");
+
ctx->input_surfaces = av_malloc(ctx->max_surface_count * sizeof(*ctx->input_surfaces));
if (!ctx->input_surfaces) {
@@ -1002,28 +862,32 @@
goto error;
}
+ // Allocation for temp surface used for sys mem -> device mem transfer
+ if (avctx->pix_fmt == AV_PIX_FMT_YUV420P)
+ {
+ ctx->transferSurf.width = (avctx->width + 31) & ~31;
+ ctx->transferSurf.height = (avctx->height + 31) & ~31;
+ p_cuda->cu_mem_alloc_pitch(&ctx->transferSurf.dptr,
+ &ctx->transferSurf.pitch,
+ ctx->transferSurf.width,
+ ctx->transferSurf.height/ 2, 16);
+ }
+
for (surfaceCount = 0; surfaceCount < ctx->max_surface_count; ++surfaceCount) {
- NV_ENC_CREATE_INPUT_BUFFER allocSurf = { 0 };
NV_ENC_CREATE_BITSTREAM_BUFFER allocOut = { 0 };
- allocSurf.version = NV_ENC_CREATE_INPUT_BUFFER_VER;
allocOut.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
- allocSurf.width = (avctx->width + 31) & ~31;
- allocSurf.height = (avctx->height + 31) & ~31;
-
- allocSurf.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED;
+ ctx->input_surfaces[surfaceCount].width = (avctx->width + 31) & ~31;
+ ctx->input_surfaces[surfaceCount].height = (avctx->height + 31) & ~31;
switch (avctx->pix_fmt) {
case AV_PIX_FMT_YUV420P:
- allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YV12_PL;
- break;
-
case AV_PIX_FMT_NV12:
- allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL;
- break;
-
- case AV_PIX_FMT_YUV444P:
- allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YUV444_PL;
+ ctx->input_surfaces[surfaceCount].format = NV_ENC_BUFFER_FORMAT_NV12_PL;
+ p_cuda->cu_mem_alloc_pitch(&ctx->input_surfaces[surfaceCount].dptr,
+ &ctx->input_surfaces[surfaceCount].pitch,
+ ctx->input_surfaces[surfaceCount].width,
+ ctx->input_surfaces[surfaceCount].height * 3 / 2, 16);
break;
default:
@@ -1032,18 +896,21 @@
goto error;
}
- nv_status = p_nvenc->nvEncCreateInputBuffer(ctx->nvencoder, &allocSurf);
+ registerParams.version = NV_ENC_REGISTER_RESOURCE_VER,
+ registerParams.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR,
+ registerParams.width = ctx->input_surfaces[surfaceCount].width,
+ registerParams.height = ctx->input_surfaces[surfaceCount].height,
+ registerParams.pitch = ctx->input_surfaces[surfaceCount].pitch,
+ registerParams.bufferFormat = ctx->input_surfaces[surfaceCount].format;
+ registerParams.resourceToRegister = (void*)ctx->input_surfaces[surfaceCount].dptr,
+ nv_status = p_nvenc->nvEncRegisterResource(ctx->nvencoder, &registerParams);
if (nv_status != NV_ENC_SUCCESS) {
- av_log(avctx, AV_LOG_FATAL, "CreateInputBuffer failed\n");
+ av_log(avctx, AV_LOG_FATAL, "RegisterResource failed\n");
res = AVERROR_EXTERNAL;
goto error;
}
-
+ ctx->input_surfaces[surfaceCount].hRes = registerParams.registeredResource;
ctx->input_surfaces[surfaceCount].lockCount = 0;
- ctx->input_surfaces[surfaceCount].input_surface = allocSurf.inputBuffer;
- ctx->input_surfaces[surfaceCount].format = allocSurf.bufferFmt;
- ctx->input_surfaces[surfaceCount].width = allocSurf.width;
- ctx->input_surfaces[surfaceCount].height = allocSurf.height;
/* 1MB is large enough to hold most output frames. NVENC increases this automaticaly if it's not enough. */
allocOut.size = 1024 * 1024;
@@ -1106,21 +973,22 @@
return 0;
error:
-
for (i = 0; i < surfaceCount; ++i) {
- p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->input_surfaces[i].input_surface);
+ p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->input_surfaces[i].hRes);
+ p_cuda->cu_mem_free(ctx->input_surfaces[i].dptr);
+
if (ctx->output_surfaces[i].output_surface)
p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->output_surfaces[i].output_surface);
}
+ p_cuda->cu_mem_free(ctx->transferSurf.dptr);
if (ctx->nvencoder)
p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
if (ctx->cu_context)
- dl_fn->cu_ctx_destroy(ctx->cu_context);
+ release_cuda_context(&ctx->cu_context, ctx->gpu);
nvenc_unload_nvenc(avctx);
-
ctx->nvencoder = NULL;
ctx->cu_context = NULL;
@@ -1132,6 +1000,7 @@
NvencContext *ctx = avctx->priv_data;
NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+ CudaDynLoadFunctions *p_cuda = get_cuda_dl_func();
int i;
av_freep(&ctx->timestamp_list.data);
@@ -1139,16 +1008,19 @@
av_freep(&ctx->output_surface_queue.data);
for (i = 0; i < ctx->max_surface_count; ++i) {
- p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->input_surfaces[i].input_surface);
+ p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->input_surfaces[i].hRes);
+ p_cuda->cu_mem_free(ctx->input_surfaces[i].dptr);
p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->output_surfaces[i].output_surface);
}
ctx->max_surface_count = 0;
- p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
+ if (ctx->nvencoder)
+ p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
+
ctx->nvencoder = NULL;
- dl_fn->cu_ctx_destroy(ctx->cu_context);
- ctx->cu_context = NULL;
+ if (ctx->cu_context)
+ release_cuda_context(&ctx->cu_context, ctx->gpu);
nvenc_unload_nvenc(avctx);
@@ -1163,6 +1035,7 @@
uint32_t slice_mode_data;
uint32_t *slice_offsets;
+ char picType = 'X';
NV_ENC_LOCK_BITSTREAM lock_params = { 0 };
NVENCSTATUS nv_status;
int res = 0;
@@ -1215,12 +1088,15 @@
FF_DISABLE_DEPRECATION_WARNINGS
case NV_ENC_PIC_TYPE_I:
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+ picType = 'I';
break;
case NV_ENC_PIC_TYPE_P:
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+ picType = 'P';
break;
case NV_ENC_PIC_TYPE_B:
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+ picType = 'B';
break;
case NV_ENC_PIC_TYPE_BI:
avctx->coded_frame->pict_type = AV_PICTURE_TYPE_BI;
@@ -1234,6 +1110,8 @@
#endif
}
+ av_log(avctx, AV_LOG_VERBOSE, "FRAME STATISTICS: Frame No. %d PicType %c Frame AvgQP %d SATD Cost %d Size %d bytes\r", lock_params.frameIdx, picType, lock_params.frameAvgQP, lock_params.frameSatd, lock_params.bitstreamSizeInBytes);
+
pkt->pts = lock_params.outputTimeStamp;
pkt->dts = timestamp_queue_dequeue(&ctx->timestamp_list);
@@ -1261,6 +1139,61 @@
return res;
}
+
+static int call_interleavechroma_kernel(CudaDynLoadFunctions* dl_func, CUfunction func,
+ CUdeviceptr cb_dptr, CUdeviceptr cr_dptr, CUdeviceptr nv12chroma_dptr, int width, int height, int srcStride, int dstStride)
+{
+ void *args_uchar[] = { &cb_dptr, &cr_dptr, &nv12chroma_dptr, &width, &height, &srcStride, &dstStride};
+ __cu(dl_func->cu_launch_kernel(func, BLOCKSX, 1, 1, THREADSX, 1, 1, 0, NULL, args_uchar, NULL));
+
+ return 0;
+}
+
+static int nvenc_copy_to_inputbuffer(NvencContext *ctx, const AVFrame* frame, NvencInputSurface *inSurf)
+{
+ CudaDynLoadFunctions *p_cuda = get_cuda_dl_func();
+ if (frame->format == AV_PIX_FMT_NV12) {
+
+ // check opaque field, if there's already a deviceptr
+ if (frame->opaque && check_nvinfo(frame->opaque) &&
+ ((ffnvinfo*)(frame->opaque))->dptr[0]) {
+ ffnvinfo* info = (ffnvinfo*)frame->opaque;
+
+ __cu(cuMemCpy2d(NULL, info->dptr[0], info->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+ __cu(cuMemCpy2d(NULL, info->dptr[1], info->linesize[1], NULL, inSurf->dptr + inSurf->pitch*inSurf->height, inSurf->pitch, frame->width, frame->height/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+ }
+ else
+ {
+ __cu(cuMemCpy2d(frame->data[0], (CUdeviceptr)NULL, frame->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ __cu(cuMemCpy2d(frame->data[1], (CUdeviceptr)NULL, frame->linesize[1], NULL, inSurf->dptr + inSurf->pitch*inSurf->height, inSurf->pitch, frame->width, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ }
+ }
+ else if (frame->format == AV_PIX_FMT_YUV420P) {
+ // check opaque field, if there's already a deviceptr
+ if (frame->opaque && check_nvinfo(frame->opaque) &&
+ ((ffnvinfo*)(frame->opaque))->dptr[0]) {
+ ffnvinfo* info = (ffnvinfo*)frame->opaque;
+
+ __cu(cuMemCpy2d(NULL, info->dptr[0], info->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+ call_interleavechroma_kernel(p_cuda, ctx->cu_func_interleaveChroma, info->dptr[1], info->dptr[2], inSurf->dptr + inSurf->pitch*inSurf->height, (frame->width+31) & ~31 , frame->height, (info->linesize[1]<<1), inSurf->pitch);
+ }
+ else
+ {
+ __cu(cuMemCpy2d(frame->data[0], (CUdeviceptr)NULL, frame->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ __cu(cuMemCpy2d(frame->data[1], (CUdeviceptr)NULL, frame->linesize[1], NULL, ctx->transferSurf.dptr, ctx->transferSurf.pitch / 2, ctx->transferSurf.width / 2, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ __cu(cuMemCpy2d(frame->data[2], (CUdeviceptr)NULL, frame->linesize[2], NULL, ctx->transferSurf.dptr + ctx->transferSurf.pitch*ctx->transferSurf.height / 4, ctx->transferSurf.pitch / 2, ctx->transferSurf.width / 2, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+
+ call_interleavechroma_kernel(p_cuda, ctx->cu_func_interleaveChroma, ctx->transferSurf.dptr, ctx->transferSurf.dptr + ctx->transferSurf.pitch*ctx->transferSurf.height/4, inSurf->dptr + inSurf->pitch*inSurf->height, (frame->width + 31) & ~31, frame->height, ctx->transferSurf.pitch, inSurf->pitch);
+ }
+ }
+ else {
+ av_log(NULL, AV_LOG_FATAL, "Invalid pixel format!\n");
+ return AVERROR(EINVAL);
+ }
+
+ return 0;
+}
+
static int nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
const AVFrame *frame, int *got_packet)
{
@@ -1276,7 +1209,7 @@
pic_params.version = NV_ENC_PIC_PARAMS_VER;
if (frame) {
- NV_ENC_LOCK_INPUT_BUFFER lockBufferParams = { 0 };
+ NV_ENC_MAP_INPUT_RESOURCE mapParams = { 0 };
NvencInputSurface *inSurf = NULL;
for (i = 0; i < ctx->max_surface_count; ++i) {
@@ -1290,69 +1223,27 @@
inSurf->lockCount = 1;
- lockBufferParams.version = NV_ENC_LOCK_INPUT_BUFFER_VER;
- lockBufferParams.inputBuffer = inSurf->input_surface;
-
- nv_status = p_nvenc->nvEncLockInputBuffer(ctx->nvencoder, &lockBufferParams);
+ mapParams.version = NV_ENC_MAP_INPUT_RESOURCE_VER;
+ mapParams.registeredResource = inSurf->hRes;
+ nv_status = p_nvenc->nvEncMapInputResource(ctx->nvencoder, &mapParams);
if (nv_status != NV_ENC_SUCCESS) {
- av_log(avctx, AV_LOG_ERROR, "Failed locking nvenc input buffer\n");
+ av_log(avctx, AV_LOG_ERROR, "Failed mapping nvenc input buffer\n");
return 0;
}
- if (avctx->pix_fmt == AV_PIX_FMT_YUV420P) {
- uint8_t *buf = lockBufferParams.bufferDataPtr;
-
- av_image_copy_plane(buf, lockBufferParams.pitch,
- frame->data[0], frame->linesize[0],
- avctx->width, avctx->height);
-
- buf += inSurf->height * lockBufferParams.pitch;
-
- av_image_copy_plane(buf, lockBufferParams.pitch >> 1,
- frame->data[2], frame->linesize[2],
- avctx->width >> 1, avctx->height >> 1);
-
- buf += (inSurf->height * lockBufferParams.pitch) >> 2;
-
- av_image_copy_plane(buf, lockBufferParams.pitch >> 1,
- frame->data[1], frame->linesize[1],
- avctx->width >> 1, avctx->height >> 1);
- } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {
- uint8_t *buf = lockBufferParams.bufferDataPtr;
-
- av_image_copy_plane(buf, lockBufferParams.pitch,
- frame->data[0], frame->linesize[0],
- avctx->width, avctx->height);
-
- buf += inSurf->height * lockBufferParams.pitch;
-
- av_image_copy_plane(buf, lockBufferParams.pitch,
- frame->data[1], frame->linesize[1],
- avctx->width, avctx->height >> 1);
- } else if (avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
- uint8_t *buf = lockBufferParams.bufferDataPtr;
-
- av_image_copy_plane(buf, lockBufferParams.pitch,
- frame->data[0], frame->linesize[0],
- avctx->width, avctx->height);
-
- buf += inSurf->height * lockBufferParams.pitch;
-
- av_image_copy_plane(buf, lockBufferParams.pitch,
- frame->data[1], frame->linesize[1],
- avctx->width, avctx->height);
-
- buf += inSurf->height * lockBufferParams.pitch;
+ inSurf->input_surface = mapParams.mappedResource;
+ if (inSurf->format != mapParams.mappedBufferFmt) {
+ av_log(avctx, AV_LOG_ERROR, "Incompatible buffer format!\n");
+ return 0;
+ }
- av_image_copy_plane(buf, lockBufferParams.pitch,
- frame->data[2], frame->linesize[2],
- avctx->width, avctx->height);
- } else {
- av_log(avctx, AV_LOG_FATAL, "Invalid pixel format!\n");
- return AVERROR(EINVAL);
+ if (nvenc_copy_to_inputbuffer(ctx, frame, inSurf) != 0) {
+ p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, inSurf->input_surface);
+ av_log(avctx, AV_LOG_ERROR, "Failed to copy data to NVENC input buffer!\n");
+ return 0;
}
- nv_status = p_nvenc->nvEncUnlockInputBuffer(ctx->nvencoder, inSurf->input_surface);
+ nv_status = p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, inSurf->input_surface);
if (nv_status != NV_ENC_SUCCESS) {
av_log(avctx, AV_LOG_FATAL, "Failed unlocking input buffer!\n");
return AVERROR_EXTERNAL;
@@ -1470,7 +1361,6 @@
static const enum AVPixelFormat pix_fmts_nvenc[] = {
AV_PIX_FMT_YUV420P,
AV_PIX_FMT_NV12,
- AV_PIX_FMT_YUV444P,
AV_PIX_FMT_NONE
};
@@ -1478,13 +1368,14 @@
#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
static const AVOption options[] = {
{ "preset", "Set the encoding preset (one of slow = hq 2pass, medium = hq, fast = hp, hq, hp, bd, ll, llhq, llhp, default)", OFFSET(preset), AV_OPT_TYPE_STRING, { .str = "medium" }, 0, 0, VE },
- { "profile", "Set the encoding profile (high, main, baseline or high444p)", OFFSET(profile), AV_OPT_TYPE_STRING, { .str = "main" }, 0, 0, VE },
+ { "profile", "Set the encoding profile (high, main, baseline)", OFFSET(profile), AV_OPT_TYPE_STRING, { .str = "main" }, 0, 0, VE },
{ "level", "Set the encoding level restriction (auto, 1.0, 1.0b, 1.1, 1.2, ..., 4.2, 5.0, 5.1)", OFFSET(level), AV_OPT_TYPE_STRING, { .str = "auto" }, 0, 0, VE },
{ "tier", "Set the encoding tier (main or high)", OFFSET(tier), AV_OPT_TYPE_STRING, { .str = "main" }, 0, 0, VE },
{ "cbr", "Use cbr encoding mode", OFFSET(cbr), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
{ "2pass", "Use 2pass encoding mode", OFFSET(twopass), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, VE },
{ "gpu", "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.", OFFSET(gpu), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
{ "delay", "Delays frame output by the given amount of frames.", OFFSET(buffer_delay), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
+ { "enableaq", "set to 1 to enable AQ ", OFFSET(aq), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
{ NULL }
};
diff -ruN ffmpeg-orig/libavcodec/nvenc_ptx.c ffmpeg/libavcodec/nvenc_ptx.c
--- ffmpeg-orig/libavcodec/nvenc_ptx.c 1970-01-01 03:00:00.000000000 +0300
+++ ffmpeg/libavcodec/nvenc_ptx.c 2016-03-04 00:18:17.585433546 +0200
@@ -0,0 +1,240 @@
+#if _WIN32 || _WIN64
+#if _WIN64
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+// Check GCC
+#if __GNUC__
+#if __x86_64__ || __ppc64__
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+#ifdef ENVIRONMENT32
+const char color_ptx[] = \
+ "//\n"
+ "// Generated by NVIDIA NVVM Compiler\n"
+ "//\n"
+ "// Compiler Build ID: CL-19830389\n"
+ "// Cuda compilation tools, release 8.0, V8.0.0\n"
+ "// Based on LLVM 3.4svn\n"
+ "//\n"
+ "\n"
+ ".version 4.3\n"
+ ".target sm_30\n"
+ ".address_size 32\n"
+ "\n"
+ "// .globl interleaveChroma\n"
+ "\n"
+ ".visible .entry interleaveChroma(\n"
+ ".param .u32 interleaveChroma_param_0,\n"
+ ".param .u32 interleaveChroma_param_1,\n"
+ ".param .u32 interleaveChroma_param_2,\n"
+ ".param .u32 interleaveChroma_param_3,\n"
+ ".param .u32 interleaveChroma_param_4,\n"
+ ".param .u32 interleaveChroma_param_5,\n"
+ ".param .u32 interleaveChroma_param_6\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<5>;\n"
+ ".reg .b32 %r<57>;\n"
+ "\n"
+ "\n"
+ "ld.param.u32 %r15, [interleaveChroma_param_0];\n"
+ "ld.param.u32 %r16, [interleaveChroma_param_1];\n"
+ "ld.param.u32 %r17, [interleaveChroma_param_2];\n"
+ "ld.param.u32 %r18, [interleaveChroma_param_3];\n"
+ "ld.param.u32 %r21, [interleaveChroma_param_4];\n"
+ "ld.param.u32 %r19, [interleaveChroma_param_5];\n"
+ "ld.param.u32 %r20, [interleaveChroma_param_6];\n"
+ "shr.s32 %r1, %r21, 1;\n"
+ "mov.u32 %r55, %ctaid.x;\n"
+ "setp.ge.s32 %p1, %r55, %r1;\n"
+ "@%p1 bra BB0_6;\n"
+ "\n"
+ "cvta.to.global.u32 %r3, %r17;\n"
+ "cvta.to.global.u32 %r4, %r16;\n"
+ "cvta.to.global.u32 %r5, %r15;\n"
+ "mov.u32 %r6, %tid.x;\n"
+ "shr.s32 %r7, %r18, 3;\n"
+ "mov.u32 %r8, %ntid.x;\n"
+ "\n"
+ "BB0_2:\n"
+ "setp.ge.s32 %p2, %r6, %r7;\n"
+ "@%p2 bra BB0_5;\n"
+ "\n"
+ "mul.lo.s32 %r22, %r55, %r19;\n"
+ "mul.lo.s32 %r23, %r55, %r20;\n"
+ "shr.s32 %r10, %r23, 2;\n"
+ "shr.s32 %r24, %r22, 2;\n"
+ "shr.u32 %r11, %r24, 1;\n"
+ "mov.u32 %r56, %r6;\n"
+ "\n"
+ "BB0_4:\n"
+ "mov.u32 %r12, %r56;\n"
+ "add.s32 %r25, %r12, %r11;\n"
+ "shl.b32 %r26, %r25, 2;\n"
+ "add.s32 %r27, %r5, %r26;\n"
+ "add.s32 %r28, %r4, %r26;\n"
+ "ld.global.u32 %r29, [%r28];\n"
+ "and.b32 %r30, %r29, 65280;\n"
+ "shl.b32 %r31, %r30, 16;\n"
+ "ld.global.u32 %r32, [%r27];\n"
+ "shl.b32 %r33, %r32, 8;\n"
+ "and.b32 %r34, %r33, 16711680;\n"
+ "shl.b32 %r35, %r29, 8;\n"
+ "and.b32 %r36, %r35, 65280;\n"
+ "and.b32 %r37, %r32, 255;\n"
+ "or.b32 %r38, %r34, %r37;\n"
+ "or.b32 %r39, %r38, %r31;\n"
+ "or.b32 %r40, %r39, %r36;\n"
+ "shl.b32 %r41, %r12, 1;\n"
+ "add.s32 %r42, %r41, %r10;\n"
+ "shl.b32 %r43, %r42, 2;\n"
+ "add.s32 %r44, %r3, %r43;\n"
+ "st.global.u32 [%r44], %r40;\n"
+ "and.b32 %r45, %r29, -16777216;\n"
+ "and.b32 %r46, %r32, -16777216;\n"
+ "shr.u32 %r47, %r46, 8;\n"
+ "or.b32 %r48, %r45, %r47;\n"
+ "and.b32 %r49, %r29, 16711680;\n"
+ "shr.u32 %r50, %r49, 8;\n"
+ "bfe.u32 %r51, %r32, 16, 8;\n"
+ "or.b32 %r52, %r48, %r51;\n"
+ "or.b32 %r53, %r52, %r50;\n"
+ "st.global.u32 [%r44+4], %r53;\n"
+ "add.s32 %r13, %r8, %r12;\n"
+ "setp.lt.s32 %p3, %r13, %r7;\n"
+ "mov.u32 %r56, %r13;\n"
+ "@%p3 bra BB0_4;\n"
+ "\n"
+ "BB0_5:\n"
+ "mov.u32 %r54, %nctaid.x;\n"
+ "add.s32 %r55, %r54, %r55;\n"
+ "setp.lt.s32 %p4, %r55, %r1;\n"
+ "@%p4 bra BB0_2;\n"
+ "\n"
+ "BB0_6:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "\n"
+;
+#elif defined ENVIRONMENT64
+const char color_ptx[] = \
+ "//\n"
+ "// Generated by NVIDIA NVVM Compiler\n"
+ "//\n"
+ "// Compiler Build ID: CL-19830389\n"
+ "// Cuda compilation tools, release 8.0, V8.0.0\n"
+ "// Based on LLVM 3.4svn\n"
+ "//\n"
+ "\n"
+ ".version 4.3\n"
+ ".target sm_30\n"
+ ".address_size 64\n"
+ "\n"
+ "// .globl interleaveChroma\n"
+ "\n"
+ ".visible .entry interleaveChroma(\n"
+ ".param .u64 interleaveChroma_param_0,\n"
+ ".param .u64 interleaveChroma_param_1,\n"
+ ".param .u64 interleaveChroma_param_2,\n"
+ ".param .u32 interleaveChroma_param_3,\n"
+ ".param .u32 interleaveChroma_param_4,\n"
+ ".param .u32 interleaveChroma_param_5,\n"
+ ".param .u32 interleaveChroma_param_6\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<5>;\n"
+ ".reg .b32 %r<47>;\n"
+ ".reg .b64 %rd<14>;\n"
+ "\n"
+ "\n"
+ "ld.param.u64 %rd4, [interleaveChroma_param_0];\n"
+ "ld.param.u64 %rd5, [interleaveChroma_param_1];\n"
+ "ld.param.u64 %rd6, [interleaveChroma_param_2];\n"
+ "ld.param.u32 %r12, [interleaveChroma_param_3];\n"
+ "ld.param.u32 %r15, [interleaveChroma_param_4];\n"
+ "ld.param.u32 %r13, [interleaveChroma_param_5];\n"
+ "ld.param.u32 %r14, [interleaveChroma_param_6];\n"
+ "shr.s32 %r1, %r15, 1;\n"
+ "mov.u32 %r45, %ctaid.x;\n"
+ "setp.ge.s32 %p1, %r45, %r1;\n"
+ "@%p1 bra BB0_6;\n"
+ "\n"
+ "cvta.to.global.u64 %rd1, %rd6;\n"
+ "cvta.to.global.u64 %rd2, %rd5;\n"
+ "cvta.to.global.u64 %rd3, %rd4;\n"
+ "mov.u32 %r3, %tid.x;\n"
+ "shr.s32 %r4, %r12, 3;\n"
+ "mov.u32 %r5, %ntid.x;\n"
+ "\n"
+ "BB0_2:\n"
+ "setp.ge.s32 %p2, %r3, %r4;\n"
+ "@%p2 bra BB0_5;\n"
+ "\n"
+ "mul.lo.s32 %r16, %r45, %r13;\n"
+ "mul.lo.s32 %r17, %r45, %r14;\n"
+ "shr.s32 %r7, %r17, 2;\n"
+ "shr.s32 %r18, %r16, 2;\n"
+ "shr.u32 %r8, %r18, 1;\n"
+ "mov.u32 %r46, %r3;\n"
+ "\n"
+ "BB0_4:\n"
+ "mov.u32 %r9, %r46;\n"
+ "add.s32 %r19, %r9, %r8;\n"
+ "mul.wide.u32 %rd7, %r19, 4;\n"
+ "add.s64 %rd8, %rd3, %rd7;\n"
+ "add.s64 %rd9, %rd2, %rd7;\n"
+ "ld.global.u32 %r20, [%rd9];\n"
+ "and.b32 %r21, %r20, 65280;\n"
+ "shl.b32 %r22, %r21, 16;\n"
+ "ld.global.u32 %r23, [%rd8];\n"
+ "shl.b32 %r24, %r23, 8;\n"
+ "and.b32 %r25, %r24, 16711680;\n"
+ "shl.b32 %r26, %r20, 8;\n"
+ "and.b32 %r27, %r26, 65280;\n"
+ "and.b32 %r28, %r23, 255;\n"
+ "or.b32 %r29, %r25, %r28;\n"
+ "or.b32 %r30, %r29, %r22;\n"
+ "or.b32 %r31, %r30, %r27;\n"
+ "shl.b32 %r32, %r9, 1;\n"
+ "add.s32 %r33, %r32, %r7;\n"
+ "mul.wide.u32 %rd10, %r33, 4;\n"
+ "add.s64 %rd11, %rd1, %rd10;\n"
+ "st.global.u32 [%rd11], %r31;\n"
+ "and.b32 %r34, %r20, -16777216;\n"
+ "and.b32 %r35, %r23, -16777216;\n"
+ "shr.u32 %r36, %r35, 8;\n"
+ "or.b32 %r37, %r34, %r36;\n"
+ "and.b32 %r38, %r20, 16711680;\n"
+ "shr.u32 %r39, %r38, 8;\n"
+ "bfe.u32 %r40, %r23, 16, 8;\n"
+ "or.b32 %r41, %r37, %r40;\n"
+ "or.b32 %r42, %r41, %r39;\n"
+ "add.s32 %r43, %r33, 1;\n"
+ "mul.wide.u32 %rd12, %r43, 4;\n"
+ "add.s64 %rd13, %rd1, %rd12;\n"
+ "st.global.u32 [%rd13], %r42;\n"
+ "add.s32 %r10, %r5, %r9;\n"
+ "setp.lt.s32 %p3, %r10, %r4;\n"
+ "mov.u32 %r46, %r10;\n"
+ "@%p3 bra BB0_4;\n"
+ "\n"
+ "BB0_5:\n"
+ "mov.u32 %r44, %nctaid.x;\n"
+ "add.s32 %r45, %r44, %r45;\n"
+ "setp.lt.s32 %p4, %r45, %r1;\n"
+ "@%p4 bra BB0_2;\n"
+ "\n"
+ "BB0_6:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "\n"
+;
+#endif
diff -ruN ffmpeg-orig/libavfilter/allfilters.c ffmpeg/libavfilter/allfilters.c
--- ffmpeg-orig/libavfilter/allfilters.c 2016-03-04 00:17:21.071189460 +0200
+++ ffmpeg/libavfilter/allfilters.c 2016-03-04 00:30:07.377997402 +0200
@@ -225,6 +225,7 @@
REGISTER_FILTER(NOFORMAT, noformat, vf);
REGISTER_FILTER(NOISE, noise, vf);
REGISTER_FILTER(NULL, null, vf);
+ REGISTER_FILTER(NVRESIZE, nvresize, vf);
REGISTER_FILTER(OCR, ocr, vf);
REGISTER_FILTER(OCV, ocv, vf);
REGISTER_FILTER(OVERLAY, overlay, vf);
diff -ruN ffmpeg-orig/libavfilter/Makefile ffmpeg/libavfilter/Makefile
--- ffmpeg-orig/libavfilter/Makefile 2016-03-04 00:17:21.079190202 +0200
+++ ffmpeg/libavfilter/Makefile 2016-03-04 02:41:05.358919913 +0200
@@ -204,6 +204,7 @@
OBJS-$(CONFIG_NOFORMAT_FILTER) += vf_format.o
OBJS-$(CONFIG_NOISE_FILTER) += vf_noise.o
OBJS-$(CONFIG_NULL_FILTER) += vf_null.o
+OBJS-$(CONFIG_NVRESIZE_FILTER) += vf_nvresize.o vf_nvresize_ptx.o
OBJS-$(CONFIG_OCR_FILTER) += vf_ocr.o
OBJS-$(CONFIG_OCV_FILTER) += vf_libopencv.o
OBJS-$(CONFIG_OPENCL) += deshake_opencl.o unsharp_opencl.o
diff -ruN ffmpeg-orig/libavfilter/vf_nvresize.c ffmpeg/libavfilter/vf_nvresize.c
--- ffmpeg-orig/libavfilter/vf_nvresize.c 1970-01-01 03:00:00.000000000 +0300
+++ ffmpeg/libavfilter/vf_nvresize.c 2016-03-04 02:27:15.262271696 +0200
@@ -0,0 +1,669 @@
+/*
+ * Copyright (c) 2011 Roger Pau Monné <roger.pau@entel.upc.edu>
+ * Copyright (c) 2011 Stefano Sabatini
+ * Copyright (c) 2013 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+#include "libavutil/eval.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/parseutils.h"
+#include "libavutil/cudautils.h"
+
+#include "avfilter.h"
+#include "drawutils.h"
+#include "formats.h"
+#include "internal.h"
+#include "video.h"
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define MAX_OUTPUT 16
+#define BLOCKX 32
+#define BLOCKY 16
+
+typedef struct cu_tex {
+ int w;
+ int h;
+ size_t pitch;
+ CUdeviceptr dptr;
+} cu_tex;
+
+typedef struct NVResizeContext {
+ const AVClass *class;
+
+ /**
+ * New dimensions. Special values are:
+ * 0 = original width/height
+ * -1 = keep original aspect
+ * -N = try to keep aspect but make sure it is divisible by N
+ */
+ int nb_outputs;
+
+ char *size_str;
+ int force_original_aspect_ratio;
+ int readback_FB;
+ int gpu;
+
+ int cuda_inited;
+
+ CUcontext cu_ctx;
+ CudaDynLoadFunctions* cu_dl_func;
+ CUmodule cu_module;
+ CUfunction cu_func_uchar;
+ CUfunction cu_func_uchar2;
+ CUfunction cu_func_uchar4;
+ CUtexref cu_tex_uchar;
+ CUtexref cu_tex_uchar2;
+ CUtexref cu_tex_uchar4;
+ cu_tex intex;
+ cu_tex outtex[MAX_OUTPUT];
+
+} NVResizeContext;
+
+#define OFFSET(x) offsetof(NVResizeContext, x)
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
+
+static const AVOption nvresize_options[] = {
+ { "outputs", "set number of outputs", OFFSET(nb_outputs), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, MAX_OUTPUT, FLAGS },
+ { "readback", "read result back to FB", OFFSET(readback_FB), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS },
+ { "size", "set video size", OFFSET(size_str), AV_OPT_TYPE_STRING, {.str = NULL}, 0, FLAGS },
+ { "s", "set video size", OFFSET(size_str), AV_OPT_TYPE_STRING, {.str = NULL}, 0, FLAGS },
+ { "gpu", "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.", OFFSET(gpu), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },
+ { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" },
+ { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(nvresize);
+
+static int query_formats(AVFilterContext *ctx)
+{
+ static const enum AVPixelFormat pix_fmts[] = {
+ AV_PIX_FMT_YUV420P,
+ AV_PIX_FMT_YUV444P,
+ AV_PIX_FMT_NV12,
+ AV_PIX_FMT_ARGB,
+ AV_PIX_FMT_RGBA,
+ AV_PIX_FMT_ABGR,
+ AV_PIX_FMT_BGRA,
+ AV_PIX_FMT_NONE,
+ };
+
+ AVFilterFormats *fmts_list = ff_make_format_list((const int*)pix_fmts);
+ if (!fmts_list)
+ return AVERROR(ENOMEM);
+ return ff_set_common_formats(ctx, fmts_list);
+}
+
+static int config_output(AVFilterLink *outlink)
+{
+ AVFilterContext *ctx = outlink->src;
+ AVFilterLink *inlink = outlink->src->inputs[0];
+ NVResizeContext *s = ctx->priv;
+
+ int outIdx = atoi(outlink->srcpad->name + 3);
+ int64_t w, h;
+ int factor_w, factor_h;
+
+ w = s->outtex[outIdx].w;
+ h = s->outtex[outIdx].h;
+
+ // Check if it is requested that the result has to be divisible by a some
+ // factor (w or h = -n with n being the factor).
+ factor_w = 1;
+ factor_h = 1;
+ if (w < -1) {
+ factor_w = -w;
+ }
+ if (h < -1) {
+ factor_h = -h;
+ }
+
+ if (w < 0 && h < 0)
+ s->outtex[outIdx].w = s->outtex[outIdx].h = 0;
+
+ if (!(w = s->outtex[outIdx].w))
+ w = inlink->w;
+ if (!(h = s->outtex[outIdx].h))
+ h = inlink->h;
+
+ // Make sure that the result is divisible by the factor we determined
+ // earlier. If no factor was set, it is nothing will happen as the default
+ // factor is 1
+ if (w < 0)
+ w = av_rescale(h, inlink->w, inlink->h * factor_w) * factor_w;
+ if (h < 0)
+ h = av_rescale(w, inlink->h, inlink->w * factor_h) * factor_h;
+
+ // Note that force_original_aspect_ratio may overwrite the previous set
+ // dimensions so that it is not divisible by the set factors anymore.
+ if (s->force_original_aspect_ratio) {
+ int tmp_w = av_rescale(h, inlink->w, inlink->h);
+ int tmp_h = av_rescale(w, inlink->h, inlink->w);
+
+ if (s->force_original_aspect_ratio == 1) {
+ w = FFMIN(tmp_w, w);
+ h = FFMIN(tmp_h, h);
+ } else {
+ w = FFMAX(tmp_w, w);
+ h = FFMAX(tmp_h, h);
+ }
+ }
+
+ if (w > INT_MAX || h > INT_MAX ||
+ (h * inlink->w) > INT_MAX ||
+ (w * inlink->h) > INT_MAX)
+ av_log(ctx, AV_LOG_ERROR, "Resd value for width or height is too big.\n");
+
+ s->outtex[outIdx].w = outlink->w = w;
+ s->outtex[outIdx].h = outlink->h = h;
+
+ if (inlink->sample_aspect_ratio.num){
+ outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio);
+ } else
+ outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+
+ // create output device memory
+ switch(outlink->format) {
+ case AV_PIX_FMT_YUV420P:
+ case AV_PIX_FMT_NV12:
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr,
+ &s->outtex[outIdx].pitch, s->outtex[outIdx].w, s->outtex[outIdx].h*3/2, 16));
+ break;
+
+ case AV_PIX_FMT_YUV444P:
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr,
+ &s->outtex[outIdx].pitch, s->outtex[outIdx].w, s->outtex[outIdx].h*3, 16));
+ break;
+
+ case AV_PIX_FMT_ARGB:
+ case AV_PIX_FMT_RGBA:
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_BGRA:
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr,
+ &s->outtex[outIdx].pitch, s->outtex[outIdx].w*4, s->outtex[outIdx].h, 16));
+ break;
+ }
+
+ return 0;
+}
+
+static av_cold int init(AVFilterContext *ctx)
+{
+ extern char resize_ptx[];
+ NVResizeContext *s = ctx->priv;
+ int ret;
+ int i, j;
+ int count = 0;
+ for (i = 0; i < s->nb_outputs; i++) {
+ char name[32];
+ AVFilterPad pad = { 0 };
+
+ snprintf(name, sizeof(name), "out%d", i);
+ pad.type = ctx->filter->inputs[0].type;
+ pad.name = av_strdup(name);
+ pad.config_props = config_output;
+ if (!pad.name)
+ return AVERROR(ENOMEM);
+
+ ff_insert_outpad(ctx, i, &pad);
+ }
+
+ // parse size parameters here
+ if (s->size_str) {
+ char split = '|';
+ char* found = NULL;
+ char* head = s->size_str;
+ while ((found = strchr(head, split)) != NULL) {
+ *found = 0;
+ if ((ret = av_parse_video_size(&s->outtex[count].w, &s->outtex[count].h, head)) < 0) {
+ av_log(ctx, AV_LOG_ERROR, "Invalid size '%s'\n", head);
+ return ret;
+ }
+ head = found+1;
+ count++;
+ }
+
+ if ((ret = av_parse_video_size(&s->outtex[count].w, &s->outtex[count].h, head)) < 0) {
+ av_log(ctx, AV_LOG_ERROR, "Invalid size '%s'\n", head);
+ return ret;
+ }
+ count++;
+ }
+
+ // sort the output
+ for (i = 0; i < count; i++) {
+ for (j = i+1; j < count; j++) {
+ int tempH, tempW;
+ if (s->outtex[i].w < s->outtex[j].w) {
+ tempW = s->outtex[i].w; tempH = s->outtex[i].h;
+ s->outtex[i].w = s->outtex[j].w; s->outtex[i].h = s->outtex[j].h;
+ s->outtex[j].w = tempW; s->outtex[j].h = tempH;
+ }
+ }
+ }
+
+ if (count < s->nb_outputs) {
+ int offset = s->nb_outputs - count;
+ for (i = s->nb_outputs-1; i >= offset; i--) {
+ s->outtex[i].w = s->outtex[i-offset].w;
+ s->outtex[i].h = s->outtex[i-offset].h;
+ }
+ for (i = 0; i < offset; i++) {
+ s->outtex[i].w = s->outtex[i].h = 0;
+ }
+ }
+
+ // init cuda_context
+ if (!s->cu_ctx) {
+ init_cuda();
+ get_cuda_context(&s->cu_ctx, s->gpu);
+ }
+ s->cu_dl_func = get_cuda_dl_func();
+
+ __cu(s->cu_dl_func->cu_module_load_data(&s->cu_module, resize_ptx));
+
+ // load functions
+ __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar, s->cu_module, "Subsample_Bilinear_uchar"));
+ __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar2, s->cu_module, "Subsample_Bilinear_uchar2"));
+ __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar4, s->cu_module, "Subsample_Bilinear_uchar4"));
+ __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar, s->cu_module, "uchar_tex"));
+ __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex"));
+ __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar4, s->cu_module, "uchar4_tex"));
+
+ __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER));
+ __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER));
+ __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar4, CU_TRSF_READ_AS_INTEGER));
+ __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR));
+ __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR));
+ __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar4, CU_TR_FILTER_MODE_LINEAR));
+
+ return 0;
+}
+
+static int copy_from_avframe(NVResizeContext *s, AVFrame* src, cu_tex* dst)
+{
+ av_assert0(src->width == dst->w && src->height == dst->h);
+
+ switch (src->format) {
+ case AV_PIX_FMT_YUV420P:
+ // copy Y channel
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ // copy U channel
+ __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch / 2, src->width / 2, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ // copy V channel
+ __cu(cuMemCpy2d(src->data[2], (CUdeviceptr)NULL, src->linesize[2], NULL, dst->dptr + dst->pitch*dst->h * 5 / 4, dst->pitch / 2, src->width / 2, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+
+ break;
+
+ case AV_PIX_FMT_YUV444P:
+ // copy Y channel
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ // copy U channel
+ __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ // copy V channel
+ __cu(cuMemCpy2d(src->data[2], (CUdeviceptr)NULL, src->linesize[2], NULL, dst->dptr + dst->pitch*dst->h * 2, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ break;
+
+ case AV_PIX_FMT_NV12:
+ // copy Y channel
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ // copy UV channel
+ __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch, src->width, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ break;
+
+ case AV_PIX_FMT_ARGB:
+ case AV_PIX_FMT_RGBA:
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_BGRA:
+ // copy the packed 32-bit plane
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width * 4, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+
+ break;
+
+ default:
+ av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(src->format));
+ return -1;
+ }
+ return 0;
+}
+
+static int copy_to_avframe(NVResizeContext* s, cu_tex* src, AVFrame* dst)
+{
+ //av_assert0(src->w == dst->width && src->h == dst->height);
+
+ switch (dst->format) {
+ case AV_PIX_FMT_YUV420P:
+ // copy Y channel
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+ // copy U channel
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch / 2, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width / 2, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+ // copy V channel
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h * 5 / 4, src->pitch / 2, dst->data[2], (CUdeviceptr)NULL, dst->linesize[2], dst->width / 2, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+ break;
+
+ case AV_PIX_FMT_YUV444P:
+ // copy Y channel
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+ // copy U channel
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+ // copy V channel
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h * 2, src->pitch, dst->data[2], (CUdeviceptr)NULL, dst->linesize[2], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+
+ break;
+
+ case AV_PIX_FMT_NV12:
+ // copy Y channel
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+ // copy UV channel
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+ break;
+
+ case AV_PIX_FMT_ARGB:
+ case AV_PIX_FMT_RGBA:
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_BGRA:
+ // copy the packed 32-bit plane
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width * 4, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+
+ break;
+
+ default:
+ av_log(NULL, AV_LOG_FATAL, "Unsupported output format: %s!\n", av_get_pix_fmt_name(dst->format));
+ return -1;
+ }
+ return 0;
+}
+
+static int call_resize_kernel(CudaDynLoadFunctions* dl_func, CUfunction func, CUtexref tex, int channels,
+ CUdeviceptr src_dptr, int src_width, int src_height, int src_pitch,
+ CUdeviceptr dst_dptr, int dst_width, int dst_height, int dst_pitch)
+{
+ void *args_uchar[] = { &dst_dptr, &dst_width, &dst_height, &dst_pitch, &src_width, &src_height };
+ CUDA_ARRAY_DESCRIPTOR desc;
+ desc.Width = src_width;
+ desc.Height = src_height;
+ desc.NumChannels = channels;
+ desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+ __cu(dl_func->cu_texref_set_address_2D(tex, &desc, src_dptr, src_pitch));
+
+ __cu(dl_func->cu_launch_kernel(func, DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1,
+ BLOCKX, BLOCKY, 1, 0, NULL, args_uchar, NULL));
+
+ return 0;
+}
+
+static int do_cuda_resize(NVResizeContext *s, cu_tex* src, cu_tex* dst, int format)
+{
+ switch (format) {
+ case AV_PIX_FMT_YUV420P:
+ if (src->w == dst->w && src->h == dst->h && src->pitch == dst->pitch) {
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->pitch, src->h*3/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+
+ }
+ else {
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+ src->dptr, src->w, src->h, src->pitch,
+ dst->dptr, dst->w, dst->h, dst->pitch);
+
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+ src->dptr+src->pitch*src->h, src->w/2, src->h/2, src->pitch/2,
+ dst->dptr+dst->pitch*dst->h, dst->w/2, dst->h/2, dst->pitch/2);
+
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+ src->dptr+src->pitch*src->h*5/4, src->w/2, src->h/2, src->pitch/2,
+ dst->dptr+dst->pitch*dst->h*5/4, dst->w/2, dst->h/2, dst->pitch/2);
+ }
+
+ break;
+
+ case AV_PIX_FMT_YUV444P:
+ if (src->w == dst->w && src->h == dst->h) {
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w, src->h*3, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+ }
+ else {
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+ src->dptr, src->w, src->h, src->pitch,
+ dst->dptr, dst->w, dst->h, dst->pitch);
+
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+ src->dptr+src->pitch*src->h, src->w, src->h, src->pitch,
+ dst->dptr+dst->pitch*dst->h, dst->w, dst->h, dst->pitch);
+
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+ src->dptr+src->pitch*src->h*2, src->w, src->h, src->pitch,
+ dst->dptr+dst->pitch*dst->h*2, dst->w, dst->h, dst->pitch);
+ }
+
+ break;
+
+ case AV_PIX_FMT_NV12:
+ if (src->w == dst->w && src->h == dst->h) {
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w, src->h*3/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+ }
+ else {
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+ src->dptr, src->w, src->h, src->pitch,
+ dst->dptr, dst->w, dst->h, dst->pitch);
+
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar2, s->cu_tex_uchar2, 2,
+ src->dptr+src->pitch*src->h, src->w/2, src->h/2, src->pitch,
+ dst->dptr+dst->pitch*dst->h, dst->w/2, dst->h/2, dst->pitch/2);
+ }
+
+ break;
+
+ case AV_PIX_FMT_ARGB:
+ case AV_PIX_FMT_RGBA:
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_BGRA:
+ if (src->w == dst->w && src->h == dst->h) {
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w*4, src->h, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+
+ }
+ else {
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar4, s->cu_tex_uchar4, 4,
+ src->dptr, src->w, src->h, src->pitch,
+ dst->dptr, dst->w, dst->h, dst->pitch/4);
+ }
+
+ break;
+
+ default:
+ av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(format));
+ return -1;
+ }
+
+ return 0;
+}
+
+static cu_tex* find_resize_src(NVResizeContext* s, cu_tex* source, cu_tex* target)
+{
+ int offset;
+ cu_tex* src;
+ if (source == NULL) {
+ return &s->intex;
+ }
+
+ if (target->w * 4 > source->w) {
+ return source;
+ }
+
+ offset = target - s->outtex;
+ for (int i = offset - 1; i >= 0; i--) {
+ if (target->w * 4 > s->outtex[i].w) {
+ return &s->outtex[i];
+ }
+ }
+
+ src = (offset == 0 ? source : &s->outtex[offset-1]);
+ av_log(NULL, AV_LOG_WARNING, "Output resolution %dx%d differs too much from the previous level %dx%d, "
+ "might cause artificial\n", target->w, target->h, src->w, src->h);
+
+ return src;
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *in)
+{
+ AVFilterContext *ctx = inlink->dst;
+ NVResizeContext *s = ctx->priv;
+ int i;
+ cu_tex* resize_src = NULL;
+ ffnvinfo* info;
+
+ // copy input to gpu
+ if (in->opaque && check_nvinfo(in->opaque) && ((ffnvinfo*)(in->opaque))->dptr[0]) {
+ ffnvinfo* info = (ffnvinfo*)in->opaque;
+ s->intex.dptr = info->dptr[0];
+ s->intex.pitch = info->linesize[0];
+ s->intex.w = in->width;
+ s->intex.h = in->height;
+ }
+ else {
+ if ( (in->width != s->intex.h || in->height != s->intex.h) &&
+ !s->intex.dptr) {
+ __cu(s->cu_dl_func->cu_mem_free(s->intex.dptr));
+ s->intex.w = in->width;
+ s->intex.h = in->height;
+ s->intex.dptr = (CUdeviceptr)NULL;
+ }
+ if (!s->intex.dptr) {
+ switch (in->format) {
+ case AV_PIX_FMT_YUV420P:
+ case AV_PIX_FMT_NV12:
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w, s->intex.h*3/2, 16));
+ break;
+ case AV_PIX_FMT_YUV444P:
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w, s->intex.h*3, 16));
+ break;
+ case AV_PIX_FMT_ARGB:
+ case AV_PIX_FMT_RGBA:
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_BGRA:
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w*4, s->intex.h, 16));
+ break;
+ default:
+ av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(in->format));
+ return -1;
+ }
+ }
+ copy_from_avframe(s, in, &s->intex);
+ }
+
+ for (i = 0; i < ctx->nb_outputs; i++) {
+ AVFrame *out;
+ if (ctx->outputs[i]->status)
+ continue;
+
+ out = ff_get_video_buffer(ctx->outputs[i], ctx->outputs[i]->w, ctx->outputs[i]->h);
+ if (!out) {
+ av_frame_free(&in);
+ return AVERROR(ENOMEM);
+ }
+ av_frame_copy_props(out, in);
+
+ // do works here
+ resize_src = find_resize_src(s, resize_src, &s->outtex[i]);
+ do_cuda_resize(s, resize_src, &s->outtex[i], in->format);
+ info = init_nvinfo();
+ switch (out->format) {
+ case AV_PIX_FMT_YUV444P:
+ info->dptr[0] = s->outtex[i].dptr;
+ info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h;
+ info->dptr[2] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h*2;
+ info->linesize[0] = info->linesize[1] = info->linesize[2] = s->outtex[i].pitch;
+ break;
+
+ case AV_PIX_FMT_YUV420P:
+ info->dptr[0] = s->outtex[i].dptr;
+ info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h;
+ info->dptr[2] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h*5/4;
+ info->linesize[0] = s->outtex[i].pitch;
+ info->linesize[1] = info->linesize[2] = s->outtex[i].pitch/2;
+ break;
+
+ case AV_PIX_FMT_NV12:
+ info->dptr[0] = s->outtex[i].dptr;
+ info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h;
+ info->linesize[0] = info->linesize[1] = s->outtex[i].pitch;
+ break;
+
+ case AV_PIX_FMT_ARGB:
+ case AV_PIX_FMT_RGBA:
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_BGRA:
+ info->dptr[0] = s->outtex[i].dptr;
+ info->linesize[0] = s->outtex[i].pitch;
+ break;
+
+ default:
+ break;
+ }
+
+ out->opaque = (void*)info;
+ if (s->readback_FB)
+ copy_to_avframe(s, &s->outtex[i], out);
+
+ if (ff_filter_frame(ctx->outputs[i], out) < 0)
+ break;
+ }
+
+ av_frame_free(&in);
+ return 0;
+}
+
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+ NVResizeContext *s = ctx->priv;
+
+ for (int i = 0; i < s->nb_outputs; i++) {
+ av_freep(&ctx->output_pads[i].name);
+ if(s->outtex[i].dptr) s->cu_dl_func->cu_mem_free(s->outtex[i].dptr);
+ }
+ if(s->cu_ctx) release_cuda_context(&s->cu_ctx, s->gpu);
+
+ av_log(ctx, AV_LOG_INFO, "nvresize::uninit\n");
+
+}
+
+static const AVFilterPad nvresize_inputs[] = {
+ {
+ .name = "default",
+ .type = AVMEDIA_TYPE_VIDEO,
+ .filter_frame = filter_frame,
+ },
+ { NULL }
+};
+
+AVFilter ff_vf_nvresize = {
+ .name = "nvresize",
+ .description = NULL_IF_CONFIG_SMALL("GPU accelerated video resizer."),
+ .inputs = nvresize_inputs,
+ .outputs = NULL,
+ .flags = AVFILTER_FLAG_DYNAMIC_OUTPUTS,
+ .priv_class = &nvresize_class,
+ .init = init,
+ .uninit = uninit,
+ .query_formats = query_formats,
+ .priv_size = sizeof(NVResizeContext),
+};
diff -ruN ffmpeg-orig/libavfilter/vf_nvresize_ptx.c ffmpeg/libavfilter/vf_nvresize_ptx.c
--- ffmpeg-orig/libavfilter/vf_nvresize_ptx.c 1970-01-01 03:00:00.000000000 +0300
+++ ffmpeg/libavfilter/vf_nvresize_ptx.c 2016-03-04 00:18:17.589433917 +0200
@@ -0,0 +1,659 @@
+#if _WIN32 || _WIN64
+#if _WIN64
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+// Check GCC
+#if __GNUC__
+#if __x86_64__ || __ppc64__
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+#ifdef ENVIRONMENT32
+const char resize_ptx[] = \
+ "//\n"
+ "// Generated by NVIDIA NVVM Compiler\n"
+ "//\n"
+ "// Compiler Build ID: CL-19324607\n"
+ "// Cuda compilation tools, release 7.0, V7.0.27\n"
+ "// Based on LLVM 3.4svn\n"
+ "//\n"
+ "\n"
+ ".version 4.2\n"
+ ".target sm_30\n"
+ ".address_size 32\n"
+ "\n"
+ "// .globl Subsample_Bilinear_uchar\n"
+ ".global .texref uchar_tex;\n"
+ ".global .texref uchar2_tex;\n"
+ ".global .texref uchar4_tex;\n"
+ "\n"
+ ".visible .entry Subsample_Bilinear_uchar(\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_0,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_1,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_2,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_3,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_4,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_5\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<4>;\n"
+ ".reg .f32 %f<27>;\n"
+ ".reg .s32 %r<43>;\n"
+ ".reg .s64 %rd<2>;\n"
+ "\n"
+ "\n"
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar_param_0];\n"
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar_param_1];\n"
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar_param_2];\n"
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar_param_3];\n"
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar_param_4];\n"
+ "ld.param.u32 %r8, [Subsample_Bilinear_uchar_param_5];\n"
+ "mov.u32 %r9, %ctaid.x;\n"
+ "mov.u32 %r10, %ntid.x;\n"
+ "mov.u32 %r11, %tid.x;\n"
+ "mad.lo.s32 %r1, %r10, %r9, %r11;\n"
+ "mov.u32 %r12, %ntid.y;\n"
+ "mov.u32 %r13, %ctaid.y;\n"
+ "mov.u32 %r14, %tid.y;\n"
+ "mad.lo.s32 %r2, %r12, %r13, %r14;\n"
+ "setp.lt.s32 %p1, %r2, %r5;\n"
+ "setp.lt.s32 %p2, %r1, %r4;\n"
+ "and.pred %p3, %p1, %p2;\n"
+ "@!%p3 bra BB0_2;\n"
+ "bra.uni BB0_1;\n"
+ "\n"
+ "BB0_1:\n"
+ "cvta.to.global.u32 %r15, %r3;\n"
+ "cvt.rn.f32.s32 %f1, %r4;\n"
+ "cvt.rn.f32.s32 %f2, %r7;\n"
+ "div.rn.f32 %f3, %f2, %f1;\n"
+ "cvt.rn.f32.s32 %f4, %r5;\n"
+ "cvt.rn.f32.s32 %f5, %r8;\n"
+ "div.rn.f32 %f6, %f5, %f4;\n"
+ "cvt.rn.f32.s32 %f7, %r1;\n"
+ "add.f32 %f8, %f7, 0f3F000000;\n"
+ "mul.f32 %f9, %f8, %f3;\n"
+ "cvt.rn.f32.s32 %f10, %r2;\n"
+ "add.f32 %f11, %f10, 0f3F000000;\n"
+ "mul.f32 %f12, %f11, %f6;\n"
+ "add.f32 %f13, %f3, 0fBF800000;\n"
+ "mul.f32 %f14, %f13, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f15, %f14;\n"
+ "add.f32 %f16, %f6, 0fBF800000;\n"
+ "mul.f32 %f17, %f16, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f18, %f17;\n"
+ "add.f32 %f19, %f15, 0f3F000000;\n"
+ "div.rn.f32 %f20, %f15, %f19;\n"
+ "add.f32 %f21, %f18, 0f3F000000;\n"
+ "div.rn.f32 %f22, %f18, %f21;\n"
+ "sub.f32 %f23, %f9, %f20;\n"
+ "sub.f32 %f24, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r16, %r17, %r18, %r19}, [uchar_tex, {%f23, %f24}];\n"
+ "and.b32 %r20, %r16, 255;\n"
+ "add.f32 %f25, %f9, %f20;\n"
+ "tex.2d.v4.u32.f32 {%r21, %r22, %r23, %r24}, [uchar_tex, {%f25, %f24}];\n"
+ "and.b32 %r25, %r21, 255;\n"
+ "add.f32 %f26, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r26, %r27, %r28, %r29}, [uchar_tex, {%f23, %f26}];\n"
+ "and.b32 %r30, %r26, 255;\n"
+ "tex.2d.v4.u32.f32 {%r31, %r32, %r33, %r34}, [uchar_tex, {%f25, %f26}];\n"
+ "and.b32 %r35, %r31, 255;\n"
+ "add.s32 %r36, %r20, %r25;\n"
+ "add.s32 %r37, %r36, %r30;\n"
+ "add.s32 %r38, %r37, %r35;\n"
+ "add.s32 %r39, %r38, 2;\n"
+ "shr.u32 %r40, %r39, 2;\n"
+ "mad.lo.s32 %r41, %r2, %r6, %r1;\n"
+ "add.s32 %r42, %r15, %r41;\n"
+ "st.global.u8 [%r42], %r40;\n"
+ "\n"
+ "BB0_2:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "// .globl Subsample_Bilinear_uchar2\n"
+ ".visible .entry Subsample_Bilinear_uchar2(\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_0,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_1,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_2,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_3,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_4,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_5\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<4>;\n"
+ ".reg .s16 %rs<3>;\n"
+ ".reg .f32 %f<27>;\n"
+ ".reg .s32 %r<53>;\n"
+ ".reg .s64 %rd<2>;\n"
+ "\n"
+ "\n"
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar2_param_0];\n"
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar2_param_1];\n"
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar2_param_2];\n"
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar2_param_3];\n"
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar2_param_4];\n"
+ "ld.param.u32 %r8, [Subsample_Bilinear_uchar2_param_5];\n"
+ "mov.u32 %r9, %ctaid.x;\n"
+ "mov.u32 %r10, %ntid.x;\n"
+ "mov.u32 %r11, %tid.x;\n"
+ "mad.lo.s32 %r1, %r10, %r9, %r11;\n"
+ "mov.u32 %r12, %ntid.y;\n"
+ "mov.u32 %r13, %ctaid.y;\n"
+ "mov.u32 %r14, %tid.y;\n"
+ "mad.lo.s32 %r2, %r12, %r13, %r14;\n"
+ "setp.lt.s32 %p1, %r2, %r5;\n"
+ "setp.lt.s32 %p2, %r1, %r4;\n"
+ "and.pred %p3, %p1, %p2;\n"
+ "@!%p3 bra BB1_2;\n"
+ "bra.uni BB1_1;\n"
+ "\n"
+ "BB1_1:\n"
+ "cvta.to.global.u32 %r15, %r3;\n"
+ "cvt.rn.f32.s32 %f1, %r4;\n"
+ "cvt.rn.f32.s32 %f2, %r7;\n"
+ "div.rn.f32 %f3, %f2, %f1;\n"
+ "cvt.rn.f32.s32 %f4, %r5;\n"
+ "cvt.rn.f32.s32 %f5, %r8;\n"
+ "div.rn.f32 %f6, %f5, %f4;\n"
+ "cvt.rn.f32.s32 %f7, %r1;\n"
+ "add.f32 %f8, %f7, 0f3F000000;\n"
+ "mul.f32 %f9, %f8, %f3;\n"
+ "cvt.rn.f32.s32 %f10, %r2;\n"
+ "add.f32 %f11, %f10, 0f3F000000;\n"
+ "mul.f32 %f12, %f11, %f6;\n"
+ "add.f32 %f13, %f3, 0fBF800000;\n"
+ "mul.f32 %f14, %f13, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f15, %f14;\n"
+ "add.f32 %f16, %f6, 0fBF800000;\n"
+ "mul.f32 %f17, %f16, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f18, %f17;\n"
+ "add.f32 %f19, %f15, 0f3F000000;\n"
+ "div.rn.f32 %f20, %f15, %f19;\n"
+ "add.f32 %f21, %f18, 0f3F000000;\n"
+ "div.rn.f32 %f22, %f18, %f21;\n"
+ "sub.f32 %f23, %f9, %f20;\n"
+ "sub.f32 %f24, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r16, %r17, %r18, %r19}, [uchar2_tex, {%f23, %f24}];\n"
+ "add.f32 %f25, %f9, %f20;\n"
+ "tex.2d.v4.u32.f32 {%r20, %r21, %r22, %r23}, [uchar2_tex, {%f25, %f24}];\n"
+ "add.f32 %f26, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r24, %r25, %r26, %r27}, [uchar2_tex, {%f23, %f26}];\n"
+ "tex.2d.v4.u32.f32 {%r28, %r29, %r30, %r31}, [uchar2_tex, {%f25, %f26}];\n"
+ "and.b32 %r32, %r16, 255;\n"
+ "and.b32 %r33, %r20, 255;\n"
+ "and.b32 %r34, %r24, 255;\n"
+ "and.b32 %r35, %r28, 255;\n"
+ "add.s32 %r36, %r32, %r33;\n"
+ "add.s32 %r37, %r36, %r34;\n"
+ "add.s32 %r38, %r37, %r35;\n"
+ "add.s32 %r39, %r38, 2;\n"
+ "shr.u32 %r40, %r39, 2;\n"
+ "and.b32 %r41, %r17, 255;\n"
+ "and.b32 %r42, %r21, 255;\n"
+ "and.b32 %r43, %r25, 255;\n"
+ "and.b32 %r44, %r29, 255;\n"
+ "add.s32 %r45, %r41, %r42;\n"
+ "add.s32 %r46, %r45, %r43;\n"
+ "add.s32 %r47, %r46, %r44;\n"
+ "add.s32 %r48, %r47, 2;\n"
+ "shr.u32 %r49, %r48, 2;\n"
+ "mad.lo.s32 %r50, %r2, %r6, %r1;\n"
+ "shl.b32 %r51, %r50, 1;\n"
+ "add.s32 %r52, %r15, %r51;\n"
+ "cvt.u16.u32 %rs1, %r49;\n"
+ "cvt.u16.u32 %rs2, %r40;\n"
+ "st.global.v2.u8 [%r52], {%rs2, %rs1};\n"
+ "\n"
+ "BB1_2:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "// .globl Subsample_Bilinear_uchar4\n"
+ ".visible .entry Subsample_Bilinear_uchar4(\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_0,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_1,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_2,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_3,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_4,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_5\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<4>;\n"
+ ".reg .s16 %rs<5>;\n"
+ ".reg .f32 %f<27>;\n"
+ ".reg .s32 %r<71>;\n"
+ ".reg .s64 %rd<2>;\n"
+ "\n"
+ "\n"
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar4_param_0];\n"
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar4_param_1];\n"
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar4_param_2];\n"
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar4_param_3];\n"
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar4_param_4];\n"
+ "ld.param.u32 %r8, [Subsample_Bilinear_uchar4_param_5];\n"
+ "mov.u32 %r9, %ctaid.x;\n"
+ "mov.u32 %r10, %ntid.x;\n"
+ "mov.u32 %r11, %tid.x;\n"
+ "mad.lo.s32 %r1, %r10, %r9, %r11;\n"
+ "mov.u32 %r12, %ntid.y;\n"
+ "mov.u32 %r13, %ctaid.y;\n"
+ "mov.u32 %r14, %tid.y;\n"
+ "mad.lo.s32 %r2, %r12, %r13, %r14;\n"
+ "setp.lt.s32 %p1, %r2, %r5;\n"
+ "setp.lt.s32 %p2, %r1, %r4;\n"
+ "and.pred %p3, %p1, %p2;\n"
+ "@!%p3 bra BB2_2;\n"
+ "bra.uni BB2_1;\n"
+ "\n"
+ "BB2_1:\n"
+ "cvta.to.global.u32 %r15, %r3;\n"
+ "cvt.rn.f32.s32 %f1, %r4;\n"
+ "cvt.rn.f32.s32 %f2, %r7;\n"
+ "div.rn.f32 %f3, %f2, %f1;\n"
+ "cvt.rn.f32.s32 %f4, %r5;\n"
+ "cvt.rn.f32.s32 %f5, %r8;\n"
+ "div.rn.f32 %f6, %f5, %f4;\n"
+ "cvt.rn.f32.s32 %f7, %r1;\n"
+ "add.f32 %f8, %f7, 0f3F000000;\n"
+ "mul.f32 %f9, %f8, %f3;\n"
+ "cvt.rn.f32.s32 %f10, %r2;\n"
+ "add.f32 %f11, %f10, 0f3F000000;\n"
+ "mul.f32 %f12, %f11, %f6;\n"
+ "add.f32 %f13, %f3, 0fBF800000;\n"
+ "mul.f32 %f14, %f13, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f15, %f14;\n"
+ "add.f32 %f16, %f6, 0fBF800000;\n"
+ "mul.f32 %f17, %f16, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f18, %f17;\n"
+ "add.f32 %f19, %f15, 0f3F000000;\n"
+ "div.rn.f32 %f20, %f15, %f19;\n"
+ "add.f32 %f21, %f18, 0f3F000000;\n"
+ "div.rn.f32 %f22, %f18, %f21;\n"
+ "sub.f32 %f23, %f9, %f20;\n"
+ "sub.f32 %f24, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r16, %r17, %r18, %r19}, [uchar4_tex, {%f23, %f24}];\n"
+ "add.f32 %f25, %f9, %f20;\n"
+ "tex.2d.v4.u32.f32 {%r20, %r21, %r22, %r23}, [uchar4_tex, {%f25, %f24}];\n"
+ "add.f32 %f26, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r24, %r25, %r26, %r27}, [uchar4_tex, {%f23, %f26}];\n"
+ "tex.2d.v4.u32.f32 {%r28, %r29, %r30, %r31}, [uchar4_tex, {%f25, %f26}];\n"
+ "and.b32 %r32, %r16, 255;\n"
+ "and.b32 %r33, %r20, 255;\n"
+ "and.b32 %r34, %r24, 255;\n"
+ "and.b32 %r35, %r28, 255;\n"
+ "add.s32 %r36, %r32, %r33;\n"
+ "add.s32 %r37, %r36, %r34;\n"
+ "add.s32 %r38, %r37, %r35;\n"
+ "add.s32 %r39, %r38, 2;\n"
+ "shr.u32 %r40, %r39, 2;\n"
+ "and.b32 %r41, %r17, 255;\n"
+ "and.b32 %r42, %r21, 255;\n"
+ "and.b32 %r43, %r25, 255;\n"
+ "and.b32 %r44, %r29, 255;\n"
+ "add.s32 %r45, %r41, %r42;\n"
+ "add.s32 %r46, %r45, %r43;\n"
+ "add.s32 %r47, %r46, %r44;\n"
+ "add.s32 %r48, %r47, 2;\n"
+ "shr.u32 %r49, %r48, 2;\n"
+ "and.b32 %r50, %r18, 255;\n"
+ "and.b32 %r51, %r22, 255;\n"
+ "and.b32 %r52, %r26, 255;\n"
+ "and.b32 %r53, %r30, 255;\n"
+ "add.s32 %r54, %r50, %r51;\n"
+ "add.s32 %r55, %r54, %r52;\n"
+ "add.s32 %r56, %r55, %r53;\n"
+ "add.s32 %r57, %r56, 2;\n"
+ "shr.u32 %r58, %r57, 2;\n"
+ "and.b32 %r59, %r19, 255;\n"
+ "and.b32 %r60, %r23, 255;\n"
+ "and.b32 %r61, %r27, 255;\n"
+ "and.b32 %r62, %r31, 255;\n"
+ "add.s32 %r63, %r59, %r60;\n"
+ "add.s32 %r64, %r63, %r61;\n"
+ "add.s32 %r65, %r64, %r62;\n"
+ "add.s32 %r66, %r65, 2;\n"
+ "shr.u32 %r67, %r66, 2;\n"
+ "mad.lo.s32 %r68, %r2, %r6, %r1;\n"
+ "shl.b32 %r69, %r68, 2;\n"
+ "add.s32 %r70, %r15, %r69;\n"
+ "cvt.u16.u32 %rs1, %r67;\n"
+ "cvt.u16.u32 %rs2, %r58;\n"
+ "cvt.u16.u32 %rs3, %r49;\n"
+ "cvt.u16.u32 %rs4, %r40;\n"
+ "st.global.v4.u8 [%r70], {%rs4, %rs3, %rs2, %rs1};\n"
+ "\n"
+ "BB2_2:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "\n"
+;
+#elif defined ENVIRONMENT64
+const char resize_ptx[] = \
+ "//\n"
+ "// Generated by NVIDIA NVVM Compiler\n"
+ "//\n"
+ "// Compiler Build ID: CL-19324607\n"
+ "// Cuda compilation tools, release 7.0, V7.0.27\n"
+ "// Based on LLVM 3.4svn\n"
+ "//\n"
+ "\n"
+ ".version 4.2\n"
+ ".target sm_30\n"
+ ".address_size 64\n"
+ "\n"
+ "// .globl Subsample_Bilinear_uchar\n"
+ ".global .texref uchar_tex;\n"
+ ".global .texref uchar2_tex;\n"
+ ".global .texref uchar4_tex;\n"
+ "\n"
+ ".visible .entry Subsample_Bilinear_uchar(\n"
+ ".param .u64 Subsample_Bilinear_uchar_param_0,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_1,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_2,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_3,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_4,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_5\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<4>;\n"
+ ".reg .f32 %f<27>;\n"
+ ".reg .s32 %r<40>;\n"
+ ".reg .s64 %rd<6>;\n"
+ "\n"
+ "\n"
+ "ld.param.u64 %rd1, [Subsample_Bilinear_uchar_param_0];\n"
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar_param_1];\n"
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar_param_2];\n"
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar_param_3];\n"
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar_param_4];\n"
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar_param_5];\n"
+ "mov.u32 %r8, %ctaid.x;\n"
+ "mov.u32 %r9, %ntid.x;\n"
+ "mov.u32 %r10, %tid.x;\n"
+ "mad.lo.s32 %r1, %r9, %r8, %r10;\n"
+ "mov.u32 %r11, %ntid.y;\n"
+ "mov.u32 %r12, %ctaid.y;\n"
+ "mov.u32 %r13, %tid.y;\n"
+ "mad.lo.s32 %r2, %r11, %r12, %r13;\n"
+ "setp.lt.s32 %p1, %r2, %r4;\n"
+ "setp.lt.s32 %p2, %r1, %r3;\n"
+ "and.pred %p3, %p1, %p2;\n"
+ "@!%p3 bra BB0_2;\n"
+ "bra.uni BB0_1;\n"
+ "\n"
+ "BB0_1:\n"
+ "cvta.to.global.u64 %rd2, %rd1;\n"
+ "cvt.rn.f32.s32 %f1, %r3;\n"
+ "cvt.rn.f32.s32 %f2, %r6;\n"
+ "div.rn.f32 %f3, %f2, %f1;\n"
+ "cvt.rn.f32.s32 %f4, %r4;\n"
+ "cvt.rn.f32.s32 %f5, %r7;\n"
+ "div.rn.f32 %f6, %f5, %f4;\n"
+ "cvt.rn.f32.s32 %f7, %r1;\n"
+ "add.f32 %f8, %f7, 0f3F000000;\n"
+ "mul.f32 %f9, %f8, %f3;\n"
+ "cvt.rn.f32.s32 %f10, %r2;\n"
+ "add.f32 %f11, %f10, 0f3F000000;\n"
+ "mul.f32 %f12, %f11, %f6;\n"
+ "add.f32 %f13, %f3, 0fBF800000;\n"
+ "mul.f32 %f14, %f13, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f15, %f14;\n"
+ "add.f32 %f16, %f6, 0fBF800000;\n"
+ "mul.f32 %f17, %f16, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f18, %f17;\n"
+ "add.f32 %f19, %f15, 0f3F000000;\n"
+ "div.rn.f32 %f20, %f15, %f19;\n"
+ "add.f32 %f21, %f18, 0f3F000000;\n"
+ "div.rn.f32 %f22, %f18, %f21;\n"
+ "sub.f32 %f23, %f9, %f20;\n"
+ "sub.f32 %f24, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r14, %r15, %r16, %r17}, [uchar_tex, {%f23, %f24}];\n"
+ "and.b32 %r18, %r14, 255;\n"
+ "add.f32 %f25, %f9, %f20;\n"
+ "tex.2d.v4.u32.f32 {%r19, %r20, %r21, %r22}, [uchar_tex, {%f25, %f24}];\n"
+ "and.b32 %r23, %r19, 255;\n"
+ "add.f32 %f26, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r24, %r25, %r26, %r27}, [uchar_tex, {%f23, %f26}];\n"
+ "and.b32 %r28, %r24, 255;\n"
+ "tex.2d.v4.u32.f32 {%r29, %r30, %r31, %r32}, [uchar_tex, {%f25, %f26}];\n"
+ "and.b32 %r33, %r29, 255;\n"
+ "add.s32 %r34, %r18, %r23;\n"
+ "add.s32 %r35, %r34, %r28;\n"
+ "add.s32 %r36, %r35, %r33;\n"
+ "add.s32 %r37, %r36, 2;\n"
+ "shr.u32 %r38, %r37, 2;\n"
+ "mad.lo.s32 %r39, %r2, %r5, %r1;\n"
+ "cvt.s64.s32 %rd4, %r39;\n"
+ "add.s64 %rd5, %rd2, %rd4;\n"
+ "st.global.u8 [%rd5], %r38;\n"
+ "\n"
+ "BB0_2:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "// .globl Subsample_Bilinear_uchar2\n"
+ ".visible .entry Subsample_Bilinear_uchar2(\n"
+ ".param .u64 Subsample_Bilinear_uchar2_param_0,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_1,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_2,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_3,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_4,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_5\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<4>;\n"
+ ".reg .s16 %rs<3>;\n"
+ ".reg .f32 %f<27>;\n"
+ ".reg .s32 %r<49>;\n"
+ ".reg .s64 %rd<6>;\n"
+ "\n"
+ "\n"
+ "ld.param.u64 %rd1, [Subsample_Bilinear_uchar2_param_0];\n"
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar2_param_1];\n"
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar2_param_2];\n"
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar2_param_3];\n"
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar2_param_4];\n"
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar2_param_5];\n"
+ "mov.u32 %r8, %ctaid.x;\n"
+ "mov.u32 %r9, %ntid.x;\n"
+ "mov.u32 %r10, %tid.x;\n"
+ "mad.lo.s32 %r1, %r9, %r8, %r10;\n"
+ "mov.u32 %r11, %ntid.y;\n"
+ "mov.u32 %r12, %ctaid.y;\n"
+ "mov.u32 %r13, %tid.y;\n"
+ "mad.lo.s32 %r2, %r11, %r12, %r13;\n"
+ "setp.lt.s32 %p1, %r2, %r4;\n"
+ "setp.lt.s32 %p2, %r1, %r3;\n"
+ "and.pred %p3, %p1, %p2;\n"
+ "@!%p3 bra BB1_2;\n"
+ "bra.uni BB1_1;\n"
+ "\n"
+ "BB1_1:\n"
+ "cvta.to.global.u64 %rd2, %rd1;\n"
+ "cvt.rn.f32.s32 %f1, %r3;\n"
+ "cvt.rn.f32.s32 %f2, %r6;\n"
+ "div.rn.f32 %f3, %f2, %f1;\n"
+ "cvt.rn.f32.s32 %f4, %r4;\n"
+ "cvt.rn.f32.s32 %f5, %r7;\n"
+ "div.rn.f32 %f6, %f5, %f4;\n"
+ "cvt.rn.f32.s32 %f7, %r1;\n"
+ "add.f32 %f8, %f7, 0f3F000000;\n"
+ "mul.f32 %f9, %f8, %f3;\n"
+ "cvt.rn.f32.s32 %f10, %r2;\n"
+ "add.f32 %f11, %f10, 0f3F000000;\n"
+ "mul.f32 %f12, %f11, %f6;\n"
+ "add.f32 %f13, %f3, 0fBF800000;\n"
+ "mul.f32 %f14, %f13, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f15, %f14;\n"
+ "add.f32 %f16, %f6, 0fBF800000;\n"
+ "mul.f32 %f17, %f16, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f18, %f17;\n"
+ "add.f32 %f19, %f15, 0f3F000000;\n"
+ "div.rn.f32 %f20, %f15, %f19;\n"
+ "add.f32 %f21, %f18, 0f3F000000;\n"
+ "div.rn.f32 %f22, %f18, %f21;\n"
+ "sub.f32 %f23, %f9, %f20;\n"
+ "sub.f32 %f24, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r14, %r15, %r16, %r17}, [uchar2_tex, {%f23, %f24}];\n"
+ "add.f32 %f25, %f9, %f20;\n"
+ "tex.2d.v4.u32.f32 {%r18, %r19, %r20, %r21}, [uchar2_tex, {%f25, %f24}];\n"
+ "add.f32 %f26, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r22, %r23, %r24, %r25}, [uchar2_tex, {%f23, %f26}];\n"
+ "tex.2d.v4.u32.f32 {%r26, %r27, %r28, %r29}, [uchar2_tex, {%f25, %f26}];\n"
+ "and.b32 %r30, %r14, 255;\n"
+ "and.b32 %r31, %r18, 255;\n"
+ "and.b32 %r32, %r22, 255;\n"
+ "and.b32 %r33, %r26, 255;\n"
+ "add.s32 %r34, %r30, %r31;\n"
+ "add.s32 %r35, %r34, %r32;\n"
+ "add.s32 %r36, %r35, %r33;\n"
+ "add.s32 %r37, %r36, 2;\n"
+ "shr.u32 %r38, %r37, 2;\n"
+ "and.b32 %r39, %r15, 255;\n"
+ "and.b32 %r40, %r19, 255;\n"
+ "and.b32 %r41, %r23, 255;\n"
+ "and.b32 %r42, %r27, 255;\n"
+ "add.s32 %r43, %r39, %r40;\n"
+ "add.s32 %r44, %r43, %r41;\n"
+ "add.s32 %r45, %r44, %r42;\n"
+ "add.s32 %r46, %r45, 2;\n"
+ "shr.u32 %r47, %r46, 2;\n"
+ "mad.lo.s32 %r48, %r2, %r5, %r1;\n"
+ "mul.wide.s32 %rd4, %r48, 2;\n"
+ "add.s64 %rd5, %rd2, %rd4;\n"
+ "cvt.u16.u32 %rs1, %r47;\n"
+ "cvt.u16.u32 %rs2, %r38;\n"
+ "st.global.v2.u8 [%rd5], {%rs2, %rs1};\n"
+ "\n"
+ "BB1_2:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "// .globl Subsample_Bilinear_uchar4\n"
+ ".visible .entry Subsample_Bilinear_uchar4(\n"
+ ".param .u64 Subsample_Bilinear_uchar4_param_0,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_1,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_2,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_3,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_4,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_5\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<4>;\n"
+ ".reg .s16 %rs<5>;\n"
+ ".reg .f32 %f<27>;\n"
+ ".reg .s32 %r<67>;\n"
+ ".reg .s64 %rd<6>;\n"
+ "\n"
+ "\n"
+ "ld.param.u64 %rd1, [Subsample_Bilinear_uchar4_param_0];\n"
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar4_param_1];\n"
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar4_param_2];\n"
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar4_param_3];\n"
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar4_param_4];\n"
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar4_param_5];\n"
+ "mov.u32 %r8, %ctaid.x;\n"
+ "mov.u32 %r9, %ntid.x;\n"
+ "mov.u32 %r10, %tid.x;\n"
+ "mad.lo.s32 %r1, %r9, %r8, %r10;\n"
+ "mov.u32 %r11, %ntid.y;\n"
+ "mov.u32 %r12, %ctaid.y;\n"
+ "mov.u32 %r13, %tid.y;\n"
+ "mad.lo.s32 %r2, %r11, %r12, %r13;\n"
+ "setp.lt.s32 %p1, %r2, %r4;\n"
+ "setp.lt.s32 %p2, %r1, %r3;\n"
+ "and.pred %p3, %p1, %p2;\n"
+ "@!%p3 bra BB2_2;\n"
+ "bra.uni BB2_1;\n"
+ "\n"
+ "BB2_1:\n"
+ "cvta.to.global.u64 %rd2, %rd1;\n"
+ "cvt.rn.f32.s32 %f1, %r3;\n"
+ "cvt.rn.f32.s32 %f2, %r6;\n"
+ "div.rn.f32 %f3, %f2, %f1;\n"
+ "cvt.rn.f32.s32 %f4, %r4;\n"
+ "cvt.rn.f32.s32 %f5, %r7;\n"
+ "div.rn.f32 %f6, %f5, %f4;\n"
+ "cvt.rn.f32.s32 %f7, %r1;\n"
+ "add.f32 %f8, %f7, 0f3F000000;\n"
+ "mul.f32 %f9, %f8, %f3;\n"
+ "cvt.rn.f32.s32 %f10, %r2;\n"
+ "add.f32 %f11, %f10, 0f3F000000;\n"
+ "mul.f32 %f12, %f11, %f6;\n"
+ "add.f32 %f13, %f3, 0fBF800000;\n"
+ "mul.f32 %f14, %f13, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f15, %f14;\n"
+ "add.f32 %f16, %f6, 0fBF800000;\n"
+ "mul.f32 %f17, %f16, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f18, %f17;\n"
+ "add.f32 %f19, %f15, 0f3F000000;\n"
+ "div.rn.f32 %f20, %f15, %f19;\n"
+ "add.f32 %f21, %f18, 0f3F000000;\n"
+ "div.rn.f32 %f22, %f18, %f21;\n"
+ "sub.f32 %f23, %f9, %f20;\n"
+ "sub.f32 %f24, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r14, %r15, %r16, %r17}, [uchar4_tex, {%f23, %f24}];\n"
+ "add.f32 %f25, %f9, %f20;\n"
+ "tex.2d.v4.u32.f32 {%r18, %r19, %r20, %r21}, [uchar4_tex, {%f25, %f24}];\n"
+ "add.f32 %f26, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r22, %r23, %r24, %r25}, [uchar4_tex, {%f23, %f26}];\n"
+ "tex.2d.v4.u32.f32 {%r26, %r27, %r28, %r29}, [uchar4_tex, {%f25, %f26}];\n"
+ "and.b32 %r30, %r14, 255;\n"
+ "and.b32 %r31, %r18, 255;\n"
+ "and.b32 %r32, %r22, 255;\n"
+ "and.b32 %r33, %r26, 255;\n"
+ "add.s32 %r34, %r30, %r31;\n"
+ "add.s32 %r35, %r34, %r32;\n"
+ "add.s32 %r36, %r35, %r33;\n"
+ "add.s32 %r37, %r36, 2;\n"
+ "shr.u32 %r38, %r37, 2;\n"
+ "and.b32 %r39, %r15, 255;\n"
+ "and.b32 %r40, %r19, 255;\n"
+ "and.b32 %r41, %r23, 255;\n"
+ "and.b32 %r42, %r27, 255;\n"
+ "add.s32 %r43, %r39, %r40;\n"
+ "add.s32 %r44, %r43, %r41;\n"
+ "add.s32 %r45, %r44, %r42;\n"
+ "add.s32 %r46, %r45, 2;\n"
+ "shr.u32 %r47, %r46, 2;\n"
+ "and.b32 %r48, %r16, 255;\n"
+ "and.b32 %r49, %r20, 255;\n"
+ "and.b32 %r50, %r24, 255;\n"
+ "and.b32 %r51, %r28, 255;\n"
+ "add.s32 %r52, %r48, %r49;\n"
+ "add.s32 %r53, %r52, %r50;\n"
+ "add.s32 %r54, %r53, %r51;\n"
+ "add.s32 %r55, %r54, 2;\n"
+ "shr.u32 %r56, %r55, 2;\n"
+ "and.b32 %r57, %r17, 255;\n"
+ "and.b32 %r58, %r21, 255;\n"
+ "and.b32 %r59, %r25, 255;\n"
+ "and.b32 %r60, %r29, 255;\n"
+ "add.s32 %r61, %r57, %r58;\n"
+ "add.s32 %r62, %r61, %r59;\n"
+ "add.s32 %r63, %r62, %r60;\n"
+ "add.s32 %r64, %r63, 2;\n"
+ "shr.u32 %r65, %r64, 2;\n"
+ "mad.lo.s32 %r66, %r2, %r5, %r1;\n"
+ "mul.wide.s32 %rd4, %r66, 4;\n"
+ "add.s64 %rd5, %rd2, %rd4;\n"
+ "cvt.u16.u32 %rs1, %r65;\n"
+ "cvt.u16.u32 %rs2, %r56;\n"
+ "cvt.u16.u32 %rs3, %r47;\n"
+ "cvt.u16.u32 %rs4, %r38;\n"
+ "st.global.v4.u8 [%rd5], {%rs4, %rs3, %rs2, %rs1};\n"
+ "\n"
+ "BB2_2:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "\n"
+;
+#endif
diff -ruN ffmpeg-orig/libavutil/cudautils.c ffmpeg/libavutil/cudautils.c
--- ffmpeg-orig/libavutil/cudautils.c 1970-01-01 03:00:00.000000000 +0300
+++ ffmpeg/libavutil/cudautils.c 2016-03-04 00:18:17.589433917 +0200
@@ -0,0 +1,288 @@
+/*
+*
+* This file is part of FFmpeg.
+*
+* FFmpeg is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* FFmpeg is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with FFmpeg; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include "cudautils.h"
+#include "common.h"
+#include "log.h"
+
+#define FF_NVINFO_VERSION 1
+static NVGUID NV_INFO_GUID = { 0x2cab9a64, 0x7095, 0x11e5, { 0xad, 0x1d, 0x94, 0xde, 0x80, 0x65, 0xb7, 0x74 } };
+static CudaContext cudaCtx = { { NULL }, { 0 }, { NULL }, 0 , { NULL }, { "" }, { 0 } };
+
+int dyload_cuda(void);
+int check_cuda(void);
+int check_cuda_errors(CUresult err, const char *func);
+
+
+#define CHECK_LOAD_FUNC(t, f, s) \
+do { \
+ (f) = (t)LOAD_FUNC(dl_fn->cuda_lib, s); \
+ if (!(f)) { \
+ av_log(NULL, AV_LOG_FATAL, "Failed loading %s from CUDA library\n", s); \
+ goto error; \
+ } \
+} while (0)
+
+int check_cuda_errors(CUresult err, const char *func)
+{
+ if (err != CUDA_SUCCESS) {
+ av_log(NULL, AV_LOG_FATAL, ">> %s - failed with error code 0x%x\n", func, err);
+ return 0;
+ }
+ return 1;
+}
+#define check_cuda_errors(f) if (!check_cuda_errors(f, #f)) goto error
+
+
+int dyload_cuda(void)
+{
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs;
+
+ if (dl_fn->cuda_lib)
+ return 1;
+
+#if defined(_WIN32)
+ dl_fn->cuda_lib = LoadLibrary(TEXT("nvcuda.dll"));
+#else
+ dl_fn->cuda_lib = dlopen("libcuda.so", RTLD_LAZY);
+#endif
+
+ if (!dl_fn->cuda_lib) {
+ av_log(NULL, AV_LOG_FATAL, "Failed loading CUDA library\n");
+ goto error;
+ }
+
+ CHECK_LOAD_FUNC(PCUINIT, dl_fn->cu_init, "cuInit");
+ CHECK_LOAD_FUNC(PCUDEVICEGETCOUNT, dl_fn->cu_device_get_count, "cuDeviceGetCount");
+ CHECK_LOAD_FUNC(PCUDEVICEGET, dl_fn->cu_device_get, "cuDeviceGet");
+ CHECK_LOAD_FUNC(PCUDEVICEGETNAME, dl_fn->cu_device_get_name, "cuDeviceGetName");
+ CHECK_LOAD_FUNC(PCUDEVICECOMPUTECAPABILITY, dl_fn->cu_device_compute_capability, "cuDeviceComputeCapability");
+ CHECK_LOAD_FUNC(PCUCTXCREATE, dl_fn->cu_ctx_create, "cuCtxCreate_v2");
+ CHECK_LOAD_FUNC(PCUCTXPOPCURRENT, dl_fn->cu_ctx_pop_current, "cuCtxPopCurrent_v2");
+ CHECK_LOAD_FUNC(PCUCTXDESTROY, dl_fn->cu_ctx_destroy, "cuCtxDestroy_v2");
+ CHECK_LOAD_FUNC(PCUMODULELOADDATA, dl_fn->cu_module_load_data, "cuModuleLoadData");
+ CHECK_LOAD_FUNC(PCUMODULEGETFUNCTION, dl_fn->cu_module_get_function, "cuModuleGetFunction");
+ CHECK_LOAD_FUNC(PCUMODULEGETTEXREF, dl_fn->cu_module_get_texref, "cuModuleGetTexRef");
+ CHECK_LOAD_FUNC(PCUTEXREFSETFLAGS, dl_fn->cu_texref_set_flags, "cuTexRefSetFlags");
+ CHECK_LOAD_FUNC(PCUTEXREFSETFILTERMODE, dl_fn->cu_texref_set_filtermode, "cuTexRefSetFilterMode");
+ CHECK_LOAD_FUNC(PCUTEXREFSETADDRESS2D, dl_fn->cu_texref_set_address_2D, "cuTexRefSetAddress2D_v3");
+ CHECK_LOAD_FUNC(PCUMEMALLOCPITCH, dl_fn->cu_mem_alloc_pitch, "cuMemAllocPitch_v2");
+ CHECK_LOAD_FUNC(PCUMEMCPY2D, dl_fn->cu_mem_cpy_2D, "cuMemcpy2D_v2");
+ CHECK_LOAD_FUNC(PCUMEMCPY2DASYNC, dl_fn->cu_mem_cpy_2D_async, "cuMemcpy2DAsync_v2");
+ CHECK_LOAD_FUNC(PCUMEMFREE, dl_fn->cu_mem_free, "cuMemFree_v2");
+ CHECK_LOAD_FUNC(PCULAUNCHKERNEL, dl_fn->cu_launch_kernel, "cuLaunchKernel");
+
+ av_log(NULL, AV_LOG_VERBOSE, "CUDA Library and Function loaded successfully\n");
+ return 1;
+
+error:
+ if (dl_fn->cuda_lib)
+ DL_CLOSE_FUNC(dl_fn->cuda_lib);
+
+ dl_fn->cuda_lib = NULL;
+ return 0;
+}
+
+
+int check_cuda(void)
+{
+ int device_count = 0;
+ int smminor = 0, smmajor = 0;
+ int i;
+
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs;
+
+
+ if (!dyload_cuda())
+ return 0;
+
+ check_cuda_errors(dl_fn->cu_init(0));
+ check_cuda_errors(dl_fn->cu_device_get_count(&device_count));
+
+ if (!device_count) {
+ av_log(NULL, AV_LOG_FATAL, "No CUDA capable devices found\n");
+ goto error;
+ }
+
+ av_log(NULL, AV_LOG_VERBOSE, "%d CUDA capable devices found\n", device_count);
+
+ for (i = 0; i < device_count; ++i) {
+ check_cuda_errors(dl_fn->cu_device_get(&cudaCtx.cu_devices[i], i));
+ check_cuda_errors(dl_fn->cu_device_get_name(cudaCtx.gpu_name[i], sizeof(cudaCtx.gpu_name[i]), cudaCtx.cu_devices[i]));
+ check_cuda_errors(dl_fn->cu_device_compute_capability(&smmajor, &smminor, cudaCtx.cu_devices[i]));
+
+ cudaCtx.smver[i] = (smmajor << 4) | smminor;
+ av_log(NULL, AV_LOG_VERBOSE, "[ GPU #%d - < %s > has Compute SM %d.%d]\n", i, cudaCtx.gpu_name[i], smmajor, smminor);
+
+ }
+ cudaCtx.device_count = device_count;
+ return 1;
+
+error:
+ cudaCtx.device_count = 0;
+ return 0;
+}
+
+int init_cuda(void)
+{
+ if (cudaCtx.device_count == 0)
+ {
+ if (!check_cuda())
+ return 0;
+ }
+
+ return 1;
+}
+
+void deinit_cuda(void)
+{
+
+ int i, deinit_flag;
+ deinit_flag = 1;
+
+ for (i = 0; i < cudaCtx.device_count; i++)
+ {
+ if (cudaCtx.cuda_context_arr[i] != NULL)
+ {
+ deinit_flag = 0;
+ }
+ }
+
+ if (deinit_flag == 1)
+ {
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs;
+ DL_CLOSE_FUNC(dl_fn->cuda_lib);
+ dl_fn->cuda_lib = NULL;
+
+ dl_fn->cu_init = NULL;
+ dl_fn->cu_device_get_count = NULL;
+ dl_fn->cu_device_get = NULL;
+ dl_fn->cu_device_get_name = NULL;
+ dl_fn->cu_device_compute_capability = NULL;
+ dl_fn->cu_ctx_create = NULL;
+ dl_fn->cu_ctx_pop_current = NULL;
+ dl_fn->cu_ctx_destroy = NULL;
+ dl_fn->cu_module_load_data = NULL;
+ dl_fn->cu_module_get_function = NULL;
+ dl_fn->cu_module_get_texref = NULL;
+ dl_fn->cu_texref_set_flags = NULL;
+ dl_fn->cu_texref_set_filtermode = NULL;
+ dl_fn->cu_texref_set_address_2D = NULL;
+ dl_fn->cu_mem_alloc_pitch = NULL;
+ dl_fn->cu_mem_cpy_2D = NULL;
+ dl_fn->cu_mem_cpy_2D_async = NULL;
+ dl_fn->cu_mem_free = NULL;
+ dl_fn->cu_launch_kernel = NULL;
+ av_log(NULL, AV_LOG_VERBOSE, "Cuda Library unloaded\n");
+ }
+}
+
+int is_gpu_feature_available(int gpu, int target_smver)
+{
+ if (cudaCtx.device_count > 0)
+ {
+ if (cudaCtx.smver[gpu] >= target_smver)
+ {
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+
+int get_cuda_context(CUcontext *ctx, int gpu)
+{
+ CUresult cu_res;
+ CUcontext get_ctx;
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs;
+
+ if (cudaCtx.cuda_context_arr[gpu] == NULL)
+ {
+ cu_res = dl_fn->cu_ctx_create(&get_ctx, 4, cudaCtx.cu_devices[gpu]);
+
+ if (cu_res != CUDA_SUCCESS) {
+ ctx = NULL;
+ return cu_res;
+ }
+
+ cudaCtx.cuda_context_arr[gpu] = get_ctx;
+ }
+
+ *ctx = cudaCtx.cuda_context_arr[gpu];
+ cudaCtx.cuda_context_count[gpu] += 1;
+ av_log(NULL, AV_LOG_VERBOSE, "cudalib : Cuda Context created 0x%p\n", *ctx);
+
+ return CUDA_SUCCESS;
+}
+
+void release_cuda_context(CUcontext *ctx, int gpu)
+{
+
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs;
+
+ ctx = NULL;
+ cudaCtx.cuda_context_count[gpu] -= 1;
+ if (cudaCtx.cuda_context_count[gpu] == 0)
+ {
+ dl_fn->cu_ctx_destroy(cudaCtx.cuda_context_arr[gpu]);
+ cudaCtx.cuda_context_arr[gpu] = NULL;
+ }
+
+}
+
+int check_nvinfo(void* ptr)
+{
+ ffnvinfo* info;
+ if (!ptr) return 0;
+ info = (ffnvinfo*)ptr;
+ if (memcmp(&info->guid, &NV_INFO_GUID, sizeof(info->guid)) != 0) return 0;
+ return 1;
+}
+
+ffnvinfo* init_nvinfo()
+{
+ ffnvinfo* info = av_mallocz(sizeof(ffnvinfo));
+ memcpy(&info->guid, &NV_INFO_GUID, sizeof(info->guid));
+ info->version = FF_NVINFO_VERSION;
+ return info;
+}
+
+CudaDynLoadFunctions* get_cuda_dl_func()
+{
+ return &cudaCtx.cuda_dload_funcs;
+}
+
+CUresult cuMemCpy2d(const void *srcHost, CUdeviceptr srcDevice, size_t srcPitch, void *dstHost, CUdeviceptr dstDevice, size_t dstPitch, size_t width, size_t height, CUmemorytype srcMemoryType, CUmemorytype dstMemoryType)
+{
+ CUDA_MEMCPY2D memcpy2D = { 0 };
+ CudaDynLoadFunctions* cu_dl_func = get_cuda_dl_func();
+
+ memcpy2D.srcMemoryType = srcMemoryType;
+ memcpy2D.dstMemoryType = dstMemoryType;
+ memcpy2D.srcHost = srcHost;
+ memcpy2D.srcDevice = srcDevice;
+ memcpy2D.srcPitch = srcPitch;
+ memcpy2D.dstHost = dstHost;
+ memcpy2D.dstDevice = dstDevice;
+ memcpy2D.dstPitch = dstPitch;
+ memcpy2D.WidthInBytes = width;
+ memcpy2D.Height = height;
+ return cu_dl_func->cu_mem_cpy_2D_async(&memcpy2D, NULL);
+}
diff -ruN ffmpeg-orig/libavutil/cudautils.h ffmpeg/libavutil/cudautils.h
--- ffmpeg-orig/libavutil/cudautils.h 1970-01-01 03:00:00.000000000 +0300
+++ ffmpeg/libavutil/cudautils.h 2016-03-04 00:18:17.589433917 +0200
@@ -0,0 +1,216 @@
+/*
+*
+* This file is part of FFmpeg.
+*
+* FFmpeg is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* FFmpeg is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with FFmpeg; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#include "common.h"
+
+
+#if defined(_WIN32)
+#define CUDAAPI __stdcall
+#else
+#define CUDAAPI
+#endif
+
+#if defined(_WIN32)
+#define LOAD_FUNC(l, s) GetProcAddress(l, s)
+#define DL_CLOSE_FUNC(l) FreeLibrary(l)
+#else
+#define LOAD_FUNC(l, s) dlsym(l, s)
+#define DL_CLOSE_FUNC(l) dlclose(l)
+#endif
+
+#define MAX_NUM_GPU 16
+
+#define CU_TRSF_READ_AS_INTEGER 0x01
+#define CU_TRSF_NORMALIZED_COORDINATES 0x02
+#define CU_TRSF_SRGB 0x10
+
+#define __cu(a) do { \
+ CUresult ret; \
+ if ((ret = (a)) != CUDA_SUCCESS) { \
+ av_log(NULL, AV_LOG_FATAL, "[%s:%d]%s has returned CUDA error %d\n", __FILE__, __LINE__, #a, ret); \
+ return AVERROR_EXTERNAL;\
+ }} while (0)
+
+
+typedef int CUdevice;
+typedef void* CUcontext;
+typedef void* CUmodule;
+typedef void* CUfunction;
+typedef void* CUtexref;
+typedef void* CUstream;
+typedef void* CUarray;
+#if defined(_WIN64) || defined(__LP64__)
+typedef unsigned long long CUdeviceptr;
+#else
+typedef unsigned int CUdeviceptr;
+#endif
+
+typedef enum cudaError_enum {
+ CUDA_SUCCESS = 0
+} CUresult;
+
+typedef enum CUfilter_mode_enum {
+ CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */
+ CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */
+} CUfilter_mode;
+
+typedef enum CUarray_format_enum {
+ CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */
+ CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
+ CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
+ CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */
+ CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */
+ CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */
+ CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */
+ CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */
+} CUarray_format;
+
+typedef struct CUDA_ARRAY_DESCRIPTOR_st
+{
+ size_t Width; /**< Width of array */
+ size_t Height; /**< Height of array */
+
+ CUarray_format Format; /**< Array format */
+ unsigned int NumChannels; /**< Channels per array element */
+} CUDA_ARRAY_DESCRIPTOR;
+
+typedef enum CUmemorytype_enum {
+ CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */
+ CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */
+ CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */
+ CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */
+} CUmemorytype;
+
+typedef struct CUDA_MEMCPY2D_st {
+ size_t srcXInBytes; /**< Source X in bytes */
+ size_t srcY; /**< Source Y */
+
+ CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+ const void *srcHost; /**< Source host pointer */
+ CUdeviceptr srcDevice; /**< Source device pointer */
+ CUarray srcArray; /**< Source array reference */
+ size_t srcPitch; /**< Source pitch (ignored when src is array) */
+
+ size_t dstXInBytes; /**< Destination X in bytes */
+ size_t dstY; /**< Destination Y */
+
+ CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+ void *dstHost; /**< Destination host pointer */
+ CUdeviceptr dstDevice; /**< Destination device pointer */
+ CUarray dstArray; /**< Destination array reference */
+ size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
+
+ size_t WidthInBytes; /**< Width of 2D memory copy in bytes */
+ size_t Height; /**< Height of 2D memory copy */
+} CUDA_MEMCPY2D;
+
+
+typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags);
+typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count);
+typedef CUresult(CUDAAPI *PCUDEVICEGET)(CUdevice *device, int ordinal);
+typedef CUresult(CUDAAPI *PCUDEVICEGETNAME)(char *name, int len, CUdevice dev);
+typedef CUresult(CUDAAPI *PCUDEVICECOMPUTECAPABILITY)(int *major, int *minor, CUdevice dev);
+typedef CUresult(CUDAAPI *PCUCTXCREATE)(CUcontext *pctx, unsigned int flags, CUdevice dev);
+typedef CUresult(CUDAAPI *PCUCTXPOPCURRENT)(CUcontext *pctx);
+typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx);
+typedef CUresult(CUDAAPI *PCUMODULELOADDATA)(CUmodule *module, const void *image);
+typedef CUresult(CUDAAPI *PCUMODULEGETFUNCTION)(CUfunction *hfunc, CUmodule hmod, const char *name);
+typedef CUresult(CUDAAPI *PCUMODULEGETTEXREF)(CUtexref *pTexRef, CUmodule hmod, const char *name);
+typedef CUresult(CUDAAPI *PCUTEXREFSETFLAGS)(CUtexref hTexRef, unsigned int Flags);
+typedef CUresult(CUDAAPI *PCUTEXREFSETFILTERMODE)(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult(CUDAAPI *PCUTEXREFSETADDRESS2D)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+typedef CUresult(CUDAAPI *PCUMEMALLOCPITCH)(CUdeviceptr* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
+typedef CUresult(CUDAAPI *PCUMEMCPY2D)(const CUDA_MEMCPY2D *pCopy);
+typedef CUresult(CUDAAPI *PCUMEMCPY2DASYNC)(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+typedef CUresult(CUDAAPI *PCUMEMFREE)(CUdeviceptr dptr);
+typedef CUresult(CUDAAPI *PCULAUNCHKERNEL)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+
+
+typedef struct CudaDynLoadFunctions{
+ PCUINIT cu_init;
+ PCUDEVICEGETCOUNT cu_device_get_count;
+ PCUDEVICEGET cu_device_get;
+ PCUDEVICEGETNAME cu_device_get_name;
+ PCUDEVICECOMPUTECAPABILITY cu_device_compute_capability;
+ PCUCTXCREATE cu_ctx_create;
+ PCUCTXPOPCURRENT cu_ctx_pop_current;
+ PCUCTXDESTROY cu_ctx_destroy;
+ PCUMODULELOADDATA cu_module_load_data;
+ PCUMODULEGETFUNCTION cu_module_get_function;
+ PCUMODULEGETTEXREF cu_module_get_texref;
+ PCUTEXREFSETFLAGS cu_texref_set_flags;
+ PCUTEXREFSETFILTERMODE cu_texref_set_filtermode;
+ PCUTEXREFSETADDRESS2D cu_texref_set_address_2D;
+ PCUMEMALLOCPITCH cu_mem_alloc_pitch;
+ PCUMEMCPY2D cu_mem_cpy_2D;
+ PCUMEMCPY2DASYNC cu_mem_cpy_2D_async;
+ PCUMEMFREE cu_mem_free;
+ PCULAUNCHKERNEL cu_launch_kernel;
+
+#if defined(_WIN32)
+ HMODULE cuda_lib;
+#else
+ void* cuda_lib;
+#endif
+} CudaDynLoadFunctions;
+
+typedef struct CudaContext{
+
+ CUcontext cuda_context_arr[MAX_NUM_GPU];
+ unsigned int cuda_context_count[MAX_NUM_GPU];
+ CudaDynLoadFunctions cuda_dload_funcs;
+
+ int device_count;
+ CUdevice cu_devices[MAX_NUM_GPU];
+ char gpu_name[MAX_NUM_GPU][128];
+ int smver[MAX_NUM_GPU];
+} CudaContext;
+
+typedef struct _NVGUID {
+ uint32_t Data1;
+ uint16_t Data2;
+ uint16_t Data3;
+ uint8_t Data4[8];
+} NVGUID;
+
+typedef struct _ffnvinfo {
+ NVGUID guid;
+ uint32_t version;
+ //CUcontext cudaCtx;
+ void* vxCtx;
+ CUdeviceptr dptr[8];
+ uint32_t linesize[8];
+} ffnvinfo;
+
+int init_cuda(void);
+void deinit_cuda(void);
+int get_cuda_context(CUcontext *ctx, int gpu);
+void release_cuda_context(CUcontext *ctx, int gpu);
+int is_gpu_feature_available(int gpu, int target_smver);
+int check_nvinfo(void* ptr);
+ffnvinfo* init_nvinfo(void);
+CudaDynLoadFunctions* get_cuda_dl_func(void);
+
+CUresult cuMemCpy2d(const void *srcHost, CUdeviceptr srcDevice, size_t srcPitch, void *dstHost, CUdeviceptr dstDevice, size_t dstPitch, size_t width, size_t height, CUmemorytype srcMemoryType, CUmemorytype dstMemoryType);
diff -ruN ffmpeg-orig/libavutil/Makefile ffmpeg/libavutil/Makefile
--- ffmpeg-orig/libavutil/Makefile 2016-03-04 00:17:21.063188717 +0200
+++ ffmpeg/libavutil/Makefile 2016-03-04 00:18:17.589433917 +0200
@@ -21,6 +21,7 @@
common.h \
cpu.h \
crc.h \
+ cudautils.h \
des.h \
display.h \
downmix_info.h \
@@ -98,6 +99,7 @@
color_utils.o \
cpu.o \
crc.o \
+ cudautils.o \
des.o \
display.o \
downmix_info.o \
diff -ruN ffmpeg-orig/configure ffmpeg/configure
--- ffmpeg-orig/configure 2016-03-04 00:17:20.652150576 +0200
+++ ffmpeg/configure 2016-03-04 02:36:28.264332984 +0200
@@ -278,6 +278,7 @@
--enable-mmal enable decoding via MMAL [no]
--enable-netcdf enable NetCDF, needed for sofalizer filter [no]
--enable-nvenc enable NVIDIA NVENC support [no]
+ --enable-nvresize enable NVIDIA CUDA accelerated resizer [no]
--enable-openal enable OpenAL 1.1 capture support [no]
--enable-opencl enable OpenCL code
--enable-opengl enable OpenGL rendering [no]
@@ -1502,6 +1503,7 @@
mmal
netcdf
nvenc
+ nvresize
openal
opencl
opengl
@@ -5325,6 +5327,7 @@
frei0r_src_filter_extralibs='$ldl'
ladspa_filter_extralibs='$ldl'
nvenc_encoder_extralibs='$ldl'
+nvresize_filter_extralibs='$ldl'
if ! disabled network; then
check_func getaddrinfo $network_extralibs
diff -ruN ffmpeg-orig/libavfilter/allfilters.c ffmpeg/libavfilter/allfilters.c
--- ffmpeg-orig/libavfilter/allfilters.c 2016-03-04 00:17:21.071189460 +0200
+++ ffmpeg/libavfilter/allfilters.c 2016-03-04 00:30:07.377997402 +0200
@@ -225,6 +225,7 @@
REGISTER_FILTER(NOFORMAT, noformat, vf);
REGISTER_FILTER(NOISE, noise, vf);
REGISTER_FILTER(NULL, null, vf);
+ REGISTER_FILTER(NVRESIZE, nvresize, vf);
REGISTER_FILTER(OCR, ocr, vf);
REGISTER_FILTER(OCV, ocv, vf);
REGISTER_FILTER(OVERLAY, overlay, vf);
diff -ruN ffmpeg-orig/libavfilter/Makefile ffmpeg/libavfilter/Makefile
--- ffmpeg-orig/libavfilter/Makefile 2016-03-04 00:17:21.079190202 +0200
+++ ffmpeg/libavfilter/Makefile 2016-03-04 02:41:05.358919913 +0200
@@ -204,6 +204,7 @@
OBJS-$(CONFIG_NOFORMAT_FILTER) += vf_format.o
OBJS-$(CONFIG_NOISE_FILTER) += vf_noise.o
OBJS-$(CONFIG_NULL_FILTER) += vf_null.o
+OBJS-$(CONFIG_NVRESIZE_FILTER) += vf_nvresize.o vf_nvresize_ptx.o
OBJS-$(CONFIG_OCR_FILTER) += vf_ocr.o
OBJS-$(CONFIG_OCV_FILTER) += vf_libopencv.o
OBJS-$(CONFIG_OPENCL) += deshake_opencl.o unsharp_opencl.o
diff -ruN ffmpeg-orig/libavfilter/vf_nvresize.c ffmpeg/libavfilter/vf_nvresize.c
--- ffmpeg-orig/libavfilter/vf_nvresize.c 1970-01-01 03:00:00.000000000 +0300
+++ ffmpeg/libavfilter/vf_nvresize.c 2016-03-04 02:27:15.262271696 +0200
@@ -0,0 +1,669 @@
+/*
+ * Copyright (c) 2011 Roger Pau Monné <roger.pau@entel.upc.edu>
+ * Copyright (c) 2011 Stefano Sabatini
+ * Copyright (c) 2013 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+#include "libavutil/eval.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/parseutils.h"
+#include "libavutil/cudautils.h"
+
+#include "avfilter.h"
+#include "drawutils.h"
+#include "formats.h"
+#include "internal.h"
+#include "video.h"
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define MAX_OUTPUT 16
+#define BLOCKX 32
+#define BLOCKY 16
+
+typedef struct cu_tex {
+ int w;
+ int h;
+ size_t pitch;
+ CUdeviceptr dptr;
+} cu_tex;
+
+typedef struct NVResizeContext {
+ const AVClass *class;
+
+ /**
+ * New dimensions. Special values are:
+ * 0 = original width/height
+ * -1 = keep original aspect
+ * -N = try to keep aspect but make sure it is divisible by N
+ */
+ int nb_outputs;
+
+ char *size_str;
+ int force_original_aspect_ratio;
+ int readback_FB;
+ int gpu;
+
+ int cuda_inited;
+
+ CUcontext cu_ctx;
+ CudaDynLoadFunctions* cu_dl_func;
+ CUmodule cu_module;
+ CUfunction cu_func_uchar;
+ CUfunction cu_func_uchar2;
+ CUfunction cu_func_uchar4;
+ CUtexref cu_tex_uchar;
+ CUtexref cu_tex_uchar2;
+ CUtexref cu_tex_uchar4;
+ cu_tex intex;
+ cu_tex outtex[MAX_OUTPUT];
+
+} NVResizeContext;
+
+#define OFFSET(x) offsetof(NVResizeContext, x)
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
+
+static const AVOption nvresize_options[] = {
+ { "outputs", "set number of outputs", OFFSET(nb_outputs), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, MAX_OUTPUT, FLAGS },
+ { "readback", "read result back to FB", OFFSET(readback_FB), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS },
+ { "size", "set video size", OFFSET(size_str), AV_OPT_TYPE_STRING, {.str = NULL}, 0, FLAGS },
+ { "s", "set video size", OFFSET(size_str), AV_OPT_TYPE_STRING, {.str = NULL}, 0, FLAGS },
+ { "gpu", "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.", OFFSET(gpu), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },
+ { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" },
+ { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(nvresize);
+
+static int query_formats(AVFilterContext *ctx)
+{
+ static const enum AVPixelFormat pix_fmts[] = {
+ AV_PIX_FMT_YUV420P,
+ AV_PIX_FMT_YUV444P,
+ AV_PIX_FMT_NV12,
+ AV_PIX_FMT_ARGB,
+ AV_PIX_FMT_RGBA,
+ AV_PIX_FMT_ABGR,
+ AV_PIX_FMT_BGRA,
+ AV_PIX_FMT_NONE,
+ };
+
+ AVFilterFormats *fmts_list = ff_make_format_list((const int*)pix_fmts);
+ if (!fmts_list)
+ return AVERROR(ENOMEM);
+ return ff_set_common_formats(ctx, fmts_list);
+}
+
+static int config_output(AVFilterLink *outlink)
+{
+ AVFilterContext *ctx = outlink->src;
+ AVFilterLink *inlink = outlink->src->inputs[0];
+ NVResizeContext *s = ctx->priv;
+
+ int outIdx = atoi(outlink->srcpad->name + 3);
+ int64_t w, h;
+ int factor_w, factor_h;
+
+ w = s->outtex[outIdx].w;
+ h = s->outtex[outIdx].h;
+
+ // Check if it is requested that the result has to be divisible by a some
+ // factor (w or h = -n with n being the factor).
+ factor_w = 1;
+ factor_h = 1;
+ if (w < -1) {
+ factor_w = -w;
+ }
+ if (h < -1) {
+ factor_h = -h;
+ }
+
+ if (w < 0 && h < 0)
+ s->outtex[outIdx].w = s->outtex[outIdx].h = 0;
+
+ if (!(w = s->outtex[outIdx].w))
+ w = inlink->w;
+ if (!(h = s->outtex[outIdx].h))
+ h = inlink->h;
+
+ // Make sure that the result is divisible by the factor we determined
+ // earlier. If no factor was set, it is nothing will happen as the default
+ // factor is 1
+ if (w < 0)
+ w = av_rescale(h, inlink->w, inlink->h * factor_w) * factor_w;
+ if (h < 0)
+ h = av_rescale(w, inlink->h, inlink->w * factor_h) * factor_h;
+
+ // Note that force_original_aspect_ratio may overwrite the previous set
+ // dimensions so that it is not divisible by the set factors anymore.
+ if (s->force_original_aspect_ratio) {
+ int tmp_w = av_rescale(h, inlink->w, inlink->h);
+ int tmp_h = av_rescale(w, inlink->h, inlink->w);
+
+ if (s->force_original_aspect_ratio == 1) {
+ w = FFMIN(tmp_w, w);
+ h = FFMIN(tmp_h, h);
+ } else {
+ w = FFMAX(tmp_w, w);
+ h = FFMAX(tmp_h, h);
+ }
+ }
+
+ if (w > INT_MAX || h > INT_MAX ||
+ (h * inlink->w) > INT_MAX ||
+ (w * inlink->h) > INT_MAX)
+ av_log(ctx, AV_LOG_ERROR, "Resd value for width or height is too big.\n");
+
+ s->outtex[outIdx].w = outlink->w = w;
+ s->outtex[outIdx].h = outlink->h = h;
+
+ if (inlink->sample_aspect_ratio.num){
+ outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio);
+ } else
+ outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+
+ // create output device memory
+ switch(outlink->format) {
+ case AV_PIX_FMT_YUV420P:
+ case AV_PIX_FMT_NV12:
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr,
+ &s->outtex[outIdx].pitch, s->outtex[outIdx].w, s->outtex[outIdx].h*3/2, 16));
+ break;
+
+ case AV_PIX_FMT_YUV444P:
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr,
+ &s->outtex[outIdx].pitch, s->outtex[outIdx].w, s->outtex[outIdx].h*3, 16));
+ break;
+
+ case AV_PIX_FMT_ARGB:
+ case AV_PIX_FMT_RGBA:
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_BGRA:
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr,
+ &s->outtex[outIdx].pitch, s->outtex[outIdx].w*4, s->outtex[outIdx].h, 16));
+ break;
+ }
+
+ return 0;
+}
+
+static av_cold int init(AVFilterContext *ctx)
+{
+ extern char resize_ptx[];
+ NVResizeContext *s = ctx->priv;
+ int ret;
+ int i, j;
+ int count = 0;
+ for (i = 0; i < s->nb_outputs; i++) {
+ char name[32];
+ AVFilterPad pad = { 0 };
+
+ snprintf(name, sizeof(name), "out%d", i);
+ pad.type = ctx->filter->inputs[0].type;
+ pad.name = av_strdup(name);
+ pad.config_props = config_output;
+ if (!pad.name)
+ return AVERROR(ENOMEM);
+
+ ff_insert_outpad(ctx, i, &pad);
+ }
+
+ // parse size parameters here
+ if (s->size_str) {
+ char split = '|';
+ char* found = NULL;
+ char* head = s->size_str;
+ while ((found = strchr(head, split)) != NULL) {
+ *found = 0;
+ if ((ret = av_parse_video_size(&s->outtex[count].w, &s->outtex[count].h, head)) < 0) {
+ av_log(ctx, AV_LOG_ERROR, "Invalid size '%s'\n", head);
+ return ret;
+ }
+ head = found+1;
+ count++;
+ }
+
+ if ((ret = av_parse_video_size(&s->outtex[count].w, &s->outtex[count].h, head)) < 0) {
+ av_log(ctx, AV_LOG_ERROR, "Invalid size '%s'\n", head);
+ return ret;
+ }
+ count++;
+ }
+
+ // sort the output
+ for (i = 0; i < count; i++) {
+ for (j = i+1; j < count; j++) {
+ int tempH, tempW;
+ if (s->outtex[i].w < s->outtex[j].w) {
+ tempW = s->outtex[i].w; tempH = s->outtex[i].h;
+ s->outtex[i].w = s->outtex[j].w; s->outtex[i].h = s->outtex[j].h;
+ s->outtex[j].w = tempW; s->outtex[j].h = tempH;
+ }
+ }
+ }
+
+ if (count < s->nb_outputs) {
+ int offset = s->nb_outputs - count;
+ for (i = s->nb_outputs-1; i >= offset; i--) {
+ s->outtex[i].w = s->outtex[i-offset].w;
+ s->outtex[i].h = s->outtex[i-offset].h;
+ }
+ for (i = 0; i < offset; i++) {
+ s->outtex[i].w = s->outtex[i].h = 0;
+ }
+ }
+
+ // init cuda_context
+ if (!s->cu_ctx) {
+ init_cuda();
+ get_cuda_context(&s->cu_ctx, s->gpu);
+ }
+ s->cu_dl_func = get_cuda_dl_func();
+
+ __cu(s->cu_dl_func->cu_module_load_data(&s->cu_module, resize_ptx));
+
+ // load functions
+ __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar, s->cu_module, "Subsample_Bilinear_uchar"));
+ __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar2, s->cu_module, "Subsample_Bilinear_uchar2"));
+ __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar4, s->cu_module, "Subsample_Bilinear_uchar4"));
+ __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar, s->cu_module, "uchar_tex"));
+ __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex"));
+ __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar4, s->cu_module, "uchar4_tex"));
+
+ __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER));
+ __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER));
+ __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar4, CU_TRSF_READ_AS_INTEGER));
+ __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR));
+ __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR));
+ __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar4, CU_TR_FILTER_MODE_LINEAR));
+
+ return 0;
+}
+
+static int copy_from_avframe(NVResizeContext *s, AVFrame* src, cu_tex* dst)
+{
+ av_assert0(src->width == dst->w && src->height == dst->h);
+
+ switch (src->format) {
+ case AV_PIX_FMT_YUV420P:
+ // copy Y channel
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ // copy U channel
+ __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch / 2, src->width / 2, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ // copy V channel
+ __cu(cuMemCpy2d(src->data[2], (CUdeviceptr)NULL, src->linesize[2], NULL, dst->dptr + dst->pitch*dst->h * 5 / 4, dst->pitch / 2, src->width / 2, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+
+ break;
+
+ case AV_PIX_FMT_YUV444P:
+ // copy Y channel
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ // copy U channel
+ __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ // copy V channel
+ __cu(cuMemCpy2d(src->data[2], (CUdeviceptr)NULL, src->linesize[2], NULL, dst->dptr + dst->pitch*dst->h * 2, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ break;
+
+ case AV_PIX_FMT_NV12:
+ // copy Y channel
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ // copy UV channel
+ __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch, src->width, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+ break;
+
+ case AV_PIX_FMT_ARGB:
+ case AV_PIX_FMT_RGBA:
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_BGRA:
+ // copy the packed 32-bit plane
+ __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width * 4, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+
+ break;
+
+ default:
+ av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(src->format));
+ return -1;
+ }
+ return 0;
+}
+
+static int copy_to_avframe(NVResizeContext* s, cu_tex* src, AVFrame* dst)
+{
+ //av_assert0(src->w == dst->width && src->h == dst->height);
+
+ switch (dst->format) {
+ case AV_PIX_FMT_YUV420P:
+ // copy Y channel
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+ // copy U channel
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch / 2, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width / 2, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+ // copy V channel
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h * 5 / 4, src->pitch / 2, dst->data[2], (CUdeviceptr)NULL, dst->linesize[2], dst->width / 2, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+ break;
+
+ case AV_PIX_FMT_YUV444P:
+ // copy Y channel
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+ // copy U channel
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+ // copy V channel
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h * 2, src->pitch, dst->data[2], (CUdeviceptr)NULL, dst->linesize[2], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+
+ break;
+
+ case AV_PIX_FMT_NV12:
+ // copy Y channel
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+ // copy UV channel
+ __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+ break;
+
+ case AV_PIX_FMT_ARGB:
+ case AV_PIX_FMT_RGBA:
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_BGRA:
+ // copy the packed 32-bit plane
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width * 4, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+
+ break;
+
+ default:
+ av_log(NULL, AV_LOG_FATAL, "Unsupported output format: %s!\n", av_get_pix_fmt_name(dst->format));
+ return -1;
+ }
+ return 0;
+}
+
+static int call_resize_kernel(CudaDynLoadFunctions* dl_func, CUfunction func, CUtexref tex, int channels,
+ CUdeviceptr src_dptr, int src_width, int src_height, int src_pitch,
+ CUdeviceptr dst_dptr, int dst_width, int dst_height, int dst_pitch)
+{
+ void *args_uchar[] = { &dst_dptr, &dst_width, &dst_height, &dst_pitch, &src_width, &src_height };
+ CUDA_ARRAY_DESCRIPTOR desc;
+ desc.Width = src_width;
+ desc.Height = src_height;
+ desc.NumChannels = channels;
+ desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+ __cu(dl_func->cu_texref_set_address_2D(tex, &desc, src_dptr, src_pitch));
+
+ __cu(dl_func->cu_launch_kernel(func, DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1,
+ BLOCKX, BLOCKY, 1, 0, NULL, args_uchar, NULL));
+
+ return 0;
+}
+
+static int do_cuda_resize(NVResizeContext *s, cu_tex* src, cu_tex* dst, int format)
+{
+ switch (format) {
+ case AV_PIX_FMT_YUV420P:
+ if (src->w == dst->w && src->h == dst->h && src->pitch == dst->pitch) {
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->pitch, src->h*3/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+
+ }
+ else {
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+ src->dptr, src->w, src->h, src->pitch,
+ dst->dptr, dst->w, dst->h, dst->pitch);
+
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+ src->dptr+src->pitch*src->h, src->w/2, src->h/2, src->pitch/2,
+ dst->dptr+dst->pitch*dst->h, dst->w/2, dst->h/2, dst->pitch/2);
+
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+ src->dptr+src->pitch*src->h*5/4, src->w/2, src->h/2, src->pitch/2,
+ dst->dptr+dst->pitch*dst->h*5/4, dst->w/2, dst->h/2, dst->pitch/2);
+ }
+
+ break;
+
+ case AV_PIX_FMT_YUV444P:
+ if (src->w == dst->w && src->h == dst->h) {
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w, src->h*3, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+ }
+ else {
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+ src->dptr, src->w, src->h, src->pitch,
+ dst->dptr, dst->w, dst->h, dst->pitch);
+
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+ src->dptr+src->pitch*src->h, src->w, src->h, src->pitch,
+ dst->dptr+dst->pitch*dst->h, dst->w, dst->h, dst->pitch);
+
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+ src->dptr+src->pitch*src->h*2, src->w, src->h, src->pitch,
+ dst->dptr+dst->pitch*dst->h*2, dst->w, dst->h, dst->pitch);
+ }
+
+ break;
+
+ case AV_PIX_FMT_NV12:
+ if (src->w == dst->w && src->h == dst->h) {
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w, src->h*3/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+ }
+ else {
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+ src->dptr, src->w, src->h, src->pitch,
+ dst->dptr, dst->w, dst->h, dst->pitch);
+
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar2, s->cu_tex_uchar2, 2,
+ src->dptr+src->pitch*src->h, src->w/2, src->h/2, src->pitch,
+ dst->dptr+dst->pitch*dst->h, dst->w/2, dst->h/2, dst->pitch/2);
+ }
+
+ break;
+
+ case AV_PIX_FMT_ARGB:
+ case AV_PIX_FMT_RGBA:
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_BGRA:
+ if (src->w == dst->w && src->h == dst->h) {
+ __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w*4, src->h, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+
+ }
+ else {
+ call_resize_kernel(s->cu_dl_func, s->cu_func_uchar4, s->cu_tex_uchar4, 4,
+ src->dptr, src->w, src->h, src->pitch,
+ dst->dptr, dst->w, dst->h, dst->pitch/4);
+ }
+
+ break;
+
+ default:
+ av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(format));
+ return -1;
+ }
+
+ return 0;
+}
+
+static cu_tex* find_resize_src(NVResizeContext* s, cu_tex* source, cu_tex* target)
+{
+ int offset;
+ cu_tex* src;
+ if (source == NULL) {
+ return &s->intex;
+ }
+
+ if (target->w * 4 > source->w) {
+ return source;
+ }
+
+ offset = target - s->outtex;
+ for (int i = offset - 1; i >= 0; i--) {
+ if (target->w * 4 > s->outtex[i].w) {
+ return &s->outtex[i];
+ }
+ }
+
+ src = (offset == 0 ? source : &s->outtex[offset-1]);
+ av_log(NULL, AV_LOG_WARNING, "Output resolution %dx%d differs too much from the previous level %dx%d, "
+ "might cause artificial\n", target->w, target->h, src->w, src->h);
+
+ return src;
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *in)
+{
+ AVFilterContext *ctx = inlink->dst;
+ NVResizeContext *s = ctx->priv;
+ int i;
+ cu_tex* resize_src = NULL;
+ ffnvinfo* info;
+
+ // copy input to gpu
+ if (in->opaque && check_nvinfo(in->opaque) && ((ffnvinfo*)(in->opaque))->dptr[0]) {
+ ffnvinfo* info = (ffnvinfo*)in->opaque;
+ s->intex.dptr = info->dptr[0];
+ s->intex.pitch = info->linesize[0];
+ s->intex.w = in->width;
+ s->intex.h = in->height;
+ }
+ else {
+ if ( (in->width != s->intex.h || in->height != s->intex.h) &&
+ !s->intex.dptr) {
+ __cu(s->cu_dl_func->cu_mem_free(s->intex.dptr));
+ s->intex.w = in->width;
+ s->intex.h = in->height;
+ s->intex.dptr = (CUdeviceptr)NULL;
+ }
+ if (!s->intex.dptr) {
+ switch (in->format) {
+ case AV_PIX_FMT_YUV420P:
+ case AV_PIX_FMT_NV12:
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w, s->intex.h*3/2, 16));
+ break;
+ case AV_PIX_FMT_YUV444P:
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w, s->intex.h*3, 16));
+ break;
+ case AV_PIX_FMT_ARGB:
+ case AV_PIX_FMT_RGBA:
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_BGRA:
+ __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w*4, s->intex.h, 16));
+ break;
+ default:
+ av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(in->format));
+ return -1;
+ }
+ }
+ copy_from_avframe(s, in, &s->intex);
+ }
+
+ for (i = 0; i < ctx->nb_outputs; i++) {
+ AVFrame *out;
+ if (ctx->outputs[i]->status)
+ continue;
+
+ out = ff_get_video_buffer(ctx->outputs[i], ctx->outputs[i]->w, ctx->outputs[i]->h);
+ if (!out) {
+ av_frame_free(&in);
+ return AVERROR(ENOMEM);
+ }
+ av_frame_copy_props(out, in);
+
+ // do works here
+ resize_src = find_resize_src(s, resize_src, &s->outtex[i]);
+ do_cuda_resize(s, resize_src, &s->outtex[i], in->format);
+ info = init_nvinfo();
+ switch (out->format) {
+ case AV_PIX_FMT_YUV444P:
+ info->dptr[0] = s->outtex[i].dptr;
+ info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h;
+ info->dptr[2] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h*2;
+ info->linesize[0] = info->linesize[1] = info->linesize[2] = s->outtex[i].pitch;
+ break;
+
+ case AV_PIX_FMT_YUV420P:
+ info->dptr[0] = s->outtex[i].dptr;
+ info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h;
+ info->dptr[2] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h*5/4;
+ info->linesize[0] = s->outtex[i].pitch;
+ info->linesize[1] = info->linesize[2] = s->outtex[i].pitch/2;
+ break;
+
+ case AV_PIX_FMT_NV12:
+ info->dptr[0] = s->outtex[i].dptr;
+ info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h;
+ info->linesize[0] = info->linesize[1] = s->outtex[i].pitch;
+ break;
+
+ case AV_PIX_FMT_ARGB:
+ case AV_PIX_FMT_RGBA:
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_BGRA:
+ info->dptr[0] = s->outtex[i].dptr;
+ info->linesize[0] = s->outtex[i].pitch;
+ break;
+
+ default:
+ break;
+ }
+
+ out->opaque = (void*)info;
+ if (s->readback_FB)
+ copy_to_avframe(s, &s->outtex[i], out);
+
+ if (ff_filter_frame(ctx->outputs[i], out) < 0)
+ break;
+ }
+
+ av_frame_free(&in);
+ return 0;
+}
+
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+ NVResizeContext *s = ctx->priv;
+
+ for (int i = 0; i < s->nb_outputs; i++) {
+ av_freep(&ctx->output_pads[i].name);
+ if(s->outtex[i].dptr) s->cu_dl_func->cu_mem_free(s->outtex[i].dptr);
+ }
+ if(s->cu_ctx) release_cuda_context(&s->cu_ctx, s->gpu);
+
+ av_log(ctx, AV_LOG_INFO, "nvresize::uninit\n");
+
+}
+
+static const AVFilterPad nvresize_inputs[] = {
+ {
+ .name = "default",
+ .type = AVMEDIA_TYPE_VIDEO,
+ .filter_frame = filter_frame,
+ },
+ { NULL }
+};
+
+AVFilter ff_vf_nvresize = {
+ .name = "nvresize",
+ .description = NULL_IF_CONFIG_SMALL("GPU accelerated video resizer."),
+ .inputs = nvresize_inputs,
+ .outputs = NULL,
+ .flags = AVFILTER_FLAG_DYNAMIC_OUTPUTS,
+ .priv_class = &nvresize_class,
+ .init = init,
+ .uninit = uninit,
+ .query_formats = query_formats,
+ .priv_size = sizeof(NVResizeContext),
+};
diff -ruN ffmpeg-orig/libavfilter/vf_nvresize_ptx.c ffmpeg/libavfilter/vf_nvresize_ptx.c
--- ffmpeg-orig/libavfilter/vf_nvresize_ptx.c 1970-01-01 03:00:00.000000000 +0300
+++ ffmpeg/libavfilter/vf_nvresize_ptx.c 2016-03-04 00:18:17.589433917 +0200
@@ -0,0 +1,659 @@
+#if _WIN32 || _WIN64
+#if _WIN64
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+// Check GCC
+#if __GNUC__
+#if __x86_64__ || __ppc64__
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+#ifdef ENVIRONMENT32
+const char resize_ptx[] = \
+ "//\n"
+ "// Generated by NVIDIA NVVM Compiler\n"
+ "//\n"
+ "// Compiler Build ID: CL-19324607\n"
+ "// Cuda compilation tools, release 7.0, V7.0.27\n"
+ "// Based on LLVM 3.4svn\n"
+ "//\n"
+ "\n"
+ ".version 4.2\n"
+ ".target sm_30\n"
+ ".address_size 32\n"
+ "\n"
+ "// .globl Subsample_Bilinear_uchar\n"
+ ".global .texref uchar_tex;\n"
+ ".global .texref uchar2_tex;\n"
+ ".global .texref uchar4_tex;\n"
+ "\n"
+ ".visible .entry Subsample_Bilinear_uchar(\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_0,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_1,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_2,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_3,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_4,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_5\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<4>;\n"
+ ".reg .f32 %f<27>;\n"
+ ".reg .s32 %r<43>;\n"
+ ".reg .s64 %rd<2>;\n"
+ "\n"
+ "\n"
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar_param_0];\n"
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar_param_1];\n"
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar_param_2];\n"
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar_param_3];\n"
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar_param_4];\n"
+ "ld.param.u32 %r8, [Subsample_Bilinear_uchar_param_5];\n"
+ "mov.u32 %r9, %ctaid.x;\n"
+ "mov.u32 %r10, %ntid.x;\n"
+ "mov.u32 %r11, %tid.x;\n"
+ "mad.lo.s32 %r1, %r10, %r9, %r11;\n"
+ "mov.u32 %r12, %ntid.y;\n"
+ "mov.u32 %r13, %ctaid.y;\n"
+ "mov.u32 %r14, %tid.y;\n"
+ "mad.lo.s32 %r2, %r12, %r13, %r14;\n"
+ "setp.lt.s32 %p1, %r2, %r5;\n"
+ "setp.lt.s32 %p2, %r1, %r4;\n"
+ "and.pred %p3, %p1, %p2;\n"
+ "@!%p3 bra BB0_2;\n"
+ "bra.uni BB0_1;\n"
+ "\n"
+ "BB0_1:\n"
+ "cvta.to.global.u32 %r15, %r3;\n"
+ "cvt.rn.f32.s32 %f1, %r4;\n"
+ "cvt.rn.f32.s32 %f2, %r7;\n"
+ "div.rn.f32 %f3, %f2, %f1;\n"
+ "cvt.rn.f32.s32 %f4, %r5;\n"
+ "cvt.rn.f32.s32 %f5, %r8;\n"
+ "div.rn.f32 %f6, %f5, %f4;\n"
+ "cvt.rn.f32.s32 %f7, %r1;\n"
+ "add.f32 %f8, %f7, 0f3F000000;\n"
+ "mul.f32 %f9, %f8, %f3;\n"
+ "cvt.rn.f32.s32 %f10, %r2;\n"
+ "add.f32 %f11, %f10, 0f3F000000;\n"
+ "mul.f32 %f12, %f11, %f6;\n"
+ "add.f32 %f13, %f3, 0fBF800000;\n"
+ "mul.f32 %f14, %f13, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f15, %f14;\n"
+ "add.f32 %f16, %f6, 0fBF800000;\n"
+ "mul.f32 %f17, %f16, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f18, %f17;\n"
+ "add.f32 %f19, %f15, 0f3F000000;\n"
+ "div.rn.f32 %f20, %f15, %f19;\n"
+ "add.f32 %f21, %f18, 0f3F000000;\n"
+ "div.rn.f32 %f22, %f18, %f21;\n"
+ "sub.f32 %f23, %f9, %f20;\n"
+ "sub.f32 %f24, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r16, %r17, %r18, %r19}, [uchar_tex, {%f23, %f24}];\n"
+ "and.b32 %r20, %r16, 255;\n"
+ "add.f32 %f25, %f9, %f20;\n"
+ "tex.2d.v4.u32.f32 {%r21, %r22, %r23, %r24}, [uchar_tex, {%f25, %f24}];\n"
+ "and.b32 %r25, %r21, 255;\n"
+ "add.f32 %f26, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r26, %r27, %r28, %r29}, [uchar_tex, {%f23, %f26}];\n"
+ "and.b32 %r30, %r26, 255;\n"
+ "tex.2d.v4.u32.f32 {%r31, %r32, %r33, %r34}, [uchar_tex, {%f25, %f26}];\n"
+ "and.b32 %r35, %r31, 255;\n"
+ "add.s32 %r36, %r20, %r25;\n"
+ "add.s32 %r37, %r36, %r30;\n"
+ "add.s32 %r38, %r37, %r35;\n"
+ "add.s32 %r39, %r38, 2;\n"
+ "shr.u32 %r40, %r39, 2;\n"
+ "mad.lo.s32 %r41, %r2, %r6, %r1;\n"
+ "add.s32 %r42, %r15, %r41;\n"
+ "st.global.u8 [%r42], %r40;\n"
+ "\n"
+ "BB0_2:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "// .globl Subsample_Bilinear_uchar2\n"
+ ".visible .entry Subsample_Bilinear_uchar2(\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_0,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_1,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_2,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_3,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_4,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_5\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<4>;\n"
+ ".reg .s16 %rs<3>;\n"
+ ".reg .f32 %f<27>;\n"
+ ".reg .s32 %r<53>;\n"
+ ".reg .s64 %rd<2>;\n"
+ "\n"
+ "\n"
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar2_param_0];\n"
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar2_param_1];\n"
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar2_param_2];\n"
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar2_param_3];\n"
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar2_param_4];\n"
+ "ld.param.u32 %r8, [Subsample_Bilinear_uchar2_param_5];\n"
+ "mov.u32 %r9, %ctaid.x;\n"
+ "mov.u32 %r10, %ntid.x;\n"
+ "mov.u32 %r11, %tid.x;\n"
+ "mad.lo.s32 %r1, %r10, %r9, %r11;\n"
+ "mov.u32 %r12, %ntid.y;\n"
+ "mov.u32 %r13, %ctaid.y;\n"
+ "mov.u32 %r14, %tid.y;\n"
+ "mad.lo.s32 %r2, %r12, %r13, %r14;\n"
+ "setp.lt.s32 %p1, %r2, %r5;\n"
+ "setp.lt.s32 %p2, %r1, %r4;\n"
+ "and.pred %p3, %p1, %p2;\n"
+ "@!%p3 bra BB1_2;\n"
+ "bra.uni BB1_1;\n"
+ "\n"
+ "BB1_1:\n"
+ "cvta.to.global.u32 %r15, %r3;\n"
+ "cvt.rn.f32.s32 %f1, %r4;\n"
+ "cvt.rn.f32.s32 %f2, %r7;\n"
+ "div.rn.f32 %f3, %f2, %f1;\n"
+ "cvt.rn.f32.s32 %f4, %r5;\n"
+ "cvt.rn.f32.s32 %f5, %r8;\n"
+ "div.rn.f32 %f6, %f5, %f4;\n"
+ "cvt.rn.f32.s32 %f7, %r1;\n"
+ "add.f32 %f8, %f7, 0f3F000000;\n"
+ "mul.f32 %f9, %f8, %f3;\n"
+ "cvt.rn.f32.s32 %f10, %r2;\n"
+ "add.f32 %f11, %f10, 0f3F000000;\n"
+ "mul.f32 %f12, %f11, %f6;\n"
+ "add.f32 %f13, %f3, 0fBF800000;\n"
+ "mul.f32 %f14, %f13, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f15, %f14;\n"
+ "add.f32 %f16, %f6, 0fBF800000;\n"
+ "mul.f32 %f17, %f16, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f18, %f17;\n"
+ "add.f32 %f19, %f15, 0f3F000000;\n"
+ "div.rn.f32 %f20, %f15, %f19;\n"
+ "add.f32 %f21, %f18, 0f3F000000;\n"
+ "div.rn.f32 %f22, %f18, %f21;\n"
+ "sub.f32 %f23, %f9, %f20;\n"
+ "sub.f32 %f24, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r16, %r17, %r18, %r19}, [uchar2_tex, {%f23, %f24}];\n"
+ "add.f32 %f25, %f9, %f20;\n"
+ "tex.2d.v4.u32.f32 {%r20, %r21, %r22, %r23}, [uchar2_tex, {%f25, %f24}];\n"
+ "add.f32 %f26, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r24, %r25, %r26, %r27}, [uchar2_tex, {%f23, %f26}];\n"
+ "tex.2d.v4.u32.f32 {%r28, %r29, %r30, %r31}, [uchar2_tex, {%f25, %f26}];\n"
+ "and.b32 %r32, %r16, 255;\n"
+ "and.b32 %r33, %r20, 255;\n"
+ "and.b32 %r34, %r24, 255;\n"
+ "and.b32 %r35, %r28, 255;\n"
+ "add.s32 %r36, %r32, %r33;\n"
+ "add.s32 %r37, %r36, %r34;\n"
+ "add.s32 %r38, %r37, %r35;\n"
+ "add.s32 %r39, %r38, 2;\n"
+ "shr.u32 %r40, %r39, 2;\n"
+ "and.b32 %r41, %r17, 255;\n"
+ "and.b32 %r42, %r21, 255;\n"
+ "and.b32 %r43, %r25, 255;\n"
+ "and.b32 %r44, %r29, 255;\n"
+ "add.s32 %r45, %r41, %r42;\n"
+ "add.s32 %r46, %r45, %r43;\n"
+ "add.s32 %r47, %r46, %r44;\n"
+ "add.s32 %r48, %r47, 2;\n"
+ "shr.u32 %r49, %r48, 2;\n"
+ "mad.lo.s32 %r50, %r2, %r6, %r1;\n"
+ "shl.b32 %r51, %r50, 1;\n"
+ "add.s32 %r52, %r15, %r51;\n"
+ "cvt.u16.u32 %rs1, %r49;\n"
+ "cvt.u16.u32 %rs2, %r40;\n"
+ "st.global.v2.u8 [%r52], {%rs2, %rs1};\n"
+ "\n"
+ "BB1_2:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "// .globl Subsample_Bilinear_uchar4\n"
+ ".visible .entry Subsample_Bilinear_uchar4(\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_0,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_1,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_2,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_3,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_4,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_5\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<4>;\n"
+ ".reg .s16 %rs<5>;\n"
+ ".reg .f32 %f<27>;\n"
+ ".reg .s32 %r<71>;\n"
+ ".reg .s64 %rd<2>;\n"
+ "\n"
+ "\n"
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar4_param_0];\n"
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar4_param_1];\n"
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar4_param_2];\n"
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar4_param_3];\n"
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar4_param_4];\n"
+ "ld.param.u32 %r8, [Subsample_Bilinear_uchar4_param_5];\n"
+ "mov.u32 %r9, %ctaid.x;\n"
+ "mov.u32 %r10, %ntid.x;\n"
+ "mov.u32 %r11, %tid.x;\n"
+ "mad.lo.s32 %r1, %r10, %r9, %r11;\n"
+ "mov.u32 %r12, %ntid.y;\n"
+ "mov.u32 %r13, %ctaid.y;\n"
+ "mov.u32 %r14, %tid.y;\n"
+ "mad.lo.s32 %r2, %r12, %r13, %r14;\n"
+ "setp.lt.s32 %p1, %r2, %r5;\n"
+ "setp.lt.s32 %p2, %r1, %r4;\n"
+ "and.pred %p3, %p1, %p2;\n"
+ "@!%p3 bra BB2_2;\n"
+ "bra.uni BB2_1;\n"
+ "\n"
+ "BB2_1:\n"
+ "cvta.to.global.u32 %r15, %r3;\n"
+ "cvt.rn.f32.s32 %f1, %r4;\n"
+ "cvt.rn.f32.s32 %f2, %r7;\n"
+ "div.rn.f32 %f3, %f2, %f1;\n"
+ "cvt.rn.f32.s32 %f4, %r5;\n"
+ "cvt.rn.f32.s32 %f5, %r8;\n"
+ "div.rn.f32 %f6, %f5, %f4;\n"
+ "cvt.rn.f32.s32 %f7, %r1;\n"
+ "add.f32 %f8, %f7, 0f3F000000;\n"
+ "mul.f32 %f9, %f8, %f3;\n"
+ "cvt.rn.f32.s32 %f10, %r2;\n"
+ "add.f32 %f11, %f10, 0f3F000000;\n"
+ "mul.f32 %f12, %f11, %f6;\n"
+ "add.f32 %f13, %f3, 0fBF800000;\n"
+ "mul.f32 %f14, %f13, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f15, %f14;\n"
+ "add.f32 %f16, %f6, 0fBF800000;\n"
+ "mul.f32 %f17, %f16, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f18, %f17;\n"
+ "add.f32 %f19, %f15, 0f3F000000;\n"
+ "div.rn.f32 %f20, %f15, %f19;\n"
+ "add.f32 %f21, %f18, 0f3F000000;\n"
+ "div.rn.f32 %f22, %f18, %f21;\n"
+ "sub.f32 %f23, %f9, %f20;\n"
+ "sub.f32 %f24, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r16, %r17, %r18, %r19}, [uchar4_tex, {%f23, %f24}];\n"
+ "add.f32 %f25, %f9, %f20;\n"
+ "tex.2d.v4.u32.f32 {%r20, %r21, %r22, %r23}, [uchar4_tex, {%f25, %f24}];\n"
+ "add.f32 %f26, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r24, %r25, %r26, %r27}, [uchar4_tex, {%f23, %f26}];\n"
+ "tex.2d.v4.u32.f32 {%r28, %r29, %r30, %r31}, [uchar4_tex, {%f25, %f26}];\n"
+ "and.b32 %r32, %r16, 255;\n"
+ "and.b32 %r33, %r20, 255;\n"
+ "and.b32 %r34, %r24, 255;\n"
+ "and.b32 %r35, %r28, 255;\n"
+ "add.s32 %r36, %r32, %r33;\n"
+ "add.s32 %r37, %r36, %r34;\n"
+ "add.s32 %r38, %r37, %r35;\n"
+ "add.s32 %r39, %r38, 2;\n"
+ "shr.u32 %r40, %r39, 2;\n"
+ "and.b32 %r41, %r17, 255;\n"
+ "and.b32 %r42, %r21, 255;\n"
+ "and.b32 %r43, %r25, 255;\n"
+ "and.b32 %r44, %r29, 255;\n"
+ "add.s32 %r45, %r41, %r42;\n"
+ "add.s32 %r46, %r45, %r43;\n"
+ "add.s32 %r47, %r46, %r44;\n"
+ "add.s32 %r48, %r47, 2;\n"
+ "shr.u32 %r49, %r48, 2;\n"
+ "and.b32 %r50, %r18, 255;\n"
+ "and.b32 %r51, %r22, 255;\n"
+ "and.b32 %r52, %r26, 255;\n"
+ "and.b32 %r53, %r30, 255;\n"
+ "add.s32 %r54, %r50, %r51;\n"
+ "add.s32 %r55, %r54, %r52;\n"
+ "add.s32 %r56, %r55, %r53;\n"
+ "add.s32 %r57, %r56, 2;\n"
+ "shr.u32 %r58, %r57, 2;\n"
+ "and.b32 %r59, %r19, 255;\n"
+ "and.b32 %r60, %r23, 255;\n"
+ "and.b32 %r61, %r27, 255;\n"
+ "and.b32 %r62, %r31, 255;\n"
+ "add.s32 %r63, %r59, %r60;\n"
+ "add.s32 %r64, %r63, %r61;\n"
+ "add.s32 %r65, %r64, %r62;\n"
+ "add.s32 %r66, %r65, 2;\n"
+ "shr.u32 %r67, %r66, 2;\n"
+ "mad.lo.s32 %r68, %r2, %r6, %r1;\n"
+ "shl.b32 %r69, %r68, 2;\n"
+ "add.s32 %r70, %r15, %r69;\n"
+ "cvt.u16.u32 %rs1, %r67;\n"
+ "cvt.u16.u32 %rs2, %r58;\n"
+ "cvt.u16.u32 %rs3, %r49;\n"
+ "cvt.u16.u32 %rs4, %r40;\n"
+ "st.global.v4.u8 [%r70], {%rs4, %rs3, %rs2, %rs1};\n"
+ "\n"
+ "BB2_2:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "\n"
+;
+#elif defined ENVIRONMENT64
+const char resize_ptx[] = \
+ "//\n"
+ "// Generated by NVIDIA NVVM Compiler\n"
+ "//\n"
+ "// Compiler Build ID: CL-19324607\n"
+ "// Cuda compilation tools, release 7.0, V7.0.27\n"
+ "// Based on LLVM 3.4svn\n"
+ "//\n"
+ "\n"
+ ".version 4.2\n"
+ ".target sm_30\n"
+ ".address_size 64\n"
+ "\n"
+ "// .globl Subsample_Bilinear_uchar\n"
+ ".global .texref uchar_tex;\n"
+ ".global .texref uchar2_tex;\n"
+ ".global .texref uchar4_tex;\n"
+ "\n"
+ ".visible .entry Subsample_Bilinear_uchar(\n"
+ ".param .u64 Subsample_Bilinear_uchar_param_0,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_1,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_2,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_3,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_4,\n"
+ ".param .u32 Subsample_Bilinear_uchar_param_5\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<4>;\n"
+ ".reg .f32 %f<27>;\n"
+ ".reg .s32 %r<40>;\n"
+ ".reg .s64 %rd<6>;\n"
+ "\n"
+ "\n"
+ "ld.param.u64 %rd1, [Subsample_Bilinear_uchar_param_0];\n"
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar_param_1];\n"
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar_param_2];\n"
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar_param_3];\n"
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar_param_4];\n"
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar_param_5];\n"
+ "mov.u32 %r8, %ctaid.x;\n"
+ "mov.u32 %r9, %ntid.x;\n"
+ "mov.u32 %r10, %tid.x;\n"
+ "mad.lo.s32 %r1, %r9, %r8, %r10;\n"
+ "mov.u32 %r11, %ntid.y;\n"
+ "mov.u32 %r12, %ctaid.y;\n"
+ "mov.u32 %r13, %tid.y;\n"
+ "mad.lo.s32 %r2, %r11, %r12, %r13;\n"
+ "setp.lt.s32 %p1, %r2, %r4;\n"
+ "setp.lt.s32 %p2, %r1, %r3;\n"
+ "and.pred %p3, %p1, %p2;\n"
+ "@!%p3 bra BB0_2;\n"
+ "bra.uni BB0_1;\n"
+ "\n"
+ "BB0_1:\n"
+ "cvta.to.global.u64 %rd2, %rd1;\n"
+ "cvt.rn.f32.s32 %f1, %r3;\n"
+ "cvt.rn.f32.s32 %f2, %r6;\n"
+ "div.rn.f32 %f3, %f2, %f1;\n"
+ "cvt.rn.f32.s32 %f4, %r4;\n"
+ "cvt.rn.f32.s32 %f5, %r7;\n"
+ "div.rn.f32 %f6, %f5, %f4;\n"
+ "cvt.rn.f32.s32 %f7, %r1;\n"
+ "add.f32 %f8, %f7, 0f3F000000;\n"
+ "mul.f32 %f9, %f8, %f3;\n"
+ "cvt.rn.f32.s32 %f10, %r2;\n"
+ "add.f32 %f11, %f10, 0f3F000000;\n"
+ "mul.f32 %f12, %f11, %f6;\n"
+ "add.f32 %f13, %f3, 0fBF800000;\n"
+ "mul.f32 %f14, %f13, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f15, %f14;\n"
+ "add.f32 %f16, %f6, 0fBF800000;\n"
+ "mul.f32 %f17, %f16, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f18, %f17;\n"
+ "add.f32 %f19, %f15, 0f3F000000;\n"
+ "div.rn.f32 %f20, %f15, %f19;\n"
+ "add.f32 %f21, %f18, 0f3F000000;\n"
+ "div.rn.f32 %f22, %f18, %f21;\n"
+ "sub.f32 %f23, %f9, %f20;\n"
+ "sub.f32 %f24, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r14, %r15, %r16, %r17}, [uchar_tex, {%f23, %f24}];\n"
+ "and.b32 %r18, %r14, 255;\n"
+ "add.f32 %f25, %f9, %f20;\n"
+ "tex.2d.v4.u32.f32 {%r19, %r20, %r21, %r22}, [uchar_tex, {%f25, %f24}];\n"
+ "and.b32 %r23, %r19, 255;\n"
+ "add.f32 %f26, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r24, %r25, %r26, %r27}, [uchar_tex, {%f23, %f26}];\n"
+ "and.b32 %r28, %r24, 255;\n"
+ "tex.2d.v4.u32.f32 {%r29, %r30, %r31, %r32}, [uchar_tex, {%f25, %f26}];\n"
+ "and.b32 %r33, %r29, 255;\n"
+ "add.s32 %r34, %r18, %r23;\n"
+ "add.s32 %r35, %r34, %r28;\n"
+ "add.s32 %r36, %r35, %r33;\n"
+ "add.s32 %r37, %r36, 2;\n"
+ "shr.u32 %r38, %r37, 2;\n"
+ "mad.lo.s32 %r39, %r2, %r5, %r1;\n"
+ "cvt.s64.s32 %rd4, %r39;\n"
+ "add.s64 %rd5, %rd2, %rd4;\n"
+ "st.global.u8 [%rd5], %r38;\n"
+ "\n"
+ "BB0_2:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "// .globl Subsample_Bilinear_uchar2\n"
+ ".visible .entry Subsample_Bilinear_uchar2(\n"
+ ".param .u64 Subsample_Bilinear_uchar2_param_0,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_1,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_2,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_3,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_4,\n"
+ ".param .u32 Subsample_Bilinear_uchar2_param_5\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<4>;\n"
+ ".reg .s16 %rs<3>;\n"
+ ".reg .f32 %f<27>;\n"
+ ".reg .s32 %r<49>;\n"
+ ".reg .s64 %rd<6>;\n"
+ "\n"
+ "\n"
+ "ld.param.u64 %rd1, [Subsample_Bilinear_uchar2_param_0];\n"
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar2_param_1];\n"
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar2_param_2];\n"
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar2_param_3];\n"
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar2_param_4];\n"
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar2_param_5];\n"
+ "mov.u32 %r8, %ctaid.x;\n"
+ "mov.u32 %r9, %ntid.x;\n"
+ "mov.u32 %r10, %tid.x;\n"
+ "mad.lo.s32 %r1, %r9, %r8, %r10;\n"
+ "mov.u32 %r11, %ntid.y;\n"
+ "mov.u32 %r12, %ctaid.y;\n"
+ "mov.u32 %r13, %tid.y;\n"
+ "mad.lo.s32 %r2, %r11, %r12, %r13;\n"
+ "setp.lt.s32 %p1, %r2, %r4;\n"
+ "setp.lt.s32 %p2, %r1, %r3;\n"
+ "and.pred %p3, %p1, %p2;\n"
+ "@!%p3 bra BB1_2;\n"
+ "bra.uni BB1_1;\n"
+ "\n"
+ "BB1_1:\n"
+ "cvta.to.global.u64 %rd2, %rd1;\n"
+ "cvt.rn.f32.s32 %f1, %r3;\n"
+ "cvt.rn.f32.s32 %f2, %r6;\n"
+ "div.rn.f32 %f3, %f2, %f1;\n"
+ "cvt.rn.f32.s32 %f4, %r4;\n"
+ "cvt.rn.f32.s32 %f5, %r7;\n"
+ "div.rn.f32 %f6, %f5, %f4;\n"
+ "cvt.rn.f32.s32 %f7, %r1;\n"
+ "add.f32 %f8, %f7, 0f3F000000;\n"
+ "mul.f32 %f9, %f8, %f3;\n"
+ "cvt.rn.f32.s32 %f10, %r2;\n"
+ "add.f32 %f11, %f10, 0f3F000000;\n"
+ "mul.f32 %f12, %f11, %f6;\n"
+ "add.f32 %f13, %f3, 0fBF800000;\n"
+ "mul.f32 %f14, %f13, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f15, %f14;\n"
+ "add.f32 %f16, %f6, 0fBF800000;\n"
+ "mul.f32 %f17, %f16, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f18, %f17;\n"
+ "add.f32 %f19, %f15, 0f3F000000;\n"
+ "div.rn.f32 %f20, %f15, %f19;\n"
+ "add.f32 %f21, %f18, 0f3F000000;\n"
+ "div.rn.f32 %f22, %f18, %f21;\n"
+ "sub.f32 %f23, %f9, %f20;\n"
+ "sub.f32 %f24, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r14, %r15, %r16, %r17}, [uchar2_tex, {%f23, %f24}];\n"
+ "add.f32 %f25, %f9, %f20;\n"
+ "tex.2d.v4.u32.f32 {%r18, %r19, %r20, %r21}, [uchar2_tex, {%f25, %f24}];\n"
+ "add.f32 %f26, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r22, %r23, %r24, %r25}, [uchar2_tex, {%f23, %f26}];\n"
+ "tex.2d.v4.u32.f32 {%r26, %r27, %r28, %r29}, [uchar2_tex, {%f25, %f26}];\n"
+ "and.b32 %r30, %r14, 255;\n"
+ "and.b32 %r31, %r18, 255;\n"
+ "and.b32 %r32, %r22, 255;\n"
+ "and.b32 %r33, %r26, 255;\n"
+ "add.s32 %r34, %r30, %r31;\n"
+ "add.s32 %r35, %r34, %r32;\n"
+ "add.s32 %r36, %r35, %r33;\n"
+ "add.s32 %r37, %r36, 2;\n"
+ "shr.u32 %r38, %r37, 2;\n"
+ "and.b32 %r39, %r15, 255;\n"
+ "and.b32 %r40, %r19, 255;\n"
+ "and.b32 %r41, %r23, 255;\n"
+ "and.b32 %r42, %r27, 255;\n"
+ "add.s32 %r43, %r39, %r40;\n"
+ "add.s32 %r44, %r43, %r41;\n"
+ "add.s32 %r45, %r44, %r42;\n"
+ "add.s32 %r46, %r45, 2;\n"
+ "shr.u32 %r47, %r46, 2;\n"
+ "mad.lo.s32 %r48, %r2, %r5, %r1;\n"
+ "mul.wide.s32 %rd4, %r48, 2;\n"
+ "add.s64 %rd5, %rd2, %rd4;\n"
+ "cvt.u16.u32 %rs1, %r47;\n"
+ "cvt.u16.u32 %rs2, %r38;\n"
+ "st.global.v2.u8 [%rd5], {%rs2, %rs1};\n"
+ "\n"
+ "BB1_2:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "// .globl Subsample_Bilinear_uchar4\n"
+ ".visible .entry Subsample_Bilinear_uchar4(\n"
+ ".param .u64 Subsample_Bilinear_uchar4_param_0,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_1,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_2,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_3,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_4,\n"
+ ".param .u32 Subsample_Bilinear_uchar4_param_5\n"
+ ")\n"
+ "{\n"
+ ".reg .pred %p<4>;\n"
+ ".reg .s16 %rs<5>;\n"
+ ".reg .f32 %f<27>;\n"
+ ".reg .s32 %r<67>;\n"
+ ".reg .s64 %rd<6>;\n"
+ "\n"
+ "\n"
+ "ld.param.u64 %rd1, [Subsample_Bilinear_uchar4_param_0];\n"
+ "ld.param.u32 %r3, [Subsample_Bilinear_uchar4_param_1];\n"
+ "ld.param.u32 %r4, [Subsample_Bilinear_uchar4_param_2];\n"
+ "ld.param.u32 %r5, [Subsample_Bilinear_uchar4_param_3];\n"
+ "ld.param.u32 %r6, [Subsample_Bilinear_uchar4_param_4];\n"
+ "ld.param.u32 %r7, [Subsample_Bilinear_uchar4_param_5];\n"
+ "mov.u32 %r8, %ctaid.x;\n"
+ "mov.u32 %r9, %ntid.x;\n"
+ "mov.u32 %r10, %tid.x;\n"
+ "mad.lo.s32 %r1, %r9, %r8, %r10;\n"
+ "mov.u32 %r11, %ntid.y;\n"
+ "mov.u32 %r12, %ctaid.y;\n"
+ "mov.u32 %r13, %tid.y;\n"
+ "mad.lo.s32 %r2, %r11, %r12, %r13;\n"
+ "setp.lt.s32 %p1, %r2, %r4;\n"
+ "setp.lt.s32 %p2, %r1, %r3;\n"
+ "and.pred %p3, %p1, %p2;\n"
+ "@!%p3 bra BB2_2;\n"
+ "bra.uni BB2_1;\n"
+ "\n"
+ "BB2_1:\n"
+ "cvta.to.global.u64 %rd2, %rd1;\n"
+ "cvt.rn.f32.s32 %f1, %r3;\n"
+ "cvt.rn.f32.s32 %f2, %r6;\n"
+ "div.rn.f32 %f3, %f2, %f1;\n"
+ "cvt.rn.f32.s32 %f4, %r4;\n"
+ "cvt.rn.f32.s32 %f5, %r7;\n"
+ "div.rn.f32 %f6, %f5, %f4;\n"
+ "cvt.rn.f32.s32 %f7, %r1;\n"
+ "add.f32 %f8, %f7, 0f3F000000;\n"
+ "mul.f32 %f9, %f8, %f3;\n"
+ "cvt.rn.f32.s32 %f10, %r2;\n"
+ "add.f32 %f11, %f10, 0f3F000000;\n"
+ "mul.f32 %f12, %f11, %f6;\n"
+ "add.f32 %f13, %f3, 0fBF800000;\n"
+ "mul.f32 %f14, %f13, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f15, %f14;\n"
+ "add.f32 %f16, %f6, 0fBF800000;\n"
+ "mul.f32 %f17, %f16, 0f3F000000;\n"
+ "cvt.sat.f32.f32 %f18, %f17;\n"
+ "add.f32 %f19, %f15, 0f3F000000;\n"
+ "div.rn.f32 %f20, %f15, %f19;\n"
+ "add.f32 %f21, %f18, 0f3F000000;\n"
+ "div.rn.f32 %f22, %f18, %f21;\n"
+ "sub.f32 %f23, %f9, %f20;\n"
+ "sub.f32 %f24, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r14, %r15, %r16, %r17}, [uchar4_tex, {%f23, %f24}];\n"
+ "add.f32 %f25, %f9, %f20;\n"
+ "tex.2d.v4.u32.f32 {%r18, %r19, %r20, %r21}, [uchar4_tex, {%f25, %f24}];\n"
+ "add.f32 %f26, %f12, %f22;\n"
+ "tex.2d.v4.u32.f32 {%r22, %r23, %r24, %r25}, [uchar4_tex, {%f23, %f26}];\n"
+ "tex.2d.v4.u32.f32 {%r26, %r27, %r28, %r29}, [uchar4_tex, {%f25, %f26}];\n"
+ "and.b32 %r30, %r14, 255;\n"
+ "and.b32 %r31, %r18, 255;\n"
+ "and.b32 %r32, %r22, 255;\n"
+ "and.b32 %r33, %r26, 255;\n"
+ "add.s32 %r34, %r30, %r31;\n"
+ "add.s32 %r35, %r34, %r32;\n"
+ "add.s32 %r36, %r35, %r33;\n"
+ "add.s32 %r37, %r36, 2;\n"
+ "shr.u32 %r38, %r37, 2;\n"
+ "and.b32 %r39, %r15, 255;\n"
+ "and.b32 %r40, %r19, 255;\n"
+ "and.b32 %r41, %r23, 255;\n"
+ "and.b32 %r42, %r27, 255;\n"
+ "add.s32 %r43, %r39, %r40;\n"
+ "add.s32 %r44, %r43, %r41;\n"
+ "add.s32 %r45, %r44, %r42;\n"
+ "add.s32 %r46, %r45, 2;\n"
+ "shr.u32 %r47, %r46, 2;\n"
+ "and.b32 %r48, %r16, 255;\n"
+ "and.b32 %r49, %r20, 255;\n"
+ "and.b32 %r50, %r24, 255;\n"
+ "and.b32 %r51, %r28, 255;\n"
+ "add.s32 %r52, %r48, %r49;\n"
+ "add.s32 %r53, %r52, %r50;\n"
+ "add.s32 %r54, %r53, %r51;\n"
+ "add.s32 %r55, %r54, 2;\n"
+ "shr.u32 %r56, %r55, 2;\n"
+ "and.b32 %r57, %r17, 255;\n"
+ "and.b32 %r58, %r21, 255;\n"
+ "and.b32 %r59, %r25, 255;\n"
+ "and.b32 %r60, %r29, 255;\n"
+ "add.s32 %r61, %r57, %r58;\n"
+ "add.s32 %r62, %r61, %r59;\n"
+ "add.s32 %r63, %r62, %r60;\n"
+ "add.s32 %r64, %r63, 2;\n"
+ "shr.u32 %r65, %r64, 2;\n"
+ "mad.lo.s32 %r66, %r2, %r5, %r1;\n"
+ "mul.wide.s32 %rd4, %r66, 4;\n"
+ "add.s64 %rd5, %rd2, %rd4;\n"
+ "cvt.u16.u32 %rs1, %r65;\n"
+ "cvt.u16.u32 %rs2, %r56;\n"
+ "cvt.u16.u32 %rs3, %r47;\n"
+ "cvt.u16.u32 %rs4, %r38;\n"
+ "st.global.v4.u8 [%rd5], {%rs4, %rs3, %rs2, %rs1};\n"
+ "\n"
+ "BB2_2:\n"
+ "ret;\n"
+ "}\n"
+ "\n"
+ "\n"
+;
+#endif
diff -ruN ffmpeg-orig/libavutil/cudautils.c ffmpeg/libavutil/cudautils.c
--- ffmpeg-orig/libavutil/cudautils.c 1970-01-01 03:00:00.000000000 +0300
+++ ffmpeg/libavutil/cudautils.c 2016-03-04 00:18:17.589433917 +0200
@@ -0,0 +1,288 @@
+/*
+*
+* This file is part of FFmpeg.
+*
+* FFmpeg is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* FFmpeg is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with FFmpeg; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include "cudautils.h"
+#include "common.h"
+#include "log.h"
+
+#define FF_NVINFO_VERSION 1
+static NVGUID NV_INFO_GUID = { 0x2cab9a64, 0x7095, 0x11e5, { 0xad, 0x1d, 0x94, 0xde, 0x80, 0x65, 0xb7, 0x74 } };
+static CudaContext cudaCtx = { { NULL }, { 0 }, { NULL }, 0 , { NULL }, { "" }, { 0 } };
+
+int dyload_cuda(void);
+int check_cuda(void);
+int check_cuda_errors(CUresult err, const char *func);
+
+
+#define CHECK_LOAD_FUNC(t, f, s) \
+do { \
+ (f) = (t)LOAD_FUNC(dl_fn->cuda_lib, s); \
+ if (!(f)) { \
+ av_log(NULL, AV_LOG_FATAL, "Failed loading %s from CUDA library\n", s); \
+ goto error; \
+ } \
+} while (0)
+
+int check_cuda_errors(CUresult err, const char *func)
+{
+ if (err != CUDA_SUCCESS) {
+ av_log(NULL, AV_LOG_FATAL, ">> %s - failed with error code 0x%x\n", func, err);
+ return 0;
+ }
+ return 1;
+}
+#define check_cuda_errors(f) if (!check_cuda_errors(f, #f)) goto error
+
+
+int dyload_cuda(void)
+{
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs;
+
+ if (dl_fn->cuda_lib)
+ return 1;
+
+#if defined(_WIN32)
+ dl_fn->cuda_lib = LoadLibrary(TEXT("nvcuda.dll"));
+#else
+ dl_fn->cuda_lib = dlopen("libcuda.so", RTLD_LAZY);
+#endif
+
+ if (!dl_fn->cuda_lib) {
+ av_log(NULL, AV_LOG_FATAL, "Failed loading CUDA library\n");
+ goto error;
+ }
+
+ CHECK_LOAD_FUNC(PCUINIT, dl_fn->cu_init, "cuInit");
+ CHECK_LOAD_FUNC(PCUDEVICEGETCOUNT, dl_fn->cu_device_get_count, "cuDeviceGetCount");
+ CHECK_LOAD_FUNC(PCUDEVICEGET, dl_fn->cu_device_get, "cuDeviceGet");
+ CHECK_LOAD_FUNC(PCUDEVICEGETNAME, dl_fn->cu_device_get_name, "cuDeviceGetName");
+ CHECK_LOAD_FUNC(PCUDEVICECOMPUTECAPABILITY, dl_fn->cu_device_compute_capability, "cuDeviceComputeCapability");
+ CHECK_LOAD_FUNC(PCUCTXCREATE, dl_fn->cu_ctx_create, "cuCtxCreate_v2");
+ CHECK_LOAD_FUNC(PCUCTXPOPCURRENT, dl_fn->cu_ctx_pop_current, "cuCtxPopCurrent_v2");
+ CHECK_LOAD_FUNC(PCUCTXDESTROY, dl_fn->cu_ctx_destroy, "cuCtxDestroy_v2");
+ CHECK_LOAD_FUNC(PCUMODULELOADDATA, dl_fn->cu_module_load_data, "cuModuleLoadData");
+ CHECK_LOAD_FUNC(PCUMODULEGETFUNCTION, dl_fn->cu_module_get_function, "cuModuleGetFunction");
+ CHECK_LOAD_FUNC(PCUMODULEGETTEXREF, dl_fn->cu_module_get_texref, "cuModuleGetTexRef");
+ CHECK_LOAD_FUNC(PCUTEXREFSETFLAGS, dl_fn->cu_texref_set_flags, "cuTexRefSetFlags");
+ CHECK_LOAD_FUNC(PCUTEXREFSETFILTERMODE, dl_fn->cu_texref_set_filtermode, "cuTexRefSetFilterMode");
+ CHECK_LOAD_FUNC(PCUTEXREFSETADDRESS2D, dl_fn->cu_texref_set_address_2D, "cuTexRefSetAddress2D_v3");
+ CHECK_LOAD_FUNC(PCUMEMALLOCPITCH, dl_fn->cu_mem_alloc_pitch, "cuMemAllocPitch_v2");
+ CHECK_LOAD_FUNC(PCUMEMCPY2D, dl_fn->cu_mem_cpy_2D, "cuMemcpy2D_v2");
+ CHECK_LOAD_FUNC(PCUMEMCPY2DASYNC, dl_fn->cu_mem_cpy_2D_async, "cuMemcpy2DAsync_v2");
+ CHECK_LOAD_FUNC(PCUMEMFREE, dl_fn->cu_mem_free, "cuMemFree_v2");
+ CHECK_LOAD_FUNC(PCULAUNCHKERNEL, dl_fn->cu_launch_kernel, "cuLaunchKernel");
+
+ av_log(NULL, AV_LOG_VERBOSE, "CUDA Library and Function loaded successfully\n");
+ return 1;
+
+error:
+ if (dl_fn->cuda_lib)
+ DL_CLOSE_FUNC(dl_fn->cuda_lib);
+
+ dl_fn->cuda_lib = NULL;
+ return 0;
+}
+
+
+int check_cuda(void)
+{
+ int device_count = 0;
+ int smminor = 0, smmajor = 0;
+ int i;
+
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs;
+
+
+ if (!dyload_cuda())
+ return 0;
+
+ check_cuda_errors(dl_fn->cu_init(0));
+ check_cuda_errors(dl_fn->cu_device_get_count(&device_count));
+
+ if (!device_count) {
+ av_log(NULL, AV_LOG_FATAL, "No CUDA capable devices found\n");
+ goto error;
+ }
+
+ av_log(NULL, AV_LOG_VERBOSE, "%d CUDA capable devices found\n", device_count);
+
+ for (i = 0; i < device_count; ++i) {
+ check_cuda_errors(dl_fn->cu_device_get(&cudaCtx.cu_devices[i], i));
+ check_cuda_errors(dl_fn->cu_device_get_name(cudaCtx.gpu_name[i], sizeof(cudaCtx.gpu_name[i]), cudaCtx.cu_devices[i]));
+ check_cuda_errors(dl_fn->cu_device_compute_capability(&smmajor, &smminor, cudaCtx.cu_devices[i]));
+
+ cudaCtx.smver[i] = (smmajor << 4) | smminor;
+ av_log(NULL, AV_LOG_VERBOSE, "[ GPU #%d - < %s > has Compute SM %d.%d]\n", i, cudaCtx.gpu_name[i], smmajor, smminor);
+
+ }
+ cudaCtx.device_count = device_count;
+ return 1;
+
+error:
+ cudaCtx.device_count = 0;
+ return 0;
+}
+
+int init_cuda(void)
+{
+ if (cudaCtx.device_count == 0)
+ {
+ if (!check_cuda())
+ return 0;
+ }
+
+ return 1;
+}
+
+void deinit_cuda(void)
+{
+
+ int i, deinit_flag;
+ deinit_flag = 1;
+
+ for (i = 0; i < cudaCtx.device_count; i++)
+ {
+ if (cudaCtx.cuda_context_arr[i] != NULL)
+ {
+ deinit_flag = 0;
+ }
+ }
+
+ if (deinit_flag == 1)
+ {
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs;
+ DL_CLOSE_FUNC(dl_fn->cuda_lib);
+ dl_fn->cuda_lib = NULL;
+
+ dl_fn->cu_init = NULL;
+ dl_fn->cu_device_get_count = NULL;
+ dl_fn->cu_device_get = NULL;
+ dl_fn->cu_device_get_name = NULL;
+ dl_fn->cu_device_compute_capability = NULL;
+ dl_fn->cu_ctx_create = NULL;
+ dl_fn->cu_ctx_pop_current = NULL;
+ dl_fn->cu_ctx_destroy = NULL;
+ dl_fn->cu_module_load_data = NULL;
+ dl_fn->cu_module_get_function = NULL;
+ dl_fn->cu_module_get_texref = NULL;
+ dl_fn->cu_texref_set_flags = NULL;
+ dl_fn->cu_texref_set_filtermode = NULL;
+ dl_fn->cu_texref_set_address_2D = NULL;
+ dl_fn->cu_mem_alloc_pitch = NULL;
+ dl_fn->cu_mem_cpy_2D = NULL;
+ dl_fn->cu_mem_cpy_2D_async = NULL;
+ dl_fn->cu_mem_free = NULL;
+ dl_fn->cu_launch_kernel = NULL;
+ av_log(NULL, AV_LOG_VERBOSE, "Cuda Library unloaded\n");
+ }
+}
+
+int is_gpu_feature_available(int gpu, int target_smver)
+{
+ if (cudaCtx.device_count > 0)
+ {
+ if (cudaCtx.smver[gpu] >= target_smver)
+ {
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+
+int get_cuda_context(CUcontext *ctx, int gpu)
+{
+ CUresult cu_res;
+ CUcontext get_ctx;
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs;
+
+ if (cudaCtx.cuda_context_arr[gpu] == NULL)
+ {
+ cu_res = dl_fn->cu_ctx_create(&get_ctx, 4, cudaCtx.cu_devices[gpu]);
+
+ if (cu_res != CUDA_SUCCESS) {
+ ctx = NULL;
+ return cu_res;
+ }
+
+ cudaCtx.cuda_context_arr[gpu] = get_ctx;
+ }
+
+ *ctx = cudaCtx.cuda_context_arr[gpu];
+ cudaCtx.cuda_context_count[gpu] += 1;
+ av_log(NULL, AV_LOG_VERBOSE, "cudalib : Cuda Context created 0x%p\n", *ctx);
+
+ return CUDA_SUCCESS;
+}
+
+void release_cuda_context(CUcontext *ctx, int gpu)
+{
+
+ CudaDynLoadFunctions *dl_fn = &cudaCtx.cuda_dload_funcs;
+
+ ctx = NULL;
+ cudaCtx.cuda_context_count[gpu] -= 1;
+ if (cudaCtx.cuda_context_count[gpu] == 0)
+ {
+ dl_fn->cu_ctx_destroy(cudaCtx.cuda_context_arr[gpu]);
+ cudaCtx.cuda_context_arr[gpu] = NULL;
+ }
+
+}
+
+int check_nvinfo(void* ptr)
+{
+ ffnvinfo* info;
+ if (!ptr) return 0;
+ info = (ffnvinfo*)ptr;
+ if (memcmp(&info->guid, &NV_INFO_GUID, sizeof(info->guid)) != 0) return 0;
+ return 1;
+}
+
+ffnvinfo* init_nvinfo()
+{
+ ffnvinfo* info = av_mallocz(sizeof(ffnvinfo));
+ memcpy(&info->guid, &NV_INFO_GUID, sizeof(info->guid));
+ info->version = FF_NVINFO_VERSION;
+ return info;
+}
+
+CudaDynLoadFunctions* get_cuda_dl_func()
+{
+ return &cudaCtx.cuda_dload_funcs;
+}
+
+CUresult cuMemCpy2d(const void *srcHost, CUdeviceptr srcDevice, size_t srcPitch, void *dstHost, CUdeviceptr dstDevice, size_t dstPitch, size_t width, size_t height, CUmemorytype srcMemoryType, CUmemorytype dstMemoryType)
+{
+ CUDA_MEMCPY2D memcpy2D = { 0 };
+ CudaDynLoadFunctions* cu_dl_func = get_cuda_dl_func();
+
+ memcpy2D.srcMemoryType = srcMemoryType;
+ memcpy2D.dstMemoryType = dstMemoryType;
+ memcpy2D.srcHost = srcHost;
+ memcpy2D.srcDevice = srcDevice;
+ memcpy2D.srcPitch = srcPitch;
+ memcpy2D.dstHost = dstHost;
+ memcpy2D.dstDevice = dstDevice;
+ memcpy2D.dstPitch = dstPitch;
+ memcpy2D.WidthInBytes = width;
+ memcpy2D.Height = height;
+ return cu_dl_func->cu_mem_cpy_2D_async(&memcpy2D, NULL);
+}
diff -ruN ffmpeg-orig/libavutil/cudautils.h ffmpeg/libavutil/cudautils.h
--- ffmpeg-orig/libavutil/cudautils.h 1970-01-01 03:00:00.000000000 +0300
+++ ffmpeg/libavutil/cudautils.h 2016-03-04 00:18:17.589433917 +0200
@@ -0,0 +1,216 @@
+/*
+*
+* This file is part of FFmpeg.
+*
+* FFmpeg is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* FFmpeg is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with FFmpeg; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#include "common.h"
+
+
+#if defined(_WIN32)
+#define CUDAAPI __stdcall
+#else
+#define CUDAAPI
+#endif
+
+#if defined(_WIN32)
+#define LOAD_FUNC(l, s) GetProcAddress(l, s)
+#define DL_CLOSE_FUNC(l) FreeLibrary(l)
+#else
+#define LOAD_FUNC(l, s) dlsym(l, s)
+#define DL_CLOSE_FUNC(l) dlclose(l)
+#endif
+
+#define MAX_NUM_GPU 16
+
+#define CU_TRSF_READ_AS_INTEGER 0x01
+#define CU_TRSF_NORMALIZED_COORDINATES 0x02
+#define CU_TRSF_SRGB 0x10
+
+#define __cu(a) do { \
+ CUresult ret; \
+ if ((ret = (a)) != CUDA_SUCCESS) { \
+ av_log(NULL, AV_LOG_FATAL, "[%s:%d]%s has returned CUDA error %d\n", __FILE__, __LINE__, #a, ret); \
+ return AVERROR_EXTERNAL;\
+ }} while (0)
+
+
+typedef int CUdevice;
+typedef void* CUcontext;
+typedef void* CUmodule;
+typedef void* CUfunction;
+typedef void* CUtexref;
+typedef void* CUstream;
+typedef void* CUarray;
+#if defined(_WIN64) || defined(__LP64__)
+typedef unsigned long long CUdeviceptr;
+#else
+typedef unsigned int CUdeviceptr;
+#endif
+
+typedef enum cudaError_enum {
+ CUDA_SUCCESS = 0
+} CUresult;
+
+typedef enum CUfilter_mode_enum {
+ CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */
+ CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */
+} CUfilter_mode;
+
+typedef enum CUarray_format_enum {
+ CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */
+ CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
+ CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
+ CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */
+ CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */
+ CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */
+ CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */
+ CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */
+} CUarray_format;
+
+typedef struct CUDA_ARRAY_DESCRIPTOR_st
+{
+ size_t Width; /**< Width of array */
+ size_t Height; /**< Height of array */
+
+ CUarray_format Format; /**< Array format */
+ unsigned int NumChannels; /**< Channels per array element */
+} CUDA_ARRAY_DESCRIPTOR;
+
+typedef enum CUmemorytype_enum {
+ CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */
+ CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */
+ CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */
+ CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */
+} CUmemorytype;
+
+typedef struct CUDA_MEMCPY2D_st {
+ size_t srcXInBytes; /**< Source X in bytes */
+ size_t srcY; /**< Source Y */
+
+ CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+ const void *srcHost; /**< Source host pointer */
+ CUdeviceptr srcDevice; /**< Source device pointer */
+ CUarray srcArray; /**< Source array reference */
+ size_t srcPitch; /**< Source pitch (ignored when src is array) */
+
+ size_t dstXInBytes; /**< Destination X in bytes */
+ size_t dstY; /**< Destination Y */
+
+ CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+ void *dstHost; /**< Destination host pointer */
+ CUdeviceptr dstDevice; /**< Destination device pointer */
+ CUarray dstArray; /**< Destination array reference */
+ size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
+
+ size_t WidthInBytes; /**< Width of 2D memory copy in bytes */
+ size_t Height; /**< Height of 2D memory copy */
+} CUDA_MEMCPY2D;
+
+
+typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags);
+typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count);
+typedef CUresult(CUDAAPI *PCUDEVICEGET)(CUdevice *device, int ordinal);
+typedef CUresult(CUDAAPI *PCUDEVICEGETNAME)(char *name, int len, CUdevice dev);
+typedef CUresult(CUDAAPI *PCUDEVICECOMPUTECAPABILITY)(int *major, int *minor, CUdevice dev);
+typedef CUresult(CUDAAPI *PCUCTXCREATE)(CUcontext *pctx, unsigned int flags, CUdevice dev);
+typedef CUresult(CUDAAPI *PCUCTXPOPCURRENT)(CUcontext *pctx);
+typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx);
+typedef CUresult(CUDAAPI *PCUMODULELOADDATA)(CUmodule *module, const void *image);
+typedef CUresult(CUDAAPI *PCUMODULEGETFUNCTION)(CUfunction *hfunc, CUmodule hmod, const char *name);
+typedef CUresult(CUDAAPI *PCUMODULEGETTEXREF)(CUtexref *pTexRef, CUmodule hmod, const char *name);
+typedef CUresult(CUDAAPI *PCUTEXREFSETFLAGS)(CUtexref hTexRef, unsigned int Flags);
+typedef CUresult(CUDAAPI *PCUTEXREFSETFILTERMODE)(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult(CUDAAPI *PCUTEXREFSETADDRESS2D)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+typedef CUresult(CUDAAPI *PCUMEMALLOCPITCH)(CUdeviceptr* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
+typedef CUresult(CUDAAPI *PCUMEMCPY2D)(const CUDA_MEMCPY2D *pCopy);
+typedef CUresult(CUDAAPI *PCUMEMCPY2DASYNC)(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+typedef CUresult(CUDAAPI *PCUMEMFREE)(CUdeviceptr dptr);
+typedef CUresult(CUDAAPI *PCULAUNCHKERNEL)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+
+
+typedef struct CudaDynLoadFunctions{
+ PCUINIT cu_init;
+ PCUDEVICEGETCOUNT cu_device_get_count;
+ PCUDEVICEGET cu_device_get;
+ PCUDEVICEGETNAME cu_device_get_name;
+ PCUDEVICECOMPUTECAPABILITY cu_device_compute_capability;
+ PCUCTXCREATE cu_ctx_create;
+ PCUCTXPOPCURRENT cu_ctx_pop_current;
+ PCUCTXDESTROY cu_ctx_destroy;
+ PCUMODULELOADDATA cu_module_load_data;
+ PCUMODULEGETFUNCTION cu_module_get_function;
+ PCUMODULEGETTEXREF cu_module_get_texref;
+ PCUTEXREFSETFLAGS cu_texref_set_flags;
+ PCUTEXREFSETFILTERMODE cu_texref_set_filtermode;
+ PCUTEXREFSETADDRESS2D cu_texref_set_address_2D;
+ PCUMEMALLOCPITCH cu_mem_alloc_pitch;
+ PCUMEMCPY2D cu_mem_cpy_2D;
+ PCUMEMCPY2DASYNC cu_mem_cpy_2D_async;
+ PCUMEMFREE cu_mem_free;
+ PCULAUNCHKERNEL cu_launch_kernel;
+
+#if defined(_WIN32)
+ HMODULE cuda_lib;
+#else
+ void* cuda_lib;
+#endif
+} CudaDynLoadFunctions;
+
+typedef struct CudaContext{
+
+ CUcontext cuda_context_arr[MAX_NUM_GPU];
+ unsigned int cuda_context_count[MAX_NUM_GPU];
+ CudaDynLoadFunctions cuda_dload_funcs;
+
+ int device_count;
+ CUdevice cu_devices[MAX_NUM_GPU];
+ char gpu_name[MAX_NUM_GPU][128];
+ int smver[MAX_NUM_GPU];
+} CudaContext;
+
+typedef struct _NVGUID {
+ uint32_t Data1;
+ uint16_t Data2;
+ uint16_t Data3;
+ uint8_t Data4[8];
+} NVGUID;
+
+typedef struct _ffnvinfo {
+ NVGUID guid;
+ uint32_t version;
+ //CUcontext cudaCtx;
+ void* vxCtx;
+ CUdeviceptr dptr[8];
+ uint32_t linesize[8];
+} ffnvinfo;
+
+int init_cuda(void);
+void deinit_cuda(void);
+int get_cuda_context(CUcontext *ctx, int gpu);
+void release_cuda_context(CUcontext *ctx, int gpu);
+int is_gpu_feature_available(int gpu, int target_smver);
+int check_nvinfo(void* ptr);
+ffnvinfo* init_nvinfo(void);
+CudaDynLoadFunctions* get_cuda_dl_func(void);
+
+CUresult cuMemCpy2d(const void *srcHost, CUdeviceptr srcDevice, size_t srcPitch, void *dstHost, CUdeviceptr dstDevice, size_t dstPitch, size_t width, size_t height, CUmemorytype srcMemoryType, CUmemorytype dstMemoryType);
diff -ruN ffmpeg-orig/libavutil/Makefile ffmpeg/libavutil/Makefile
--- ffmpeg-orig/libavutil/Makefile 2016-03-04 00:17:21.063188717 +0200
+++ ffmpeg/libavutil/Makefile 2016-03-04 00:18:17.589433917 +0200
@@ -21,6 +21,7 @@
common.h \
cpu.h \
crc.h \
+ cudautils.h \
des.h \
display.h \
downmix_info.h \
@@ -98,6 +99,7 @@
color_utils.o \
cpu.o \
crc.o \
+ cudautils.o \
des.o \
display.o \
downmix_info.o \
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment