Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added vmaf_cuda_fex_synchronize, fixed cuda fex flush functions #1332

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions libvmaf/include/libvmaf/libvmaf_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,17 @@ int vmaf_cuda_preallocate_pictures(VmafContext *vmaf,
*/
int vmaf_cuda_fetch_preallocated_picture(VmafContext *vmaf, VmafPicture* pic);

/**
* Synchronizes all CUDA feature extractors within the VmafContext
* with the CPU using their flush function. All feature scores will
* be written when this function returns.
*
* @param vmaf VMAF context allocated with `vmaf_init()` and
* initialized with `vmaf_cuda_preallocate_pictures()`.
* @return 0 on success, or < 0 (a negative errno code) on error.
*/
int vmaf_cuda_fex_synchronize(VmafContext *vmaf);

#ifdef __cplusplus
}
#endif
Expand Down
44 changes: 32 additions & 12 deletions libvmaf/src/feature/cuda/integer_adm_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@
#include "cuda/integer_adm_cuda.h"
#include "picture_cuda.h"
#include <unistd.h>

#include <assert.h>
#ifdef HAVE_NVTX
#include "nvtx3/nvToolsExt.h"
MorkTheOrk marked this conversation as resolved.
Show resolved Hide resolved
#endif

#define RES_BUFFER_SIZE 4 * 3 * 2

Expand All @@ -55,7 +57,7 @@ typedef struct AdmStateCuda {
int dst_stride, CUstream c_stream);
CUstream str, host_stream;
void* write_score_parameters;
CUevent ref_event, dis_event, finished;
CUevent ref_event, dis_event, finished, scores_written;
VmafDictionary *feature_name_dict;

// adm_dwt kernels
Expand Down Expand Up @@ -642,7 +644,6 @@ typedef struct write_score_parameters_adm {

static int write_scores(write_score_parameters_adm* params)
{

VmafFeatureCollector *feature_collector = params->feature_collector;
AdmStateCuda *s = params->s;
unsigned index = params->index;
Expand Down Expand Up @@ -715,7 +716,11 @@ static int write_scores(write_score_parameters_adm* params)
s->feature_name_dict, "integer_adm_scale3", scores[6] / scores[7],
index);

if (!s->debug) return err;
if (!s->debug) {

return err;
}


err |= vmaf_feature_collector_append_with_dict(feature_collector,
s->feature_name_dict, "integer_adm", score, index);
Expand Down Expand Up @@ -749,7 +754,6 @@ static int write_scores(write_score_parameters_adm* params)

err |= vmaf_feature_collector_append_with_dict(feature_collector,
s->feature_name_dict, "integer_adm_den_scale3", scores[7], index);

return err;
}

Expand Down Expand Up @@ -1015,9 +1019,10 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_DISABLE_TIMING));


CUmodule adm_cm_module, adm_csf_den_module, adm_csf_module, adm_decouple_module, adm_dwt_module;
Expand Down Expand Up @@ -1155,10 +1160,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,

// this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores
CHECK_CUDA(cuStreamSynchronize(s->str));
// CHECK_CUDA(cuEventSynchronize(s->finished));
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuEventDestroy(s->finished));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
CHECK_CUDA(cuCtxPopCurrent(NULL));

// current implementation is limited by the 16-bit data pipeline, thus
Expand All @@ -1179,6 +1181,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
data->w = ref_pic->w[0];
CHECK_CUDA(cuStreamWaitEvent(s->host_stream, s->finished, CU_EVENT_WAIT_DEFAULT));
CHECK_CUDA(cuLaunchHostFunc(s->host_stream, (CUhostFn)write_scores, data));
CHECK_CUDA(cuEventRecord(s->scores_written, s->host_stream));
return 0;
}

Expand Down Expand Up @@ -1215,15 +1218,32 @@ static int close_fex_cuda(VmafFeatureExtractor *fex)
if (s->write_score_parameters) free(s->write_score_parameters);

ret |= vmaf_dictionary_free(&s->feature_name_dict);

if(s->ref_event) CHECK_CUDA(cuEventDestroy(s->ref_event));
if(s->dis_event) CHECK_CUDA(cuEventDestroy(s->dis_event));
if(s->finished) CHECK_CUDA(cuEventDestroy(s->finished));
if(s->scores_written) CHECK_CUDA(cuEventDestroy(s->scores_written));

//cuStreamDestroy(s->str);

return ret;
}

static int flush_fex_cuda(VmafFeatureExtractor *fex,
VmafFeatureCollector *feature_collector)
{
#ifdef HAVE_NVTX
nvtxRangePushA("flush adm_cuda");
#endif
AdmStateCuda *s = fex->priv;
int ret = 0;
CHECK_CUDA(cuStreamSynchronize(s->str));
return 1;
CHECK_CUDA(cuStreamSynchronize(s->host_stream));
CHECK_CUDA(cuEventSynchronize(s->scores_written));
#ifdef HAVE_NVTX
nvtxRangePop();
#endif
return (ret < 0) ? ret : !ret;
}

static const char *provided_features[] = {
Expand Down
71 changes: 52 additions & 19 deletions libvmaf/src/feature/cuda/integer_motion_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,12 @@
#include "picture.h"
#include "picture_cuda.h"
#include "cuda_helper.cuh"
#ifdef HAVE_NVTX
#include "nvtx3/nvToolsExt.h"
#endif

typedef struct MotionStateCuda {
CUevent event, finished;
CUevent event, finished, scores_written;
CUfunction funcbpc8, funcbpc16;
CUstream str, host_stream;
VmafCudaBuffer* blur[2];
Expand All @@ -44,6 +47,8 @@ typedef struct MotionStateCuda {
double score;
bool debug;
bool motion_force_zero;
bool flushed;
bool closed;
void (*calculate_motion_score)(const VmafPicture* src, VmafCudaBuffer* src_blurred,
const VmafCudaBuffer* prev_blurred, VmafCudaBuffer* sad,
unsigned width, unsigned height,
Expand Down Expand Up @@ -136,12 +141,15 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt
unsigned bpc, unsigned w, unsigned h)
{
MotionStateCuda *s = fex->priv;
s->flushed = true;
s->closed = false;

CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_BLOCKING_SYNC));

CUmodule module;
CHECK_CUDA(cuModuleLoadData(&module, src_motion_score_ptx));
Expand Down Expand Up @@ -202,20 +210,39 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt
return -ENOMEM;
}

// if called twice in a row, finalize FEX and close
static int flush_fex_cuda(VmafFeatureExtractor *fex,
VmafFeatureCollector *feature_collector)
{
#ifdef HAVE_NVTX
nvtxRangePushA("flush motion_cuda");
#endif

MotionStateCuda *s = fex->priv;
int ret = 0;
CHECK_CUDA(cuStreamSynchronize(s->str));
CHECK_CUDA(cuStreamSynchronize(s->host_stream));

if (s->index > 0) {
ret = vmaf_feature_collector_append(feature_collector,
"VMAF_integer_feature_motion2_score",
s->score, s->index);
if(!s->flushed) {
CHECK_CUDA(cuStreamSynchronize(s->str));
CHECK_CUDA(cuStreamSynchronize(s->host_stream));
while (cuEventQuery(s->scores_written) != CUDA_SUCCESS)
{
continue;
}
CHECK_CUDA(cuEventSynchronize(s->scores_written));

}
else {
if (s->index > 0 && !s->closed) {
ret = vmaf_feature_collector_append(feature_collector,
"VMAF_integer_feature_motion2_score",
s->score, s->index);
}
s->closed = true;
}
s->flushed = true;

#ifdef HAVE_NVTX
nvtxRangePop();
#endif
return (ret < 0) ? ret : !ret;
}

Expand All @@ -242,7 +269,7 @@ static int write_scores(write_score_parameters_moco* params)
}
if (err) return err;

if (params->index == 1)
if (params->index == 1)
return 0;

err = vmaf_feature_collector_append(feature_collector,
Expand All @@ -257,13 +284,14 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic,
VmafFeatureCollector *feature_collector)
{
MotionStateCuda *s = fex->priv;

if(s->closed) {
return -ESHUTDOWN; // TODO: proper error code here
}
s->flushed = false;
// this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores
CHECK_CUDA(cuStreamSynchronize(s->str));
// CHECK_CUDA(cuEventSynchronize(s->finished));
CHECK_CUDA(cuEventSynchronize(s->finished));
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuEventDestroy(s->finished));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
CHECK_CUDA(cuCtxPopCurrent(NULL));

int err = 0;
Expand All @@ -286,10 +314,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic,
CHECK_CUDA(cuEventRecord(s->event, vmaf_cuda_picture_get_stream(ref_pic)));
// This event ensures the input buffer is consumed
CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT));
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuEventDestroy(s->event));
CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuCtxPopCurrent(NULL));


if (index == 0) {
err = vmaf_feature_collector_append(feature_collector,
Expand All @@ -311,11 +336,13 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic,
CHECK_CUDA(cuStreamWaitEvent(s->host_stream, s->finished, CU_EVENT_WAIT_DEFAULT));

write_score_parameters_moco* params = s->write_score_parameters;
cuEventSynchronize(s->scores_written);
params->feature_collector = feature_collector;
params->h = ref_pic->h[0];
params->w = ref_pic->w[0];
params->index = index;
CHECK_CUDA(cuLaunchHostFunc(s->host_stream, write_scores, s->write_score_parameters));
CHECK_CUDA(cuEventRecord(s->scores_written, s->host_stream));
return 0;
}

Expand All @@ -342,6 +369,12 @@ static int close_fex_cuda(VmafFeatureExtractor *fex)
if(s->write_score_parameters) {
free(s->write_score_parameters);
}


if(s->event) CHECK_CUDA(cuEventDestroy(s->event));
if(s->finished) CHECK_CUDA(cuEventDestroy(s->finished));
if(s->scores_written) CHECK_CUDA(cuEventDestroy(s->scores_written));

return ret;
}

Expand Down
36 changes: 27 additions & 9 deletions libvmaf/src/feature/cuda/integer_vif_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@
#include "cuda/integer_vif_cuda.h"
#include "picture_cuda.h"

#ifdef HAVE_NVTX
#include "nvtx3/nvToolsExt.h"
#endif

#if ARCH_X86
#include "x86/vif_avx2.h"
#if HAVE_AVX512
Expand All @@ -42,7 +46,7 @@

typedef struct VifStateCuda {
VifBufferCuda buf;
CUevent event, finished;
CUevent event, finished, scores_written;
CUstream str, host_stream;
bool debug;
double vif_enhn_gain_limit;
Expand Down Expand Up @@ -99,9 +103,9 @@ static int init_fex_cuda(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuStreamCreateWithPriority(&s->str, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuStreamCreateWithPriority(&s->host_stream, CU_STREAM_NON_BLOCKING, 0));
CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));

CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuEventCreate(&s->scores_written, CU_EVENT_DISABLE_TIMING));
// make this static
CUmodule filter1d_module;
CHECK_CUDA(cuModuleLoadData(&filter1d_module, src_filter1d_ptx));
Expand Down Expand Up @@ -453,8 +457,6 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
// before the GPU has finished writing to it.
CHECK_CUDA(cuStreamSynchronize(s->str));
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuEventDestroy(s->finished));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
CHECK_CUDA(cuCtxPopCurrent(NULL));

CHECK_CUDA(cuMemsetD8Async(s->buf.accum_data->data, 0, sizeof(vif_accums) * 4, s->str));
Expand All @@ -478,7 +480,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
CHECK_CUDA(cuStreamWaitEvent(s->str, s->event, CU_EVENT_WAIT_DEFAULT));
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuEventDestroy(s->event));
CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DEFAULT));
CHECK_CUDA(cuEventCreate(&s->event, CU_EVENT_DISABLE_TIMING));
CHECK_CUDA(cuCtxPopCurrent(NULL));
}
}
Expand All @@ -496,7 +498,8 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
write_score_parameters_vif *data = s->buf.cpu_param_buf;
data->feature_collector = feature_collector;
data->index = index;
CHECK_CUDA(cuLaunchHostFunc(s->str, write_scores, data));
CHECK_CUDA(cuLaunchHostFunc(s->host_stream, (CUhostFn)write_scores, data));
CHECK_CUDA(cuEventRecord(s->scores_written, s->host_stream));
return 0;
}

Expand All @@ -517,17 +520,32 @@ static int close_fex_cuda(VmafFeatureExtractor *fex)
if (s->buf.accum_host) {
ret |= vmaf_cuda_buffer_host_free(fex->cu_state, s->buf.accum_host);
}

if(s->event) CHECK_CUDA(cuEventDestroy(s->event));
if(s->finished) CHECK_CUDA(cuEventDestroy(s->finished));
if(s->scores_written) CHECK_CUDA(cuEventDestroy(s->scores_written));

ret |= vmaf_dictionary_free(&s->feature_name_dict);
return ret;
}

static int flush_fex_cuda(VmafFeatureExtractor *fex,
VmafFeatureCollector *feature_collector)
{
#ifdef HAVE_NVTX
nvtxRangePushA("flush vif_cuda");
#endif
VifStateCuda *s = fex->priv;

int ret = 0;
CHECK_CUDA(cuStreamSynchronize(s->str));
return 1;
CHECK_CUDA(cuStreamSynchronize(s->host_stream));
CHECK_CUDA(cuEventSynchronize(s->scores_written));
#ifdef HAVE_NVTX
nvtxRangePop();
#endif

return (ret < 0) ? ret : !ret;
}

static const char *provided_features[] = {
Expand Down
Loading