@@ -20,6 +20,13 @@ static bool g_cuda =
2020 return new CudaDeviceInterface (device);
2121 });
2222
23+ // BT.709 full range color conversion matrix for YUV to RGB conversion.
24+ // See Note [YUV -> RGB Color Conversion, color space and color range] below.
25+ constexpr Npp32f bt709FullRangeColorTwist[3 ][4 ] = {
26+ {1 .0f , 0 .0f , 1 .5748f , 0 .0f },
27+ {1 .0f , -0 .187324273f , -0 .468124273f , -128 .0f },
28+ {1 .0f , 1 .8556f , 0 .0f , -128 .0f }};
29+
2330// We reuse cuda contexts across VideoDeoder instances. This is because
2431// creating a cuda context is expensive. The cache mechanism is as follows:
2532// 1. There is a cache of size MAX_CONTEXTS_PER_GPU_IN_CACHE cuda contexts for
@@ -312,21 +319,54 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
312319 static_cast <int >(getFFMPEGCompatibleDeviceIndex (device_)));
313320
314321 NppiSize oSizeROI = {width, height};
315- Npp8u* input [2 ] = {avFrame->data [0 ], avFrame->data [1 ]};
322+ Npp8u* yuvData [2 ] = {avFrame->data [0 ], avFrame->data [1 ]};
316323
317324 NppStatus status;
318325
326+ // For background, see
327+ // Note [YUV -> RGB Color Conversion, color space and color range]
319328 if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
320- status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx (
321- input,
322- avFrame->linesize [0 ],
323- static_cast <Npp8u*>(dst.data_ptr ()),
324- dst.stride (0 ),
325- oSizeROI,
326- nppCtx);
329+ if (avFrame->color_range == AVColorRange::AVCOL_RANGE_JPEG) {
330+ // NPP provides a pre-defined color conversion function for BT.709 full
331+ // range: nppiNV12ToRGB_709HDTV_8u_P2C3R_Ctx. But it's not closely
332+ // matching the results we have on CPU. So we're using a custom color
333+ // conversion matrix, which provides more accurate results. See the note
334+ // mentioned above for details, and headaches.
335+
336+ int srcStep[2 ] = {avFrame->linesize [0 ], avFrame->linesize [1 ]};
337+
338+ status = nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx (
339+ yuvData,
340+ srcStep,
341+ static_cast <Npp8u*>(dst.data_ptr ()),
342+ dst.stride (0 ),
343+ oSizeROI,
344+ bt709FullRangeColorTwist,
345+ nppCtx);
346+ } else {
347+ // If not full range, we assume studio limited range.
348+ // The color conversion matrix for BT.709 limited range should be:
349+ // static const Npp32f bt709LimitedRangeColorTwist[3][4] = {
350+ // {1.16438356f, 0.0f, 1.79274107f, -16.0f},
351+ // {1.16438356f, -0.213248614f, -0.5329093290f, -128.0f},
352+ // {1.16438356f, 2.11240179f, 0.0f, -128.0f}
353+ // };
354+ // We get very close results to CPU with that, but using the pre-defined
355+ // nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx seems to be even more accurate.
356+ status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx (
357+ yuvData,
358+ avFrame->linesize [0 ],
359+ static_cast <Npp8u*>(dst.data_ptr ()),
360+ dst.stride (0 ),
361+ oSizeROI,
362+ nppCtx);
363+ }
327364 } else {
365+ // TODO we're assuming BT.601 color space (and probably limited range) by
366+ // calling nppiNV12ToRGB_8u_P2C3R_Ctx. We should handle BT.601 full range,
367+ // and other color-spaces like 2020.
328368 status = nppiNV12ToRGB_8u_P2C3R_Ctx (
329- input ,
369+ yuvData ,
330370 avFrame->linesize [0 ],
331371 static_cast <Npp8u*>(dst.data_ptr ()),
332372 dst.stride (0 ),
@@ -362,3 +402,123 @@ std::optional<const AVCodec*> CudaDeviceInterface::findCodec(
362402}
363403
364404} // namespace facebook::torchcodec
405+
406+ /* clang-format off */
407+ // Note: [YUV -> RGB Color Conversion, color space and color range]
408+ //
409+ // The frames we get from the decoder (FFmpeg decoder, or NVCUVID) are in YUV
410+ // format. We need to convert them to RGB. This note attempts to describe this
411+ // process. There may be some inaccuracies and approximations that experts will
412+ // notice, but our goal is only to provide a good enough understanding of the
413+ // process for torchcodec developers to implement and maintain it.
414+ // On CPU, filtergraph and swscale handle everything for us. With CUDA, we have
415+ // to do a lot of the heavy lifting ourselves.
416+ //
417+ // Color space and color range
418+ // ---------------------------
419+ // Two main characteristics of a frame will affect the conversion process:
420+ // 1. Color space: This basically defines what YUV values correspond to which
421+ // physical wavelength. No need to go into details here,the point is that
422+ // videos can come in different color spaces, the most common ones being
423+ // BT.601 and BT.709, but there are others.
424+ // In FFmpeg this is represented with AVColorSpace:
425+ // https://ffmpeg.org/doxygen/4.0/pixfmt_8h.html#aff71a069509a1ad3ff54d53a1c894c85
426+ // 2. Color range: This defines the range of YUV values. There is:
427+ // - full range, also called PC range: AVCOL_RANGE_JPEG
428+ // - and the "limited" range, also called studio or TV range: AVCOL_RANGE_MPEG
429+ // https://ffmpeg.org/doxygen/4.0/pixfmt_8h.html#a3da0bf691418bc22c4bcbe6583ad589a
430+ //
431+ // Color space and color range are independent concepts, so we can have a BT.709
432+ // with full range, and another one with limited range. Same for BT.601.
433+ //
434+ // In the first version of this note we'll focus on the full color range. It
435+ // will later be updated to account for the limited range.
436+ //
437+ // Color conversion matrix
438+ // -----------------------
439+ // YUV -> RGB conversion is defined as the reverse process of the RGB -> YUV,
440+ // So this is where we'll start.
441+ // At the core of a RGB -> YUV conversion are the "luma coefficients", which are
442+ // specific to a given color space and defined by the color space standard. In
443+ // FFmpeg they can be found here:
444+ // https://github.com/FFmpeg/FFmpeg/blob/7d606ef0ccf2946a4a21ab1ec23486cadc21864b/libavutil/csp.c#L46-L56
445+ //
446+ // For example, the BT.709 coefficients are: kr=0.2126, kg=0.7152, kb=0.0722
447+ // Coefficients must sum to 1.
448+ //
449+ // Conventionally Y is in [0, 1] range, and U and V are in [-0.5, 0.5] range
450+ // (that's mathematically, in practice they are represented in integer range).
451+ // The conversion is defined as:
452+ // https://en.wikipedia.org/wiki/YCbCr#R'G'B'_to_Y%E2%80%B2PbPr
453+ // Y = kr*R + kg*G + kb*B
454+ // U = (B - Y) * 0.5 / (1 - kb) = (B - Y) / u_scale where u_scale = 2 * (1 - kb)
455+ // V = (R - Y) * 0.5 / (1 - kr) = (R - Y) / v_scale where v_scale = 2 * (1 - kr)
456+ //
457+ // Putting all this into matrix form, we get:
458+ // [Y] = [kr kg kb ] [R]
459+ // [U] [-kr/u_scale -kg/u_scale (1-kb)/u_scale] [G]
460+ // [V] [(1-kr)/v_scale -kg/v_scale -kb)/v_scale ] [B]
461+ //
462+ //
463+ // Now, to convert YUV to RGB, we just need to invert this matrix:
464+ // ```py
465+ // import torch
466+ // kr, kg, kb = 0.2126, 0.7152, 0.0722 # BT.709 luma coefficients
467+ // u_scale = 2 * (1 - kb)
468+ // v_scale = 2 * (1 - kr)
469+ //
470+ // rgb_to_yuv = torch.tensor([
471+ // [kr, kg, kb],
472+ // [-kr/u_scale, -kg/u_scale, (1-kb)/u_scale],
473+ // [(1-kr)/v_scale, -kg/v_scale, -kb/v_scale]
474+ // ])
475+ //
476+ // yuv_to_rgb_full = torch.linalg.inv(rgb_to_yuv)
477+ // print("YUV->RGB matrix (Full Range):")
478+ // print(yuv_to_rgb_full)
479+ // ```
480+ // And we get:
481+ // tensor([[ 1.0000e+00, -3.3142e-09, 1.5748e+00],
482+ // [ 1.0000e+00, -1.8732e-01, -4.6812e-01],
483+ // [ 1.0000e+00, 1.8556e+00, 4.6231e-09]])
484+ //
485+ // Which matches https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion
486+ //
487+ // Color conversion in NPP
488+ // -----------------------
489+ // https://docs.nvidia.com/cuda/npp/image_color_conversion.html.
490+ //
491+ // NPP provides different ways to convert YUV to RGB:
492+ // - pre-defined color conversion functions like
493+ // nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx and nppiNV12ToRGB_709HDTV_8u_P2C3R_Ctx
494+ // which are for BT.709 limited and full range, respectively.
495+ // - generic color conversion functions that accept a custom color conversion
496+ // matrix, called ColorTwist, like nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx
497+ //
498+ // We use the pre-defined functions or the color twist functions depending on
499+ // which one we find to be closer to the CPU results.
500+ //
501+ // The color twist functionality is *partially* described in a section named
502+ // "YUVToRGBColorTwist". Importantly:
503+ //
504+ // - The `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` function takes the YUV data
505+ // and the color-conversion matrix as input. The function itself and the
506+ // matrix assume different ranges for YUV values:
507+ // - The **matrix coefficient** must assume that Y is in [0, 1] and U,V are in
508+ // [-0.5, 0.5]. That's how we defined our matrix above.
509+ // - The function `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` however expects all
510+ // of the input Y, U, V to be in [0, 255]. That's how the data comes out of
511+ // the decoder.
512+ // - But *internally*, `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` needs U and V to
513+ // be centered around 0, i.e. in [-128, 127]. So we need to apply a -128
514+ // offset to U and V. Y doesn't need to be offset. The offset can be applied
515+ // by adding a 4th column to the matrix.
516+ //
517+ //
518+ // So our conversion matrix becomes the following, with new offset column:
519+ // tensor([[ 1.0000e+00, -3.3142e-09, 1.5748e+00, 0]
520+ // [ 1.0000e+00, -1.8732e-01, -4.6812e-01, -128]
521+ // [ 1.0000e+00, 1.8556e+00, 4.6231e-09 , -128]])
522+ //
523+ // And that's what we need to pass for BT701, full range.
524+ /* clang-format on */
0 commit comments