Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 2 additions & 138 deletions src/torchcodec/_core/BetaCudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -674,149 +674,13 @@ void BetaCudaDeviceInterface::flush() {
std::swap(readyFrames_, emptyQueue);
}

UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
UniqueAVFrame& cpuFrame) {
// This is called in the context of the CPU fallback: the frame was decoded on
// the CPU, and in this function we convert that frame into NV12 format and
// send it to the GPU.
// We do that in 2 steps:
// - First we convert the input CPU frame into an intermediate NV12 CPU frame
// using sws_scale.
// - Then we allocate GPU memory and copy the NV12 CPU frame to the GPU. This
// is what we return

TORCH_CHECK(cpuFrame != nullptr, "CPU frame cannot be null");

int width = cpuFrame->width;
int height = cpuFrame->height;

// intermediate NV12 CPU frame. It's not on the GPU yet.
UniqueAVFrame nv12CpuFrame(av_frame_alloc());
TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame");

nv12CpuFrame->format = AV_PIX_FMT_NV12;
nv12CpuFrame->width = width;
nv12CpuFrame->height = height;

int ret = av_frame_get_buffer(nv12CpuFrame.get(), 0);
TORCH_CHECK(
ret >= 0,
"Failed to allocate NV12 CPU frame buffer: ",
getFFMPEGErrorStringFromErrorCode(ret));

SwsFrameContext swsFrameContext(
width,
height,
static_cast<AVPixelFormat>(cpuFrame->format),
width,
height);

if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
swsContext_ = createSwsContext(
swsFrameContext, cpuFrame->colorspace, AV_PIX_FMT_NV12, SWS_BILINEAR);
prevSwsFrameContext_ = swsFrameContext;
}

int convertedHeight = sws_scale(
swsContext_.get(),
cpuFrame->data,
cpuFrame->linesize,
0,
height,
nv12CpuFrame->data,
nv12CpuFrame->linesize);
TORCH_CHECK(
convertedHeight == height, "sws_scale failed for CPU->NV12 conversion");

int ySize = width * height;
TORCH_CHECK(
ySize % 2 == 0,
"Y plane size must be even. Please report on TorchCodec repo.");
int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane
size_t totalSize = static_cast<size_t>(ySize + uvSize);

uint8_t* cudaBuffer = nullptr;
cudaError_t err =
cudaMalloc(reinterpret_cast<void**>(&cudaBuffer), totalSize);
TORCH_CHECK(
err == cudaSuccess,
"Failed to allocate CUDA memory: ",
cudaGetErrorString(err));

UniqueAVFrame gpuFrame(av_frame_alloc());
TORCH_CHECK(gpuFrame != nullptr, "Failed to allocate GPU AVFrame");

gpuFrame->format = AV_PIX_FMT_CUDA;
gpuFrame->width = width;
gpuFrame->height = height;
gpuFrame->data[0] = cudaBuffer;
gpuFrame->data[1] = cudaBuffer + ySize;
gpuFrame->linesize[0] = width;
gpuFrame->linesize[1] = width;

// Note that we use cudaMemcpy2D here instead of cudaMemcpy because the
// linesizes (strides) may be different than the widths for the input CPU
// frame. That's precisely what cudaMemcpy2D is for.
err = cudaMemcpy2D(
gpuFrame->data[0],
gpuFrame->linesize[0],
nv12CpuFrame->data[0],
nv12CpuFrame->linesize[0],
width,
height,
cudaMemcpyHostToDevice);
TORCH_CHECK(
err == cudaSuccess,
"Failed to copy Y plane to GPU: ",
cudaGetErrorString(err));

TORCH_CHECK(
height % 2 == 0,
"height must be even. Please report on TorchCodec repo.");
err = cudaMemcpy2D(
gpuFrame->data[1],
gpuFrame->linesize[1],
nv12CpuFrame->data[1],
nv12CpuFrame->linesize[1],
width,
height / 2,
cudaMemcpyHostToDevice);
TORCH_CHECK(
err == cudaSuccess,
"Failed to copy UV plane to GPU: ",
cudaGetErrorString(err));

ret = av_frame_copy_props(gpuFrame.get(), cpuFrame.get());
TORCH_CHECK(
ret >= 0,
"Failed to copy frame properties: ",
getFFMPEGErrorStringFromErrorCode(ret));

// We're almost done, but we need to make sure the CUDA memory is freed
// properly. Usually, AVFrame data is freed when av_frame_free() is called
// (upon UniqueAVFrame destruction), but since we allocated the CUDA memory
// ourselves, FFmpeg doesn't know how to free it. The recommended way to deal
// with this is to associate the opaque_ref field of the AVFrame with a `free`
// callback that will then be called by av_frame_free().
gpuFrame->opaque_ref = av_buffer_create(
nullptr, // data - we don't need any
0, // data size
cudaBufferFreeCallback, // callback triggered by av_frame_free()
cudaBuffer, // parameter to callback
0); // flags
TORCH_CHECK(
gpuFrame->opaque_ref != nullptr,
"Failed to create GPU memory cleanup reference");

return gpuFrame;
}

void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
UniqueAVFrame& avFrame,
FrameOutput& frameOutput,
std::optional<torch::Tensor> preAllocatedOutputTensor) {
UniqueAVFrame gpuFrame =
cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame);
cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame, swsCtx_, device_)
: std::move(avFrame);

// TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
// ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
Expand Down
9 changes: 5 additions & 4 deletions src/torchcodec/_core/BetaCudaDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "src/torchcodec/_core/DeviceInterface.h"
#include "src/torchcodec/_core/FFMPEGCommon.h"
#include "src/torchcodec/_core/NVDECCache.h"
#include "src/torchcodec/_core/SwsContext.h"

#include <map>
#include <memory>
Expand Down Expand Up @@ -81,8 +82,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {
unsigned int pitch,
const CUVIDPARSERDISPINFO& dispInfo);

UniqueAVFrame transferCpuFrameToGpuNV12(UniqueAVFrame& cpuFrame);

CUvideoparser videoParser_ = nullptr;
UniqueCUvideodecoder decoder_;
CUVIDEOFORMAT videoFormat_ = {};
Expand All @@ -101,8 +100,10 @@ class BetaCudaDeviceInterface : public DeviceInterface {

std::unique_ptr<DeviceInterface> cpuFallback_;
bool nvcuvidAvailable_ = false;
UniqueSwsContext swsContext_;
SwsFrameContext prevSwsFrameContext_;

// Swscale context cache for GPU transfer during CPU fallback.
// Used to convert CPU frames to NV12 before transferring to GPU.
SwsScaler swsCtx_;
};

} // namespace facebook::torchcodec
Expand Down
1 change: 1 addition & 0 deletions src/torchcodec/_core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ function(make_torchcodec_libraries
Encoder.cpp
ValidationUtils.cpp
Transform.cpp
SwsContext.cpp
)

if(ENABLE_CUDA)
Expand Down
133 changes: 133 additions & 0 deletions src/torchcodec/_core/CUDACommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -327,4 +327,137 @@ int getDeviceIndex(const torch::Device& device) {
return deviceIndex;
}

// Callback for freeing CUDA memory associated with AVFrame
void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) {
cudaFree(opaque);
}

UniqueAVFrame transferCpuFrameToGpuNV12(
UniqueAVFrame& cpuFrame,
SwsScaler& swsCtx,
[[maybe_unused]] const torch::Device& device) {
// This function converts a CPU frame to NV12 format and transfers it to GPU.
// We do that in 2 steps:
// - First we convert the input CPU frame into an intermediate NV12 CPU frame
// using sws_scale.
// - Then we allocate GPU memory and copy the NV12 CPU frame to the GPU. This
// is what we return.

TORCH_CHECK(cpuFrame != nullptr, "CPU frame cannot be null");

int width = cpuFrame->width;
int height = cpuFrame->height;

// Intermediate NV12 CPU frame. It's not on the GPU yet.
UniqueAVFrame nv12CpuFrame(av_frame_alloc());
TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame");

nv12CpuFrame->format = AV_PIX_FMT_NV12;
nv12CpuFrame->width = width;
nv12CpuFrame->height = height;

int ret = av_frame_get_buffer(nv12CpuFrame.get(), 0);
TORCH_CHECK(
ret >= 0,
"Failed to allocate NV12 CPU frame buffer: ",
getFFMPEGErrorStringFromErrorCode(ret));

FrameDims outputDims(height, width);
auto swsContext = swsCtx.getOrCreateContext(
cpuFrame, outputDims, cpuFrame->colorspace, AV_PIX_FMT_NV12, SWS_BILINEAR);

int convertedHeight = sws_scale(
swsContext,
cpuFrame->data,
cpuFrame->linesize,
0,
height,
nv12CpuFrame->data,
nv12CpuFrame->linesize);
TORCH_CHECK(
convertedHeight == height, "sws_scale failed for CPU->NV12 conversion");

int ySize = width * height;
TORCH_CHECK(
ySize % 2 == 0,
"Y plane size must be even. Please report on TorchCodec repo.");
int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane
size_t totalSize = static_cast<size_t>(ySize + uvSize);

uint8_t* cudaBuffer = nullptr;
cudaError_t err =
cudaMalloc(reinterpret_cast<void**>(&cudaBuffer), totalSize);
TORCH_CHECK(
err == cudaSuccess,
"Failed to allocate CUDA memory: ",
cudaGetErrorString(err));

UniqueAVFrame gpuFrame(av_frame_alloc());
TORCH_CHECK(gpuFrame != nullptr, "Failed to allocate GPU AVFrame");

gpuFrame->format = AV_PIX_FMT_CUDA;
gpuFrame->width = width;
gpuFrame->height = height;
gpuFrame->data[0] = cudaBuffer;
gpuFrame->data[1] = cudaBuffer + ySize;
gpuFrame->linesize[0] = width;
gpuFrame->linesize[1] = width;

// Note that we use cudaMemcpy2D here instead of cudaMemcpy because the
// linesizes (strides) may be different than the widths for the input CPU
// frame. That's precisely what cudaMemcpy2D is for.
err = cudaMemcpy2D(
gpuFrame->data[0],
gpuFrame->linesize[0],
nv12CpuFrame->data[0],
nv12CpuFrame->linesize[0],
width,
height,
cudaMemcpyHostToDevice);
TORCH_CHECK(
err == cudaSuccess,
"Failed to copy Y plane to GPU: ",
cudaGetErrorString(err));

TORCH_CHECK(
height % 2 == 0,
"height must be even. Please report on TorchCodec repo.");
err = cudaMemcpy2D(
gpuFrame->data[1],
gpuFrame->linesize[1],
nv12CpuFrame->data[1],
nv12CpuFrame->linesize[1],
width,
height / 2,
cudaMemcpyHostToDevice);
TORCH_CHECK(
err == cudaSuccess,
"Failed to copy UV plane to GPU: ",
cudaGetErrorString(err));

ret = av_frame_copy_props(gpuFrame.get(), cpuFrame.get());
TORCH_CHECK(
ret >= 0,
"Failed to copy frame properties: ",
getFFMPEGErrorStringFromErrorCode(ret));

// We're almost done, but we need to make sure the CUDA memory is freed
// properly. Usually, AVFrame data is freed when av_frame_free() is called
// (upon UniqueAVFrame destruction), but since we allocated the CUDA memory
// ourselves, FFmpeg doesn't know how to free it. The recommended way to deal
// with this is to associate the opaque_ref field of the AVFrame with a `free`
// callback that will then be called by av_frame_free().
gpuFrame->opaque_ref = av_buffer_create(
nullptr, // data - we don't need any
0, // data size
cudaBufferFreeCallback, // callback triggered by av_frame_free()
cudaBuffer, // parameter to callback
0); // flags
TORCH_CHECK(
gpuFrame->opaque_ref != nullptr,
"Failed to create GPU memory cleanup reference");

return gpuFrame;
}

} // namespace facebook::torchcodec
8 changes: 8 additions & 0 deletions src/torchcodec/_core/CUDACommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

#include "src/torchcodec/_core/FFMPEGCommon.h"
#include "src/torchcodec/_core/Frame.h"
#include "src/torchcodec/_core/SwsContext.h"

extern "C" {
#include <libavutil/hwcontext_cuda.h>
Expand Down Expand Up @@ -48,4 +49,11 @@ void validatePreAllocatedTensorShape(

int getDeviceIndex(const torch::Device& device);

// Convert CPU frame to NV12 and transfer to GPU for GPU-accelerated color
// conversion. Used during CPU fallback to move color conversion to GPU.
UniqueAVFrame transferCpuFrameToGpuNV12(
UniqueAVFrame& cpuFrame,
SwsScaler& swsCtx,
const torch::Device& device);

} // namespace facebook::torchcodec
28 changes: 5 additions & 23 deletions src/torchcodec/_core/CpuDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,35 +215,17 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
const UniqueAVFrame& avFrame,
torch::Tensor& outputTensor,
const FrameDims& outputDims) {
enum AVPixelFormat frameFormat =
static_cast<enum AVPixelFormat>(avFrame->format);

// We need to compare the current frame context with our previous frame
// context. If they are different, then we need to re-create our colorspace
// conversion objects. We create our colorspace conversion objects late so
// that we don't have to depend on the unreliable metadata in the header.
// And we sometimes re-create them because it's possible for frame
// resolution to change mid-stream. Finally, we want to reuse the colorspace
// conversion objects as much as possible for performance reasons.
SwsFrameContext swsFrameContext(
avFrame->width,
avFrame->height,
frameFormat,
outputDims.width,
outputDims.height);

if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
swsContext_ = createSwsContext(
swsFrameContext, avFrame->colorspace, AV_PIX_FMT_RGB24, swsFlags_);
prevSwsFrameContext_ = swsFrameContext;
}
// Get or create swscale context. The SwsScaler class manages caching
// and recreation logic internally based on frame properties.
auto swsContext = swsCtx_.getOrCreateContext(
avFrame, outputDims, avFrame->colorspace, AV_PIX_FMT_RGB24, swsFlags_);

uint8_t* pointers[4] = {
outputTensor.data_ptr<uint8_t>(), nullptr, nullptr, nullptr};
int expectedOutputWidth = outputTensor.sizes()[1];
int linesizes[4] = {expectedOutputWidth * 3, 0, 0, 0};
int resultHeight = sws_scale(
swsContext_.get(),
swsContext,
avFrame->data,
avFrame->linesize,
0,
Expand Down
Loading
Loading