Skip to content

Commit 25e42e1

Browse files
authored
Address some TODONVDEC (#960)
1 parent fdd8833 commit 25e42e1

File tree

8 files changed

+49
-63
lines changed

8 files changed

+49
-63
lines changed

src/torchcodec/_core/BetaCudaDeviceInterface.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -216,12 +216,11 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
216216
// unclear.
217217
flush();
218218
unmapPreviousFrame();
219-
NVDECCache::getCache(device_.index())
220-
.returnDecoder(&videoFormat_, std::move(decoder_));
219+
NVDECCache::getCache(device_).returnDecoder(
220+
&videoFormat_, std::move(decoder_));
221221
}
222222

223223
if (videoParser_) {
224-
// TODONVDEC P2: consider caching this? Does DALI do that?
225224
cuvidDestroyVideoParser(videoParser_);
226225
videoParser_ = nullptr;
227226
}
@@ -362,11 +361,12 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
362361
}
363362

364363
if (!decoder_) {
365-
decoder_ = NVDECCache::getCache(device_.index()).getDecoder(videoFormat);
364+
decoder_ = NVDECCache::getCache(device_).getDecoder(videoFormat);
366365

367366
if (!decoder_) {
368367
// TODONVDEC P2: consider re-configuring an existing decoder instead of
369-
// re-creating one. See docs, see DALI.
368+
// re-creating one. See docs, see DALI. Re-configuration doesn't seem to
369+
// be enabled in DALI by default.
370370
decoder_ = createDecoder(videoFormat);
371371
}
372372

src/torchcodec/_core/CUDACommon.cpp

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,12 @@
55
// LICENSE file in the root directory of this source tree.
66

77
#include "src/torchcodec/_core/CUDACommon.h"
8+
#include "src/torchcodec/_core/Cache.h" // for PerGpuCache
89

910
namespace facebook::torchcodec {
1011

1112
namespace {
1213

13-
// Pytorch can only handle up to 128 GPUs.
14-
// https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
15-
const int MAX_CUDA_GPUS = 128;
1614
// Set to -1 to have an infinitely sized cache. Set it to 0 to disable caching.
1715
// Set to a positive number to have a cache of that size.
1816
const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1;
@@ -249,7 +247,7 @@ torch::Tensor convertNV12FrameToRGB(
249247
}
250248

251249
UniqueNppContext getNppStreamContext(const torch::Device& device) {
252-
torch::DeviceIndex nonNegativeDeviceIndex = getNonNegativeDeviceIndex(device);
250+
int deviceIndex = getDeviceIndex(device);
253251

254252
UniqueNppContext nppCtx = g_cached_npp_ctxs.get(device);
255253
if (nppCtx) {
@@ -266,13 +264,13 @@ UniqueNppContext getNppStreamContext(const torch::Device& device) {
266264

267265
nppCtx = std::make_unique<NppStreamContext>();
268266
cudaDeviceProp prop{};
269-
cudaError_t err = cudaGetDeviceProperties(&prop, nonNegativeDeviceIndex);
267+
cudaError_t err = cudaGetDeviceProperties(&prop, deviceIndex);
270268
TORCH_CHECK(
271269
err == cudaSuccess,
272270
"cudaGetDeviceProperties failed: ",
273271
cudaGetErrorString(err));
274272

275-
nppCtx->nCudaDeviceId = nonNegativeDeviceIndex;
273+
nppCtx->nCudaDeviceId = deviceIndex;
276274
nppCtx->nMultiProcessorCount = prop.multiProcessorCount;
277275
nppCtx->nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
278276
nppCtx->nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
@@ -312,4 +310,21 @@ void validatePreAllocatedTensorShape(
312310
}
313311
}
314312

313+
int getDeviceIndex(const torch::Device& device) {
314+
// PyTorch uses int8_t as its torch::DeviceIndex, but FFmpeg and CUDA
315+
// libraries use int. So we use int, too.
316+
int deviceIndex = static_cast<int>(device.index());
317+
TORCH_CHECK(
318+
deviceIndex >= -1 && deviceIndex < MAX_CUDA_GPUS,
319+
"Invalid device index = ",
320+
deviceIndex);
321+
322+
if (deviceIndex == -1) {
323+
TORCH_CHECK(
324+
cudaGetDevice(&deviceIndex) == cudaSuccess,
325+
"Failed to get current CUDA device.");
326+
}
327+
return deviceIndex;
328+
}
329+
315330
} // namespace facebook::torchcodec

src/torchcodec/_core/CUDACommon.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
#include <npp.h>
1212
#include <torch/types.h>
1313

14-
#include "src/torchcodec/_core/Cache.h"
1514
#include "src/torchcodec/_core/FFMPEGCommon.h"
1615
#include "src/torchcodec/_core/Frame.h"
1716

@@ -22,6 +21,10 @@ extern "C" {
2221

2322
namespace facebook::torchcodec {
2423

24+
// Pytorch can only handle up to 128 GPUs.
25+
// https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
26+
constexpr int MAX_CUDA_GPUS = 128;
27+
2528
void initializeCudaContextWithPytorch(const torch::Device& device);
2629

2730
// Unique pointer type for NPP stream context
@@ -43,4 +46,6 @@ void validatePreAllocatedTensorShape(
4346
const std::optional<torch::Tensor>& preAllocatedOutputTensor,
4447
const UniqueAVFrame& avFrame);
4548

49+
int getDeviceIndex(const torch::Device& device);
50+
4651
} // namespace facebook::torchcodec

src/torchcodec/_core/Cache.h

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -95,30 +95,16 @@ class PerGpuCache {
9595
std::vector<std::unique_ptr<Cache<T, D>>> cache_;
9696
};
9797

98-
// Note: this function is inline for convenience, not performance. Because the
99-
// rest of this file is template functions, they must all be defined in this
100-
// header. This function is not a template function, and should, in principle,
101-
// be defined in a .cpp file to preserve the One Definition Rule. That's
102-
// annoying for such a small amount of code, so we just inline it. If this file
103-
// grows, and there are more such functions, we should break them out into a
104-
// .cpp file.
105-
inline torch::DeviceIndex getNonNegativeDeviceIndex(
106-
const torch::Device& device) {
107-
torch::DeviceIndex deviceIndex = device.index();
108-
// For single GPU machines libtorch returns -1 for the device index. So for
109-
// that case we set the device index to 0. That's used in per-gpu cache
110-
// implementation and during initialization of CUDA and FFmpeg contexts
111-
// which require non negative indices.
112-
deviceIndex = std::max<at::DeviceIndex>(deviceIndex, 0);
113-
TORCH_CHECK(deviceIndex >= 0, "Device index out of range");
114-
return deviceIndex;
115-
}
98+
// Forward declaration of getDeviceIndex which exists in CUDACommon.h
99+
// This avoids circular dependency between Cache.h and CUDACommon.cpp which also
100+
// needs to include Cache.h
101+
int getDeviceIndex(const torch::Device& device);
116102

117103
template <typename T, typename D>
118104
bool PerGpuCache<T, D>::addIfCacheHasCapacity(
119105
const torch::Device& device,
120106
element_type&& obj) {
121-
torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device);
107+
int deviceIndex = getDeviceIndex(device);
122108
TORCH_CHECK(
123109
static_cast<size_t>(deviceIndex) < cache_.size(),
124110
"Device index out of range");
@@ -128,7 +114,7 @@ bool PerGpuCache<T, D>::addIfCacheHasCapacity(
128114
template <typename T, typename D>
129115
typename PerGpuCache<T, D>::element_type PerGpuCache<T, D>::get(
130116
const torch::Device& device) {
131-
torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device);
117+
int deviceIndex = getDeviceIndex(device);
132118
TORCH_CHECK(
133119
static_cast<size_t>(deviceIndex) < cache_.size(),
134120
"Device index out of range");

src/torchcodec/_core/CudaDeviceInterface.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,6 @@ static bool g_cuda = registerDeviceInterface(
3232
// from
3333
// the cache. If the cache is empty we create a new cuda context.
3434

35-
// Pytorch can only handle up to 128 GPUs.
36-
// https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
37-
const int MAX_CUDA_GPUS = 128;
3835
// Set to -1 to have an infinitely sized cache. Set it to 0 to disable caching.
3936
// Set to a positive number to have a cache of that size.
4037
const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1;
@@ -54,7 +51,7 @@ int getFlagsAVHardwareDeviceContextCreate() {
5451
UniqueAVBufferRef getHardwareDeviceContext(const torch::Device& device) {
5552
enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
5653
TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
57-
torch::DeviceIndex nonNegativeDeviceIndex = getNonNegativeDeviceIndex(device);
54+
int deviceIndex = getDeviceIndex(device);
5855

5956
UniqueAVBufferRef hardwareDeviceCtx = g_cached_hw_device_ctxs.get(device);
6057
if (hardwareDeviceCtx) {
@@ -68,9 +65,9 @@ UniqueAVBufferRef getHardwareDeviceContext(const torch::Device& device) {
6865
// So we ensure the deviceIndex is not negative.
6966
// We set the device because we may be called from a different thread than
7067
// the one that initialized the cuda context.
71-
cudaSetDevice(nonNegativeDeviceIndex);
68+
cudaSetDevice(deviceIndex);
7269
AVBufferRef* hardwareDeviceCtxRaw = nullptr;
73-
std::string deviceOrdinal = std::to_string(nonNegativeDeviceIndex);
70+
std::string deviceOrdinal = std::to_string(deviceIndex);
7471

7572
int err = av_hwdevice_ctx_create(
7673
&hardwareDeviceCtxRaw,

src/torchcodec/_core/NVDECCache.cpp

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <torch/types.h>
88
#include <mutex>
99

10+
#include "src/torchcodec/_core/CUDACommon.h"
1011
#include "src/torchcodec/_core/FFMPEGCommon.h"
1112
#include "src/torchcodec/_core/NVDECCache.h"
1213

@@ -19,20 +20,9 @@ extern "C" {
1920

2021
namespace facebook::torchcodec {
2122

22-
NVDECCache& NVDECCache::getCache(int deviceIndex) {
23-
const int MAX_CUDA_GPUS = 128;
24-
TORCH_CHECK(
25-
deviceIndex >= -1 && deviceIndex < MAX_CUDA_GPUS,
26-
"Invalid device index = ",
27-
deviceIndex);
23+
NVDECCache& NVDECCache::getCache(const torch::Device& device) {
2824
static NVDECCache cacheInstances[MAX_CUDA_GPUS];
29-
if (deviceIndex == -1) {
30-
// TODO NVDEC P3: Unify with existing getNonNegativeDeviceIndex()
31-
TORCH_CHECK(
32-
cudaGetDevice(&deviceIndex) == cudaSuccess,
33-
"Failed to get current CUDA device.");
34-
}
35-
return cacheInstances[deviceIndex];
25+
return cacheInstances[getDeviceIndex(device)];
3626
}
3727

3828
UniqueCUvideodecoder NVDECCache::getDecoder(CUVIDEOFORMAT* videoFormat) {

src/torchcodec/_core/NVDECCache.h

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <mutex>
1212

1313
#include <cuda.h>
14+
#include <torch/types.h>
1415
#include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
1516
#include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
1617

@@ -36,7 +37,7 @@ using UniqueCUvideodecoder =
3637
// per GPU device, and it is accessed through the static getCache() method.
3738
class NVDECCache {
3839
public:
39-
static NVDECCache& getCache(int deviceIndex);
40+
static NVDECCache& getCache(const torch::Device& device);
4041

4142
// Get decoder from cache - returns nullptr if none available
4243
UniqueCUvideodecoder getDecoder(CUVIDEOFORMAT* videoFormat);
@@ -68,11 +69,6 @@ class NVDECCache {
6869
CacheKey(const CacheKey&) = default;
6970
CacheKey& operator=(const CacheKey&) = default;
7071

71-
// TODONVDEC P2: we only implement operator< which is enough for std::map,
72-
// but:
73-
// - we should consider using std::unordered_map
74-
// - we should consider a more sophisticated and potentially less strict
75-
// cache key comparison logic
7672
bool operator<(const CacheKey& other) const {
7773
return std::tie(
7874
codecType,

test/utils.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,6 @@ def unsplit_device_str(device_str: str) -> str:
4141
# It is used:
4242
# - before calling `.to(device)` where device can't be "cuda:0:beta"
4343
# - before calling add_video_stream(device=device, device_variant=device_variant)
44-
#
45-
# TODONVDEC P2: Find a less clunky way to test the BETA CUDA interface. It
46-
# will ultimately depend on how we want to publicly expose it.
4744
if device_str == "cuda:0:beta":
4845
return "cuda", "beta"
4946
else:
@@ -750,7 +747,7 @@ def sample_format(self) -> str:
750747

751748

752749
def supports_approximate_mode(asset: TestVideo) -> bool:
753-
# TODONVDEC P2: open an issue about his. That's actually not related to
754-
# NVDEC at all, those don't support approximate mode because they don't set
755-
# a duration. CPU decoder fails too!
750+
# Those are missing the `duration` field so they fail in approximate mode (on all devices).
751+
# TODO: we should address this, see
752+
# https://github.com/meta-pytorch/torchcodec/issues/945
756753
return asset not in (AV1_VIDEO, TEST_SRC_2_720P_VP9, TEST_SRC_2_720P_VP8)

0 commit comments

Comments
 (0)