Rework frame ordering and pts matching

NicolasHug · NicolasHug · commit 5605c9025565 · 2025-10-01T13:43:00.000+01:00
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -41,10 +41,17 @@ pfnSequenceCallback(void* pUserData, CUVIDEOFORMAT* videoFormat) {
 }
 
 static int CUDAAPI
-pfnDecodePictureCallback(void* pUserData, CUVIDPICPARAMS* pPicParams) {
+pfnDecodePictureCallback(void* pUserData, CUVIDPICPARAMS* picParams) {
   BetaCudaDeviceInterface* decoder =
       static_cast<BetaCudaDeviceInterface*>(pUserData);
-  return decoder->frameReadyForDecoding(pPicParams);
+  return decoder->frameReadyForDecoding(picParams);
+}
+
+static int CUDAAPI
+pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
+  BetaCudaDeviceInterface* decoder =
+      static_cast<BetaCudaDeviceInterface*>(pUserData);
+  return decoder->frameReadyInDisplayOrder(dispInfo);
 }
 
 static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
@@ -203,7 +210,7 @@ void BetaCudaDeviceInterface::initializeInterface(AVStream* avStream) {
   parserParams.pUserData = this;
   parserParams.pfnSequenceCallback = pfnSequenceCallback;
   parserParams.pfnDecodePicture = pfnDecodePictureCallback;
-  parserParams.pfnDisplayPicture = nullptr;
+  parserParams.pfnDisplayPicture = pfnDisplayPictureCallback;
 
   CUresult result = cuvidCreateVideoParser(&videoParser_, &parserParams);
   TORCH_CHECK(
@@ -259,10 +266,6 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
     cuvidPacket.flags = CUVID_PKT_TIMESTAMP;
     cuvidPacket.timestamp = packet->pts;
 
-    // Like DALI: store packet PTS in queue to later assign to frames as they
-    // come out
-    packetsPtsQueue.push(packet->pts);
-
   } else {
     // End of stream packet
     cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM;
@@ -312,68 +315,40 @@ void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) {
 // ready to be decoded, i.e. the parser received all the necessary packets for a
 // given frame. It means we can send that frame to be decoded by the hardware
 // NVDEC decoder by calling cuvidDecodePicture which is non-blocking.
-int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* pPicParams) {
+int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) {
   if (isFlushing_) {
     return 0;
   }
 
-  TORCH_CHECK(pPicParams != nullptr, "Invalid picture parameters");
+  TORCH_CHECK(picParams != nullptr, "Invalid picture parameters");
   TORCH_CHECK(decoder_, "Decoder not initialized before picture decode");
 
   // Send frame to be decoded by NVDEC - non-blocking call.
-  CUresult result = cuvidDecodePicture(*decoder_.get(), pPicParams);
+  CUresult result = cuvidDecodePicture(*decoder_.get(), picParams);
   if (result != CUDA_SUCCESS) {
-    return 0; // Yes, you're reading that right, 0 mean error.
+    return 0; // Yes, you're reading that right, 0 means error.
   }
 
-  // The frame was sent to be decoded on the NVDEC hardware. Now we store some
-  // relevant info into our frame buffer so that we can retrieve the decoded
-  // frame later when receiveFrame() is called.
-  // Importantly we need to 'guess' the PTS of that frame. The heuristic we use
-  // (like in DALI) is that the frames are ready to be decoded in the same order
-  // as the packets were sent to the parser. So we assign the PTS of the frame
-  // by popping the PTS of the oldest packet in our packetsPtsQueue (note:
-  // oldest doesn't necessarily mean lowest PTS!).
-
-  TORCH_CHECK(
-      // TODONVDEC P0 the queue may be empty, handle that.
-      !packetsPtsQueue.empty(),
-      "PTS queue is empty when decoding a frame");
-  int64_t guessedPts = packetsPtsQueue.front();
-  packetsPtsQueue.pop();
-
-  // Field values taken from DALI
-  CUVIDPARSERDISPINFO dispInfo = {};
-  dispInfo.picture_index = pPicParams->CurrPicIdx;
-  dispInfo.progressive_frame = !pPicParams->field_pic_flag;
-  dispInfo.top_field_first = pPicParams->bottom_field_flag ^ 1;
-  dispInfo.repeat_first_field = 0;
-  dispInfo.timestamp = guessedPts;
-
-  FrameBuffer::Slot* slot = frameBuffer_.findEmptySlot();
-  slot->dispInfo = dispInfo;
-  slot->guessedPts = guessedPts;
-  slot->occupied = true;
+  frameBuffer_.markAsBeingDecoded(/*slotId=*/picParams->CurrPicIdx);
+  return 1;
+}
 
+int BetaCudaDeviceInterface::frameReadyInDisplayOrder(
+    CUVIDPARSERDISPINFO* dispInfo) {
+  frameBuffer_.markSlotReadyAndSetInfo(
+      /*slotId=*/dispInfo->picture_index, dispInfo);
   return 1;
 }
 
-// Moral equivalent of avcodec_receive_frame(). Here, we look for a decoded
-// frame with the exact desired PTS in our frame buffer. This logic is only
-// valid in exact seek_mode, for now.
-int BetaCudaDeviceInterface::receiveFrame(
-    UniqueAVFrame& avFrame,
-    int64_t desiredPts) {
-  FrameBuffer::Slot* slot = frameBuffer_.findFrameWithExactPts(desiredPts);
+// Moral equivalent of avcodec_receive_frame().
+int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
+  FrameBuffer::Slot* slot = frameBuffer_.findReadySlotWithLowestPts();
   if (slot == nullptr) {
     // No frame found, instruct caller to try again later after sending more
     // packets.
     return AVERROR(EAGAIN);
   }
 
-  slot->occupied = false;
-  slot->guessedPts = -1;
-
   CUVIDPROCPARAMS procParams = {};
   CUVIDPARSERDISPINFO dispInfo = slot->dispInfo;
   procParams.progressive_frame = dispInfo.progressive_frame;
@@ -382,6 +357,8 @@ int BetaCudaDeviceInterface::receiveFrame(
   CUdeviceptr framePtr = 0;
   unsigned int pitch = 0;
 
+  frameBuffer_.free(slot->slotId);
+
   // We know the frame we want was sent to the hardware decoder, but now we need
   // to "map" it to an "output surface" before we can use its data. This is a
   // blocking calls that waits until the frame is fully decoded and ready to be
@@ -435,7 +412,7 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
   avFrame->width = width;
   avFrame->height = height;
   avFrame->format = AV_PIX_FMT_CUDA;
-  avFrame->pts = dispInfo.timestamp; // == guessedPts
+  avFrame->pts = dispInfo.timestamp;
 
   unsigned int frameRateNum = videoFormat_.frame_rate.numerator;
   unsigned int frameRateDen = videoFormat_.frame_rate.denominator;
@@ -498,13 +475,7 @@ void BetaCudaDeviceInterface::flush() {
 
   isFlushing_ = false;
 
-  for (auto& slot : frameBuffer_) {
-    slot.occupied = false;
-    slot.guessedPts = -1;
-  }
-
-  std::queue<int64_t> empty;
-  packetsPtsQueue.swap(empty);
+  frameBuffer_.clear();
 
   eofSent_ = false;
 }
@@ -538,26 +509,52 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
       preAllocatedOutputTensor);
 }
 
-BetaCudaDeviceInterface::FrameBuffer::Slot*
-BetaCudaDeviceInterface::FrameBuffer::findEmptySlot() {
-  for (auto& slot : frameBuffer_) {
-    if (!slot.occupied) {
-      return &slot;
-    }
-  }
-  frameBuffer_.emplace_back();
-  return &frameBuffer_.back();
+void BetaCudaDeviceInterface::FrameBuffer::markAsBeingDecoded(int slotId) {
+  auto it = frameBuffer_.find(slotId);
+  TORCH_CHECK(
+      it == frameBuffer_.end(),
+      "Slot ",
+      slotId,
+      " is already occupied. This should never happen.");
+
+  frameBuffer_.emplace(slotId, Slot(slotId, SlotState::BEING_DECODED));
+}
+
+void BetaCudaDeviceInterface::FrameBuffer::markSlotReadyAndSetInfo(
+    int slotId,
+    CUVIDPARSERDISPINFO* dispInfo) {
+  auto it = frameBuffer_.find(slotId);
+  TORCH_CHECK(
+      it != frameBuffer_.end(),
+      "Could not find matching slot with slotId ",
+      slotId,
+      ". This should never happen.");
+
+  it->second.state = SlotState::READY_FOR_OUTPUT;
+  it->second.dispInfo = *dispInfo;
+}
+
+void BetaCudaDeviceInterface::FrameBuffer::free(int slotId) {
+  auto it = frameBuffer_.find(slotId);
+  TORCH_CHECK(
+      it != frameBuffer_.end(),
+      "Tried to free non-existing slot with slotId",
+      slotId,
+      ". This should never happen.");
+  frameBuffer_.erase(it);
 }
 
 BetaCudaDeviceInterface::FrameBuffer::Slot*
-BetaCudaDeviceInterface::FrameBuffer::findFrameWithExactPts(
-    int64_t desiredPts) {
-  for (auto& slot : frameBuffer_) {
-    if (slot.occupied && slot.guessedPts == desiredPts) {
-      return &slot;
+BetaCudaDeviceInterface::FrameBuffer::findReadySlotWithLowestPts() {
+  Slot* outputSlot = nullptr;
+  for (auto& [_, slot] : frameBuffer_) {
+    if (slot.state == SlotState::READY_FOR_OUTPUT &&
+        (outputSlot == nullptr ||
+         slot.dispInfo.timestamp < outputSlot->dispInfo.timestamp)) {
+      outputSlot = &slot;
     }
   }
-  return nullptr;
+  return outputSlot;
 }
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -52,49 +52,49 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   }
 
   int sendPacket(ReferenceAVPacket& packet) override;
-  int receiveFrame(UniqueAVFrame& avFrame, int64_t desiredPts) override;
+  int receiveFrame(UniqueAVFrame& avFrame) override;
   void flush() override;
 
   // NVDEC callback functions (must be public for C callbacks)
   int streamPropertyChange(CUVIDEOFORMAT* videoFormat);
-  int frameReadyForDecoding(CUVIDPICPARAMS* pPicParams);
+  int frameReadyForDecoding(CUVIDPICPARAMS* picParams);
+  int frameReadyInDisplayOrder(CUVIDPARSERDISPINFO* dispInfo);
 
  private:
   // Apply bitstream filter, modifies packet in-place
   void applyBSF(ReferenceAVPacket& packet);
 
   class FrameBuffer {
    public:
+    enum class SlotState { BEING_DECODED, READY_FOR_OUTPUT };
+
     struct Slot {
       CUVIDPARSERDISPINFO dispInfo;
-      int64_t guessedPts;
-      bool occupied = false;
+      SlotState state;
+      int slotId;
 
-      Slot() : guessedPts(-1), occupied(false) {
+      explicit Slot(int id, SlotState s) : state(s), slotId(id) {
         std::memset(&dispInfo, 0, sizeof(dispInfo));
+        TORCH_CHECK(
+            state == SlotState::BEING_DECODED,
+            "Programmer: are you sure you want to create a slot that is not BEING_DECODED?");
       }
     };
 
-    // TODONVDEC P1: init size should probably be min_num_decode_surfaces from
-    // video format
-    FrameBuffer() : frameBuffer_(4) {}
-
+    FrameBuffer() = default;
     ~FrameBuffer() = default;
 
-    Slot* findEmptySlot();
-    Slot* findFrameWithExactPts(int64_t desiredPts);
+    void markAsBeingDecoded(int slotId);
+    void markSlotReadyAndSetInfo(int slotId, CUVIDPARSERDISPINFO* dispInfo);
+    void free(int slotId);
+    Slot* findReadySlotWithLowestPts();
 
-    // Iterator support for range-based for loops
-    auto begin() {
-      return frameBuffer_.begin();
-    }
-
-    auto end() {
-      return frameBuffer_.end();
+    void clear() {
+      frameBuffer_.clear();
     }
 
    private:
-    std::vector<Slot> frameBuffer_;
+    std::unordered_map<int, Slot> frameBuffer_;
   };
 
   UniqueAVFrame convertCudaFrameToAVFrame(
@@ -108,8 +108,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {
 
   FrameBuffer frameBuffer_;
 
-  std::queue<int64_t> packetsPtsQueue;
-
   bool eofSent_ = false;
 
   // Flush flag to prevent decode operations during flush (like DALI's
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -87,9 +87,7 @@ class DeviceInterface {
   // Moral equivalent of avcodec_receive_frame()
   // Returns AVSUCCESS on success, AVERROR(EAGAIN) if no frame ready,
   // AVERROR_EOF if end of stream, or other AVERROR on failure
-  virtual int receiveFrame(
-      [[maybe_unused]] UniqueAVFrame& avFrame,
-      [[maybe_unused]] int64_t desiredPts) {
+  virtual int receiveFrame([[maybe_unused]] UniqueAVFrame& avFrame) {
     TORCH_CHECK(
         false,
         "Send/receive packet decoding not implemented for this device interface");
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -1146,7 +1146,7 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame(
 
   while (true) {
     if (useCustomInterface) {
-      status = deviceInterface_->receiveFrame(avFrame, cursor_);
+      status = deviceInterface_->receiveFrame(avFrame);
     } else {
       status =
           avcodec_receive_frame(streamInfo.codecContext.get(), avFrame.get());
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -153,12 +153,6 @@ def __init__(
                 device_variant = device_split[2]
                 device = ":".join(device_split[0:2])
 
-        # TODONVDEC P0 Support approximate mode. Not ideal to validate that here
-        # either, but validating this at a lower level forces to add yet another
-        # (temprorary) validation API to the device inteface
-        if device_variant == "beta" and seek_mode != "exact":
-            raise ValueError("Seek mode must be exact for BETA CUDA interface.")
-
         core.add_video_stream(
             self._decoder,
             stream_index=stream_index,
diff --git a/test/test_decoders.py b/test/test_decoders.py