@@ -108,7 +108,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
108108 " vs supported:" ,
109109 caps.nMaxMBCount );
110110
111- // Decoder creation parameters, taken from DALI
111+ // Decoder creation parameters, most are taken from DALI
112112 CUVIDDECODECREATEINFO decoderParams = {};
113113 decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8 ;
114114 decoderParams.ChromaFormat = videoFormat->chroma_format ;
@@ -124,7 +124,12 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
124124 decoderParams.ulTargetWidth =
125125 videoFormat->display_area .right - videoFormat->display_area .left ;
126126 decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces ;
127- decoderParams.ulNumOutputSurfaces = 2 ;
127+ // We should only ever need 1 output surface, since we process frames
128+ // sequentially, and we always unmap the previous frame before mapping a new
129+ // one.
130+ // TODONVDEC P3: set this to 2, allow for 2 frames to be mapped at a time, and
131+ // benchmark to see if this makes any difference.
132+ decoderParams.ulNumOutputSurfaces = 1 ;
128133 decoderParams.display_area .left = videoFormat->display_area .left ;
129134 decoderParams.display_area .right = videoFormat->display_area .right ;
130135 decoderParams.display_area .top = videoFormat->display_area .top ;
@@ -166,10 +171,14 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
166171}
167172
168173BetaCudaDeviceInterface::~BetaCudaDeviceInterface () {
169- // TODONVDEC P0: we probably need to free the frames that have been decoded by
170- // NVDEC but not yet "mapped" - i.e. those that are still in readyFrames_?
171-
172174 if (decoder_) {
175+ // DALI doesn't seem to do any particular cleanup of the decoder before
176+ // sending it to the cache, so we probably don't need to do anything either.
177+ // Just to be safe, we flush.
178+ // What happens to those decode surfaces that haven't yet been mapped is
179+ // unclear.
180+ flush ();
181+ unmapPreviousFrame ();
173182 NVDECCache::getCache (device_.index ())
174183 .returnDecoder (&videoFormat_, std::move (decoder_));
175184 }
@@ -320,7 +329,7 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
320329 decoder_ = NVDECCache::getCache (device_.index ()).getDecoder (videoFormat);
321330
322331 if (!decoder_) {
323- // TODONVDEC P0 : consider re-configuring an existing decoder instead of
332+ // TODONVDEC P2 : consider re-configuring an existing decoder instead of
324333 // re-creating one. See docs, see DALI.
325334 decoder_ = createDecoder (videoFormat);
326335 }
@@ -341,13 +350,20 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
341350 packet.get () && packet->data && packet->size > 0 ,
342351 " sendPacket received an empty packet, this is unexpected, please report." );
343352
344- applyBSF (packet);
353+ // Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or
354+ // the original one if no BSF is needed. This new filtered packet must be
355+ // allocated outside of applyBSF: if it were allocated inside applyBSF, it
356+ // would be destroyed at the end of the function, leaving us with a dangling
357+ // reference.
358+ AutoAVPacket filteredAutoPacket;
359+ ReferenceAVPacket filteredPacket (filteredAutoPacket);
360+ ReferenceAVPacket& packetToSend = applyBSF (packet, filteredPacket);
345361
346362 CUVIDSOURCEDATAPACKET cuvidPacket = {};
347- cuvidPacket.payload = packet ->data ;
348- cuvidPacket.payload_size = packet ->size ;
363+ cuvidPacket.payload = packetToSend ->data ;
364+ cuvidPacket.payload_size = packetToSend ->size ;
349365 cuvidPacket.flags = CUVID_PKT_TIMESTAMP;
350- cuvidPacket.timestamp = packet ->pts ;
366+ cuvidPacket.timestamp = packetToSend ->pts ;
351367
352368 return sendCuvidPacket (cuvidPacket);
353369}
@@ -366,9 +382,11 @@ int BetaCudaDeviceInterface::sendCuvidPacket(
366382 return result == CUDA_SUCCESS ? AVSUCCESS : AVERROR_EXTERNAL;
367383}
368384
369- void BetaCudaDeviceInterface::applyBSF (ReferenceAVPacket& packet) {
385+ ReferenceAVPacket& BetaCudaDeviceInterface::applyBSF (
386+ ReferenceAVPacket& packet,
387+ ReferenceAVPacket& filteredPacket) {
370388 if (!bitstreamFilter_) {
371- return ;
389+ return packet ;
372390 }
373391
374392 int retVal = av_bsf_send_packet (bitstreamFilter_.get (), packet.get ());
@@ -377,41 +395,26 @@ void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) {
377395 " Failed to send packet to bitstream filter: " ,
378396 getFFMPEGErrorStringFromErrorCode (retVal));
379397
380- // Create a temporary packet to receive the filtered data
381398 // TODO P1: the docs mention there can theoretically be multiple output
382399 // packets for a single input, i.e. we may need to call av_bsf_receive_packet
383400 // more than once. We should figure out whether that applies to the BSF we're
384401 // using.
385- AutoAVPacket filteredAutoPacket;
386- ReferenceAVPacket filteredPacket (filteredAutoPacket);
387402 retVal = av_bsf_receive_packet (bitstreamFilter_.get (), filteredPacket.get ());
388403 TORCH_CHECK (
389404 retVal >= AVSUCCESS,
390405 " Failed to receive packet from bitstream filter: " ,
391406 getFFMPEGErrorStringFromErrorCode (retVal));
392407
393- // Free the original packet's data which isn't needed anymore, and move the
394- // fields of the filtered packet into the original packet. The filtered packet
395- // fields are re-set by av_packet_move_ref, so when it goes out of scope and
396- // gets destructed, it's not going to affect the original packet.
397- packet.reset (filteredPacket);
398- // TODONVDEC P0: consider cleaner ways to do this. Maybe we should let
399- // applyBSF return a new packet, and maybe that new packet needs to be a field
400- // on the interface to avoid complex lifetime issues.
408+ return filteredPacket;
401409}
402410
403411// Parser triggers this callback within cuvidParseVideoData when a frame is
404412// ready to be decoded, i.e. the parser received all the necessary packets for a
405413// given frame. It means we can send that frame to be decoded by the hardware
406414// NVDEC decoder by calling cuvidDecodePicture which is non-blocking.
407415int BetaCudaDeviceInterface::frameReadyForDecoding (CUVIDPICPARAMS* picParams) {
408- if (isFlushing_) {
409- return 0 ;
410- }
411-
412416 TORCH_CHECK (picParams != nullptr , " Invalid picture parameters" );
413417 TORCH_CHECK (decoder_, " Decoder not initialized before picture decode" );
414-
415418 // Send frame to be decoded by NVDEC - non-blocking call.
416419 CUresult result = cuvidDecodePicture (*decoder_.get (), picParams);
417420
@@ -432,6 +435,7 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
432435 // packets, or to stop if EOF was already sent.
433436 return eofSent_ ? AVERROR_EOF : AVERROR (EAGAIN);
434437 }
438+
435439 CUVIDPARSERDISPINFO dispInfo = readyFrames_.front ();
436440 readyFrames_.pop ();
437441
@@ -450,34 +454,41 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
450454 // to "map" it to an "output surface" before we can use its data. This is a
451455 // blocking calls that waits until the frame is fully decoded and ready to be
452456 // used.
457+ // When a frame is mapped to an output surface, it needs to be unmapped
458+ // eventually, so that the decoder can re-use the output surface. Failing to
459+ // unmap will cause map to eventually fail. DALI unmaps frames almost
460+ // immediately after mapping them: they do the color-conversion in-between,
461+ // which involves a copy of the data, so that works.
462+ // We, OTOH, will do the color-conversion later, outside of ReceiveFrame(). So
463+ // we unmap here: just before mapping a new frame. At that point we know that
464+ // the previously-mapped frame is no longer needed: it was either
465+ // color-converted (with a copy), or that's a frame that was discarded in
466+ // SingleStreamDecoder. Either way, the underlying output surface can be
467+ // safely re-used.
468+ unmapPreviousFrame ();
453469 CUresult result = cuvidMapVideoFrame (
454470 *decoder_.get (), dispInfo.picture_index , &framePtr, &pitch, &procParams);
455-
456471 if (result != CUDA_SUCCESS) {
457472 return AVERROR_EXTERNAL;
458473 }
474+ previouslyMappedFrame_ = framePtr;
459475
460476 avFrame = convertCudaFrameToAVFrame (framePtr, pitch, dispInfo);
461477
462- // Unmap the frame so that the decoder can reuse its corresponding output
463- // surface. Whether this is blocking is unclear?
464- cuvidUnmapVideoFrame (*decoder_.get (), framePtr);
465- // TODONVDEC P0: Get clarity on this:
466- // We assume that the framePtr is still valid after unmapping. That framePtr
467- // is now part of the avFrame, which we'll return to the caller, and the
468- // caller will immediately use it for color-conversion, at which point a copy
469- // happens. After the copy, it doesn't matter whether framePtr is still valid.
470- // And we'll return to this function (and to cuvidUnmapVideoFrame()) *after*
471- // the copy is made, so there should be no risk of overwriting the data before
472- // the copy.
473- // Buuuut yeah, we need get more clarity on what actually happens, and on
474- // what's needed. IIUC DALI makes the color-conversion copy immediately after
475- // cuvidMapVideoFrame() and *before* cuvidUnmapVideoFrame() with a synchronize
476- // in between. So maybe we should do the same.
477-
478478 return AVSUCCESS;
479479}
480480
481+ void BetaCudaDeviceInterface::unmapPreviousFrame () {
482+ if (previouslyMappedFrame_ == 0 ) {
483+ return ;
484+ }
485+ CUresult result =
486+ cuvidUnmapVideoFrame (*decoder_.get (), previouslyMappedFrame_);
487+ TORCH_CHECK (
488+ result == CUDA_SUCCESS, " Failed to unmap previous frame: " , result);
489+ previouslyMappedFrame_ = 0 ;
490+ }
491+
481492UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame (
482493 CUdeviceptr framePtr,
483494 unsigned int pitch,
@@ -554,16 +565,17 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
554565}
555566
556567void BetaCudaDeviceInterface::flush () {
557- isFlushing_ = true ;
558-
568+ // The NVCUVID docs mention that after seeking, i.e. when flush() is called,
569+ // we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
570+ // don't say whether this should be an empty packet, or whether it should be a
571+ // flag on the next non-empty packet. It doesn't matter: neither work :)
572+ // Sending an EOF packet, however, does work. So we do that. And we re-set the
573+ // eofSent_ flag to false because that's not a true EOF notification.
559574 sendEOFPacket ();
560-
561- isFlushing_ = false ;
575+ eofSent_ = false ;
562576
563577 std::queue<CUVIDPARSERDISPINFO> emptyQueue;
564578 std::swap (readyFrames_, emptyQueue);
565-
566- eofSent_ = false ;
567579}
568580
569581void BetaCudaDeviceInterface::convertAVFrameToFrameOutput (
0 commit comments