@@ -47,8 +47,7 @@ void CpuDeviceInterface::initializeVideo(
4747 // We calculate this value during initilization but we don't refer to it until
4848 // getColorConversionLibrary() is called. Calculating this value during
4949 // initialization saves us from having to save all of the transforms.
50- areTransformsSwScaleCompatible_ = transforms.empty () ||
51- (transforms.size () == 1 && transforms[0 ]->isResize ());
50+ areTransformsSwScaleCompatible_ = transforms.empty ();
5251
5352 // Note that we do not expose this capability in the public API, only through
5453 // the core API.
@@ -58,16 +57,6 @@ void CpuDeviceInterface::initializeVideo(
5857 userRequestedSwScale_ = videoStreamOptions_.colorConversionLibrary ==
5958 ColorConversionLibrary::SWSCALE;
6059
61- // We can only use swscale when we have a single resize transform. Note that
62- // we actually decide on whether or not to actually use swscale at the last
63- // possible moment, when we actually convert the frame. This is because we
64- // need to know the actual frame dimensions.
65- if (transforms.size () == 1 && transforms[0 ]->isResize ()) {
66- auto resize = dynamic_cast <ResizeTransform*>(transforms[0 ].get ());
67- TORCH_CHECK (resize != nullptr , " ResizeTransform expected but not found!" )
68- swsFlags_ = resize->getSwsFlags ();
69- }
70-
7160 // If we have any transforms, replace filters_ with the filter strings from
7261 // the transforms. As noted above, we decide between swscale and filtergraph
7362 // when we actually decode a frame.
@@ -81,7 +70,18 @@ void CpuDeviceInterface::initializeVideo(
8170 first = false ;
8271 }
8372 if (!transforms.empty ()) {
84- filters_ = filters.str ();
73+ // Note [Transform and Format Conversion Order]
74+ // We have to ensure that all user filters happen AFTER the explicit format
75+ // conversion. That is, we want the filters to be applied in RGB24, not the
76+ // pixel format of the input frame.
77+ //
78+ // The ouput frame will always be in RGB24, as we specify the sink node with
79+ // AV_PIX_FORMAT_RGB24. Filtergraph will automatically insert a filter
80+ // conversion to ensure the output frame matches the pixel format
81+ // specified in the sink. But by default, it will insert it after the user
82+ // filters. We need an explicit format conversion to get the behavior we
83+ // want.
84+ filters_ = " format=rgb24," + filters.str ();
8585 }
8686
8787 initialized_ = true ;
@@ -238,6 +238,11 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
238238 enum AVPixelFormat frameFormat =
239239 static_cast <enum AVPixelFormat>(avFrame->format );
240240
241+ TORCH_CHECK (
242+ avFrame->height == outputDims.height &&
243+ avFrame->width == outputDims.width ,
244+ " Input dimensions are not equal to output dimensions; resize for sws_scale() is not yet supported." );
245+
241246 // We need to compare the current frame context with our previous frame
242247 // context. If they are different, then we need to re-create our colorspace
243248 // conversion objects. We create our colorspace conversion objects late so
@@ -254,7 +259,16 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
254259
255260 if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
256261 swsContext_ = createSwsContext (
257- swsFrameContext, avFrame->colorspace , AV_PIX_FMT_RGB24, swsFlags_);
262+ swsFrameContext,
263+ avFrame->colorspace ,
264+
265+ // See [Transform and Format Conversion Order] for more on the output
266+ // pixel format.
267+ /* outputFormat=*/ AV_PIX_FMT_RGB24,
268+
269+ // We don't set any flags because we don't yet use sw_scale() for
270+ // resizing.
271+ /* swsFlags=*/ 0 );
258272 prevSwsFrameContext_ = swsFrameContext;
259273 }
260274
@@ -276,17 +290,17 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
276290torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph (
277291 const UniqueAVFrame& avFrame,
278292 const FrameDims& outputDims) {
279- enum AVPixelFormat frameFormat =
293+ enum AVPixelFormat avFrameFormat =
280294 static_cast <enum AVPixelFormat>(avFrame->format );
281295
282296 FiltersContext filtersContext (
283297 avFrame->width ,
284298 avFrame->height ,
285- frameFormat ,
299+ avFrameFormat ,
286300 avFrame->sample_aspect_ratio ,
287301 outputDims.width ,
288302 outputDims.height ,
289- AV_PIX_FMT_RGB24,
303+ /* outputFormat= */ AV_PIX_FMT_RGB24,
290304 filters_,
291305 timeBase_);
292306
0 commit comments