meta-pytorch · mollyxu · Nov 10, 2025 · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025
diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
@@ -96,6 +96,7 @@ function(make_torchcodec_libraries
         Encoder.cpp
         ValidationUtils.cpp
         Transform.cpp
+        Metadata.cpp
     )
 
     if(ENABLE_CUDA)

diff --git a/src/torchcodec/_core/Metadata.cpp b/src/torchcodec/_core/Metadata.cpp
@@ -0,0 +1,110 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "Metadata.h"
+
+namespace facebook::torchcodec {
+
+std::optional<double> StreamMetadata::getDurationSeconds(
+    SeekMode seekMode) const {
+  switch (seekMode) {
+    case SeekMode::custom_frame_mappings:
+    case SeekMode::exact:
+      // In exact mode, use the scanned content value
+      if (endStreamPtsSecondsFromContent.has_value() &&
+          beginStreamPtsSecondsFromContent.has_value()) {
+        return endStreamPtsSecondsFromContent.value() -
+            beginStreamPtsSecondsFromContent.value();
+      }
+      return std::nullopt;
 for stream_index in range(container_dict["numStreams"]): 
     stream_dict = json.loads(_get_stream_json_metadata(decoder, stream_index)) 
 for stream_index in range(container_dict["numStreams"]): 
     stream_dict = json.loads(_get_stream_json_metadata(decoder, stream_index)) 
+    case SeekMode::approximate:
+      if (durationSecondsFromHeader.has_value()) {
+        return durationSecondsFromHeader.value();
+      }
+      if (numFramesFromHeader.has_value() && averageFpsFromHeader.has_value() &&
+          averageFpsFromHeader.value() != 0.0) {
+        return static_cast<double>(numFramesFromHeader.value()) /
+            averageFpsFromHeader.value();
+      }
+      return std::nullopt;
+  }
+  return std::nullopt;
+}
+
+double StreamMetadata::getBeginStreamSeconds(SeekMode seekMode) const {
+  switch (seekMode) {
+    case SeekMode::custom_frame_mappings:
+    case SeekMode::exact:
+      if (beginStreamPtsSecondsFromContent.has_value()) {
+        return beginStreamPtsSecondsFromContent.value();
+      }
+      return 0.0;
+    case SeekMode::approximate:
+      return 0.0;
+  }
+  return 0.0;
+}
+
+std::optional<double> StreamMetadata::getEndStreamSeconds(
+    SeekMode seekMode) const {
+  switch (seekMode) {
+    case SeekMode::custom_frame_mappings:
+    case SeekMode::exact:
+      if (endStreamPtsSecondsFromContent.has_value()) {
+        return endStreamPtsSecondsFromContent.value();
+      }
+      return getDurationSeconds(seekMode);
+    case SeekMode::approximate:
+      return getDurationSeconds(seekMode);
+  }
+  return std::nullopt;
+}
+
+std::optional<int64_t> StreamMetadata::getNumFrames(SeekMode seekMode) const {
+  switch (seekMode) {
+    case SeekMode::custom_frame_mappings:
+    case SeekMode::exact:
+      if (numFramesFromContent.has_value()) {
+        return numFramesFromContent.value();
+      }
+      return std::nullopt;
+    case SeekMode::approximate: {
+      if (numFramesFromHeader.has_value()) {
+        return numFramesFromHeader.value();
+      }
+      if (averageFpsFromHeader.has_value() &&
+          durationSecondsFromHeader.has_value()) {
+        return static_cast<int64_t>(
+            averageFpsFromHeader.value() * durationSecondsFromHeader.value());
+      }
+      return std::nullopt;
+    }
+  }
+  return std::nullopt;
+}
+
+std::optional<double> StreamMetadata::getAverageFps(SeekMode seekMode) const {
+  switch (seekMode) {
+    case SeekMode::custom_frame_mappings:
+    case SeekMode::exact:
+      if (getNumFrames(seekMode).has_value() &&
+          beginStreamPtsSecondsFromContent.has_value() &&
+          endStreamPtsSecondsFromContent.has_value() &&
+          (beginStreamPtsSecondsFromContent.value() !=
+           endStreamPtsSecondsFromContent.value())) {
+        return static_cast<double>(
+            getNumFrames(seekMode).value() /
+            (endStreamPtsSecondsFromContent.value() -
+             beginStreamPtsSecondsFromContent.value()));
+      }
+      return averageFpsFromHeader;
+    case SeekMode::approximate:
+      return averageFpsFromHeader;
+  }
+  return std::nullopt;
+}
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Metadata.h b/src/torchcodec/_core/Metadata.h
@@ -18,6 +18,8 @@ extern "C" {
 
 namespace facebook::torchcodec {
 
+enum class SeekMode { exact, approximate, custom_frame_mappings };
+
 struct StreamMetadata {
   // Common (video and audio) fields derived from the AVStream.
   int streamIndex;
@@ -52,6 +54,13 @@ struct StreamMetadata {
   std::optional<int64_t> sampleRate;
   std::optional<int64_t> numChannels;
   std::optional<std::string> sampleFormat;
+
+  // Computed methods with fallback logic
+  std::optional<double> getDurationSeconds(SeekMode seekMode) const;
+  double getBeginStreamSeconds(SeekMode seekMode) const;
+  std::optional<double> getEndStreamSeconds(SeekMode seekMode) const;
+  std::optional<int64_t> getNumFrames(SeekMode seekMode) const;
+  std::optional<double> getAverageFps(SeekMode seekMode) const;
 };
 
 struct ContainerMetadata {

diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -367,6 +367,10 @@ ContainerMetadata SingleStreamDecoder::getContainerMetadata() const {
   return containerMetadata_;
 }
 
+SeekMode SingleStreamDecoder::getSeekMode() const {
+  return seekMode_;
+}
+
 torch::Tensor SingleStreamDecoder::getKeyFrameIndices() {
   validateActiveStream(AVMEDIA_TYPE_VIDEO);
   validateScannedAllStreams("getKeyFrameIndices");
@@ -611,7 +615,7 @@ FrameOutput SingleStreamDecoder::getFrameAtIndexInternal(
   const auto& streamMetadata =
       containerMetadata_.allStreamMetadata[activeStreamIndex_];
 
-  std::optional<int64_t> numFrames = getNumFrames(streamMetadata);
+  std::optional<int64_t> numFrames = streamMetadata.getNumFrames(seekMode_);
   if (numFrames.has_value()) {
     // If the frameIndex is negative, we convert it to a positive index
     frameIndex = frameIndex >= 0 ? frameIndex : frameIndex + numFrames.value();
@@ -705,7 +709,7 @@ FrameBatchOutput SingleStreamDecoder::getFramesInRange(
 
   // Note that if we do not have the number of frames available in our
   // metadata, then we assume that the upper part of the range is valid.
-  std::optional<int64_t> numFrames = getNumFrames(streamMetadata);
+  std::optional<int64_t> numFrames = streamMetadata.getNumFrames(seekMode_);
   if (numFrames.has_value()) {
     TORCH_CHECK(
         stop <= numFrames.value(),
@@ -779,8 +783,9 @@ FrameBatchOutput SingleStreamDecoder::getFramesPlayedAt(
   const auto& streamMetadata =
       containerMetadata_.allStreamMetadata[activeStreamIndex_];
 
-  double minSeconds = getMinSeconds(streamMetadata);
-  std::optional<double> maxSeconds = getMaxSeconds(streamMetadata);
+  double minSeconds = streamMetadata.getBeginStreamSeconds(seekMode_);
+  std::optional<double> maxSeconds =
+      streamMetadata.getEndStreamSeconds(seekMode_);
 
   // The frame played at timestamp t and the one played at timestamp `t +
   // eps` are probably the same frame, with the same index. The easiest way to
@@ -857,7 +862,7 @@ FrameBatchOutput SingleStreamDecoder::getFramesPlayedInRange(
     return frameBatchOutput;
   }
 
-  double minSeconds = getMinSeconds(streamMetadata);
+  double minSeconds = streamMetadata.getBeginStreamSeconds(seekMode_);
   TORCH_CHECK(
       startSeconds >= minSeconds,
       "Start seconds is " + std::to_string(startSeconds) +
@@ -866,7 +871,8 @@ FrameBatchOutput SingleStreamDecoder::getFramesPlayedInRange(
 
   // Note that if we can't determine the maximum seconds from the metadata,
   // then we assume upper range is valid.
-  std::optional<double> maxSeconds = getMaxSeconds(streamMetadata);
+  std::optional<double> maxSeconds =
+      streamMetadata.getEndStreamSeconds(seekMode_);
   if (maxSeconds.has_value()) {
     TORCH_CHECK(
         startSeconds < maxSeconds.value(),
@@ -1439,47 +1445,6 @@ int64_t SingleStreamDecoder::getPts(int64_t frameIndex) {
 // STREAM AND METADATA APIS
 // --------------------------------------------------------------------------
 
-std::optional<int64_t> SingleStreamDecoder::getNumFrames(
-    const StreamMetadata& streamMetadata) {
-  switch (seekMode_) {
-    case SeekMode::custom_frame_mappings:
-    case SeekMode::exact:
-      return streamMetadata.numFramesFromContent.value();
-    case SeekMode::approximate: {
-      return streamMetadata.numFramesFromHeader;
-    }
-    default:
-      TORCH_CHECK(false, "Unknown SeekMode");
-  }
-}
-
-double SingleStreamDecoder::getMinSeconds(
-    const StreamMetadata& streamMetadata) {
-  switch (seekMode_) {
-    case SeekMode::custom_frame_mappings:
-    case SeekMode::exact:
-      return streamMetadata.beginStreamPtsSecondsFromContent.value();
-    case SeekMode::approximate:
-      return 0;
-    default:
-      TORCH_CHECK(false, "Unknown SeekMode");
-  }
-}
-
-std::optional<double> SingleStreamDecoder::getMaxSeconds(
-    const StreamMetadata& streamMetadata) {
-  switch (seekMode_) {
-    case SeekMode::custom_frame_mappings:
-    case SeekMode::exact:
-      return streamMetadata.endStreamPtsSecondsFromContent.value();
-    case SeekMode::approximate: {
-      return streamMetadata.durationSecondsFromHeader;
-    }
-    default:
-      TORCH_CHECK(false, "Unknown SeekMode");
-  }
-}
-
 // --------------------------------------------------------------------------
 // VALIDATION UTILS
 // --------------------------------------------------------------------------
@@ -1529,7 +1494,7 @@ void SingleStreamDecoder::validateFrameIndex(
 
   // Note that if we do not have the number of frames available in our
   // metadata, then we assume that the frameIndex is valid.
-  std::optional<int64_t> numFrames = getNumFrames(streamMetadata);
+  std::optional<int64_t> numFrames = streamMetadata.getNumFrames(seekMode_);
   if (numFrames.has_value()) {
     if (frameIndex >= numFrames.value()) {
       throw std::out_of_range(

diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -16,6 +16,7 @@
 #include "DeviceInterface.h"
 #include "FFMPEGCommon.h"
 #include "Frame.h"
+#include "Metadata.h"
 #include "StreamOptions.h"
 #include "Transform.h"
 
@@ -30,8 +31,6 @@ class SingleStreamDecoder {
   // CONSTRUCTION API
   // --------------------------------------------------------------------------
 
-  enum class SeekMode { exact, approximate, custom_frame_mappings };
-
   // Creates a SingleStreamDecoder from the video at videoFilePath.
   explicit SingleStreamDecoder(
       const std::string& videoFilePath,
@@ -60,6 +59,9 @@ class SingleStreamDecoder {
   // Returns the metadata for the container.
   ContainerMetadata getContainerMetadata() const;
 
+  // Returns the seek mode of this decoder.
+  SeekMode getSeekMode() const;
+
   // Returns the key frame indices as a tensor. The tensor is 1D and contains
   // int64 values, where each value is the frame index for a key frame.
   torch::Tensor getKeyFrameIndices();
@@ -312,10 +314,6 @@ class SingleStreamDecoder {
   // index. Note that this index may be truncated for some files.
   int getBestStreamIndex(AVMediaType mediaType);
 
-  std::optional<int64_t> getNumFrames(const StreamMetadata& streamMetadata);
-  double getMinSeconds(const StreamMetadata& streamMetadata);
-  std::optional<double> getMaxSeconds(const StreamMetadata& streamMetadata);
-
   // --------------------------------------------------------------------------
   // VALIDATION UTILS
   // --------------------------------------------------------------------------