Merge pull request #25874 from cudawarped:videoio_ffmpeg_fix_encapsulate_ts

videoio: fix cv::VideoWriter with FFmpeg encapsulation timestamps #25874 Fix https://github.com/opencv/opencv/issues/25873 by modifying `cv::VideoWriter` to use provided presentation indices (pts). ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2024-11-27 20:50:25 +08:00 · 2024-07-22 17:41:39 +03:00 · 2024-07-22 17:41:39 +03:00 · c9b57819b1
commit c9b57819b1
parent 89dd4ee137
3 changed files with 88 additions and 32 deletions
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@ -211,6 +211,8 @@ enum VideoCaptureProperties {
       CAP_PROP_CODEC_EXTRADATA_INDEX = 68, //!< Positive index indicates that returning extra data is supported by the video back end.  This can be retrieved as cap.retrieve(data, <returned index>).  E.g. When reading from a h264 encoded RTSP stream, the FFmpeg backend could return the SPS and/or PPS if available (if sent in reply to a DESCRIBE request), from calls to cap.retrieve(data, <returned index>).
       CAP_PROP_FRAME_TYPE = 69, //!< (read-only) FFmpeg back-end only - Frame type ascii code (73 = 'I', 80 = 'P', 66 = 'B' or 63 = '?' if unknown) of the most recently read frame.
       CAP_PROP_N_THREADS = 70, //!< (**open-only**) Set the maximum number of threads to use. Use 0 to use as many threads as CPU cores (applicable for FFmpeg back-end only).
+       CAP_PROP_PTS = 71, //!<  (read-only) FFmpeg back-end only - presentation timestamp of the most recently read frame using the FPS time base.  e.g. fps = 25, VideoCapture::get(\ref CAP_PROP_PTS) = 3, presentation time = 3/25 seconds.
+       CAP_PROP_DTS_DELAY = 72, //!<  (read-only) FFmpeg back-end only - maximum difference between presentation (pts) and decompression timestamps (dts) using FPS time base.  e.g. delay is maximum when frame_num = 0, if true, VideoCapture::get(\ref CAP_PROP_PTS) = 0 and VideoCapture::get(\ref CAP_PROP_DTS_DELAY) = 2, dts = -2.  Non zero values usually imply the stream is encoded using B-frames which are not decoded in presentation order.
 #ifndef CV_DOXYGEN
       CV__CAP_PROP_LATEST
 #endif
@ -230,8 +232,10 @@ enum VideoWriterProperties {
  VIDEOWRITER_PROP_HW_DEVICE       = 7, //!< (**open-only**) Hardware device index (select GPU if multiple available). Device enumeration is acceleration type specific.
  VIDEOWRITER_PROP_HW_ACCELERATION_USE_OPENCL= 8, //!< (**open-only**) If non-zero, create new OpenCL context and bind it to current thread. The OpenCL context created with Video Acceleration context attached it (if not attached yet) for optimized GPU data copy between cv::UMat and HW accelerated encoder.
  VIDEOWRITER_PROP_RAW_VIDEO = 9, //!< (**open-only**) Set to non-zero to enable encapsulation of an encoded raw video stream. Each raw encoded video frame should be passed to VideoWriter::write() as single row or column of a \ref CV_8UC1 Mat. \note If the key frame interval is not 1 then it must be manually specified by the user. This can either be performed during initialization passing \ref VIDEOWRITER_PROP_KEY_INTERVAL as one of the extra encoder params  to \ref VideoWriter::VideoWriter(const String &, int, double, const Size &, const std::vector< int > &params) or afterwards by setting the \ref VIDEOWRITER_PROP_KEY_FLAG with \ref VideoWriter::set() before writing each frame. FFMpeg backend only.
-  VIDEOWRITER_PROP_KEY_INTERVAL = 10, //!< (**open-only**) Set the key frame interval using raw video encapsulation (\ref VIDEOWRITER_PROP_RAW_VIDEO != 0). Defaults to 1 when not set. FFMpeg backend only.
-  VIDEOWRITER_PROP_KEY_FLAG = 11, //!< Set to non-zero to signal that the following frames are key frames or zero if not, when encapsulating raw video (\ref VIDEOWRITER_PROP_RAW_VIDEO != 0). FFMpeg backend only.
+  VIDEOWRITER_PROP_KEY_INTERVAL = 10, //!< (**open-only**) Set the key frame interval using raw video encapsulation (\ref VIDEOWRITER_PROP_RAW_VIDEO != 0). Defaults to 1 when not set. FFmpeg back-end only.
+  VIDEOWRITER_PROP_KEY_FLAG = 11, //!< Set to non-zero to signal that the following frames are key frames or zero if not, when encapsulating raw video (\ref VIDEOWRITER_PROP_RAW_VIDEO != 0). FFmpeg back-end only.
+  VIDEOWRITER_PROP_PTS = 12, //!< Specifies the frame presentation timestamp for each frame using the FPS time base. This property is **only** necessary when encapsulating **externally** encoded video where the decoding order differs from the presentation order, such as in GOP patterns with bi-directional B-frames. The value should be provided by your external encoder and for video sources with fixed frame rates it is equivalent to dividing the current frame's presentation time (\ref CAP_PROP_POS_MSEC) by the frame duration (1000.0 / VideoCapture::get(\ref CAP_PROP_FPS)). It can be queried from the resulting encapsulated video file using VideoCapture::get(\ref CAP_PROP_PTS). FFmpeg back-end only.
+  VIDEOWRITER_PROP_DTS_DELAY = 13, //!< Specifies the maximum difference between presentation (pts) and decompression timestamps (dts) using the FPS time base. This property is necessary **only** when encapsulating **externally** encoded video where the decoding order differs from the presentation order, such as in GOP patterns with bi-directional B-frames. The value should be calculated based on the specific GOP pattern used during encoding. For example, in a GOP with presentation order IBP and decoding order IPB, this value would be 1, as the B-frame is the second frame presented but the third to be decoded. It can be queried from the resulting encapsulated video file using VideoCapture::get(\ref CAP_PROP_DTS_DELAY). Non-zero values usually imply the stream is encoded using B-frames. FFmpeg back-end only.
 #ifndef CV_DOXYGEN
  CV__VIDEOWRITER_PROP_LATEST
 #endif
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@ -560,6 +560,8 @@ struct CvCapture_FFMPEG
    AVFrame         * picture;
    AVFrame           rgb_picture;
    int64_t           picture_pts;
+    int64_t           pts_in_fps_time_base;
+    int64_t           dts_delay_in_fps_time_base;

    AVPacket          packet;
    Image_FFMPEG      frame;
@ -615,6 +617,8 @@ void CvCapture_FFMPEG::init()
    video_st = 0;
    picture = 0;
    picture_pts = AV_NOPTS_VALUE_;
+    pts_in_fps_time_base = 0;
+    dts_delay_in_fps_time_base = 0;
    first_frame_number = -1;
    memset( &rgb_picture, 0, sizeof(rgb_picture) );
    memset( &frame, 0, sizeof(frame) );
@ -1581,13 +1585,26 @@ bool CvCapture_FFMPEG::grabFrame()

    if (valid) {
        if (picture_pts == AV_NOPTS_VALUE_) {
-            if (!rawMode)
+            int64_t dts = 0;
+            if (!rawMode) {
                picture_pts = picture->CV_FFMPEG_PTS_FIELD != AV_NOPTS_VALUE_ && picture->CV_FFMPEG_PTS_FIELD != 0 ? picture->CV_FFMPEG_PTS_FIELD : picture->pkt_dts;
+                if(frame_number == 0) dts = picture->pkt_dts;
+            }
            else {
                const AVPacket& packet_raw = packet.data != 0 ? packet : packet_filtered;
                picture_pts = packet_raw.pts != AV_NOPTS_VALUE_ && packet_raw.pts != 0 ? packet_raw.pts : packet_raw.dts;
+                if (frame_number == 0) dts = packet_raw.dts;
                if (picture_pts < 0) picture_pts = 0;
            }
+#if LIBAVCODEC_BUILD >= CALC_FFMPEG_VERSION(54, 1, 0) || LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(52, 111, 0)
+            AVRational frame_rate = video_st->avg_frame_rate;
+#else
+            AVRational frame_rate = video_st->r_frame_rate;
+#endif
+            if (picture_pts != AV_NOPTS_VALUE_)
+                pts_in_fps_time_base = av_rescale_q(picture_pts, video_st->time_base, AVRational{ frame_rate.den, frame_rate.num });
+            if (frame_number == 0 && dts != AV_NOPTS_VALUE_)
+                dts_delay_in_fps_time_base = -av_rescale_q(dts, video_st->time_base, AVRational{ frame_rate.den, frame_rate.num });
            frame_number++;
        }
    }
@ -1855,6 +1872,11 @@ double CvCapture_FFMPEG::getProperty( int property_id ) const
    case CAP_PROP_N_THREADS:
        if (!rawMode)
            return static_cast<double>(context->thread_count);
+        break;
+    case CAP_PROP_PTS:
+        return static_cast<double>(pts_in_fps_time_base);
+    case CAP_PROP_DTS_DELAY:
+        return static_cast<double>(dts_delay_in_fps_time_base);
    default:
        break;
    }
@ -2107,6 +2129,8 @@ struct CvVideoWriter_FFMPEG
    bool              encode_video;
    int               idr_period;
    bool              key_frame;
+    int               pts_index;
+    int               b_frame_dts_delay;
 };

 static const char * icvFFMPEGErrStr(int err)
@ -2175,6 +2199,8 @@ void CvVideoWriter_FFMPEG::init()
    encode_video = true;
    idr_period = 0;
    key_frame = false;
+    pts_index = -1;
+    b_frame_dts_delay = 0;
 }

 /**
@ -2343,7 +2369,7 @@ static AVCodecContext * icv_configure_video_stream_FFMPEG(AVFormatContext *oc,
 static const int OPENCV_NO_FRAMES_WRITTEN_CODE = 1000;

 static int icv_av_encapsulate_video_FFMPEG(AVFormatContext* oc, AVStream* video_st, AVCodecContext* c,
-    uint8_t* data, int sz, const int frame_idx, const bool key_frame)
+    uint8_t* data, int sz, const int frame_idx, const int pts_index, const int b_frame_dts_delay, const bool key_frame)
 {
 #if LIBAVFORMAT_BUILD < CALC_FFMPEG_VERSION(57, 0, 0)
    AVPacket pkt_;
@ -2354,7 +2380,9 @@ static int icv_av_encapsulate_video_FFMPEG(AVFormatContext* oc, AVStream* video_
 #endif
    if(key_frame)
        pkt->flags |= PKT_FLAG_KEY;
-    pkt->pts = frame_idx;
+    pkt->pts = pts_index == -1 ? frame_idx : pts_index;
+    pkt->dts = frame_idx - b_frame_dts_delay;
+    pkt->duration = 1;
    pkt->size = sz;
    pkt->data = data;
    av_packet_rescale_ts(pkt, c->time_base, video_st->time_base);
@ -2449,7 +2477,7 @@ bool CvVideoWriter_FFMPEG::writeFrame( const unsigned char* data, int step, int
    if (!encode_video) {
        CV_Assert(cn == 1 && ((width > 0 && height == 1) || (width == 1 && height > 0 && step == 1)));
        const bool set_key_frame = key_frame ? key_frame : idr_period ? frame_idx % idr_period == 0 : 1;
-        bool ret = icv_av_encapsulate_video_FFMPEG(oc, video_st, context, (uint8_t*)data, width, frame_idx, set_key_frame);
+        bool ret = icv_av_encapsulate_video_FFMPEG(oc, video_st, context, (uint8_t*)data, width, frame_idx, pts_index, b_frame_dts_delay, set_key_frame);
        frame_idx++;
        return ret;
    }
@ -2651,6 +2679,12 @@ bool CvVideoWriter_FFMPEG::setProperty(int property_id, double value)
    case VIDEOWRITER_PROP_KEY_FLAG:
        key_frame = static_cast<bool>(value);
        break;
+    case VIDEOWRITER_PROP_PTS:
+        pts_index = static_cast<int>(value);
+        break;
+    case VIDEOWRITER_PROP_DTS_DELAY:
+        b_frame_dts_delay = static_cast<int>(value);
+        break;
    default:
        return false;
    }
--- a/modules/videoio/test/test_ffmpeg.cpp
+++ b/modules/videoio/test/test_ffmpeg.cpp
@ -293,9 +293,13 @@ const videoio_container_get_params_t videoio_container_get_params[] =

 INSTANTIATE_TEST_CASE_P(/**/, videoio_container_get, testing::ValuesIn(videoio_container_get_params));

-typedef tuple<string, string, int, int> videoio_encapsulate_params_t;
+typedef tuple<string, string, int, int, bool, bool> videoio_encapsulate_params_t;
 typedef testing::TestWithParam< videoio_encapsulate_params_t > videoio_encapsulate;

+#if defined(WIN32)  // remove when FFmpeg wrapper includes PR25874
+#define WIN32_WAIT_FOR_FFMPEG_WRAPPER_UPDATE
+#endif
+
 TEST_P(videoio_encapsulate, write)
 {
    const VideoCaptureAPIs api = CAP_FFMPEG;
@ -307,6 +311,8 @@ TEST_P(videoio_encapsulate, write)
    const int idrPeriod = get<2>(GetParam());
    const int nFrames = get<3>(GetParam());
    const string fileNameOut = tempfile(cv::format("test_encapsulated_stream.%s", ext.c_str()).c_str());
+    const bool setPts = get<4>(GetParam());
+    const bool tsWorking = get<5>(GetParam());

    // Use VideoWriter to encapsulate encoded video read with VideoReader
    {
@ -320,12 +326,16 @@ TEST_P(videoio_encapsulate, write)
        capRaw.retrieve(extraData, codecExtradataIndex);
        const int fourcc = static_cast<int>(capRaw.get(CAP_PROP_FOURCC));
        const bool mpeg4 = (fourcc == fourccFromString("FMP4"));
-
        VideoWriter container(fileNameOut, api, fourcc, fps, { width, height }, { VideoWriterProperties::VIDEOWRITER_PROP_RAW_VIDEO, 1, VideoWriterProperties::VIDEOWRITER_PROP_KEY_INTERVAL, idrPeriod });
        ASSERT_TRUE(container.isOpened());
        Mat rawFrame;
        for (int i = 0; i < nFrames; i++) {
            ASSERT_TRUE(capRaw.read(rawFrame));
+#if !defined(WIN32_WAIT_FOR_FFMPEG_WRAPPER_UPDATE)
+            if (setPts && i == 0) {
+                ASSERT_TRUE(container.set(VIDEOWRITER_PROP_DTS_DELAY, capRaw.get(CAP_PROP_DTS_DELAY)));
+            }
+#endif
            ASSERT_FALSE(rawFrame.empty());
            if (i == 0 && mpeg4) {
                Mat tmp = rawFrame.clone();
@ -336,6 +346,11 @@ TEST_P(videoio_encapsulate, write)
                memcpy(rawFrame.data, extraData.data, extraData.total());
                memcpy(rawFrame.data + extraData.total(), tmp.data, tmp.total());
            }
+#if !defined(WIN32_WAIT_FOR_FFMPEG_WRAPPER_UPDATE)
+            if (setPts) {
+                ASSERT_TRUE(container.set(VIDEOWRITER_PROP_PTS, capRaw.get(CAP_PROP_PTS)));
+            }
+#endif
            container.write(rawFrame);
        }
        container.release();
@ -362,11 +377,15 @@ TEST_P(videoio_encapsulate, write)
            ASSERT_TRUE(capActual.read(actual));
            ASSERT_FALSE(actual.empty());
            ASSERT_EQ(0, cvtest::norm(reference, actual, NORM_INF));
-
            ASSERT_TRUE(capActualRaw.grab());
            const bool keyFrameActual = capActualRaw.get(CAP_PROP_LRF_HAS_KEY_FRAME) == 1.;
            const bool keyFrameReference = idrPeriod ? i % idrPeriod == 0 : 1;
            ASSERT_EQ(keyFrameReference, keyFrameActual);
+#if !defined(WIN32_WAIT_FOR_FFMPEG_WRAPPER_UPDATE)
+            if (tsWorking) {
+                ASSERT_EQ(round(capReference.get(CAP_PROP_POS_MSEC)), round(capActual.get(CAP_PROP_POS_MSEC)));
+            }
+#endif
        }
    }

@ -375,30 +394,29 @@ TEST_P(videoio_encapsulate, write)

 const videoio_encapsulate_params_t videoio_encapsulate_params[] =
 {
-    videoio_encapsulate_params_t("video/big_buck_bunny.h264", "avi", 125, 125),
-    videoio_encapsulate_params_t("video/big_buck_bunny.h265", "mp4", 125, 125),
-    videoio_encapsulate_params_t("video/big_buck_bunny.wmv", "wmv", 12, 13),
-    videoio_encapsulate_params_t("video/big_buck_bunny.mp4", "mp4", 12, 13),
-    videoio_encapsulate_params_t("video/big_buck_bunny.mjpg.avi", "mp4", 0, 4),
-    videoio_encapsulate_params_t("video/big_buck_bunny.mov", "mp4", 12, 13),
-    videoio_encapsulate_params_t("video/big_buck_bunny.avi", "mp4", 125, 125),
-    videoio_encapsulate_params_t("video/big_buck_bunny.mpg", "mp4", 12, 13),
-    videoio_encapsulate_params_t("video/VID00003-20100701-2204.wmv", "wmv", 12, 13),
-    videoio_encapsulate_params_t("video/VID00003-20100701-2204.mpg", "mp4", 12,13),
-    videoio_encapsulate_params_t("video/VID00003-20100701-2204.avi", "mp4", 12, 13),
-    videoio_encapsulate_params_t("video/VID00003-20100701-2204.3GP", "mp4", 51, 52),
-    videoio_encapsulate_params_t("video/sample_sorenson.avi", "mp4", 12, 13),
-    videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.libxvid.mp4", "mp4", 3, 4),
-    videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.mpeg2video.mp4", "mp4", 12, 13),
-    videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.mjpeg.mp4", "mp4", 0, 5),
-    videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.libx264.mp4", "avi", 15, 15),
-    videoio_encapsulate_params_t("../cv/tracking/faceocc2/data/faceocc2.webm", "webm", 128, 129),
-    videoio_encapsulate_params_t("../cv/video/1920x1080.avi", "mp4", 12, 13),
-    videoio_encapsulate_params_t("../cv/video/768x576.avi", "avi", 15, 16)
+    videoio_encapsulate_params_t("video/big_buck_bunny.h264", "avi", 125, 125, false, false), // tsWorking = false: no timestamp information
+    videoio_encapsulate_params_t("video/big_buck_bunny.h265", "mp4", 125, 125, false, false), // tsWorking = false: no timestamp information
+    videoio_encapsulate_params_t("video/big_buck_bunny.wmv", "wmv", 12, 13, false, true),
+    videoio_encapsulate_params_t("video/big_buck_bunny.mp4", "mp4", 12, 13, false, true),
+    videoio_encapsulate_params_t("video/big_buck_bunny.mjpg.avi", "mp4", 0, 4, false, true),
+    videoio_encapsulate_params_t("video/big_buck_bunny.mov", "mp4", 12, 13, false, true),
+    videoio_encapsulate_params_t("video/big_buck_bunny.avi", "mp4", 125, 125, false, false), // tsWorking = false: PTS not available for all frames
+    videoio_encapsulate_params_t("video/big_buck_bunny.mpg", "mp4", 12, 13, true, true),
+    videoio_encapsulate_params_t("video/VID00003-20100701-2204.wmv", "wmv", 12, 13, false, true),
+    videoio_encapsulate_params_t("video/VID00003-20100701-2204.mpg", "mp4", 12, 13, false, false), // tsWorking = false: PTS not available for all frames
+    videoio_encapsulate_params_t("video/VID00003-20100701-2204.avi", "mp4", 12, 13, false, false), // tsWorking = false: Unable to correctly set PTS when writing
+    videoio_encapsulate_params_t("video/VID00003-20100701-2204.3GP", "mp4", 51, 52, false, false), // tsWorking = false: Source with variable fps
+    videoio_encapsulate_params_t("video/sample_sorenson.avi", "mp4", 12, 13, false, true),
+    videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.libxvid.mp4", "mp4", 3, 4, false, true),
+    videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.mpeg2video.mp4", "mpg", 12, 13, false, true),
+    videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.mjpeg.mp4", "mp4", 0, 5, false, true),
+    videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.libx264.mp4", "ts", 15, 15, true, true),
+    videoio_encapsulate_params_t("../cv/tracking/faceocc2/data/faceocc2.webm", "webm", 128, 129, false, true),
+    videoio_encapsulate_params_t("../cv/video/1920x1080.avi", "mp4", 12, 13, false, true),
+    videoio_encapsulate_params_t("../cv/video/768x576.avi", "avi", 15, 16, false, true),
    // Not supported by with FFmpeg:
-    //videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.libx265.mp4", "mp4", 15, 15),
-    //videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.libvpx-vp9.mp4", "mp4", 15, 15),
-
+    //videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.libx265.mp4", "mp4", 15, 15, true, true),
+    //videoio_encapsulate_params_t("video/sample_322x242_15frames.yuv420p.libvpx-vp9.mp4", "mp4", 15, 15, false, true),
 };

 INSTANTIATE_TEST_CASE_P(/**/, videoio_encapsulate, testing::ValuesIn(videoio_encapsulate_params));