diff --git a/libs/scrap/examples/benchmark.rs b/libs/scrap/examples/benchmark.rs index eccf502d9..dd1380995 100644 --- a/libs/scrap/examples/benchmark.rs +++ b/libs/scrap/examples/benchmark.rs @@ -101,10 +101,8 @@ fn test_vpx( let config = EncoderCfg::VPX(VpxEncoderConfig { width: width as _, height: height as _, - timebase: [1, 1000], bitrate: bitrate_k as _, codec: codec_id, - num_threads: (num_cpus::get() / 2) as _, }); let mut encoder = VpxEncoder::new(config).unwrap(); let mut vpxs = vec![]; diff --git a/libs/scrap/examples/record-screen.rs b/libs/scrap/examples/record-screen.rs index 10ad66e09..48f73052a 100644 --- a/libs/scrap/examples/record-screen.rs +++ b/libs/scrap/examples/record-screen.rs @@ -101,10 +101,8 @@ fn main() -> io::Result<()> { let mut vpx = vpx_encode::VpxEncoder::new(EncoderCfg::VPX(vpx_encode::VpxEncoderConfig { width, height, - timebase: [1, 1000], bitrate: args.flag_bv, codec: vpx_codec, - num_threads: 0, })) .unwrap(); diff --git a/libs/scrap/src/common/aom.rs b/libs/scrap/src/common/aom.rs index b9748efda..2c614feec 100644 --- a/libs/scrap/src/common/aom.rs +++ b/libs/scrap/src/common/aom.rs @@ -78,7 +78,7 @@ mod webrtc { } else { // Use 2 threads for low res on ARM. #[cfg(any(target_arch = "arm", target_arch = "aarch64", target_os = "android"))] - if (width * height >= 320 * 180 && number_of_cores > 2) { + if width * height >= 320 * 180 && number_of_cores > 2 { return 2; } // 1 thread less than VGA. diff --git a/libs/scrap/src/common/vpxcodec.rs b/libs/scrap/src/common/vpxcodec.rs index 801527811..090df2cc2 100644 --- a/libs/scrap/src/common/vpxcodec.rs +++ b/libs/scrap/src/common/vpxcodec.rs @@ -10,13 +10,14 @@ use hbb_common::ResultType; use crate::codec::EncoderApi; use crate::{GoogleImage, STRIDE_ALIGN}; -use super::vpx::{vp8e_enc_control_id::*, vpx_codec_err_t::*, *}; +use super::vpx::{vpx_codec_err_t::*, *}; use crate::{generate_call_macro, generate_call_ptr_macro, Error, Result}; use hbb_common::bytes::Bytes; -use std::os::raw::{c_int, c_uint}; +use std::os::raw::c_uint; use std::{ptr, slice}; generate_call_macro!(call_vpx, false); +generate_call_macro!(call_vpx_allow_err, true); generate_call_ptr_macro!(call_vpx_ptr); #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] @@ -53,40 +54,11 @@ impl EncoderApi for VpxEncoder { VpxVideoCodecId::VP8 => call_vpx_ptr!(vpx_codec_vp8_cx()), VpxVideoCodecId::VP9 => call_vpx_ptr!(vpx_codec_vp9_cx()), }; - let mut c = unsafe { std::mem::MaybeUninit::zeroed().assume_init() }; - call_vpx!(vpx_codec_enc_config_default(i, &mut c, 0)); - // https://www.webmproject.org/docs/encoder-parameters/ - // default: c.rc_min_quantizer = 0, c.rc_max_quantizer = 63 - // try rc_resize_allowed later - - c.g_w = config.width; - c.g_h = config.height; - c.g_timebase.num = config.timebase[0]; - c.g_timebase.den = config.timebase[1]; - c.rc_target_bitrate = config.bitrate; - c.rc_undershoot_pct = 95; - c.rc_dropframe_thresh = 25; - c.g_threads = if config.num_threads == 0 { - num_cpus::get() as _ - } else { - config.num_threads + let c = match config.codec { + VpxVideoCodecId::VP8 => webrtc::vp8::enc_cfg(i, &config)?, + VpxVideoCodecId::VP9 => webrtc::vp9::enc_cfg(i, &config)?, }; - c.g_error_resilient = VPX_ERROR_RESILIENT_DEFAULT; - // https://developers.google.com/media/vp9/bitrate-modes/ - // Constant Bitrate mode (CBR) is recommended for live streaming with VP9. - c.rc_end_usage = vpx_rc_mode::VPX_CBR; - // c.kf_min_dist = 0; - // c.kf_max_dist = 999999; - c.kf_mode = vpx_kf_mode::VPX_KF_DISABLED; // reduce bandwidth a lot - - /* - The VPX encoder supports two-pass encoding for rate control purposes. - In two-pass encoding, the entire encoding process is performed twice. - The first pass generates new control parameters for the second pass. - - This approach enables the best PSNR at the same bit rate. - */ let mut ctx = Default::default(); call_vpx!(vpx_codec_enc_init_ver( @@ -96,50 +68,9 @@ impl EncoderApi for VpxEncoder { 0, VPX_ENCODER_ABI_VERSION as _ )); - - if config.codec == VpxVideoCodecId::VP9 { - // set encoder internal speed settings - // in ffmpeg, it is --speed option - /* - set to 0 or a positive value 1-16, the codec will try to adapt its - complexity depending on the time it spends encoding. Increasing this - number will make the speed go up and the quality go down. - Negative values mean strict enforcement of this - while positive values are adaptive - */ - /* https://developers.google.com/media/vp9/live-encoding - Speed 5 to 8 should be used for live / real-time encoding. - Lower numbers (5 or 6) are higher quality but require more CPU power. - Higher numbers (7 or 8) will be lower quality but more manageable for lower latency - use cases and also for lower CPU power devices such as mobile. - */ - call_vpx!(vpx_codec_control_(&mut ctx, VP8E_SET_CPUUSED as _, 7,)); - // set row level multi-threading - /* - as some people in comments and below have already commented, - more recent versions of libvpx support -row-mt 1 to enable tile row - multi-threading. This can increase the number of tiles by up to 4x in VP9 - (since the max number of tile rows is 4, regardless of video height). - To enable this, use -tile-rows N where N is the number of tile rows in - log2 units (so -tile-rows 1 means 2 tile rows and -tile-rows 2 means 4 tile - rows). The total number of active threads will then be equal to - $tile_rows * $tile_columns - */ - call_vpx!(vpx_codec_control_( - &mut ctx, - VP9E_SET_ROW_MT as _, - 1 as c_int - )); - - call_vpx!(vpx_codec_control_( - &mut ctx, - VP9E_SET_TILE_COLUMNS as _, - 4 as c_int - )); - } else if config.codec == VpxVideoCodecId::VP8 { - // https://github.com/webmproject/libvpx/blob/972149cafeb71d6f08df89e91a0130d6a38c4b15/vpx/vp8cx.h#L172 - // https://groups.google.com/a/webmproject.org/g/webm-discuss/c/DJhSrmfQ61M - call_vpx!(vpx_codec_control_(&mut ctx, VP8E_SET_CPUUSED as _, 12,)); + match config.codec { + VpxVideoCodecId::VP8 => webrtc::vp8::set_control(&mut ctx, &c)?, + VpxVideoCodecId::VP9 => webrtc::vp9::set_control(&mut ctx, &c)?, } Ok(Self { @@ -287,13 +218,10 @@ pub struct VpxEncoderConfig { pub width: c_uint, /// The height (in pixels). pub height: c_uint, - /// The timebase numerator and denominator (in seconds). - pub timebase: [c_int; 2], /// The target bitrate (in kilobits per second). pub bitrate: c_uint, /// The codec pub codec: VpxVideoCodecId, - pub num_threads: u32, } #[derive(Clone, Copy, Debug)] @@ -489,3 +417,370 @@ impl Drop for Image { } unsafe impl Send for vpx_codec_ctx_t {} + +mod webrtc { + use super::*; + + const K_QP_MAX: u32 = 25; // worth adjusting + const MODE: VideoCodecMode = VideoCodecMode::KScreensharing; + const K_RTP_TICKS_PER_SECOND: i32 = 90000; + const NUMBER_OF_TEMPORAL_LAYERS: u32 = 1; + const DENOISING_ON: bool = true; + const FRAME_DROP_ENABLED: bool = false; + + #[allow(dead_code)] + #[derive(Debug, PartialEq, Eq)] + enum VideoCodecMode { + KRealtimeVideo, + KScreensharing, + } + + #[allow(dead_code)] + #[derive(Debug, PartialEq, Eq)] + enum VideoCodecComplexity { + KComplexityLow = -1, + KComplexityNormal = 0, + KComplexityHigh = 1, + KComplexityHigher = 2, + KComplexityMax = 3, + } + + // https://webrtc.googlesource.com/src/+/refs/heads/main/modules/video_coding/codecs/vp9/libvpx_vp9_encoder.cc + pub mod vp9 { + use super::*; + const SVC: bool = false; + // https://webrtc.googlesource.com/src/+/refs/heads/main/api/video_codecs/video_encoder.cc#35 + const KEY_FRAME_INTERVAL: u32 = 3000; + const ADAPTIVE_QP_MODE: bool = true; + + pub fn enc_cfg( + i: *const vpx_codec_iface_t, + cfg: &VpxEncoderConfig, + ) -> ResultType { + let mut c: vpx_codec_enc_cfg_t = + unsafe { std::mem::MaybeUninit::zeroed().assume_init() }; + call_vpx!(vpx_codec_enc_config_default(i, &mut c, 0)); + + // kProfile0 + c.g_bit_depth = vpx_bit_depth::VPX_BITS_8; + c.g_profile = 0; + c.g_input_bit_depth = 8; + + c.g_w = cfg.width; + c.g_h = cfg.height; + c.rc_target_bitrate = cfg.bitrate; // in kbit/s + c.g_error_resilient = if SVC { VPX_ERROR_RESILIENT_DEFAULT } else { 0 }; + c.g_timebase.num = 1; + c.g_timebase.den = K_RTP_TICKS_PER_SECOND; + c.g_lag_in_frames = 0; + c.rc_dropframe_thresh = if FRAME_DROP_ENABLED { 30 } else { 0 }; + c.rc_end_usage = vpx_rc_mode::VPX_CBR; + c.g_pass = vpx_enc_pass::VPX_RC_ONE_PASS; + c.rc_min_quantizer = if MODE == VideoCodecMode::KScreensharing { + 8 + } else { + 2 + }; + c.rc_max_quantizer = K_QP_MAX; + c.rc_undershoot_pct = 50; + c.rc_overshoot_pct = 50; + c.rc_buf_initial_sz = 500; + c.rc_buf_optimal_sz = 600; + c.rc_buf_sz = 1000; + // Key-frame interval is enforced manually by this wrapper. + c.kf_mode = vpx_kf_mode::VPX_KF_DISABLED; + // TODO(webm:1592): work-around for libvpx issue, as it can still + // put some key-frames at will even in VPX_KF_DISABLED kf_mode. + c.kf_max_dist = KEY_FRAME_INTERVAL; + c.kf_min_dist = c.kf_max_dist; + c.rc_resize_allowed = 0; + // Determine number of threads based on the image size and #cores. + c.g_threads = number_of_threads(c.g_w, c.g_h, num_cpus::get()); + + c.temporal_layering_mode = + vp9e_temporal_layering_mode::VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING as _; + c.ts_number_layers = 1; + c.ts_rate_decimator[0] = 1; + c.ts_periodicity = 1; + c.ts_layer_id[0] = 0; + + Ok(c) + } + + pub fn set_control(ctx: *mut vpx_codec_ctx_t, cfg: &vpx_codec_enc_cfg_t) -> ResultType<()> { + use vp8e_enc_control_id::*; + + macro_rules! call_ctl { + ($ctx:expr, $vpxe:expr, $arg:expr) => {{ + call_vpx_allow_err!(vpx_codec_control_($ctx, $vpxe as i32, $arg)); + }}; + } + call_ctl!( + ctx, + VP8E_SET_MAX_INTRA_BITRATE_PCT, + max_intra_target(cfg.rc_buf_optimal_sz) + ); + call_ctl!(ctx, VP9E_SET_AQ_MODE, if ADAPTIVE_QP_MODE { 3 } else { 0 }); + call_ctl!(ctx, VP9E_SET_FRAME_PARALLEL_DECODING, 0); + call_ctl!(ctx, VP9E_SET_SVC_GF_TEMPORAL_REF, 0); + call_ctl!( + ctx, + VP8E_SET_CPUUSED, + get_default_performance_flags(cfg.g_w, cfg.g_h).0 + ); + call_ctl!(ctx, VP9E_SET_TILE_COLUMNS, cfg.g_threads >> 1); + // Turn on row-based multithreading. + call_ctl!(ctx, VP9E_SET_ROW_MT, 1); + let denoising = DENOISING_ON + && allow_denoising() + && get_default_performance_flags(cfg.g_w, cfg.g_h).1; + call_ctl!( + ctx, + VP9E_SET_NOISE_SENSITIVITY, + if denoising { 1 } else { 0 } + ); + if MODE == VideoCodecMode::KScreensharing { + call_ctl!(ctx, VP9E_SET_TUNE_CONTENT, 1); + } + // Enable encoder skip of static/low content blocks. + call_ctl!(ctx, VP8E_SET_STATIC_THRESHOLD, 1); + + Ok(()) + } + + // return (base_layer_speed, allow_denoising) + fn get_default_performance_flags(width: u32, height: u32) -> (u32, bool) { + if cfg!(any( + target_arch = "arm", + target_arch = "aarch64", + target_os = "android" + )) { + (8, true) + } else if width * height < 352 * 288 { + (5, true) + } else if width * height < 1920 * 1080 { + (7, true) + } else { + (9, false) + } + } + + fn allow_denoising() -> bool { + // Do not enable the denoiser on ARM since optimization is pending. + // Denoiser is on by default on other platforms. + if cfg!(any( + target_arch = "arm", + target_arch = "aarch64", + target_os = "android" + )) { + false + } else { + true + } + } + + fn number_of_threads(width: u32, height: u32, number_of_cores: usize) -> u32 { + // Keep the number of encoder threads equal to the possible number of column + // tiles, which is (1, 2, 4, 8). See comments below for VP9E_SET_TILE_COLUMNS. + if width * height >= 1280 * 720 && number_of_cores > 4 { + return 4; + } else if width * height >= 640 * 360 && number_of_cores > 2 { + return 2; + } else { + // Use 2 threads for low res on ARM. + #[cfg(any(target_arch = "arm", target_arch = "aarch64", target_os = "android"))] + if width * height >= 320 * 180 && number_of_cores > 2 { + return 2; + } + // 1 thread less than VGA. + return 1; + } + } + } + + // https://webrtc.googlesource.com/src/+/refs/heads/main/modules/video_coding/codecs/vp8/libvpx_vp8_encoder.cc + pub mod vp8 { + use super::*; + // https://webrtc.googlesource.com/src/+/refs/heads/main/api/video_codecs/video_encoder.cc#23 + const DISABLE_KEY_FRAME_INTERVAL: bool = true; + const KEY_FRAME_INTERVAL: u32 = 3000; + const COMPLEXITY: VideoCodecComplexity = VideoCodecComplexity::KComplexityNormal; + const K_TOKEN_PARTITIONS: vp8e_token_partitions = + vp8e_token_partitions::VP8_ONE_TOKENPARTITION; + + pub fn enc_cfg( + i: *const vpx_codec_iface_t, + cfg: &VpxEncoderConfig, + ) -> ResultType { + let mut c: vpx_codec_enc_cfg_t = + unsafe { std::mem::MaybeUninit::zeroed().assume_init() }; + call_vpx!(vpx_codec_enc_config_default(i, &mut c, 0)); + + c.g_w = cfg.width; + c.g_h = cfg.height; + c.g_timebase.num = 1; + c.g_timebase.den = K_RTP_TICKS_PER_SECOND; + c.g_lag_in_frames = 0; + c.g_error_resilient = if NUMBER_OF_TEMPORAL_LAYERS > 1 { + VPX_ERROR_RESILIENT_DEFAULT + } else { + 0 + }; + c.rc_end_usage = vpx_rc_mode::VPX_CBR; + c.g_pass = vpx_enc_pass::VPX_RC_ONE_PASS; + c.rc_resize_allowed = 0; + c.rc_min_quantizer = if MODE == VideoCodecMode::KScreensharing { + 12 + } else { + 2 + }; + c.rc_max_quantizer = K_QP_MAX; + c.rc_undershoot_pct = 100; + c.rc_overshoot_pct = 15; + c.rc_buf_initial_sz = 500; + c.rc_buf_optimal_sz = 600; + c.rc_buf_sz = 1000; + if !DISABLE_KEY_FRAME_INTERVAL && KEY_FRAME_INTERVAL > 0 { + c.kf_mode = vpx_kf_mode::VPX_KF_AUTO; + c.kf_max_dist = KEY_FRAME_INTERVAL; + } else { + c.kf_mode = vpx_kf_mode::VPX_KF_DISABLED; + } + c.g_threads = number_of_threads(c.g_w, c.g_h, num_cpus::get()); + c.rc_target_bitrate = cfg.bitrate; + c.rc_dropframe_thresh = if FRAME_DROP_ENABLED { 30 } else { 0 }; + + Ok(c) + } + + pub fn set_control(ctx: *mut vpx_codec_ctx_t, cfg: &vpx_codec_enc_cfg_t) -> ResultType<()> { + use vp8e_enc_control_id::*; + + macro_rules! call_ctl { + ($ctx:expr, $vpxe:expr, $arg:expr) => {{ + call_vpx_allow_err!(vpx_codec_control_($ctx, $vpxe as i32, $arg)); + }}; + } + call_ctl!( + ctx, + VP8E_SET_STATIC_THRESHOLD, + if MODE == VideoCodecMode::KScreensharing { + 100 + } else { + 1 + } + ); + call_ctl!( + ctx, + VP8E_SET_CPUUSED, + get_cpu_speed(cfg.g_w, cfg.g_h, num_cpus::get()) + ); + + call_ctl!(ctx, VP8E_SET_TOKEN_PARTITIONS, K_TOKEN_PARTITIONS); + call_ctl!( + ctx, + VP8E_SET_MAX_INTRA_BITRATE_PCT, + max_intra_target(cfg.rc_buf_optimal_sz) + ); + call_ctl!( + ctx, + VP8E_SET_SCREEN_CONTENT_MODE, + if MODE == VideoCodecMode::KScreensharing { + 2 // On with more aggressive rate control. + } else { + 0 + } + ); + + Ok(()) + } + + fn get_cpu_speed_default() -> i32 { + match COMPLEXITY { + VideoCodecComplexity::KComplexityHigh => -5, + VideoCodecComplexity::KComplexityHigher => -4, + VideoCodecComplexity::KComplexityMax => -3, + _ => -6, + } + } + + fn get_cpu_speed(width: u32, height: u32, number_of_cores: usize) -> i32 { + if cfg!(any( + target_arch = "arm", + target_arch = "aarch64", + target_os = "android" + )) { + if number_of_cores <= 3 { + -12 + } else if width * height <= 352 * 288 { + -8 + } else if width * height <= 640 * 480 { + -10 + } else { + -12 + } + } else { + let cpu_speed_default = get_cpu_speed_default(); + if width * height < 352 * 288 { + if cpu_speed_default < -4 { + -4 + } else { + cpu_speed_default + } + } else { + cpu_speed_default + } + } + } + + fn number_of_threads(width: u32, height: u32, cpus: usize) -> u32 { + if cfg!(target_os = "android") { + if width * height >= 320 * 180 { + if cpus >= 4 { + // 3 threads for CPUs with 4 and more cores since most of times only 4 + // cores will be active. + 3 + } else if cpus == 3 || cpus == 2 { + 2 + } else { + 1 + } + } else { + 1 + } + } else { + if width * height >= 1920 * 1080 && cpus > 8 { + 8 // 8 threads for 1080p on high perf machines. + } else if width * height > 1280 * 960 && cpus >= 6 { + // 3 threads for 1080p. + return 3; + } else if width * height > 640 * 480 && cpus >= 3 { + // Default 2 threads for qHD/HD, but allow 3 if core count is high enough, + // as this will allow more margin for high-core/low clock machines or if + // not built with highest optimization. + if cpus >= 6 { + 3 + } else { + 2 + } + } else { + // 1 thread for VGA or less. + 1 + } + } + } + } + + fn max_intra_target(optimal_buffer_size: u32) -> u32 { + const MAX_FRAMERATE: u32 = 60; // TODO + let scale_par: f32 = 0.5; + let target_pct: u32 = + ((optimal_buffer_size as f32) * scale_par * MAX_FRAMERATE as f32 / 10.0) as u32; + let min_intra_size: u32 = 300; + if target_pct < min_intra_size { + min_intra_size + } else { + target_pct + } + } +} diff --git a/src/server/video_service.rs b/src/server/video_service.rs index f1053c5c9..d8e69ab13 100644 --- a/src/server/video_service.rs +++ b/src/server/video_service.rs @@ -540,14 +540,12 @@ fn run(sp: GenericService) -> ResultType<()> { EncoderCfg::VPX(VpxEncoderConfig { width: c.width as _, height: c.height as _, - timebase: [1, 1000], // Output timestamp precision bitrate, codec: if name == scrap::CodecName::VP8 { VpxVideoCodecId::VP8 } else { VpxVideoCodecId::VP9 }, - num_threads: (num_cpus::get() / 2) as _, }) } scrap::CodecName::AV1 => EncoderCfg::AOM(AomEncoderConfig {