From bbab522274f857279773d1c618ce7915b9300003 Mon Sep 17 00:00:00 2001 From: Khyber Sen Date: Tue, 20 May 2025 07:40:01 -0400 Subject: [PATCH 1/3] perf: `fn rav1d_create_lf_mask_intra`: store as `u16`s Previously, two adjacent 8-bit stores were done. By doing them as `u16`s instead of `[u8]`s/`[u8; 2]`s, we can get a single 16-bit store to be emitted instead. --- src/lf_mask.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/lf_mask.rs b/src/lf_mask.rs index b75e5eb42..4b21e57d6 100644 --- a/src/lf_mask.rs +++ b/src/lf_mask.rs @@ -19,6 +19,7 @@ use parking_lot::RwLock; use std::cmp; use std::ffi::c_int; use std::mem::MaybeUninit; +use zerocopy::FromBytes; #[repr(C)] pub struct Av1FilterLUT { @@ -449,6 +450,9 @@ pub(crate) fn rav1d_create_lf_mask_intra( let bx4 = bx & 31; let by4 = by & 31; + let filter_level_yuv = filter_level.0.map(|a| a[0][0]); + let [filter_level_y, filter_level_uv] = *<[u16; 2]>::ref_from(&filter_level_yuv).unwrap(); + if bw4 != 0 && bh4 != 0 { let mut level_cache_off = by * b4_stride + bx; for _y in 0..bh4 { @@ -456,8 +460,7 @@ pub(crate) fn rav1d_create_lf_mask_intra( let idx = 4 * (level_cache_off + x); // `0.., ..2` is for Y let lvl = &mut *level_cache.index_mut((idx + 0.., ..2)); - lvl[0] = filter_level[0][0][0]; - lvl[1] = filter_level[1][0][0]; + *u16::mut_from(lvl).unwrap() = filter_level_y; } level_cache_off += b4_stride; } @@ -494,8 +497,7 @@ pub(crate) fn rav1d_create_lf_mask_intra( let idx = 4 * (level_cache_off + x); // `2.., ..2` is for UV let lvl = &mut *level_cache.index_mut((idx + 2.., ..2)); - lvl[0] = filter_level[2][0][0]; - lvl[1] = filter_level[3][0][0]; + *u16::mut_from(lvl).unwrap() = filter_level_uv; } level_cache_off += b4_stride; } From f7fd7df743733abd2abe4f165d3d5296fd87368f Mon Sep 17 00:00:00 2001 From: Khyber Sen Date: Tue, 27 May 2025 04:29:39 -0400 Subject: [PATCH 2/3] perf: `fn rav1d_create_lf_mask_inter`: store as `u16`s Same as for `fn rav1d_create_lf_mask_intra`. --- src/lf_mask.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/lf_mask.rs b/src/lf_mask.rs index 4b21e57d6..2e3bc2a0f 100644 --- a/src/lf_mask.rs +++ b/src/lf_mask.rs @@ -549,6 +549,9 @@ pub(crate) fn rav1d_create_lf_mask_inter( let bx4 = bx & 31; let by4 = by & 31; + let filter_level_yuv = filter_level.0.map(|a| a[r#ref][is_gmv]); + let [filter_level_y, filter_level_uv] = *<[u16; 2]>::ref_from(&filter_level_yuv).unwrap(); + if bw4 != 0 && bh4 != 0 { let mut level_cache_off = by * b4_stride + bx; for _y in 0..bh4 { @@ -556,8 +559,7 @@ pub(crate) fn rav1d_create_lf_mask_inter( let idx = 4 * (level_cache_off + x); // `0.., ..2` is for Y let lvl = &mut *level_cache.index_mut((idx + 0.., ..2)); - lvl[0] = filter_level[0][r#ref][is_gmv]; - lvl[1] = filter_level[1][r#ref][is_gmv]; + *u16::mut_from(lvl).unwrap() = filter_level_y; } level_cache_off += b4_stride; } @@ -605,8 +607,7 @@ pub(crate) fn rav1d_create_lf_mask_inter( let idx = 4 * (level_cache_off + x); // `2.., ..2` is for UV let lvl = &mut *level_cache.index_mut((idx + 2.., ..2)); - lvl[0] = filter_level[2][r#ref][is_gmv]; - lvl[1] = filter_level[3][r#ref][is_gmv]; + *u16::mut_from(lvl).unwrap() = filter_level_uv; } level_cache_off += b4_stride; } From d5d6d73f57e313fd4f4390a9041e595cbfbc1543 Mon Sep 17 00:00:00 2001 From: Khyber Sen Date: Fri, 13 Jun 2025 03:45:49 -0400 Subject: [PATCH 3/3] `struct Rav1dFrameContextLf::level`: make an `AlignedVec2` so we can safely treat it as `u16`s --- src/align.rs | 1 + src/decode.rs | 3 +-- src/internal.rs | 3 ++- src/lf_apply.rs | 9 +++++---- src/lf_mask.rs | 5 +++-- src/loopfilter.rs | 9 +++++---- 6 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/align.rs b/src/align.rs index dc665d726..eba1ce8a3 100644 --- a/src/align.rs +++ b/src/align.rs @@ -263,6 +263,7 @@ impl Default for AlignedVec { } } +pub type AlignedVec2 = AlignedVec>; pub type AlignedVec32 = AlignedVec>; pub type AlignedVec64 = AlignedVec>; diff --git a/src/decode.rs b/src/decode.rs index ce47a5c6c..f138335df 100644 --- a/src/decode.rs +++ b/src/decode.rs @@ -4538,8 +4538,7 @@ pub(crate) fn rav1d_decode_frame_init(c: &Rav1dContext, fc: &Rav1dFrameContext) f.lf.mask.resize_with(num_sb128 as usize, Default::default); // over-allocate by 3 bytes since some of the SIMD implementations // index this from the level type and can thus over-read by up to 3 bytes. - f.lf.level - .resize_with(4 * num_sb128 as usize * 32 * 32 + 3, Default::default); // TODO: Fallible allocation + f.lf.level.resize(4 * num_sb128 as usize * 32 * 32 + 3, 0); // TODO: Fallible allocation if c.fc.len() > 1 { // TODO: Fallible allocation f.frame_thread diff --git a/src/internal.rs b/src/internal.rs index 855e4c78f..504a60ff3 100644 --- a/src/internal.rs +++ b/src/internal.rs @@ -1,5 +1,6 @@ use crate::align::Align16; use crate::align::Align64; +use crate::align::AlignedVec2; use crate::align::AlignedVec64; use crate::cdef::Rav1dCdefDSPContext; use crate::cdf::CdfContext; @@ -701,7 +702,7 @@ impl TxLpfRightEdge { #[derive(Default)] #[repr(C)] pub struct Rav1dFrameContextLf { - pub level: DisjointMut>, + pub level: DisjointMut>, pub mask: Vec, /* len = w*h */ pub lr_mask: Vec, pub lim_lut: Align16, diff --git a/src/lf_apply.rs b/src/lf_apply.rs index 52b00fcfe..7ae9eaf06 100644 --- a/src/lf_apply.rs +++ b/src/lf_apply.rs @@ -1,5 +1,6 @@ #![deny(unsafe_op_in_unsafe_fn)] +use crate::align::AlignedVec2; use crate::align::AlignedVec64; use crate::disjoint_mut::DisjointMut; use crate::include::common::bitdepth::BitDepth; @@ -368,7 +369,7 @@ pub(crate) fn rav1d_copy_lpf( fn filter_plane_cols_y( f: &Rav1dFrameData, have_left: bool, - lvl: WithOffset<&DisjointMut>>, + lvl: WithOffset<&DisjointMut>>, mask: &[[[RelaxedAtomic; 2]; 3]; 32], y_dst: Rav1dPictureDataComponentOffset, w: usize, @@ -405,7 +406,7 @@ fn filter_plane_cols_y( fn filter_plane_rows_y( f: &Rav1dFrameData, have_top: bool, - lvl: WithOffset<&DisjointMut>>, + lvl: WithOffset<&DisjointMut>>, b4_stride: usize, mask: &[[[RelaxedAtomic; 2]; 3]; 32], y_dst: Rav1dPictureDataComponentOffset, @@ -437,7 +438,7 @@ fn filter_plane_rows_y( fn filter_plane_cols_uv( f: &Rav1dFrameData, have_left: bool, - lvl: WithOffset<&DisjointMut>>, + lvl: WithOffset<&DisjointMut>>, mask: &[[[RelaxedAtomic; 2]; 2]; 32], u_dst: Rav1dPictureDataComponentOffset, v_dst: Rav1dPictureDataComponentOffset, @@ -480,7 +481,7 @@ fn filter_plane_cols_uv( fn filter_plane_rows_uv( f: &Rav1dFrameData, have_top: bool, - lvl: WithOffset<&DisjointMut>>, + lvl: WithOffset<&DisjointMut>>, b4_stride: usize, mask: &[[[RelaxedAtomic; 2]; 2]; 32], u_dst: Rav1dPictureDataComponentOffset, diff --git a/src/lf_mask.rs b/src/lf_mask.rs index 2e3bc2a0f..fd16db417 100644 --- a/src/lf_mask.rs +++ b/src/lf_mask.rs @@ -1,4 +1,5 @@ use crate::align::Align16; +use crate::align::AlignedVec2; use crate::align::ArrayDefault; use crate::ctx::CaseSet; use crate::disjoint_mut::DisjointMut; @@ -426,7 +427,7 @@ fn mask_edges_chroma( #[inline(never)] pub(crate) fn rav1d_create_lf_mask_intra( lflvl: &Av1Filter, - level_cache: &DisjointMut>, + level_cache: &DisjointMut>, b4_stride: ptrdiff_t, filter_level: &Align16<[[[u8; 2]; 8]; 4]>, b: Bxy, @@ -520,7 +521,7 @@ pub(crate) fn rav1d_create_lf_mask_intra( #[inline(never)] pub(crate) fn rav1d_create_lf_mask_inter( lflvl: &Av1Filter, - level_cache: &DisjointMut>, + level_cache: &DisjointMut>, b4_stride: ptrdiff_t, filter_level: &Align16<[[[u8; 2]; 8]; 4]>, r#ref: usize, diff --git a/src/loopfilter.rs b/src/loopfilter.rs index bc81ba286..bedfd7803 100644 --- a/src/loopfilter.rs +++ b/src/loopfilter.rs @@ -1,6 +1,7 @@ #![deny(unsafe_op_in_unsafe_fn)] use crate::align::Align16; +use crate::align::AlignedVec2; use crate::cpu::CpuFlags; use crate::disjoint_mut::DisjointMut; use crate::ffi_safe::FFISafe; @@ -34,7 +35,7 @@ wrap_fn_ptr!(pub unsafe extern "C" fn loopfilter_sb( w: c_int, bitdepth_max: c_int, _dst: *const FFISafe, - _lvl: *const FFISafe>>>, + _lvl: *const FFISafe>>>, ) -> ()); impl loopfilter_sb::Fn { @@ -43,7 +44,7 @@ impl loopfilter_sb::Fn { f: &Rav1dFrameData, dst: Rav1dPictureDataComponentOffset, mask: &[u32; 3], - lvl: WithOffset<&DisjointMut>>, + lvl: WithOffset<&DisjointMut>>, w: usize, ) { let dst_ptr = dst.as_mut_ptr::().cast(); @@ -289,7 +290,7 @@ enum YUV { fn loop_filter_sb128_rust( mut dst: Rav1dPictureDataComponentOffset, vmask: &[u32; 3], - mut lvl: WithOffset<&DisjointMut>>, + mut lvl: WithOffset<&DisjointMut>>, b4_stride: usize, lut: &Align16, _wh: c_int, @@ -367,7 +368,7 @@ unsafe extern "C" fn loop_filter_sb128_c_erased, - lvl: *const FFISafe>>>, + lvl: *const FFISafe>>>, ) { // SAFETY: Was passed as `FFISafe::new(_)` in `loopfilter_sb::Fn::call`. let dst = *unsafe { FFISafe::get(dst) };