Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ raw-cpuid = "11.0.1"
strum = { version = "0.26", features = ["derive"] }
to_method = "1.1.0"
zerocopy = { version = "0.7.32", features = ["derive"] }
bytemuck = { version = "1.23.0" }

[build-dependencies]
cc = "1.0.79"
Expand Down
45 changes: 25 additions & 20 deletions src/lf_mask.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use crate::src::levels::SegmentId;
use crate::src::levels::TxfmSize;
use crate::src::relaxed_atomic::RelaxedAtomic;
use crate::src::tables::dav1d_txfm_dimensions;
use bytemuck;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're already using zerocopy, so if you can do these operations with zerocopy instead of bytemuck that would be much preferred.

Copy link
Author

@coderkalyan coderkalyan May 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me see if zerocopy can do the same thing. If not it's also a "relatively safe" unsafe mem::transmute call.

use libc::ptrdiff_t;
use parking_lot::RwLock;
use std::cmp;
Expand Down Expand Up @@ -149,7 +150,7 @@ fn decomp_tx(
};
}

#[inline]
#[inline(always)]
fn mask_edges_inter(
masks: &[[[[RelaxedAtomic<u16>; 2]; 3]; 32]; 2],
by4: usize,
Expand Down Expand Up @@ -261,7 +262,7 @@ fn mask_edges_inter(
a[..w4].copy_from_slice(txa_slice);
}

#[inline]
#[inline(always)]
fn mask_edges_intra(
masks: &[[[[RelaxedAtomic<u16>; 2]; 3]; 32]; 2],
by4: usize,
Expand Down Expand Up @@ -452,12 +453,13 @@ pub(crate) fn rav1d_create_lf_mask_intra(
if bw4 != 0 && bh4 != 0 {
let mut level_cache_off = by * b4_stride + bx;
for _y in 0..bh4 {
let idx = 4 * level_cache_off;
let lvl = &mut *level_cache.index_mut((idx + 0.., ..4 * bw4));
let lvl: &mut [[u8; 4]] = bytemuck::cast_slice_mut(lvl);
for x in 0..bw4 {
let idx = 4 * (level_cache_off + x);
// `0.., ..2` is for Y
let lvl = &mut *level_cache.index_mut((idx + 0.., ..2));
lvl[0] = filter_level[0][0][0];
lvl[1] = filter_level[1][0][0];
lvl[x][0] = filter_level[0][0][0];
lvl[x][1] = filter_level[1][0][0];
Comment on lines +461 to +462
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i was also looking at this function yesterday, you might want to keep an eye out for ways to convince llvm/rustc that this store can be one 16-bit store instead of a pair of 8-bit stores (i took a "get out and push" approach while proving it out, myself: iximeow@e133a89)

the factor by which the loop is unrolled is partially a question of the size of the loop, so where this produces pairs of 5-byte stores for each iteration:

   f10e9: e9 72 ff ff ff        jmp    f1060
   f10ee: 66 90                 xchg   %ax,%ax
   f10f0: 4a 8d 04 32           lea    (%rdx,%r14,1),%rax
   f10f4: 45 31 ff              xor    %r15d,%r15d
   f10f7: 66 0f 1f 84 00 00 00  nopw   0x0(%rax,%rax,1)
   f10fe: 00 00
   f1100: 42 88 74 b8 f3        mov    %sil,-0xd(%rax,%r15,4)
   f1105: 42 88 5c b8 f4        mov    %bl,-0xc(%rax,%r15,4)
   f110a: 42 88 74 b8 f7        mov    %sil,-0x9(%rax,%r15,4)
   f110f: 42 88 5c b8 f8        mov    %bl,-0x8(%rax,%r15,4)
   f1114: 42 88 74 b8 fb        mov    %sil,-0x5(%rax,%r15,4)
   f1119: 42 88 5c b8 fc        mov    %bl,-0x4(%rax,%r15,4)
   f111e: 42 88 74 b8 ff        mov    %sil,-0x1(%rax,%r15,4)
   f1123: 42 88 1c b8           mov    %bl,(%rax,%r15,4)
   f1127: 49 83 c7 04           add    $0x4,%r15
   f112b: 4d 39 fb              cmp    %r15,%r11
   f112e: 75 d0                 jne    f1100 

i'd seen slightly better still throughput with the iteration being expressed as one 4-byte store:

   f10f9: 72 2e                 jb     f1129 <_ZN5rav1d3src7lf_mask26rav1d_create_lf_mask_inter17h52008c9dbd379820E+0x1b9>
   f10fb: 4d 89 cb              mov    %r9,%r11
   f10fe: 66 90                 xchg   %ax,%ax
   f1100: 66 89 0f              mov    %cx,(%rdi)
   f1103: 66 89 4f 04           mov    %cx,0x4(%rdi)
   f1107: 66 89 4f 08           mov    %cx,0x8(%rdi)
   f110b: 66 89 4f 0c           mov    %cx,0xc(%rdi)
   f110f: 66 89 4f 10           mov    %cx,0x10(%rdi)
   f1113: 66 89 4f 14           mov    %cx,0x14(%rdi)
   f1117: 66 89 4f 18           mov    %cx,0x18(%rdi)
   f111b: 66 89 4f 1c           mov    %cx,0x1c(%rdi)
   f111f: 48 83 c7 20           add    $0x20,%rdi
   f1123: 49 83 c3 f8           add    $0xfffffffffffffff8,%r11
   f1127: 75 d7                 jne    f1100

that said i'd initially taken an approach of re-slicing from lvl_buf in the innermost loop instead of incrementing x - that worked well too, except it kept maintaining the slice length (an additional sub each loop iteration) where your approach here doesn't do that. it is unfortunate that you get the slightly wider mov %reg, %sil,-0xd(%base,%index,4) stores though..

i also had a hard time precisely measuring this one. on a 9950x, but my change was an improvement on the order of 0.5% for the 10-bit video. on the 8-bit video where i've better numbers: ~1% fewer loads/stores dispatched (presumably mostly because of the loop unrolling), 0.6% fewer instructions in total, 0.25% total fewer cycles.

i would expect your change to have similar results, maybe a less change overall because of the slightly less aggressive loop unrolling.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the help! I didn't notice, let me try to convince LLVM to fuse the store similar to what you're doing. Your numbers sound right - they're exactly what I would estimate based on the performance improvement to this function alone. Let me look at instruction/cycle counts, it may be more useful than wall clock as there's just too much noise to measure < 1% improvements consistently.

If you'd like to share thoughts/work together on completing this fix, let me know!

Copy link
Author

@coderkalyan coderkalyan May 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On 10 bit Chimera for 4000 frames, I'm getting 1.8% fewer instructions and 0.5-1.2% wall clock compared to baseline now. The former seems to be reproducible/consistent, latter is still affected by noise. Seems better than yours which is suspicious, will take a closer look.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think those observations make sense for the 10-bit video, and in particular on a 16-core processor. i noticed this weekend that dav1d and rav1d both struggle to keep threads busy past ~12 threads and for some inputs more threads past 18 start to harm throughput. haven't looked too much into that since the overall behavior is very consistent between the two, but it definitely seems like the overall work-scheduling situation could be.... much better

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i was also looking at this function yesterday, you might want to keep an eye out for ways to convince llvm/rustc that this store can be one 16-bit store instead of a pair of 8-bit stores (i took a "get out and push" approach while proving it out, myself: iximeow@e133a89)

I think I found a way to do this simpler and safer. I'll open a PR for just that part. Thanks for finding out the 2 u8 stores aren't being coalesced! I haven't looked at the rest of the loop in detail yet, but the stores I should be able to fix.

}
level_cache_off += b4_stride;
}
Expand Down Expand Up @@ -490,12 +492,13 @@ pub(crate) fn rav1d_create_lf_mask_intra(

let mut level_cache_off = (by >> ss_ver) * b4_stride + (bx >> ss_hor);
for _y in 0..cbh4 {
let idx = 4 * level_cache_off;
let lvl = &mut *level_cache.index_mut((idx + 0.., ..4 * cbw4));
let lvl: &mut [[u8; 4]] = bytemuck::cast_slice_mut(lvl);
for x in 0..cbw4 {
let idx = 4 * (level_cache_off + x);
// `2.., ..2` is for UV
let lvl = &mut *level_cache.index_mut((idx + 2.., ..2));
lvl[0] = filter_level[2][0][0];
lvl[1] = filter_level[3][0][0];
lvl[x][2] = filter_level[2][0][0];
lvl[x][3] = filter_level[3][0][0];
}
level_cache_off += b4_stride;
}
Expand Down Expand Up @@ -550,12 +553,13 @@ pub(crate) fn rav1d_create_lf_mask_inter(
if bw4 != 0 && bh4 != 0 {
let mut level_cache_off = by * b4_stride + bx;
for _y in 0..bh4 {
let idx = 4 * level_cache_off;
let lvl = &mut *level_cache.index_mut((idx + 0.., ..4 * bw4));
let lvl: &mut [[u8; 4]] = bytemuck::cast_slice_mut(lvl);
for x in 0..bw4 {
let idx = 4 * (level_cache_off + x);
// `0.., ..2` is for Y
let lvl = &mut *level_cache.index_mut((idx + 0.., ..2));
lvl[0] = filter_level[0][r#ref][is_gmv];
lvl[1] = filter_level[1][r#ref][is_gmv];
// 0, 1 is for Y
lvl[x][0] = filter_level[0][r#ref][is_gmv];
lvl[x][1] = filter_level[1][r#ref][is_gmv];
}
level_cache_off += b4_stride;
}
Expand Down Expand Up @@ -599,12 +603,13 @@ pub(crate) fn rav1d_create_lf_mask_inter(

let mut level_cache_off = (by >> ss_ver) * b4_stride + (bx >> ss_hor);
for _y in 0..cbh4 {
let idx = 4 * level_cache_off;
let lvl = &mut *level_cache.index_mut((idx + 0.., ..4 * cbw4));
let lvl: &mut [[u8; 4]] = bytemuck::cast_slice_mut(lvl);
for x in 0..cbw4 {
let idx = 4 * (level_cache_off + x);
// `2.., ..2` is for UV
let lvl = &mut *level_cache.index_mut((idx + 2.., ..2));
lvl[0] = filter_level[2][r#ref][is_gmv];
lvl[1] = filter_level[3][r#ref][is_gmv];
// 2, 3 is for UV
lvl[x][2] = filter_level[2][r#ref][is_gmv];
lvl[x][3] = filter_level[3][r#ref][is_gmv];
}
level_cache_off += b4_stride;
}
Expand Down
Loading