From a Reddit post, I’ve seen code like this:
#![feature(core_intrinsics)]
const N: usize = 65_536;
#[inline(never)]
pub fn max_array(x: &mut[f64; N], y: &[f64; N]) {
debug_assert!(x.len() == y.len());
unsafe {
std::intrinsics::assume(x.as_ptr() as usize % 64 == 0);
std::intrinsics::assume(y.as_ptr() as usize % 64 == 0);
}
for i in 0 .. x.len() {
x[i] = if y[i] > x[i] { y[i] } else { x[i] };
}
}
Compiling that code like this, with the latest nightly:
rust -C opt-level=3 -C target-feature=+avx512f --emit asm temp.rs
You get:
_ZN4temp9max_array17ha1ddfb82e16f921aE:
xorl %eax, %eax
.align 16, 0x90
.LBB0_1:
vmovapd (%rdx,%rax,8), %zmm0
vmovapd 64(%rdx,%rax,8), %zmm1
vmovapd 128(%rdx,%rax,8), %zmm2
vmovapd 192(%rdx,%rax,8), %zmm3
vmaxpd (%rcx,%rax,8), %zmm0, %zmm0
vmaxpd 64(%rcx,%rax,8), %zmm1, %zmm1
vmaxpd 128(%rcx,%rax,8), %zmm2, %zmm2
vmaxpd 192(%rcx,%rax,8), %zmm3, %zmm3
vmovapd %zmm0, (%rcx,%rax,8)
vmovapd %zmm1, 64(%rcx,%rax,8)
vmovapd %zmm2, 128(%rcx,%rax,8)
vmovapd %zmm3, 192(%rcx,%rax,8)
addq $32, %rax
cmpq $65536, %rax
jne .LBB0_1
retq
Without the assumes you get:
_ZN4temp9max_array17ha1ddfb82e16f921aE:
xorl %eax, %eax
.align 16, 0x90
.LBB0_1:
vmovupd (%rdx,%rax,8), %zmm0
vmovupd 64(%rdx,%rax,8), %zmm1
vmovupd 128(%rdx,%rax,8), %zmm2
vmovupd 192(%rdx,%rax,8), %zmm3
vmaxpd (%rcx,%rax,8), %zmm0, %zmm0
vmaxpd 64(%rcx,%rax,8), %zmm1, %zmm1
vmaxpd 128(%rcx,%rax,8), %zmm2, %zmm2
vmaxpd 192(%rcx,%rax,8), %zmm3, %zmm3
vmovupd %zmm0, (%rcx,%rax,8)
vmovupd %zmm1, 64(%rcx,%rax,8)
vmovupd %zmm2, 128(%rcx,%rax,8)
vmovupd %zmm3, 192(%rcx,%rax,8)
addq $32, %rax
cmpq $65536, %rax
jne .LBB0_1
retq
The version with the assumes uses vmovapd (aligned) instead of vmovupd.
I can think of a simple way to avoid unsafe code and allow the max performance for numeric code, introducing a third optional argument for arrays and vectors:
let a = [0f64; 1024; 64];
let v = vec![0f64; 1024; 64];
fn foo(a: &[f64; 1024; 64], b: Vec<f64, 64>) {...}
The third argument is a compile time constant that represents the alignment of the chunk of memory. If this value is omitted, it’s assumed to be the standard one currently used. A function that requires a certain alignment can be called with a larger alignment too.
I think adding the memory alignment like that to the type system, allows both safety and performance. And being the extra argument optional, I think in most code you can ignore it (unless the alignment is smaller than the standard one, but you can even forbid this for simplicity).
Is this idea overkill, and too much complexity for its use cases?