Optimization comparison: Vec vs array and for vs while

leonardo · April 1, 2022, 11:41am

Hi, this code shows two versions of a short function that computes totient values from zero to a given N, based on a variant of a sieve. The version 1 uses a Vec and a for loop, the second a boxed array and a while loop. Version 2 works with a compile-time constant length N. So it's reasonable for the version 2 to be a little more optimizable. But here the second version avoids all but one array bound test and the division by zero test.

So I think such difference is excessive. So I think Rustc should add a Vec/slice length range analisys pass to propagate the length of vectors in simple cases (like this, here the Vec length never changes after its creation). Another optimization pass could be added to rustc for the for-step_by that here is pessimizing the code compared to the while loop. for loops are very common and I think they are worth having a more focused optimization before high-level semantics gets lots by LLVM.

#[inline(never)]
pub fn euler_phi1(n: usize) -> Vec<u32> {
    let mut phi = vec![0_u32; n + 1];
    phi[1] = 1;

    for p in (3 .. n + 1).step_by(2) {
        if phi[p] == 0 {
            let mut i = (n / p - 1) | 1;
            while i > 0 {
                if phi[i] != 0 {
                    let mut x = i * p;
                    let mut f = phi[i] * (p as u32 ^ 1);

                    while x <= n {
                        phi[x] = f;
                        x *= p;
                        f = f.wrapping_mul(p as u32);
                    }
                }

                if i <= 2 { break; }
                i -= 2;
            }
        }
    }

    for i in 1 .. n / 2 + 1 {
        phi[i << 1] = if (i & 1) != 0 { phi[i] } else { phi[i] << 1 };
    }
    phi
}


const N: usize = 100_000; // Input.
type Phis = Box<[u32; N + 1]>;

#[inline(never)]
pub fn euler_phi2() -> Phis {
    let mut phi: Phis = vec![0_u32; N + 1]
                        .into_boxed_slice()
                        .try_into()
                        .unwrap();
    phi[1] = 1;

    let mut p = 3;
    while p < N + 1 {
        if phi[p] == 0 {
            let mut i = (N / p - 1) | 1;
            while i > 0 {
                if phi[i] != 0 { // Bound test.
                    let mut x = i * p;
                    let mut f = phi[i] * (p as u32 ^ 1);

                    while x <= N {
                        phi[x] = f;
                        x *= p;
                        f = f.wrapping_mul(p as u32);
                    }
                }

                if i <= 2 { break; }
                i -= 2;
            }
        }
        p += 2;
    }

    for i in 1 .. N / 2 + 1 {
        phi[i << 1] = if (i & 1) != 0 { phi[i] } else { phi[i] << 1 };
    }
    phi
}


fn main() {
    let p1 = euler_phi1(N);
    let p2 = euler_phi2();
    assert_eq!(&p1[..], &p2[..]);
}

euler_phi1:
        push    r15
        push    r14
        push    r13
        push    r12
        push    rbx
        lea     r15, [rsi + 1]
        mov     ecx, 4
        xor     r12d, r12d
        mov     rax, r15
        mul     rcx
        mov     r13, rax
        setno   al
        jo      .LBB0_47
        mov     rbx, rsi
        mov     r14, rdi
        mov     r12b, al
        shl     r12, 2
        test    r13, r13
        je      .LBB0_2
        mov     rdi, r13
        mov     rsi, r12
        call    qword ptr [rip + __rust_alloc_zeroed@GOTPCREL]
        mov     rcx, rax
        test    rcx, rcx
        je      .LBB0_48
.LBB0_5:
        mov     qword ptr [r14], rcx
        mov     qword ptr [r14 + 8], r15
        mov     qword ptr [r14 + 16], r15
        cmp     r15, 1
        jbe     .LBB0_6
        mov     dword ptr [rcx + 4], 1
        mov     r8d, 3
        xor     r9d, r9d
.LBB0_8:
        test    r9b, 1
        je      .LBB0_15
        lea     rax, [r8 + 1]
        inc     r8
        je      .LBB0_16
        cmp     r8, r15
        jae     .LBB0_16
        inc     rax
        mov     rdi, r8
        mov     r8, rax
        jmp     .LBB0_12
.LBB0_15:
        cmp     r8, r15
        mov     rax, r8
        adc     rax, 0
        mov     rdi, r8
        cmp     r8, r15
        mov     r8, rax
        jae     .LBB0_16
.LBB0_12:
        cmp     rdi, r15
        jae     .LBB0_13
        mov     r9b, 1
        cmp     dword ptr [rcx + 4*rdi], 0
        jne     .LBB0_8
        test    rdi, rdi
        je      .LBB0_33
        mov     rax, rbx
        or      rax, rdi
        shr     rax, 32
        je      .LBB0_24
        mov     rax, rbx
        xor     edx, edx
        div     rdi
        dec     rax
        or      rax, 1
        cmp     rax, r15
        jb      .LBB0_27
        jmp     .LBB0_34
.LBB0_24:
        mov     eax, ebx
        xor     edx, edx
        div     edi
        dec     rax
        or      rax, 1
        cmp     rax, r15
        jae     .LBB0_34
.LBB0_27:
        mov     r10d, edi
        xor     r10d, 1
.LBB0_28:
        mov     edx, dword ptr [rcx + 4*rax]
        test    edx, edx
        je      .LBB0_37
        mov     rsi, rax
        imul    rsi, rdi
        cmp     rsi, rbx
        ja      .LBB0_37
        imul    edx, r10d
.LBB0_31:
        cmp     rsi, r15
        jae     .LBB0_32
        mov     dword ptr [rcx + 4*rsi], edx
        imul    edx, edi
        imul    rsi, rdi
        cmp     rsi, rbx
        jbe     .LBB0_31
.LBB0_37:
        cmp     rax, 3
        jb      .LBB0_8
        add     rax, -2
        cmp     rax, r15
        jb      .LBB0_28
.LBB0_34:
        lea     rdx, [rip + .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.4]
        mov     rdi, rax
        mov     rsi, r15
        call    qword ptr [rip + core::panicking::panic_bounds_check@GOTPCREL]
        ud2
.LBB0_32:
        lea     rdx, [rip + .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.5]
        mov     rdi, rsi
        mov     rsi, r15
        call    qword ptr [rip + core::panicking::panic_bounds_check@GOTPCREL]
        ud2
.LBB0_2:
        mov     rcx, r12
        test    rcx, rcx
        jne     .LBB0_5
.LBB0_48:
        mov     rdi, r13
        mov     rsi, r12
        call    qword ptr [rip + alloc::alloc::handle_alloc_error@GOTPCREL]
        ud2
.LBB0_16:
        cmp     rbx, 2
        jb      .LBB0_46
        shr     rbx
        mov     edi, 1
.LBB0_18:
        test    dil, 1
        jne     .LBB0_40
        cmp     rdi, r15
        jae     .LBB0_20
        mov     edx, dword ptr [rcx + 4*rdi]
        add     edx, edx
        jmp     .LBB0_43
.LBB0_40:
        cmp     rdi, r15
        jae     .LBB0_41
        mov     edx, dword ptr [rcx + 4*rdi]
.LBB0_43:
        lea     rax, [rdi + rdi]
        cmp     rax, r15
        jae     .LBB0_44
        lea     rsi, [rdi + 1]
        mov     dword ptr [rcx + 4*rax], edx
        cmp     rdi, rbx
        mov     rdi, rsi
        jne     .LBB0_18
.LBB0_46:
        mov     rax, r14
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        ret
.LBB0_13:
        lea     rdx, [rip + .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.2]
        mov     rsi, r15
        call    qword ptr [rip + core::panicking::panic_bounds_check@GOTPCREL]
        ud2
.LBB0_44:
        lea     rdx, [rip + .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.8]
        mov     rdi, rax
        mov     rsi, r15
        call    qword ptr [rip + core::panicking::panic_bounds_check@GOTPCREL]
        ud2
.LBB0_33:
        lea     rdi, [rip + str.0]
        lea     rdx, [rip + .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.3]
        mov     esi, 25
        call    qword ptr [rip + core::panicking::panic@GOTPCREL]
        ud2
.LBB0_20:
        lea     rdx, [rip + .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.7]
        mov     rsi, r15
        call    qword ptr [rip + core::panicking::panic_bounds_check@GOTPCREL]
        ud2
.LBB0_41:
        lea     rdx, [rip + .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.6]
        mov     rsi, r15
        call    qword ptr [rip + core::panicking::panic_bounds_check@GOTPCREL]
        ud2
.LBB0_6:
        lea     rdx, [rip + .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.1]
        mov     edi, 1
        mov     rsi, r15
        call    qword ptr [rip + core::panicking::panic_bounds_check@GOTPCREL]
        ud2
.LBB0_47:
        call    qword ptr [rip + alloc::raw_vec::capacity_overflow@GOTPCREL]
        ud2


example::euler_phi2:
        push    rax
        mov     edi, 400004
        mov     esi, 4
        call    qword ptr [rip + __rust_alloc_zeroed@GOTPCREL]
        test    rax, rax
        je      .LBB1_16
        mov     rcx, rax
        mov     dword ptr [rax + 4], 1
        mov     esi, 3
        jmp     .LBB1_2
.LBB1_3:
        lea     rax, [rsi + 2]
        cmp     rsi, 99999
        mov     rsi, rax
        jae     .LBB1_4
.LBB1_2:
        cmp     dword ptr [rcx + 4*rsi], 0
        jne     .LBB1_3
        mov     eax, 100000
        xor     edx, edx
        div     esi
        dec     rax
        or      rax, 1
        cmp     rax, 100000
        ja      .LBB1_15
        mov     r8d, esi
        xor     r8d, 1
.LBB1_9:
        mov     edi, dword ptr [rcx + 4*rax]
        test    edi, edi
        je      .LBB1_13
        mov     rdx, rax
        imul    rdx, rsi
        cmp     rdx, 100000
        ja      .LBB1_13
        imul    edi, r8d
.LBB1_12:
        mov     dword ptr [rcx + 4*rdx], edi
        imul    edi, esi
        imul    rdx, rsi
        cmp     rdx, 100001
        jb      .LBB1_12
.LBB1_13:
        cmp     rax, 3
        jb      .LBB1_3
        add     rax, -2
        cmp     rax, 100001
        jb      .LBB1_9
.LBB1_15:
        lea     rdx, [rip + .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.9]
        mov     esi, 100001
        mov     rdi, rax
        call    qword ptr [rip + core::panicking::panic_bounds_check@GOTPCREL]
        ud2
.LBB1_4:
        mov     rax, -50000
.LBB1_5:
        mov     edx, eax
        neg     dl
        and     dl, 1
        shlx    esi, dword ptr [rcx + 4*rax + 200004], edx
        mov     dword ptr [rcx + 8*rax + 400008], esi
        lea     esi, [rax + 82]
        not     sil
        and     sil, 1
        shlx    esi, dword ptr [rcx + 4*rax + 200008], esi
        mov     dword ptr [rcx + 8*rax + 400016], esi
        shlx    esi, dword ptr [rcx + 4*rax + 200012], edx
        mov     dword ptr [rcx + 8*rax + 400024], esi
        lea     esi, [rax + 84]
        not     sil
        and     sil, 1
        shlx    esi, dword ptr [rcx + 4*rax + 200016], esi
        mov     dword ptr [rcx + 8*rax + 400032], esi
        shlx    edx, dword ptr [rcx + 4*rax + 200020], edx
        mov     dword ptr [rcx + 8*rax + 400040], edx
        add     rax, 5
        jne     .LBB1_5
        mov     rax, rcx
        pop     rcx
        ret
.LBB1_16:
        mov     edi, 400004
        mov     esi, 4
        call    qword ptr [rip + alloc::alloc::handle_alloc_error@GOTPCREL]
        ud2


.Lanon.f5a6578b9a75e6d29108a9d55f85dd98.0:
        .ascii  "/app/example.rs"

.Lanon.f5a6578b9a75e6d29108a9d55f85dd98.1:
        .quad   .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.0
        .asciz  "\017\000\000\000\000\000\000\000\004\000\000\000\005\000\000"

.Lanon.f5a6578b9a75e6d29108a9d55f85dd98.2:
        .quad   .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.0
        .asciz  "\017\000\000\000\000\000\000\000\007\000\000\000\f\000\000"

.Lanon.f5a6578b9a75e6d29108a9d55f85dd98.3:
        .quad   .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.0
        .asciz  "\017\000\000\000\000\000\000\000\b\000\000\000\032\000\000"

str.0:
        .ascii  "attempt to divide by zero"

.Lanon.f5a6578b9a75e6d29108a9d55f85dd98.4:
        .quad   .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.0
        .asciz  "\017\000\000\000\000\000\000\000\n\000\000\000\024\000\000"

.Lanon.f5a6578b9a75e6d29108a9d55f85dd98.5:
        .quad   .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.0
        .asciz  "\017\000\000\000\000\000\000\000\017\000\000\000\031\000\000"

.Lanon.f5a6578b9a75e6d29108a9d55f85dd98.6:
        .quad   .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.0
        .asciz  "\017\000\000\000\000\000\000\000\034\000\000\000)\000\000"

.Lanon.f5a6578b9a75e6d29108a9d55f85dd98.7:
        .quad   .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.0
        .asciz  "\017\000\000\000\000\000\000\000\034\000\000\0009\000\000"

.Lanon.f5a6578b9a75e6d29108a9d55f85dd98.8:
        .quad   .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.0
        .asciz  "\017\000\000\000\000\000\000\000\034\000\000\000\t\000\000"

.Lanon.f5a6578b9a75e6d29108a9d55f85dd98.9:
        .quad   .Lanon.f5a6578b9a75e6d29108a9d55f85dd98.0
        .asciz  "\017\000\000\000\000\000\000\0002\000\000\000\024\000\000"

CAD97 · April 1, 2022, 1:49pm

Interesting note: adding let phi = &mut phi[..n + 1]; (plus a rename to still return vec) reduces the vec code to only one panic_bounds_check in the asm, like the box version. It does still appear to get worse codegen, however.

scottmcm · April 1, 2022, 6:02pm

"Reslicing" like this is basically always the right thing to do before a bunch of indexing. See this recent post for another example of it: Mir optimization pass that implements auto-vectorization - #6 by scottmcm

And I don't know if it's still true, but at least 1½ years ago it was the case that indexing through Vec optimized worse than through a slice (We all know `iter` is faster than `loop`, but why? - #3 by scottmcm - The Rust Programming Language Forum), which just gives another reason that you should reslice it.

Note that you can opt into more focused optimization for them by changing your for p in STUFF { to STUFF.for_each(|p| {.

This loop is kinda weird, BTW.

Anything | 1 will be > 0, and you break at the bottom before i can get down to 0.

So I think it should be a loop?

But more importantly, I think you can reduce this report down to something more specific, by asking whether LLVM more directly whether it knows certain things.

For example, I'm extra confident about the while condition because if you ask LLVM to compile

pub fn demo_lower(n: usize, p: usize) -> bool {
    assert!(p >= 3);

    let i = (n / p - 1) | 1;
    i > 0
}

the output is clearly "oh yeah, that either asserts or returns true".

Whereas if you ask it about

pub fn demo_upper(n: usize, p: usize) -> bool {
    assert!(p >= 3);

    let i = (n / p - 1) | 1;
    i < (n + 1)
}

Then it obviously has no idea

define noundef zeroext i1 @_ZN7example10demo_upper17hed0063195a0d922bE(i64 %n, i64 %p) unnamed_addr #0 !dbg !13 {
  %_4 = icmp ult i64 %p, 3, !dbg !14
  br i1 %_4, label %bb1, label %bb3, !dbg !15

bb1:                                              ; preds = %start
  tail call void @_ZN4core9panicking5panic17hef60c19188bfa7ddE([0 x i8]* noalias noundef nonnull readonly align 1 bitcast (<{ [24 x i8] }>* @alloc14 to [0 x i8]*), i64 24, %"core::panic::location::Location"* noalias noundef readonly align 8 dereferenceable(24) bitcast (<{ i8*, [16 x i8] }>* @alloc16 to %"core::panic::location::Location"*)) #2, !dbg !15
  unreachable, !dbg !15

bb3:                                              ; preds = %start
  %_8 = udiv i64 %n, %p, !dbg !16
  %_7 = add nsw i64 %_8, -1, !dbg !17
  %i = or i64 %_7, 1, !dbg !17
  %_13 = add i64 %n, 1, !dbg !18
  %0 = icmp ult i64 %i, %_13, !dbg !20
  ret i1 %0, !dbg !21
}

And thus the problem isn't that it doesn't know the bounds, it's that regardless of whether it knows the vec/slice bound or not, the indexing logic you're using is too complicated for it to figure out.

So you might consider other ways to write that logic that it might understand better, or consider filing an LLVM issue about it if you think that LLVM should understand what you're doing.

(Repro for those two snippits: https://rust.godbolt.org/z/qYcnM1E5G)

tschuett · April 1, 2022, 8:36pm

LLVM/clang has llvm-opt-report: https://reviews.llvm.org/D25262

I wonder if rustc could write remarks like, e.g.: too many runtime checks, ...

leonardo · April 3, 2022, 9:15am

Right. I think it's a leftover from a precedent version of the code. Thank you for the help and notes.

system · July 2, 2022, 9:15am

This topic was automatically closed 90 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
MIR: Optimize for-loop on integer ranges	11	1614	February 10, 2023
About optimizations of `for` loops compiler	6	1143	August 25, 2023
More about slice -> array conversions language design	3	701	July 28, 2020
Potential Compiler Optimization Issue with &Vec<const T> vs &[const T] compiler	5	637	October 9, 2024
[x-post from users forum] Long compile times for a vector with 50K u64 values internals	2	973	March 11, 2020

Optimization comparison: Vec vs array and for vs while

Related topics