I commonly try to appease the borrow checker by changing |a: &mut T| { ... }
into |a: T| -> T { ... }
and I always worried about optimization but it never really bit me so I assumed that the compier was desugaring both into the same code. I decided to look into it directly and found out that it actually doesn't so I wanted to ask some opinions and thoughts on an optimization pass that would change the following f
into f_opt
.
#[link(name = "my_c_library")]
extern "C" {
fn cfunc(x: u64) -> bool;
}
type A = [u64; 1024];
// Not only does the compiler not find tail call opts here,
// it can't optimize it into f_opt
fn f(nest: u32, mut a: A) -> A {
if nest > 0 {
// Note: I guess tail call opt should have kicked in here...
f(nest + 1, a)
} else {
a[500] = 30;
a
}
}
// Since A is Copy we can avoid memcpy without chainging the semantics
// if we in-place modify the argument.
#[inline(always)]
fn f_opt(nest: u32, mut a: A) -> A {
fn f(nest: u32, a: &mut A) {
if nest > 0 {
f(nest + 1, a);
} else {
a[500] = 30;
}
}
f(nest, &mut a);
a
}
pub fn main() {
let a = [0; 1024];
let a = f(1000, a);
// let a = f_opt(1000, a);
unsafe { cfunc(a[500]); }
}
To avoid dependence on godbolt (x86):
f_opt
begets simply
example::main:
mov edi, 30
jmp qword ptr [rip + cfunc@GOTPCREL]
while f
compiles into the much longer winded
example::f:
push rbp
push r14
push rbx
mov eax, 8192
call __rust_probestack
sub rsp, rax
mov rax, rdx
mov rbx, rdi
test esi, esi
je .LBB0_1
mov ebp, esi
inc ebp
mov r14, rsp
mov edx, 8192
mov rdi, r14
mov rsi, rax
call qword ptr [rip + memcpy@GOTPCREL]
mov rdi, rbx
mov esi, ebp
mov rdx, r14
call example::f
jmp .LBB0_2
.LBB0_1:
mov qword ptr [rax + 4000], 30
mov edx, 8192
mov rdi, rbx
mov rsi, rax
call qword ptr [rip + memcpy@GOTPCREL]
.LBB0_2:
mov rax, rbx
add rsp, 8192
pop rbx
pop r14
pop rbp
ret
example::main:
push rbx
mov eax, 16384
call __rust_probestack
sub rsp, rax
lea rbx, [rsp + 8192]
mov edx, 8192
mov rdi, rbx
xor esi, esi
call qword ptr [rip + memset@GOTPCREL]
mov rdi, rsp
mov esi, 1000
mov rdx, rbx
call example::f
mov rdi, qword ptr [rsp + 4000]
add rsp, 16384
pop rbx
jmp qword ptr [rip + cfunc@GOTPCREL]
PS. My point is not the missed tail call optimization although it feels vaguely related to my main point. PS2. I would also appreciate some links to related/similar optimization passes that the compiler does.
Thanks