Goal: Get a u64 from continuous 8 elements in a &[u8], and check its value
(All following codes can be checked in playground, with additionally worse u128 examples available)
Good one
To avoid unaligned access, I use the following code:
fn u64_fetch1(buf: &[u8], pos: usize) -> bool {
let Some([b0, b1, b2, b3, b4, b5, b6, b7]) = buf.get(pos..(pos + 8)) else {
return false;
};
let target = u64::from_le_bytes([*b0, *b1, *b2, *b3, *b4, *b5, *b6, *b7]);
target == 0x1234567812345678
}
The optimization works fine (build with release profile in x86-64):
u64_fetch1:
cmp rdx, -8
setae al
lea rcx, [rdx + 8]
cmp rcx, rsi
seta cl
or cl, al
je .LBB2_2
xor eax, eax
ret
.LBB2_2:
movabs rax, 1311768465173141112
cmp qword ptr [rdi + rdx], rax
sete al
ret
There is only one memory access with qword width.
Bad one
However, if I add a redundant check in the code:
fn u64_fetch2(buf: &[u8], pos: usize) -> bool {
let Some(b0) = buf.get(pos) else {
return false;
};
// This one is redundant
if *b0 != 0x78 {
return false;
}
let Some([b0, b1, b2, b3, b4, b5, b6, b7]) = buf.get(pos..(pos + 8)) else {
return false;
};
let target = u64::from_le_bytes([*b0, *b1, *b2, *b3, *b4, *b5, *b6, *b7]);
target == 0x1234567812345678
}
And the generated assembly is
u64_fetch2:
cmp rdx, rsi
jae .LBB3_2
cmp byte ptr [rdi + rdx], 120
jne .LBB3_2
cmp rdx, -8
setae al
lea rcx, [rdx + 8]
cmp rcx, rsi
seta cl
or cl, al
je .LBB3_5
.LBB3_2:
xor eax, eax
ret
.LBB3_5:
movzx eax, byte ptr [rdi + rdx + 1]
movzx ecx, byte ptr [rdi + rdx + 2]
mov esi, dword ptr [rdi + rdx + 4]
shl rsi, 32
movzx edx, byte ptr [rdi + rdx + 3]
shl edx, 24
shl ecx, 16
shl eax, 8
or eax, ecx
or eax, edx
or rax, rsi
movabs rcx, 1311768465173140992
cmp rax, rcx
sete al
ret
There are five memory accesses in total: four byte-width access and one dword-width access.
In fact, from the above generate codes, the optimizer do know the check is redundant: after checking the first byte, the following three bytes are checked together to form a three-byte-width integer check instead of re-accessing the first byte.
Does the optimizer think the non-volatile access to immutable memory have side effect? If not, why doesn't it remove the first check to allow for only one memory access?