Hey there,
I just had some benchmarks with the nom crate and tested some fundamental performance degrease when using the tag!
macro, which will internally work like &slice_a[..max_len] == &slice_b[..max_len]
. So, are there any optimizations done on this in the past?
Usually it could be even faster than memcmp
, for example if we implement something like this, if pa is known at compile time.
match len {
1 => *(pa as *const u8) == *(pb as *const u8),
2 => *(pa as *const u16) == *(pb as *const u16),
3 => {
*(pa as *const u16) == *(pb as *const u16) &&
*(pa.offset(2) as *const u8) == *(pb.offset(2) as *const u8)
}
4 => *(pa as *const u32) == *(pb as *const u32),
5 => {
*(pa as *const u32) == *(pb as *const u32) &&
*(pa.offset(4) as *const u8) == *(pb.offset(4) as *const u8)
}
6 => {
*(pa as *const u32) == *(pb as *const u32) &&
*(pa.offset(4) as *const u16) == *(pb.offset(4) as *const u16)
}
7 => {
*(pa as *const u32) == *(pb as *const u32) &&
*(pa.offset(4) as *const u16) == *(pb.offset(4) as *const u16) &&
*(pa.offset(6) as *const u8) == *(pb.offset(6) as *const u8)
}
8 => *(pa as *const u64) == *(pb as *const u64),
9 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u8) == *(pb.offset(8) as *const u8)
}
10 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u16) == *(pb.offset(8) as *const u16)
}
11 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u16) == *(pb.offset(8) as *const u16) &&
*(pa.offset(10) as *const u8) == *(pb.offset(10) as *const u8)
}
12 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u32) == *(pb.offset(8) as *const u32)
}
13 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u32) == *(pb.offset(8) as *const u32) &&
*(pa.offset(12) as *const u8) == *(pb.offset(12) as *const u8)
}
14 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u32) == *(pb.offset(8) as *const u32) &&
*(pa.offset(12) as *const u16) == *(pb.offset(12) as *const u16)
}
15 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u32) == *(pb.offset(8) as *const u32) &&
*(pa.offset(12) as *const u16) == *(pb.offset(12) as *const u16) &&
*(pa.offset(14) as *const u8) == *(pb.offset(14) as *const u8)
}
16 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u64) == *(pb.offset(8) as *const u64)
}
_ => memcmp(pa, pb, len) == 0,
}
This could automatically be implemented for arrays up to a specific size, with some offsets and a mixture of casts. What I want to say is that comparing casted u64 values in c is much faster than the memcmp.
This discussion seems to be related to 16913 and also this crate.