Hey there,
I just had some benchmarks with the nom crate and tested some fundamental performance degrease when using the tag! macro, which will internally work like &slice_a[..max_len] == &slice_b[..max_len]. So, are there any optimizations done on this in the past?
Usually it could be even faster than memcmp, for example if we implement something like this, if pa is known at compile time.
match len {
1 => *(pa as *const u8) == *(pb as *const u8),
2 => *(pa as *const u16) == *(pb as *const u16),
3 => {
*(pa as *const u16) == *(pb as *const u16) &&
*(pa.offset(2) as *const u8) == *(pb.offset(2) as *const u8)
}
4 => *(pa as *const u32) == *(pb as *const u32),
5 => {
*(pa as *const u32) == *(pb as *const u32) &&
*(pa.offset(4) as *const u8) == *(pb.offset(4) as *const u8)
}
6 => {
*(pa as *const u32) == *(pb as *const u32) &&
*(pa.offset(4) as *const u16) == *(pb.offset(4) as *const u16)
}
7 => {
*(pa as *const u32) == *(pb as *const u32) &&
*(pa.offset(4) as *const u16) == *(pb.offset(4) as *const u16) &&
*(pa.offset(6) as *const u8) == *(pb.offset(6) as *const u8)
}
8 => *(pa as *const u64) == *(pb as *const u64),
9 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u8) == *(pb.offset(8) as *const u8)
}
10 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u16) == *(pb.offset(8) as *const u16)
}
11 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u16) == *(pb.offset(8) as *const u16) &&
*(pa.offset(10) as *const u8) == *(pb.offset(10) as *const u8)
}
12 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u32) == *(pb.offset(8) as *const u32)
}
13 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u32) == *(pb.offset(8) as *const u32) &&
*(pa.offset(12) as *const u8) == *(pb.offset(12) as *const u8)
}
14 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u32) == *(pb.offset(8) as *const u32) &&
*(pa.offset(12) as *const u16) == *(pb.offset(12) as *const u16)
}
15 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u32) == *(pb.offset(8) as *const u32) &&
*(pa.offset(12) as *const u16) == *(pb.offset(12) as *const u16) &&
*(pa.offset(14) as *const u8) == *(pb.offset(14) as *const u8)
}
16 => {
*(pa as *const u64) == *(pb as *const u64) &&
*(pa.offset(8) as *const u64) == *(pb.offset(8) as *const u64)
}
_ => memcmp(pa, pb, len) == 0,
}
This could automatically be implemented for arrays up to a specific size, with some offsets and a mixture of casts. What I want to say is that comparing casted u64 values in c is much faster than the memcmp.
This discussion seems to be related to 16913 and also this crate.