I.e. Checking if a u8
is a utf8 first byte and how many bytes would follow it. This seems like a simple operation that would be a good counterpart to char::utf8_len
(this operation might have a specific name I am not aware of, however)
It's something I had to implement for myself multiple times, but it's too small to warrant a whole crate, so I think it would feel at home in the standard library. Finally, most utf8 validation can already be done easily by encoding the byte(s) into types that can already handle the operations, but this part of the equation is missing.
Here is how I normally implement it:
pub const fn utf8_prefix_len(b: u8) -> Option<usize> {
match b {
0xc0 | 0xc1 | 0xf5..=0xff => None,
_ => {
match (b & 0b1111_1000).leading_ones() {
0 => Some(1),
1 => None,
n => Some(n as usize)
}
}
}
}
// unsafe: gives wrong results if input is not a first byte
pub const unsafe fn utf8_prefix_len_unchecked(b: u8) -> usize {
let n = (b & 0b1111_0000).leading_ones();
if n == 0 {1} else {n as usize}
}
pub const fn is_first_utf8_byte(b: u8) -> bool {
match b {
0xc0 | 0xc1 | 0xf5..=0xff => false,
_ => (b & 0b1100_0000) != 0b1000_0000
}
}
(alternatively n | (1 >> n)
instead of the if-else
in utf8_prefix_len_unchecked
, but I don't think it's any faster on rustc)
Test:
#[test]
fn test_utf8_prefix_inspection() {
let mut bytes = [0u8; 4];
macro_rules! test_if_first_byte {
($b: expr, $n: expr) => {
assert_eq!(utf8_prefix_len($b), Some($n));
unsafe {assert_eq!(utf8_prefix_len_unchecked($b), $n)};
assert!(is_first_utf8_byte($b));
}
}
macro_rules! test_if_not_first_byte {
($b: expr) => {
assert_eq!(utf8_prefix_len($b), None);
assert!(!is_first_utf8_byte($b));
}
}
for c1 in ['0', 'a', 'Z', '-'] {
c1.encode_utf8(&mut bytes);
test_if_first_byte!(bytes[0], 1);
}
for c2 in ['าซ', 'ฮฃ', 'ร', 'ล'] {
c2.encode_utf8(&mut bytes);
test_if_first_byte!(bytes[0], 2);
test_if_not_first_byte!(bytes[1]);
}
for c3 in ['โค', '๊ค', 'โคก', '๊ญ'] {
c3.encode_utf8(&mut bytes);
test_if_first_byte!(bytes[0], 3);
test_if_not_first_byte!(bytes[1]);
test_if_not_first_byte!(bytes[2]);
}
for c4 in ['๐', '๐', '๐ฌ', '๐ฆ'] {
c4.encode_utf8(&mut bytes);
test_if_first_byte!(bytes[0], 4);
test_if_not_first_byte!(bytes[1]);
test_if_not_first_byte!(bytes[2]);
test_if_not_first_byte!(bytes[3]);
}
for exception in [0xc0, 0xc1, 0xf5, 0xf6, 0xf7, 0xf8,
0xf9, 0xfa, 0xfb, 0xfc, 0xfe, 0xff] {
test_if_not_first_byte!(exception);
}
}
Perhaps the other operation I'd recommend would be the ability to decode a byte slice into a u32.
Code:
const unsafe fn _decode_as_codepoint(b: &[u8], n: usize) -> u32 {
macro_rules! g {
($i: literal) => { *b.as_ptr().add($i) }
}
match n {
1 => {
g!(0) as u32
},
2 => {
(((g!(0) & 0b0001_1111) as u32) << 6) |
(((g!(1) & 0b0011_1111) as u32) << 0)
},
3 => {
(((g!(0) & 0b0000_1111) as u32) << 12) |
(((g!(1) & 0b0011_1111) as u32) << 6) |
(((g!(2) & 0b0011_1111) as u32) << 0)
},
_ => {
(((g!(0) & 0b0000_1111) as u32) << 18) |
(((g!(1) & 0b0011_1111) as u32) << 12) |
(((g!(2) & 0b0011_1111) as u32) << 6) |
(((g!(3) & 0b0011_1111) as u32) << 0)
},
}
}
pub const fn decode_as_codepoint(b: &[u8]) -> Option<u32> {
if let [b0, ..] = b {
if let Some(n) = utf8_prefix_len(*b0) {
if b.len() >= n {
let mut i = 1;
while i < n {
if (b[i] & 0b1100_0000) != 0b1000_0000 {
return None;
}
i += 1;
}
return Some(unsafe{_decode_as_codepoint(b, n)});
}
}
}
return None;
}
// unsafe: no bounds checking, assumes valid utf8
pub const unsafe fn decode_as_codepoint_unchecked(b: &[u8]) -> u32 {
_decode_as_codepoint(b, utf8_prefix_len_unchecked(b[0]))
}
#[test]
fn test_codepoint_decoding() {
let mut bytes = [0u8; 4];
macro_rules! test_decode {
($c: literal) => {
let u = $c as u32;
$c.encode_utf8(&mut bytes);
assert_eq!(decode_as_codepoint(&bytes), Some(u));
assert_eq!(unsafe{decode_as_codepoint_unchecked(&bytes)}, u);
}
}
macro_rules! test_invalid_decode {
([$($b: literal),*]) => {
assert_eq!(decode_as_codepoint(&[$($b),*]), None);
}
}
test_decode!('0');
test_decode!('a');
test_decode!('าซ');
test_decode!('ฮฃ');
test_decode!('โคก');
test_decode!('๊ญ');
test_decode!('๐');
test_decode!('๐');
test_invalid_decode!([]);
test_invalid_decode!([0b1000_0000]);
test_invalid_decode!([0b1100_0000]);
test_invalid_decode!([0b1100_0011, 0xff]);
test_invalid_decode!([0b1110_0001]);
test_invalid_decode!([0b1111_0101, 0x80]);
test_invalid_decode!([0b1111_0011, 0xff, 0x80, 0x80]);
}
Again because it's a bridge between the features in the standard library, as you can easily encode characters into byte arrays but not the other way around without creating an extraneous string and getting its parts from an iterator. But I think this one is a bit more involved of a discussion.