Feature Request: Utf8 prefix inspection for u8

I.e. Checking if a u8 is a utf8 first byte and how many bytes would follow it. This seems like a simple operation that would be a good counterpart to char::utf8_len (this operation might have a specific name I am not aware of, however)

It's something I had to implement for myself multiple times, but it's too small to warrant a whole crate, so I think it would feel at home in the standard library. Finally, most utf8 validation can already be done easily by encoding the byte(s) into types that can already handle the operations, but this part of the equation is missing.

Here is how I normally implement it:

pub const fn utf8_prefix_len(b: u8) -> Option<usize> {
  match b {
    0xc0 | 0xc1 | 0xf5..=0xff => None,
    _ => {
      match (b & 0b1111_1000).leading_ones() {
        0 => Some(1),
        1 => None,
        n => Some(n as usize)
      }
    }
  }
}

// unsafe: gives wrong results if input is not a first byte
pub const unsafe fn utf8_prefix_len_unchecked(b: u8) -> usize {
  let n = (b & 0b1111_0000).leading_ones();
  if n == 0 {1} else {n as usize}
}

pub const fn is_first_utf8_byte(b: u8) -> bool {
  match b {
    0xc0 | 0xc1 | 0xf5..=0xff => false,
    _ => (b & 0b1100_0000) != 0b1000_0000
  }
}

(alternatively n | (1 >> n) instead of the if-else in utf8_prefix_len_unchecked, but I don't think it's any faster on rustc)

Test:
#[test]
fn test_utf8_prefix_inspection() {
  let mut bytes = [0u8; 4];

  macro_rules! test_if_first_byte {
    ($b: expr, $n: expr) => {
      assert_eq!(utf8_prefix_len($b), Some($n));
      unsafe {assert_eq!(utf8_prefix_len_unchecked($b), $n)};
      assert!(is_first_utf8_byte($b));
    }
  }
  macro_rules! test_if_not_first_byte {
    ($b: expr) => {
      assert_eq!(utf8_prefix_len($b), None);
      assert!(!is_first_utf8_byte($b));
    }
  }
  
  for c1 in ['0', 'a', 'Z', '-'] {
    c1.encode_utf8(&mut bytes);
    test_if_first_byte!(bytes[0], 1);
  }
  for c2 in ['าซ', 'ฮฃ', 'รƒ', 'ลž'] {
    c2.encode_utf8(&mut bytes);
    test_if_first_byte!(bytes[0], 2);
    test_if_not_first_byte!(bytes[1]);
  }
  for c3 in ['โ‚ค', '๊ค‚', 'โคก', '๊ญ'] {
    c3.encode_utf8(&mut bytes);
    test_if_first_byte!(bytes[0], 3);
    test_if_not_first_byte!(bytes[1]);
    test_if_not_first_byte!(bytes[2]);
  }
  for c4 in ['๐‘’', '๐œ‘', '๐’ฌ', '๐’ฆ'] {
    c4.encode_utf8(&mut bytes);
    test_if_first_byte!(bytes[0], 4);
    test_if_not_first_byte!(bytes[1]);
    test_if_not_first_byte!(bytes[2]);
    test_if_not_first_byte!(bytes[3]);
  }
  for exception in  [0xc0, 0xc1, 0xf5, 0xf6, 0xf7, 0xf8,
                     0xf9, 0xfa, 0xfb, 0xfc, 0xfe, 0xff] {
    test_if_not_first_byte!(exception);       
  }
}

Perhaps the other operation I'd recommend would be the ability to decode a byte slice into a u32.

Code:
const unsafe fn _decode_as_codepoint(b: &[u8], n: usize) -> u32 {
  macro_rules! g {
    ($i: literal) => { *b.as_ptr().add($i) }
  }
  match n {
    1 => {
      g!(0) as u32
    },
    2 => {
      (((g!(0) & 0b0001_1111) as u32) << 6) |
      (((g!(1) & 0b0011_1111) as u32) << 0)
    },
    3 => {
      (((g!(0) & 0b0000_1111) as u32) << 12) |
      (((g!(1) & 0b0011_1111) as u32) <<  6) |
      (((g!(2) & 0b0011_1111) as u32) <<  0)
    },
    _ => {
      (((g!(0) & 0b0000_1111) as u32) << 18) |
      (((g!(1) & 0b0011_1111) as u32) << 12) |
      (((g!(2) & 0b0011_1111) as u32) <<  6) |
      (((g!(3) & 0b0011_1111) as u32) <<  0)
    },
  }
}

pub const fn decode_as_codepoint(b: &[u8]) -> Option<u32> {
  if let [b0, ..] = b {
    if let Some(n) = utf8_prefix_len(*b0) {
      if b.len() >= n {
        let mut i = 1;
        while i < n {
          if (b[i] & 0b1100_0000) != 0b1000_0000 {
            return None;
          }
          i += 1;
        }
        return Some(unsafe{_decode_as_codepoint(b, n)});
      }
    }
  }
  return None;
}

// unsafe: no bounds checking, assumes valid utf8
pub const unsafe fn decode_as_codepoint_unchecked(b: &[u8]) -> u32 {
  _decode_as_codepoint(b, utf8_prefix_len_unchecked(b[0]))
}

#[test]
fn test_codepoint_decoding() {
  let mut bytes = [0u8; 4];

  macro_rules! test_decode {
    ($c: literal) => {
      let u = $c as u32;
      $c.encode_utf8(&mut bytes);
      assert_eq!(decode_as_codepoint(&bytes), Some(u));
      assert_eq!(unsafe{decode_as_codepoint_unchecked(&bytes)}, u);
    }
  }

  macro_rules! test_invalid_decode {
    ([$($b: literal),*]) => {
      assert_eq!(decode_as_codepoint(&[$($b),*]), None);
    }
  }

  test_decode!('0');
  test_decode!('a');
  test_decode!('าซ');
  test_decode!('ฮฃ');
  test_decode!('โคก');
  test_decode!('๊ญ');
  test_decode!('๐‘’');
  test_decode!('๐œ‘');
  
  test_invalid_decode!([]);
  test_invalid_decode!([0b1000_0000]);
  test_invalid_decode!([0b1100_0000]);
  test_invalid_decode!([0b1100_0011, 0xff]);
  test_invalid_decode!([0b1110_0001]);
  test_invalid_decode!([0b1111_0101, 0x80]);
  test_invalid_decode!([0b1111_0011, 0xff, 0x80, 0x80]);
}

Again because it's a bridge between the features in the standard library, as you can easily encode characters into byte arrays but not the other way around without creating an extraneous string and getting its parts from an iterator. But I think this one is a bit more involved of a discussion.

2 Likes

There is utf8_char_width in nightly which returns 0 for an invalid UTF8 prefix and the length of the UTF8 sequence for a valid prefix byte.

5 Likes

It's gated behind an internals feature, but I suppose we could expose it somehow. Good find.

2 Likes

You can use the bstr crate (by BurntSushi, author of ripgrep) for this. It features a utf8_chunks method that gets you an iterator where the first item is exactly the UTF-8 prefix you're talking about.

2 Likes

That's a neat crate, but I am often dealing with meta/const/type programming and so I'd like it to be a constant function. It also took me a bit but I eventually realized that I could access the char width by taking the length of the (valid) items of the iterator, not exactly direct but it would work, hehe. (the chunks themselves are normally sufficient and often in non-const contexts but I do need specifically the width if, say, I need to generate an array or other fixed-size container out of it, specifically in my latest use case I wanted a function that read a const string and generated a char array out of it, which needed a const function to count up the number of prefixes/chunks)

Maybe once const traits are available it will be easier for me to use crates for this type of thing. That all said, the internal utf8_char_width function does solve my issue and it made me go "doh" I didn't think to implement it with an array, hehe. Hopefully it will be considered to be elevated as a proper method for u8 (altho I am not picky of what form it will take if it does get stabilized)

1 Like

I forgot about overlongs, etc so I updated my implementation just for the sake of correctness.1

The array version is more to my liking though, I will probably just copy it in a local crate for now. I also took a shot at implementing _decode_as_codepoint as (many) arrays, as an experiment.

Code:
macro_rules! gen_byte_array {
  (|$b: ident| -> $T: ty {$ex: expr}, $d: expr) => {{
    const fn byte_op($b: u8) -> $T  {
      $ex
    }
    let mut u = [$d; 256];
    let mut i = 0;
    while i < 256 {
      u[i] = byte_op(i as u8);
      i += 1;
    }
    u
  }};

  (|$b: ident| -> $T: ty {$ex: expr}) => {
    gen_byte_array!(|$b| -> $T {$ex}, 0)
  }
}

macro_rules! count_idents {
  ($a: ident) => {0};
  ($a: ident, $b: ident) => {1};
  ($a: ident, $b: ident, $c: ident) => {2};
  ($a: ident, $b: ident, $c: ident, $d: ident) => {3};
}

macro_rules! gen_utf8_arrays {
  ($($bn: ident),+) => {
    gen_utf8_arrays!(step 1:
      count_idents!($($bn),*); $($bn),*);
  };

  (step 1: $c: expr; $($bn: ident),+) => {
    gen_utf8_arrays!(step 2:
      0b0011_1111 >> $c, ($c as u32) * 6; $($bn),*);
  };

  (step 2: $B: expr, $S: expr; $b0: ident $(,$bn: ident)*) => {
    const $b0: [u32; 256] = {
      gen_byte_array!(|b| -> u32 {
        ((b & $B) as u32) << $S
      })
    };
    gen_utf8_arrays!(step 3: $S - 6; $($bn),*);
  };


  (step 3: $n: expr; $b: ident $(,$bn: ident)*) => {
    const $b: [u32; 256] = {
      gen_byte_array!(|b| -> u32 {
        ((b & 0b0011_1111) as u32) << $n
      })
    };
    gen_utf8_arrays!(step 3: $n - 6; $($bn),*);
  };

  (step 3: $n: expr;) => {}
}

gen_utf8_arrays! {
  BYTE_2_1,
  BYTE_2_2
}

gen_utf8_arrays! {
  BYTE_3_1,
  BYTE_3_2,
  BYTE_3_3
}

gen_utf8_arrays! {
  BYTE_4_1,
  BYTE_4_2,
  BYTE_4_3,
  BYTE_4_4
}

const unsafe fn _decode_as_codepoint(b: &[u8], n: usize) -> u32 {
  macro_rules! g {
    ($i: literal) => { *b.as_ptr().add($i) as usize }
  }
  match n {
    1 => {
      g!(0) as u32
    },
    2 => {
      BYTE_2_1[g!(0)] | BYTE_2_2[g!(1)]
    },
    3 => {
      BYTE_3_1[g!(0)] | BYTE_3_2[g!(1)]
                      | BYTE_3_3[g!(2)]
    },
    _ => {
      BYTE_4_1[g!(0)] | BYTE_4_2[g!(1)]
                      | BYTE_4_3[g!(2)]
                      | BYTE_4_4[g!(3)]
    },
  }
}

Macros saving the day once again and letting me avoid typing 9 kilobytes of inline integer arrays, hehe.

The post this was a reply to got deleted but I did spend a lot of effort on this before I noticed so hopefully the bump will be forgiven.


1 Honestly I just sorta forgot about it because I commonly only use this with pre-validated byte strings, so I only use the unsafe version.