Let's look at single-byte to UTF-8 decode:
pub fn decode_to_utf8_raw(
&mut self,
src: &[u8],
dst: &mut [u8],
_last: bool,
) -> (DecoderResult, usize, usize) {
let mut source = ByteSource::new(src);
let mut dest = Utf8Destination::new(dst);
'outermost: loop {
match dest.copy_ascii_from_check_space_bmp(&mut source) {
CopyAsciiResult::Stop(ret) => return ret,
CopyAsciiResult::GoOn((mut non_ascii, mut handle)) => 'middle: loop {
// Start non-boilerplate
//
// Since the non-ASCIIness of `non_ascii` is hidden from
// the optimizer, it can't figure out that it's OK to
// statically omit the bound check when accessing
// `[u16; 128]` with an index
// `non_ascii as usize - 0x80usize`.
let mapped =
unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
// let mapped = self.table[non_ascii as usize - 0x80usize];
if mapped == 0u16 {
return (
DecoderResult::Malformed(1, 0),
source.consumed(),
handle.written(),
);
}
let dest_again = handle.write_bmp_excl_ascii(mapped);
So we get a read-only input slice and a writable output slice. We wrap these in ByteSource
and Utf8Destination
, which know our current position (index) when processing these slices.
At the top of the loop we do dest.copy_ascii_from_check_space_bmp(&mut source)
, which copies ASCII from src
to dst
(until the first non-ASCII or running out of space) and advances the position indices within ByteSource
and Utf8Destination
.
If we didn't run out of space, we get non_ascii
(u8
) and mut handle
. The handle borrows the Utf8Destination
and represents an assertion that the destination has space for at least on Basic Multilingual Plane character encoded as UTF-8 (i.e. at least 3 bytes of space). Writing through the handle consumes the handle to make sure there is at most one write transaction per space check.
On the last line quoted here, we do handle.write_bmp_excl_ascii(mapped)
, which looks like this:
pub fn write_bmp_excl_ascii(self, bmp: u16) -> &'a mut Utf8Destination<'b> {
self.dest.write_bmp_excl_ascii(bmp);
self.dest
}
Which calls into this stuff:
#[inline(always)]
fn write_code_unit(&mut self, u: u8) {
unsafe {
// OK, because we checked before handing out a handle.
*(self.slice.get_unchecked_mut(self.pos)) = u;
}
self.pos += 1;
}
#[inline(always)]
fn write_ascii(&mut self, ascii: u8) {
debug_assert!(ascii < 0x80);
self.write_code_unit(ascii);
}
#[inline(always)]
fn write_bmp(&mut self, bmp: u16) {
if bmp < 0x80u16 {
self.write_ascii(bmp as u8);
} else if bmp < 0x800u16 {
self.write_mid_bmp(bmp);
} else {
self.write_upper_bmp(bmp);
}
}
#[inline(always)]
fn write_mid_bmp(&mut self, mid_bmp: u16) {
debug_assert!(mid_bmp >= 0x80);
debug_assert!(mid_bmp < 0x800);
self.write_code_unit(((mid_bmp as u32 >> 6) | 0xC0u32) as u8);
self.write_code_unit(((mid_bmp as u32 & 0x3Fu32) | 0x80u32) as u8);
}
#[inline(always)]
fn write_upper_bmp(&mut self, upper_bmp: u16) {
debug_assert!(upper_bmp >= 0x800);
self.write_code_unit(((upper_bmp as u32 >> 12) | 0xE0u32) as u8);
self.write_code_unit((((upper_bmp as u32 & 0xFC0u32) >> 6) | 0x80u32) as u8);
self.write_code_unit(((upper_bmp as u32 & 0x3Fu32) | 0x80u32) as u8);
}
#[inline(always)]
fn write_bmp_excl_ascii(&mut self, bmp: u16) {
if bmp < 0x800u16 {
self.write_mid_bmp(bmp);
} else {
self.write_upper_bmp(bmp);
}
}
So we see that the write after ASCII happens via *(self.slice.get_unchecked_mut(self.pos)) = u;
, where self.slice
is the dst
slice from decode_to_utf8_raw
.
How can this write overlap a usize
write, then? Let's look at how the ASCII copying happens.
First, we call into this:
#[inline(always)]
pub fn copy_ascii_from_check_space_bmp<'b>(
&'b mut self,
source: &mut ByteSource,
) -> CopyAsciiResult<(DecoderResult, usize, usize), (u8, Utf8BmpHandle<'b, 'a>)> {
let non_ascii_ret = {
let dst_len = self.slice.len();
let src_remaining = &source.slice[source.pos..];
let dst_remaining = &mut self.slice[self.pos..];
let (pending, length) = if dst_remaining.len() < src_remaining.len() {
(DecoderResult::OutputFull, dst_remaining.len())
} else {
(DecoderResult::InputEmpty, src_remaining.len())
};
match unsafe {
ascii_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
} {
None => {
source.pos += length;
self.pos += length;
return CopyAsciiResult::Stop((pending, source.pos, self.pos));
}
Some((non_ascii, consumed)) => {
source.pos += consumed;
self.pos += consumed;
if self.pos + 2 < dst_len {
source.pos += 1; // +1 for non_ascii
non_ascii
} else {
return CopyAsciiResult::Stop((
DecoderResult::OutputFull,
source.pos,
self.pos,
));
}
}
}
};
CopyAsciiResult::GoOn((non_ascii_ret, Utf8BmpHandle::new(self)))
}
So the code first takes tail subslices of the source and destination but instead of passing those subslices onward, it passed the start pointers and the minimum of their lengths to ascii_to_ascii
, which is generated by the macro invocation ascii_alu!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride);
.
It looks like this:
#[allow(unused_macros)]
macro_rules! ascii_alu {
($name:ident,
$src_unit:ty,
$dst_unit:ty,
$stride_fn:ident) => (
#[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
#[inline(always)]
pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) -> Option<($src_unit, usize)> {
let mut offset = 0usize;
// This loop is only broken out of as a `goto` forward
loop {
let mut until_alignment = {
// Check if the other unit aligns if we move the narrower unit
// to alignment.
// if ::std::mem::size_of::<$src_unit>() == ::std::mem::size_of::<$dst_unit>() {
// ascii_to_ascii
let src_alignment = (src as usize) & ALU_ALIGNMENT_MASK;
let dst_alignment = (dst as usize) & ALU_ALIGNMENT_MASK;
if src_alignment != dst_alignment {
break;
}
(ALU_ALIGNMENT - src_alignment) & ALU_ALIGNMENT_MASK
// } else if ::std::mem::size_of::<$src_unit>() < ::std::mem::size_of::<$dst_unit>() {
// ascii_to_basic_latin
// let src_until_alignment = (ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
// if (dst.offset(src_until_alignment as isize) as usize) & ALIGNMENT_MASK != 0 {
// break;
// }
// src_until_alignment
// } else {
// basic_latin_to_ascii
// let dst_until_alignment = (ALIGNMENT - ((dst as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
// if (src.offset(dst_until_alignment as isize) as usize) & ALIGNMENT_MASK != 0 {
// break;
// }
// dst_until_alignment
// }
};
if until_alignment + ALU_STRIDE_SIZE <= len {
// Moving pointers to alignment seems to be a pessimization on
// x86_64 for operations that have UTF-16 as the internal
// Unicode representation. However, since it seems to be a win
// on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
// mixed results when encoding from UTF-16 and since x86 and
// x86_64 should be using SSE2 in due course, keeping the move
// to alignment here. It would be good to test on more ARM CPUs
// and on real MIPS and POWER hardware.
while until_alignment != 0 {
let code_unit = *(src.offset(offset as isize));
if code_unit > 127 {
return Some((code_unit, offset));
}
*(dst.offset(offset as isize)) = code_unit as $dst_unit;
offset += 1;
until_alignment -= 1;
}
let len_minus_stride = len - ALU_STRIDE_SIZE;
loop {
if let Some(num_ascii) = $stride_fn(src.offset(offset as isize) as *const usize,
dst.offset(offset as isize) as *mut usize) {
offset += num_ascii;
return Some((*(src.offset(offset as isize)), offset));
}
offset += ALU_STRIDE_SIZE;
if offset > len_minus_stride {
break;
}
}
}
break;
}
while offset < len {
let code_unit = *(src.offset(offset as isize));
if code_unit > 127 {
return Some((code_unit, offset));
}
*(dst.offset(offset as isize)) = code_unit as $dst_unit;
offset += 1;
}
None
});
}
We see that when $stride_fn
fails, it returns the number of ASCII bytes in the stride. The stride function looks like this:
#[inline(always)]
unsafe fn ascii_to_ascii_stride(src: *const usize, dst: *mut usize) -> Option<usize> {
let word = *src;
let second_word = *(src.offset(1));
*dst = word;
*(dst.offset(1)) = second_word;
find_non_ascii(word, second_word)
}
And there it is: The function writes the stride unconditionally and then examines it for non-ASCII. So in the case where non-ASCII was found, the usize
writes made here are overlapped with writes of u8
made in
fn write_code_unit(&mut self, u: u8) {
unsafe {
// OK, because we checked before handing out a handle.
*(self.slice.get_unchecked_mut(self.pos)) = u;
}
self.pos += 1;
}
which we saw earlier.