ABI-Change discussion: About Vec<T>

Currently, &Vec<T> is a pointer point to Vec<T>, which is a tuple of (ptr,len,cap).

Thus, indexing an element from &Vec would dereference pointer twice.

fn indexing_Vec(rhs:Vec<i32>,r:usize) -> i32 {
    assert!(rhs.len()>r);
//        movq    (%rbx), %rdi // first dereference, obtain a ptr to `&[i32]`
//        movl    (%rdi,%rax,4), %ebx  // second dereference, got rhs[r]
    rhs[r]
//         movl    %ebx, %eax
}
Full Assembly generate by godbolt with `-C opt-level=3`

indexing_Vec:
.Lfunc_begin1:
        .file   5 "/app" "example.rs"
        .loc    5 2 0
        .cfi_startproc
        .cfi_personality 155, DW.ref.rust_eh_personality
        .cfi_lsda 27, .Lexception0
        pushq   %r14
        .cfi_def_cfa_offset 16
        pushq   %rbx
        .cfi_def_cfa_offset 24
        pushq   %rax
        .cfi_def_cfa_offset 32
        .cfi_offset %rbx, -24
        .cfi_offset %r14, -16
        movq    %rdi, %rbx
.Ltmp12:
        .loc    5 3 13 prologue_end
        cmpq    %rsi, 16(%rdi)
        .loc    5 3 5 is_stmt 0
        jbe     .LBB1_1
        .loc    5 0 5
        movq    %rsi, %rax
        .loc    5 4 5 is_stmt 1
        movq    (%rbx), %rdi
.Ltmp13:
        .loc    1 487 1
        movq    8(%rbx), %rsi
.Ltmp14:
        .loc    5 4 5
        movl    (%rdi,%rax,4), %ebx
.Ltmp15:
        .loc    2 241 25
        testq   %rsi, %rsi
        .loc    2 241 12 is_stmt 0
        je      .LBB1_5
.Ltmp16:
        .loc    3 435 16 is_stmt 1
        shlq    $2, %rsi
.Ltmp17:
        .loc    4 250 22
        movl    $4, %edx
        callq   *__rust_dealloc@GOTPCREL(%rip)
.Ltmp18:
.LBB1_5:
        .loc    5 5 2
        movl    %ebx, %eax
        addq    $8, %rsp
        .cfi_def_cfa_offset 24
        popq    %rbx
        .cfi_def_cfa_offset 16
        popq    %r14
        .cfi_def_cfa_offset 8
        retq

Same things happened to &Vec<T>:

fn indexing_ref_Vec(rhs:&Vec<i32>,r:usize) -> i32 {
    assert!(rhs.len()>r);
//        movq    (%rdi), %rax // first dereference, obtain a ptr to `&[i32]`
//        movl    (%rax,%rsi,4), %eax  // second dereference, got rhs[r]
    rhs[r]
}
Full Assembly generate by godbolt with `-C opt-level=3`

indexing_ref_Vec:
.Lfunc_begin2:
        .loc    5 7 0
        .cfi_startproc
        pushq   %rax
        .cfi_def_cfa_offset 16
.Ltmp20:
        .loc    5 8 13 prologue_end
        cmpq    %rsi, 16(%rdi)
        .loc    5 8 5 is_stmt 0
        jbe     .LBB2_2
        .loc    5 9 5 is_stmt 1
        movq    (%rdi), %rax
        movl    (%rax,%rsi,4), %eax
        .loc    5 10 2
        popq    %rcx
        .cfi_def_cfa_offset 8
        retq
.LBB2_2:
        .cfi_def_cfa_offset 16
        .loc    5 8 5
        leaq    .L__unnamed_1(%rip), %rdi
        leaq    .L__unnamed_3(%rip), %rdx
        movl    $31, %esi
        callq   *core::panicking::panic@GOTPCREL(%rip)
        ud2
.Ltmp21:
.Lfunc_end2:
        .size   indexing_ref_Vec, .Lfunc_end2-indexing_ref_Vec
        .cfi_endproc
    .section        .text.indexing_array,"ax",@progbits
    .globl  indexing_array
    .p2align        4, 0x90
    .type   indexing_array,@function

indexing_array: .Lfunc_begin3: .loc 5 12 0 .cfi_startproc pushq %rax .cfi_def_cfa_offset 16 .Ltmp22: .loc 5 13 13 prologue_end cmpq $5, %rsi .loc 5 13 5 is_stmt 0 jae .LBB3_1 .loc 5 14 5 is_stmt 1 movl (%rdi,%rsi,4), %eax .loc 5 15 2 popq %rcx .cfi_def_cfa_offset 8 retq

But for array with either fixed size or a reference, we only need 1 dereference:

#[no_mangle]
fn indexing_array(rhs:[i32;5],r:usize) -> i32 {
    assert!(rhs.len()>r);
//        movl    (%rdi,%rsi,4), %eax
    rhs[r]
}
#[no_mangle]
fn indexing_ref_array(rhs:&[i32],r:usize) -> i32 {
    assert!(rhs.len()>r);
//        movl    (%rdi,%rdx,4), %eax
    rhs[r]
}
Full Assembly generate by godbolt with `-C opt-level=3`

        .size   indexing_ref_Vec, .Lfunc_end2-indexing_ref_Vec
        .cfi_endproc
    .section        .text.indexing_array,"ax",@progbits
    .globl  indexing_array
    .p2align        4, 0x90
    .type   indexing_array,@function

indexing_array: .Lfunc_begin3: .loc 5 12 0 .cfi_startproc pushq %rax .cfi_def_cfa_offset 16 .Ltmp22: .loc 5 13 13 prologue_end cmpq $5, %rsi .loc 5 13 5 is_stmt 0 jae .LBB3_1 .loc 5 14 5 is_stmt 1 movl (%rdi,%rsi,4), %eax .loc 5 15 2 popq %rcx .cfi_def_cfa_offset 8 retq .LBB3_1: .cfi_def_cfa_offset 16 .loc 5 13 5 leaq .L__unnamed_1(%rip), %rdi leaq .L__unnamed_4(%rip), %rdx movl $31, %esi callq *core::panicking::panic@GOTPCREL(%rip) ud2 .Ltmp23: .Lfunc_end3: .size indexing_array, .Lfunc_end3-indexing_array .cfi_endproc

    .section        .text.indexing_ref_array,"ax",@progbits
    .globl  indexing_ref_array
    .p2align        4, 0x90
    .type   indexing_ref_array,@function

indexing_ref_array: .Lfunc_begin4: .loc 5 17 0 .cfi_startproc pushq %rax .cfi_def_cfa_offset 16 .Ltmp24: .loc 5 18 13 prologue_end cmpq %rdx, %rsi .loc 5 18 5 is_stmt 0 jbe .LBB4_1 .loc 5 19 5 is_stmt 1 movl (%rdi,%rdx,4), %eax .loc 5 20 2 popq %rcx .cfi_def_cfa_offset 8 retq .LBB4_1: .cfi_def_cfa_offset 16 .loc 5 18 5 leaq .L__unnamed_1(%rip), %rdi leaq .L__unnamed_5(%rip), %rdx movl $31, %esi callq *core::panicking::panic@GOTPCREL(%rip) ud2 .Ltmp25: .Lfunc_end4:

Code above shows a potential performance gain by changing the ABI of Vec<T>.


My proposal is, make Vec<T> to a raw pointer(with phantom marks if necessary) directly, and store metadata in the new memory.

e.g.,

struct Vec<T>{
    ptr:ArrayPtr<T>,
}
#[repr(C)]
struct Array<T,const N:usize>{
    capacity:usize,// (must equals to N)
    len:usize, // element length
    data:[T;N]
}

finally, the most important thing:

#[repr(C)]
struct ArrayPtr<T>{
   ptr: NonNull<T>, // point to Array<T,N>.data
}

thus, we could use

unsafe{*((usize*)ArrayPtr<T>.ptr-1)}

and

unsafe{*((usize*)ArrayPtr<T>.ptr-2)}

to obtain the len and cap seperately.

--

If we change those API, then Vec<T> could be stored into a register directly, thus no need to dereference some pointer to get the real pointer.

What's more, we could play some magic on &Vec<T>, make &Vec<T> just a copy of Vec<T> (without ownership), thus we won't dereference twice.


Currently, Rust have promised:

Most fundamentally, Vec is and always will be a (pointer, capacity, length) triplet. No more, no less. The order of these fields is completely unspecified, and you should use the appropriate methods to modify these. The pointer will never be null, so this type is null-pointer-optimized.

But Rust also said that

Due to its incredibly fundamental nature, Vec makes a lot of guarantees about its design. This ensures that it’s as low-overhead as possible in the general case, and can be correctly manipulated in primitive ways by unsafe code.

Thus making such change might be fine.

Is it possible to make such ABI-change in Rust 2024?

There is a crate on crates.io that provides this: thin_vec - Rust

A disadvantage of this approach is that it forces Vec::new() to allocate memory immediately, even with 0 elements.

You should pretty much never be handing around &Vec<T>, use &[T] instead.

10 Likes

Firstly, Vec::new() won't allocate any memory if we provide a

const EMPTY_VEC:Array<();0>=Array<();0>{0,0,[]};

All of the pointers could point to this const, without allocating new memory.

Actually the double dereferencing happens when we use Vec<T>.

what's more, if we have to deal with complex data structure like Vec<Vec<T>>, we couldn't convert it directly into &[&[T]].

This is why I want to change the ABI

First, the guarantee the Vec will always be (ptr, length, capacity) is not "unless there's more efficient representation". It's always.

Second, it will not necessarily be more performant, as the length and capacity require indirect access now.

Third, this will not help with the double indirection, as unless you want to move the Vec you'd still need &Vec which is a double ptr. And if you want to move the vec you have single indirection right now.

Fourth, don't use &Vec, use &[] and then you'll have only a single indirection too.

Finally, if you won't use monospaced font your post will be easier to read.

6 Likes

This is why I'm talking about Rust 2024.

In real world, we access elements more than access length, thus it is worth to do such optimization.

At least, Vec<T> could be indexed directly.

Sometimes, you must deal with struct like Vec<Vec<T>>, the inner Vec<T> could not converts to &[T] at no cost.

I have no idea why my post using monospaced font..

Found the missing slashes.

Editions still need to interporate with each other, and so they cannot change the layout of types.

Are you sure? Each time you're iterating over the elements you're accessing the length. Each time you index into the Vec you're accessing the length (for bound checks).

I don't understand.

If you're passing the inner Vec into a function, it can coerce into a slice.

If you're passing the outer Vec, you only have one indirection for each inner Vec (except the outer).

3 Likes

In addition to the (sufficient) arguments above, something else that Vec supports is bidirectional conversion with Box<[T]> (discarding unused capacity). That requires that the on-heap state match the layout of a raw array, since we wouldn’t want Box<[T]> to have extra overhead. Changing this would mean removing long-stable API, not just changing ABI.

10 Likes

Actually, iterating only compares the pointers(this is why Vec<ZST>'s capacity is not 0)

Currently, If we want to obtain an element from a Vec<T>, we have to dereference twice, since Vec<T> is too big to store in a register.

Yes, you're right.

Sure, but how do you compute the end pointer? You add the length to the start pointer.

we would only calculate it once.

and since length is next to the item we would iterate, calculate end pointer might trigger a prefetch, thus there might be not too many cost.

We can also fetch the data pointer once.

And I didn't say it will be less efficient, nor did I say it won't be more efficient. It may be. Depending on the access pattern. The current implementation has an important characteristic: it has mostly detereminstic performance. Your approach varies a lot more. It could be faster, if the length is on the same cache line as the elements I access, or much slower, if I access far elements and maybe the length is even on the main memory. This is why it is not good for the general-purpose Vec in std. It may be well suited for your use case, and in that, case, you're welcome to use thin-vec (or write your own).

1 Like

Any kind of change like this is effectively ruled out by the existence of Vec::from_raw_parts. Technically it could be changed to reallocate on the spot and still satisfy its API contract, but people expect converting Vec to its component fields and back to be a free operation.

9 Likes

Actually, you could implement from_raw_parts to just ignore the cap/len arguments (and offset the data pointer back to the front of the allocation).

But yes, Vec<T> promises that it's (ptr, len, cap), and that's not going to change. I think we guarantee that Box<[T]> -> Vec<T> is O(1). We also have that Vec<ZST> doesn't allocate, which isn't possible if len/cap are outlined.

Plus in general the optimization of passing &V as just V is problematic. The most obvious issue is that you can as usize to observe the address of the reference, and we (almost certainly) guarantee address stability, which isn't provided if we optimize by-reference to by-value passing.

In general, this just isn't a necessary problem. When dealing with concrete types, use &[T] instead of &Vec<T>. Methods on Vec are inlinable and inlining trivially removes the temporary reference.

2 Likes

My understanding is that LLVM is still free to convert between by-reference and by-value passing if it thinks it's better and can prove that the optimization is correct. Is that true?

A similar trick was tried with a hash map a while back:

1 Like

Note that with the proposed design, Vec::from_raw_parts would still require no allocations. Instead it would have a precondition that ptr points inside of the Array struct, i.e. not only the range ptr..ptr + len consists of allocated memory, but ptr.cast::<usize>() - 2 must also be within the same allocation (so that we can put the (len, capacity) header in two words before the ptr).

However, this would be incompatible with the current API and usages of Vec, which is often used to manage arbitrary allocations of len objects`. It would also noticeably increase the heap usage of short vectors. Two extra words would take 16 bytes on 64-bit systems, which is enough for 4-16 small-sized elements.

Similarly, Vec::new() wouldn't need to allocate. It would just use a null pointer to specify an empty Vec. On the other hand, it would mean that every access to the vector would have to branch on the initial pointer, which would inhibit optimizations.

This change would also mean that it is no longer possible to separately mutably borrow Vec.len, Vec.capacity and Vec.ptr. This is only an issue for the implementation itself, since those fields are not exposed. However, it may still complicate the implementation, negatively affect performance and increase the probability of unsoundness. Benchmarks would be required here.

Overall, besides being incompatible with the current API and layout of Vec (which is often used in in unsafe code), this is a change of very dubious performance and memory use benefits. There are certainly contexts where it could be worthwhile, but as a general-purpose growable buffer it looks unsuitable.

1 Like

Since thin_vec already exists as a crate, I'd be interested to see benchmarks comparing Vec and ThinVec! That could help dispel the differing assumptions about performance effects of the different layouts.

...not interested enough to do it myself, though :sweat_smile:

3 Likes

This is likely accurate (where "prove it's correct" includes proving the address is never potentially observed), but LLVM typically doesn't like to change the ABI of even internal/nonexposed functions when they haven't been inlined out of being a function call.

The reason for this is unclear, but probably a combination of at least that

  • Properly codegen-unit internal functions are rare in C/C++, thus most functions need to maintain their proper ABI; and
  • Saying one ABI is an optimization over another is very rarely a simple question, since it has knock-on effects for register allocation.

(A like notification got me here and we've discussed similar things in the thread about pass-by-reference and Copy types.)

3 Likes

This topic was automatically closed 90 days after the last reply. New replies are no longer allowed.