Gotos in restricted functions


#1

I seem to remember that Rust doesn’t have gotos because they interact badly with the borrow checker. But to create efficient and easy to write finite state machines and simple interpreters it’s nice to have gotos and computed (indirect) gotos standardized in the language.

So is it a good idea to add a standard function annotation like #[allow_goto] that allows the usage of gotos and computed gotos inside a function? Such annotation should forbid the kind of code that requires the borrow checker (so just copy values). This is a limitation, but I think it’s acceptable because the kind of function where you want to write a goto-based FSM or computed gotos probably looks a lot like C code.


#2

Dear @Leonardo, Could you provide an example of a code that you want to write with goto? The problem isn’t only the borrow checker. Look at the following example:

fn something() {
    goto_label:
    let str = String::from("Some string");
    if is_some_condition {
        goto goto_label;
    }
}

How about destructible types? Should they be allowed? I’d like to see an example where you use only values of types that supports copy.


#3

If a ‘proper tail calls’ feature ever gets implemented, it should provide a decent way to simulate regular gotos (via inlining), though that doesn’t really help with computed gotos…


#4

Gotos are handy to write efficient, compact, and easy to write finite state machines. This is a little example in D language, it generates the “Look and say sequence” (from: http://rosettacode.org/wiki/Look-and-say_sequence#Fast_Imperative_Version ), and it’s quite fast:

import core.stdc.stdio, std.math, std.conv, std.algorithm, std.array;
 
void showLookAndSay(bool showArrays)(in uint n) nothrow {
    if (n == 0) // No sequences to generate and show.
        return;
 
    enum Digit : char { nil = '\0', one = '1', two = '2', thr = '3' }
 
    // Allocate an approximate upper bound size for the array.
    static Digit* allocBuffer(in uint m) nothrow {
        immutable len = cast(size_t)(100 + 1.05 *
                                     exp(0.269 * m + 0.2686)) + 1;
        auto a = len.uninitializedArray!(Digit[]);
        printf("Allocated %d bytes.\n", a.length * Digit.sizeof);
        return a.ptr;
    }
 
    // Can't be expressed in the D type system:
    // a1 and a2 are immutable pointers to mutable data.
    auto a1 = allocBuffer(n % 2 ? n : n - 1);
    auto a2 = allocBuffer(n % 2 ? n - 1 : n);
    printf("\n");
 
    a1[0] = Digit.one;
    size_t len1 = 1;
    a1[len1] = Digit.nil;
 
    foreach (immutable i; 0 .. n - 1) {
        static if (showArrays)
            printf("%2u: %s\n", i + 1, a1);
        else
            printf("%2u: n. digits: %u\n", i + 1, len1);
        auto p1 = a1,
             p2 = a2;
 
        S0: final switch (*p1++) with (Digit) { // Initial state.
                case nil: goto END;
                case one: goto S1;
                case two: goto S2;
                case thr: goto S3;
            }
        S1: final switch (*p1++) with (Digit) {
                case nil: *p2++ = one; *p2++ = one; goto END;
                case one: goto S11;
                case two: *p2++ = one; *p2++ = one; goto S2;
                case thr: *p2++ = one; *p2++ = one; goto S3;
            }
        S2: final switch (*p1++) with (Digit) {
                case nil: *p2++ = one; *p2++ = two; goto END;
                case one: *p2++ = one; *p2++ = two; goto S1;
                case two: goto S22;
                case thr: *p2++ = one; *p2++ = two; goto S3;
            }
        S3: final switch (*p1++) with (Digit) {
                case nil: *p2++ = one; *p2++ = thr; goto END;
                case one: *p2++ = one; *p2++ = thr; goto S1;
                case two: *p2++ = one; *p2++ = thr; goto S2;
                case thr: goto S33;
            }
        S11: final switch (*p1++) with (Digit) {
                case nil: *p2++ = two; *p2++ = one; goto END;
                case one: *p2++ = thr; *p2++ = one; goto S0;
                case two: *p2++ = two; *p2++ = one; goto S2;
                case thr: *p2++ = two; *p2++ = one; goto S3;
            }
        S22: final switch (*p1++) with (Digit) {
                case nil: *p2++ = two; *p2++ = two; goto END;
                case one: *p2++ = two; *p2++ = two; goto S1;
                case two: *p2++ = thr; *p2++ = two; goto S0;
                case thr: *p2++ = two; *p2++ = two; goto S3;
            }
        S33: final switch (*p1++) with (Digit) {
                case nil: *p2++ = two; *p2++ = thr; goto END;
                case one: *p2++ = two; *p2++ = thr; goto S1;
                case two: *p2++ = two; *p2++ = thr; goto S2;
                case thr: *p2++ = thr; *p2++ = thr; goto S0;
            }
        END:
            immutable len2 = p2 - a2;
            a2[len2] = Digit.nil;
            a1.swap(a2);
            len1 = len2;
    }
 
    static if (showArrays)
        printf("%2u: %s\n", n, a1);
    else
        printf("%2u: n. digits: %u\n", n, len1);
}
 
void main(in string[] args) {
    immutable n = (args.length == 2) ? args[1].to!uint : 10;
    n.showLookAndSay!true;
}

If you are using GNU-C you can write a little more efficient code, with computed gotos and the builtin_expect:

void audioactive_loop(int n) {
    unsigned long len1, len2, i;
    unsigned char *s1, *p1, *s2, *p2, *pend, *paux;
    const void *labelsS123[] = {&&S1, &&S2, &&S3}; // Comment this out if you don't use GCC

    // ...
    
    len1 = 1;
    s1[0] = '1';
    s1[len1] = '\0';

    for (i = 0; i < n - 1; i++) {
        if (DOPRINT)
            printf("%d: %s\n", i+1, s1);
        else
            printf("%d: size=%d\n", i+1, len1);

        p1 = s1;
        p2 = s2;
        pend = s1 + len1;

        S0:
            if (__builtin_expect(p1 == pend, 0)) goto END;
            goto *labelsS123[*p1++ - '1'];

        S1:
            if (__builtin_expect(p1 == pend, 0)) { *p2++ = '1'; *p2++ = '1'; goto END; }
            switch (*p1++) {
                case '1': goto S11;
                case '2': *p2++ = '1'; *p2++ = '1'; goto S2;
                case '3': *p2++ = '1'; *p2++ = '1'; goto S3;
            }

    // ...

Regarding computed gotos they are handy to write simple but sufficiently efficient interpreters (and if you don’t want to create a JIT or to use something like PyPy). For a little self-contained example of an interpreter you can see the solutions to the ICFP 2006 contest:

http://fileformats.archiveteam.org/wiki/Universal_Machine_(ICFP_programming_contest_2006)

This is an efficient solution in GNU-C:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
typedef unsigned long long uint64;

#define CompileTime_Assert( x ) extern char __ctassert[(x)];

uint32 const BUF_SIZE = 4*1024*1024;
uint32 const MIN_ALLOC = 8;

struct OPTYPE
{
    void * m_op;
    uint32 *__restrict m_A;
    uint32 *__restrict m_B;
};


uint32 s_buffer[BUF_SIZE*MIN_ALLOC];
uint32 *s_free = 0;

inline uint32 *allocate( uint32 _size )
{
  uint32 *ptr;
  
  if( (_size < MIN_ALLOC) && s_free )
  {
    ptr = s_free;
    s_free = (uint32*)*ptr;
    *ptr = _size;
        CompileTime_Assert( MIN_ALLOC == 8 ); // unroll
        ptr[1] = 0;
        ptr[2] = 0;
        ptr[3] = 0;
        ptr[4] = 0;
        ptr[5] = 0;
        ptr[6] = 0;
        ptr[7] = 0;
  }
  else
  {
    ptr = new uint32[_size+1];
    *ptr = _size;
    memset(ptr+1,0,_size*4);
  }
  return ptr+1;
}

inline void destroy( void * _ptr )
{
  uint32 *ptr = ((uint32*)_ptr)-1;
  if( ((uint32)(ptr-s_buffer))<(BUF_SIZE*MIN_ALLOC) )
  {
    *ptr = (uint)s_free;
    s_free = ptr;
  }
  else
  {
    delete[] ptr;
  }
}

CompileTime_Assert( sizeof(uint32) == sizeof(uint32 *) );

#define A (*ip->m_A)
#define B (*ip->m_B)
#define C (*ip->m_C)
    
class VM
{    
public:
    static void Go( uint32 *_arr0 )
    {        
        void * const labels[] = 
        {
            &&HALT,
            &&HALT,
            &&HALT,
            &&HALT,
            &&HALT,
            &&HALT,
            &&HALT,
            &&HALT,
            &&OP_08,
            &&OP_09,
            &&OP_10,
            &&OP_11,
            &&OP_12
        };

        void * const OP00[] = 
    {
      &&OP_00_0,
      &&OP_00_1,
      &&OP_00_2,
      &&OP_00_3,
      &&OP_00_4,
      &&OP_00_5,
      &&OP_00_6,
      &&OP_00_7,
    };

        void * const OP01[] = 
    {
      &&OP_01_0,
      &&OP_01_1,
      &&OP_01_2,
      &&OP_01_3,
      &&OP_01_4,
      &&OP_01_5,
      &&OP_01_6,
      &&OP_01_7,
    };

        void * const OP02[] = 
    {
      &&OP_02_0,
      &&OP_02_1,
      &&OP_02_2,
      &&OP_02_3,
      &&OP_02_4,
      &&OP_02_5,
      &&OP_02_6,
      &&OP_02_7,
    };

        void * const OP03[] = 
    {
      &&OP_03_0,
      &&OP_03_1,
      &&OP_03_2,
      &&OP_03_3,
      &&OP_03_4,
      &&OP_03_5,
      &&OP_03_6,
      &&OP_03_7,
    };

        void * const OP04[] = 
    {
      &&OP_04_0,
      &&OP_04_1,
      &&OP_04_2,
      &&OP_04_3,
      &&OP_04_4,
      &&OP_04_5,
      &&OP_04_6,
      &&OP_04_7,
    };

        void * const OP05[] = 
    {
      &&OP_05_0,
      &&OP_05_1,
      &&OP_05_2,
      &&OP_05_3,
      &&OP_05_4,
      &&OP_05_5,
      &&OP_05_6,
      &&OP_05_7,
    };

        void * const OP06[] = 
    {
      &&OP_06_0,
      &&OP_06_1,
      &&OP_06_2,
      &&OP_06_3,
      &&OP_06_4,
      &&OP_06_5,
      &&OP_06_6,
      &&OP_06_7,
    };

        void * const OP13[] = 
    {
      &&OP_13_0,
      &&OP_13_1,
      &&OP_13_2,
      &&OP_13_3,
      &&OP_13_4,
      &&OP_13_5,
      &&OP_13_6,
      &&OP_13_7,
    };

        uint32 __restrict reg[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
        
        OPTYPE * __restrict insns = new OPTYPE[(_arr0[-1]+3)&~3];
        {
            for(uint i = 0; i < _arr0[-1]; i+=4 )
            {
        insns[i+0].m_op = &&XLATE;
        insns[i+1].m_op = &&XLATE;
        insns[i+2].m_op = &&XLATE;
        insns[i+3].m_op = &&XLATE;
            }
        }
        register OPTYPE * __restrict ip = insns;
                        
// JIT compile
XLATE:
    {
      uint32 const op = _arr0[ip-insns];
      uint32 const opcode = (op>>28);
      
      switch( opcode )
      {          
        case 0:
             ip->m_op = OP00[((op>>6)&7)];
          ip->m_A = reg+ ((op>>3)&7);
             ip->m_B = reg+ (op&7);
          goto *ip->m_op;       
             
        case 1:
             ip->m_op = OP01[((op>>3)&7)];
          ip->m_A = reg+ ((op>>6)&7);
             ip->m_B = reg+ (op&7);
          goto *ip->m_op;          

        case 2:
             ip->m_op = OP02[((op>>6)&7)];
          ip->m_A = reg+ ((op>>3)&7);
             ip->m_B = reg+ (op&7);
          goto *ip->m_op;          

        case 3:
             ip->m_op = OP03[((op>>6)&7)];
          ip->m_A = reg+ ((op>>3)&7);
             ip->m_B = reg+ (op&7);
          goto *ip->m_op;       

        case 4:
             ip->m_op = OP04[((op>>6)&7)];
          ip->m_A = reg+ ((op>>3)&7);
             ip->m_B = reg+ (op&7);
          goto *ip->m_op;       

        case 5:
             ip->m_op = OP05[((op>>6)&7)];
          ip->m_A = reg+ ((op>>3)&7);
             ip->m_B = reg+ (op&7);
          goto *ip->m_op;       

        case 6:
             ip->m_op = OP06[((op>>6)&7)];
          ip->m_A = reg+ ((op>>3)&7);
             ip->m_B = reg+ (op&7);
          goto *ip->m_op;       
          
        case 13:
            ip->m_op = OP13[(op>>25)&7];        
          ip->m_A = (uint32*)(_arr0[ip-insns]&((1<<25)-1));
          goto *ip->m_op;
        
        default:
             ip->m_op = labels[opcode];
          ip->m_A = reg+ ((op>>3)&7);
             ip->m_B = reg+ (op&7);
          goto *ip->m_op;          
      }
    }
        
        // ----------------------------------------
OP_00_0:
            if( !B ) { goto *(++ip)->m_op; }
      reg[0] = A;
            goto *(++ip)->m_op;                  
OP_03_0:
      reg[0] = A + B;
            goto *(++ip)->m_op;
OP_04_0:
      reg[0] = A * B;
            goto *(++ip)->m_op;
OP_05_0:
      reg[0] = A / B;
            goto *(++ip)->m_op;
OP_06_0:
            reg[0] = ~(A & B);
            goto *(++ip)->m_op;

OP_00_1:
            if( !B ) { goto *(++ip)->m_op; }
      reg[1] = A;
            goto *(++ip)->m_op;                  
OP_03_1:
      reg[1] = A + B;
            goto *(++ip)->m_op;
OP_04_1:
      reg[1] = A * B;
            goto *(++ip)->m_op;
OP_05_1:
      reg[1] = A / B;
            goto *(++ip)->m_op;
OP_06_1:
            reg[1] = ~(A & B);
            goto *(++ip)->m_op;

OP_00_2:
            if( !B ) { goto *(++ip)->m_op; }
      reg[2] = A;
            goto *(++ip)->m_op;                  
OP_03_2:
      reg[2] = A + B;
            goto *(++ip)->m_op;
OP_04_2:
      reg[2] = A * B;
            goto *(++ip)->m_op;
OP_05_2:
      reg[2] = A / B;
            goto *(++ip)->m_op;
OP_06_2:
            reg[2] = ~(A & B);
            goto *(++ip)->m_op;

OP_00_3:
            if( !B ) { goto *(++ip)->m_op; }
      reg[3] = A;
            goto *(++ip)->m_op;                  
OP_03_3:
      reg[3] = A + B;
            goto *(++ip)->m_op;
OP_04_3:
      reg[3] = A * B;
            goto *(++ip)->m_op;
OP_05_3:
      reg[3] = A / B;
            goto *(++ip)->m_op;
OP_06_3:
            reg[3] = ~(A & B);
            goto *(++ip)->m_op;
      
OP_00_4:
            if( !B ) { goto *(++ip)->m_op; }
      reg[4] = A;
            goto *(++ip)->m_op;                  
OP_03_4:
      reg[4] = A + B;
            goto *(++ip)->m_op;
OP_04_4:
      reg[4] = A * B;
            goto *(++ip)->m_op;
OP_05_4:
      reg[4] = A / B;
            goto *(++ip)->m_op;
OP_06_4:
            reg[4] = ~(A & B);
            goto *(++ip)->m_op;

OP_00_5:
            if( !B ) { goto *(++ip)->m_op; }
      reg[5] = A;
            goto *(++ip)->m_op;                  
OP_03_5:
      reg[5] = A + B;
            goto *(++ip)->m_op;
OP_04_5:
      reg[5] = A * B;
            goto *(++ip)->m_op;
OP_05_5:
      reg[5] = A / B;
            goto *(++ip)->m_op;
OP_06_5:
            reg[5] = ~(A & B);
            goto *(++ip)->m_op;
      
OP_00_6:
            if( !B ) { goto *(++ip)->m_op; }
      reg[6] = A;
            goto *(++ip)->m_op;                  
OP_03_6:
      reg[6] = A + B;
            goto *(++ip)->m_op;
OP_04_6:
      reg[6] = A * B;
            goto *(++ip)->m_op;
OP_05_6:
      reg[6] = A / B;
            goto *(++ip)->m_op;
OP_06_6:
            reg[6] = ~(A & B);
            goto *(++ip)->m_op;

OP_00_7:
            if( !B ) { goto *(++ip)->m_op; }
      reg[7] = A;
            goto *(++ip)->m_op;                  
OP_03_7:
      reg[7] = A + B;
            goto *(++ip)->m_op;
OP_04_7:
      reg[7] = A * B;
            goto *(++ip)->m_op;
OP_05_7:
      reg[7] = A / B;
            goto *(++ip)->m_op;
OP_06_7:
            reg[7] = ~(A & B);
            goto *(++ip)->m_op;
      

OP_08:
      A = (uint32)allocate(B);
            goto *(++ip)->m_op;

OP_09:
      destroy((void*)B);
            goto *(++ip)->m_op;

OP_10:
      fputc( B, stdout );
            goto *(++ip)->m_op;

OP_11:
      B = fgetc( stdin );            
            goto *(++ip)->m_op;

// ---

OP_01_0:
            A = (reg[0] ? ((uint32 *)reg[0]) : _arr0)[B] ;
            goto *(++ip)->m_op;

OP_01_1:
            A = (reg[1] ? ((uint32 *)reg[1]) : _arr0)[B] ;
            goto *(++ip)->m_op;

OP_01_2:
            A = (reg[2] ? ((uint32 *)reg[2]) : _arr0)[B] ;
            goto *(++ip)->m_op;

OP_01_3:
            A = (reg[3] ? ((uint32 *)reg[3]) : _arr0)[B] ;
            goto *(++ip)->m_op;

OP_01_4:
            A = (reg[4] ? ((uint32 *)reg[4]) : _arr0)[B] ;
            goto *(++ip)->m_op;

OP_01_5:
            A = (reg[5] ? ((uint32 *)reg[5]) : _arr0)[B] ;
            goto *(++ip)->m_op;

OP_01_6:
            A = (reg[6] ? ((uint32 *)reg[6]) : _arr0)[B] ;
            goto *(++ip)->m_op;

OP_01_7:
            A = (reg[7] ? ((uint32 *)reg[7]) : _arr0)[B] ;
            goto *(++ip)->m_op;

// ---

OP_02_SMC:
      _arr0[A] = B;
      insns[A].m_op = &&XLATE;
      goto *(++ip)->m_op;

OP_02_0:
      if( !reg[0] ) { goto OP_02_SMC; }
      ((uint32 *__restrict)reg[0])[A] = B;
      goto *(++ip)->m_op;

OP_02_1:
      if( !reg[1] ) { goto OP_02_SMC; }
      ((uint32 *__restrict)reg[1])[A] = B;
      goto *(++ip)->m_op;
      
OP_02_2:
      if( !reg[2] ) { goto OP_02_SMC; }
      ((uint32 *__restrict)reg[2])[A] = B;
      goto *(++ip)->m_op;
      
OP_02_3:
      if( !reg[3] ) { goto OP_02_SMC; }
      ((uint32 *__restrict)reg[3])[A] = B;
      goto *(++ip)->m_op;
      
OP_02_4:
      if( !reg[4] ) { goto OP_02_SMC; }
      ((uint32 *__restrict)reg[4])[A] = B;
      goto *(++ip)->m_op;
      
OP_02_5:
      if( !reg[5] ) { goto OP_02_SMC; }
      ((uint32 *__restrict)reg[5])[A] = B;
      goto *(++ip)->m_op;
      
OP_02_6:
      if( !reg[6] ) { goto OP_02_SMC; }
      ((uint32 *__restrict)reg[6])[A] = B;
      goto *(++ip)->m_op;
      
OP_02_7:
      if( !reg[7] ) { goto OP_02_SMC; }
      ((uint32 *__restrict)reg[7])[A] = B;
      goto *(++ip)->m_op;

// ---

OP_12:
             if( !A ) 
      {
        ip = insns+B;
               goto *ip->m_op;
      }

      {
        uint32 const *source = (uint32 const*)A;
        uint32 const dest = B;
                uint32 const size = source[-1];
                if( size > _arr0[-1] )
                {
                    destroy(_arr0);
                    delete[] insns;
                    _arr0 = allocate(size);
                    insns = new OPTYPE[(size+3)&~3];
                }
                memcpy( _arr0, source, size*4 );
                for(uint i = 0; i < size; i+=4 )
                {
                    insns[i+0].m_op = &&XLATE;
                    insns[i+1].m_op = &&XLATE;
                    insns[i+2].m_op = &&XLATE;
                    insns[i+3].m_op = &&XLATE;
                }
        ip = insns+dest;
        goto XLATE;
      }

// ---

OP_13_0:
            reg[0] = (uint32)ip->m_A;
            goto *(++ip)->m_op;

OP_13_1:
            reg[1] = (uint32)ip->m_A;
            goto *(++ip)->m_op;

OP_13_2:
            reg[2] = (uint32)ip->m_A;
            goto *(++ip)->m_op;

OP_13_3:
            reg[3] = (uint32)ip->m_A;
            goto *(++ip)->m_op;

OP_13_4:
            reg[4] = (uint32)ip->m_A;
            goto *(++ip)->m_op;

OP_13_5:
            reg[5] = (uint32)ip->m_A;
            goto *(++ip)->m_op;

OP_13_6:
            reg[6] = (uint32)ip->m_A;
            goto *(++ip)->m_op;

OP_13_7:
            reg[7] = (uint32)ip->m_A;
            goto *(++ip)->m_op;

HALT:
        return;
    }

    
};

int main( int argc, char const ** _argv )
{
  for(uint32 i = 0; i < (BUF_SIZE-1); i++)
  {
    s_buffer[i*MIN_ALLOC] = (uint)(&s_buffer[((i+1)*MIN_ALLOC)]);
  }
  s_buffer[(BUF_SIZE-1)*MIN_ALLOC] = 0;
  s_free = s_buffer;
  
    if( argc == 2 )
    {
        FILE *file = fopen( _argv[1], "r" );
        if( file )
        {
            fseek( file, 0, SEEK_END );
            uint32 fsize = ftell( file );
            rewind( file );
            
            uint32 size = (fsize+3)/4;
            uint32 *data = new uint32[size+1];
            data[0] = size;
            data++;
            fread( data, 1, fsize, file );

            // endian-reverse big-endian data
            for( uint32 i = 0; i < size; i++ )
            {
                data[i] = ( data[i] >> 24 )
                |            (( data[i] >> 8 ) & 0x0000FF00)
                |            (( data[i] << 8 ) & 0x00FF0000)
                |            ( data[i] << 24 );
            }
            
            fclose( file );
            VM::Go( data );
      
            // Leak data[] and let the OS clean it up after us
        }
        else
        {
            printf( "Couldn't open file %s for reading.\n", _argv[1] );
        }
    }
    else
    {
        printf("Usage: icfp filename.um\n");
    }
    
      return 0;
};

#5

I agree gotos would be really useful in some high performance code.

Does it really interact that bad with the Borrow Checker? I guess it could just assume the worst and let the user deal with it (resorting to unsafe, which is fine).


#6

Is there any particular reason you don’t want to write a state machine as a match inside a loop?

enum State {
    S0,
    S1,
    S2,
}

fn state_machine() {
    let mut state = State::S0;
    loop {
        state = match state {
            State::S0 => { println!("goto S1"); State::S1 },
            State::S1 => { println!("goto S2"); State::S2 },
            State::S2 => { println!("done"); break; },
        }
    }
}

#7

From my limited experience with D language (where you can use switches on enums too) the performance of a switch-based version of that FSM for the Look and say sequence is lower. In theory I think this is just a matter of having a sufficiently smart compiler. But in general I find that implementing complex FSMs with gotos leads to simpler code.

And once you have computed gotos, having regular gotos too is probably reasonable.


#8

That seems too restrictive because all sorts of harmless operations will introduce short-lived borrows, even when working with Copy values. For example, all comparisons will create borrows to pass to the relevant method of PartialOrd or PartialEq.


#9

It would be interesting to see an example of a state machine written using match-in-loop (for example, the one for the look and see sequence that you mention) along with the assembly it outputs and an explanation of where it is inefficient and why it would be infeasible to get LLVM to generate efficient code. If that were feasible then - and I’m no rustc developer, so take this with a hefty grain of salt - it would presumably be preferable to improve code generation than add new (difficult, complex, confusing) language features.


#10

One advantage of regular gotos (not computed ones) is that it’s sufficiently easy to quickly compile them into reasonably efficient code, so if you later want to compile Rust code with a GCC or IntelCompiler back-end you have less risks of missed or incorrect optimizations.


#11

Please Note: don’t take much of what I say seriously, I’m mostly being silly but:

Fun fact: that ICFP C program example with gotos is so fast it completes the sandmark.umz benchmark in record time by segfaulting before it even begins the stress test.

(this is after having to compile with -fpermissive otherwise it won’t compile because the original programmer is too lazy to fix their warnings)

./cee sandmark.umz 
trying to Allocate array of size 0..
trying to Abandon size 0 allocation..
trying to Allocate size 11..
trying Array Index on allocated array..
trying Amendment of allocated array..
checking Amendment of allocated array..
trying Alloc(a,a) and amending it..
comparing multiple allocations..
pointer arithmetic..
check old allocation..
simple tests ok!
about to load program from some allocated array..
success.
verifying that the array and its copy are the same...
success.
testing aliasing..
success.
free after loadprog..
success.
loadprog ok.
Segmentation fault (core dumped)

My guess is after spending hours and hours writing the program, and have it segfault on the most basic benchmark (let alone running the entire vm simulation without OOM), the very thought of delving into that mad morass of goto spaghetti was too much, and they despaired and gave up :stuck_out_tongue:

It’s too bad it didn’t run, I was hoping to perf it against my implementation I wrote some time ago:

It’s written in OCaml if you want to test it. If you have a proper install, typing make bench should start the benchmark, and you should see something like:

 make bench
./m32x sandmark.umz
trying to Allocate array of size 0..
trying to Abandon size 0 allocation..
trying to Allocate size 11..
trying Array Index on allocated array..
trying Amendment of allocated array..
checking Amendment of allocated array..
trying Alloc(a,a) and amending it..
comparing multiple allocations..
pointer arithmetic..
check old allocation..
simple tests ok!
about to load program from some allocated array..
success.
verifying that the array and its copy are the same...
success.
testing aliasing..
success.
free after loadprog..
success.
loadprog ok.
 == SANDmark 19106 beginning stress test / benchmark.. ==
100. 12345678.09abcdef
99.  6d58165c.2948d58d
98.  0f63b9ed.1d9c4076
97.  8dba0fc0.64af8685
...
1.   2c80b43d.5646302f
0.   a8d1619e.5540e6cf
SANDmark complete.

Ideally without a segfault :smiley:

It’s very fast, it beat several C implementations. Oh, and it’s actually readable by a regular human (as far as OCaml is readable) :wink: (but seriously though, if you can fix the goto version I’d like to see how much faster (if it even is) than my OCaml version)

On this note, going to give a plug for how terrifyingly efficient the OCaml compiler has gotten. Recently was benching binary parsing in my rust version versus OCaml version and OCaml was pretty much neck and neck…

Good thing I spent all that time thinking about lifetimes :laughing::thinking:


#12

That third program in C was written by someone else (and I have not benchmarked/debugged it). While the first program (in D) and the second program piece (in C) were written by me.

It’s the “Curse of C”, that’s why I am now using Rust since almost a year, I am fed up by C code fragility and unsafety.

I have found several versions of the code, but I have not tested them, this is another one:

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>

//#include <arpa/inet.h>
// for ntohl (network to host long) uint32_t ntohl(uint32_t netlong);
#define ntohl __builtin_bswap32


#define BLKSIZE 8
#define CSIZE (256 * 1048576 / (BLKSIZE * 4))

typedef unsigned int u32;

typedef union {
    u32 block[BLKSIZE];
    struct {
        u32 prev;
        u32 next;
    } pointers;
} chunk_t;

chunk_t chunk[CSIZE];
u32 total=CSIZE-1;

#define INRANGE(x) ( (char*)x >= (char*)chunk && \
        (char*) x < ((char*)chunk) + (CSIZE * BLKSIZE * 4))

u32 *codex = 0;
u32 pc = 0;
u32 *seg = NULL;
u32 regs[8] = {0,0,0,0,0,0,0,0};

void my_init() {
    for (int i = 0; i < CSIZE; i++) {
        chunk[i].pointers.prev = i - 1;
        chunk[i].pointers.next = i + 1;
    }
    chunk[0].pointers.prev = (CSIZE-1);
    chunk[CSIZE - 1].pointers.next = 0;
}

u32 my_alloc(u32 size){
    if (size <= BLKSIZE && total != 0) {
        u32 idx = chunk[0].pointers.next;
        u32 nxt = chunk[idx].pointers.next;
        chunk[0].pointers.next = nxt;
        chunk[nxt].pointers.prev = 0;
        for (int i = 0; i < BLKSIZE; i++)
            chunk[idx].block[i] = 0;
        total--;
        return (u32)((void*)(&chunk[idx]));
    } else {
        u32* p = (u32*)calloc(size+1, 4);
        *p = size;
        return (u32)(p + 1);
    }
}

void my_free(u32 u) {
    if (INRANGE(u)) {
        int idx = (((u32)u)-((u32)(&chunk[0]))) / (BLKSIZE * 4);
        u32 nxt = chunk[0].pointers.next;
        chunk[idx].pointers.next = nxt;
        chunk[idx].pointers.prev = 0;
        chunk[nxt].pointers.prev = idx;
        chunk[0].pointers.next = idx;
        total++;
    } else {
        u32* p = (u32*)(u - 4);
        free(p);
    }
}

u32 my_getsize(u32 u) {
    if (INRANGE(u)) {
        return BLKSIZE;
    } else {
        u32* p = (u32*)(u - 4);
        return p[0];
    }
}

void run() {
    void* ops[] = {&&op_0, &&op_1, &&op_2, &&op_3, &&op_4, &&op_5, &&op_6,
                   &&op_7, &&op_8, &&op_9, &&op_10, &&op_11, &&op_12, &&op_13};

    seg = codex;
    pc = 0;

    while(1) {
        u32 w = seg[pc++];
        u32 opcode = w >> 28;
        u32 a = (w >> 6) & 7;
        u32 b = (w >> 3) & 7;
        u32 c = w & 7;
        u32 sa = (w >> 25) & 7;
        u32 dat = (w << 7) >> 7;

        switch (opcode) {
            case 0:
            op_0:
                if (regs[c])
                    regs[a] = regs[b];

                instruction = *finger++; goto *ops[instruction >> 28];
            case 1:
                if (regs[b] == 0)
                    regs[a] = seg[regs[c]];
                else
                    regs[a] = ((u32 *)(regs[b]))[regs[c]];
                break;
            case 2:
                if (regs[a] == 0)
                    seg[regs[b]] = regs[c];
                else
                    ((u32 *)(regs[a]))[regs[b]] = regs[c];
                break;
            case 3:
                regs[a] = regs[b] + regs[c];
                break;
            case 4:
                regs[a] = regs[b] * regs[c];
                break;
            case 5:
                regs[a] = regs[b] / regs[c];
                break;
            case 6:
                regs[a] = (~(regs[b] & regs[c]));
                break;
            case 7:
                exit(0);
                return;
            case 8:
                regs[b] = my_alloc(regs[c]);
                break;
            case 9:
                my_free(regs[c]);
                break;
            case 10:
                printf("%c", regs[c]);
                break;
            case 11:
                regs[c] = getchar();
                if (regs[c] == EOF)
                    regs[c] = 0xffffffff;
                break;
            case 12:
                if (regs[b]!=0) {
                    if (seg != codex)
                        my_free((u32) seg);
                    int size = my_getsize(regs[b]);
                    seg = (u32*)my_alloc(size);
                    memcpy(seg, (void*)regs[b], size*4);
                }
                pc = regs[c];
                break;
            case 13:
                regs[sa] = dat;
        }
    }
}

int main(int argc, char**argv) {
    FILE* f = fopen(argv[1], "rb");
    assert(f);
    fseek(f, 0, SEEK_END);
    int cdxlen = ftell(f);
    rewind(f);
    codex = (u32 *)malloc(cdxlen + 4);
    fread(codex, 1, cdxlen, f);
    fclose(f);
    for (int i = 0; i < cdxlen / 4; i++)
        codex[i] = ntohl(codex[i]);
    my_init();
    run();

    return 0;
}

#13

Doesn’t compile at all, missing variables, unknown opcodes, etc. :]


#14

Don’t forget goto is really nice for implementing continuations. If you want to arbitrarily leave a method / record the stack then come back and go to where to left off.


#15

I suspect the FSM case here will get handled with become.

There is a generalization of Rust’s current gotos in FCP merge, actually:


#16

The best option is to implement this as an optimization of the loop/match pattern.

Specifically, add a modifier keyword/attribute to match statements that is only valid to use to match on a private C-like enum marked as “repr(jump)”, that results in the enum values being represented as the addresses of the match arms (it’s an error if multiple marked matches for that same enum are present).

This would be implemented using LLVM’s blockaddress and indirectbr.