Optimize vswap()

vswap() is called often enough and shows in profile and it was easy to
hand optimize swapping vtop[-1] and vtop[0] - instead of large (28 bytes
on i386) tmp variable and two memory to memory copies, let's swap areas
by longs through registers with streamlined assembly.

For

    $ ./tcc -B. -bench -DONE_SOURCE -DCONFIG_MULTIARCHDIR=\"i386-linux-gnu\" -c tcc.c

before:

 # Overhead      Command        Shared Object                                          Symbol
 # ........  ...........  ...................  ..............................................
 #
     15.19%          tcc  tcc                  [.] next_nomacro1
      5.19%          tcc  libc-2.13.so         [.] _int_malloc
      4.57%          tcc  tcc                  [.] next
      3.36%          tcc  tcc                  [.] tok_str_add2
      3.03%          tcc  tcc                  [.] macro_subst_tok
      2.93%          tcc  tcc                  [.] macro_subst
      2.53%          tcc  tcc                  [.] next_nomacro_spc
      2.49%          tcc  tcc                  [.] vswap
      2.36%          tcc  libc-2.13.so         [.] _int_free

       │    ST_FUNC void vswap(void)
       │    {
  1,96 │      push   %edi
  2,65 │      push   %esi
  1,08 │      sub    $0x20,%esp
       │        SValue tmp;
       │
       │        /* cannot let cpu flags if other instruction are generated. Also
       │           avoid leaving VT_JMP anywhere except on the top of the stack
       │           because it would complicate the code generator. */
       │        if (vtop >= vstack) {
  0,98 │      mov    0x8078cac,%eax
       │      cmp    $0x8078d3c,%eax
  1,18 │   ┌──jb     24
       │   │        int v = vtop->r & VT_VALMASK;
  1,08 │   │  mov    0x8(%eax),%edx
  0,78 │   │  and    $0x3f,%edx
       │   │        if (v == VT_CMP || (v & ~1) == VT_JMP)
  0,78 │   │  cmp    $0x33,%edx
  0,69 │   │↓ je     54
  0,59 │   │  and    $0xfffffffe,%edx
  0,49 │   │  cmp    $0x34,%edx
  0,29 │   │↓ je     54
       │   │            gv(RC_INT);
       │   │    }
       │   │    tmp = vtop[0];
  1,08 │24:└─→lea    0x4(%esp),%edi
  0,39 │      mov    $0x7,%ecx
       │      mov    %eax,%esi
 14,41 │      rep    movsl %ds:(%esi),%es:(%edi)
       │        vtop[0] = vtop[-1];
  9,51 │      lea    -0x1c(%eax),%esi
  1,96 │      mov    $0x7,%cl
       │      mov    %eax,%edi
 17,06 │      rep    movsl %ds:(%esi),%es:(%edi)
       │        vtop[-1] = tmp;
 10,20 │      mov    0x8078cac,%edi
  2,35 │      sub    $0x1c,%edi
  0,78 │      lea    0x4(%esp),%esi
       │      mov    $0x7,%cl
 15,20 │      rep    movsl %ds:(%esi),%es:(%edi)
       │    }
  9,90 │      add    $0x20,%esp
  2,25 │      pop    %esi
  1,67 │      pop    %edi
  0,69 │      ret

after:

 # Overhead      Command        Shared Object                                          Symbol
 # ........  ...........  ...................  ..............................................
 #
     15.27%          tcc  tcc                  [.] next_nomacro1
      5.08%          tcc  libc-2.13.so         [.] _int_malloc
      4.57%          tcc  tcc                  [.] next
      3.17%          tcc  tcc                  [.] tok_str_add2
      3.12%          tcc  tcc                  [.] macro_subst
      2.99%          tcc  tcc                  [.] macro_subst_tok
      2.43%          tcc  tcc                  [.] next_nomacro_spc
      2.32%          tcc  libc-2.13.so         [.] _int_free

      . . .

      0.71%          tcc  tcc                  [.] vswap

       │    ST_FUNC void vswap(void)
       │    {
  7,22 │      push   %eax
       │        /* cannot let cpu flags if other instruction are generated. Also
       │           avoid leaving VT_JMP anywhere except on the top of the stack
       │           because it would complicate the code generator. */
       │        if (vtop >= vstack) {
 11,34 │      mov    0x8078cac,%eax
  2,75 │      cmp    $0x8078d3c,%eax
  0,34 │   ┌──jb     20
       │   │        int v = vtop->r & VT_VALMASK;
  0,34 │   │  mov    0x8(%eax),%edx
  8,93 │   │  and    $0x3f,%edx
       │   │        if (v == VT_CMP || (v & ~1) == VT_JMP)
  2,06 │   │  cmp    $0x33,%edx
  2,41 │   │↓ je     74
  2,41 │   │  and    $0xfffffffe,%edx
  0,34 │   │  cmp    $0x34,%edx
  2,41 │   │↓ je     74
       │   │        vtopl[-1*VSIZEL + i] = tmpl;    \
       │   │      } do {} while (0)
       │   │
       │   │    VSWAPL(15); VSWAPL(14); VSWAPL(13); VSWAPL(12);
       │   │    VSWAPL(11); VSWAPL(10); VSWAPL( 9); VSWAPL( 8);
       │   │    VSWAPL( 7); VSWAPL( 6); VSWAPL( 5); VSWAPL( 4);
  2,06 │20:└─→mov    0x18(%eax),%edx
  1,37 │      mov    -0x4(%eax),%ecx
  2,06 │      mov    %ecx,0x18(%eax)
  1,37 │      mov    %edx,-0x4(%eax)
  2,06 │      mov    0x14(%eax),%edx
  2,06 │      mov    -0x8(%eax),%ecx
  2,41 │      mov    %ecx,0x14(%eax)
  3,09 │      mov    %edx,-0x8(%eax)
  3,09 │      mov    0x10(%eax),%edx
  1,72 │      mov    -0xc(%eax),%ecx
  2,75 │      mov    %ecx,0x10(%eax)
  1,72 │      mov    %edx,-0xc(%eax)
       │        VSWAPL( 3); VSWAPL( 2); VSWAPL( 1); VSWAPL( 0);
  2,41 │      mov    0xc(%eax),%edx
  2,41 │      mov    -0x10(%eax),%ecx
  2,41 │      mov    %ecx,0xc(%eax)
  0,69 │      mov    %edx,-0x10(%eax)
  1,72 │      mov    0x8(%eax),%edx
  0,69 │      mov    -0x14(%eax),%ecx
  1,03 │      mov    %ecx,0x8(%eax)
  1,37 │      mov    %edx,-0x14(%eax)
  1,37 │      mov    0x4(%eax),%edx
  0,69 │      mov    -0x18(%eax),%ecx
  3,09 │      mov    %ecx,0x4(%eax)
  2,06 │      mov    %edx,-0x18(%eax)
  1,37 │      mov    (%eax),%edx
  2,41 │      mov    -0x1c(%eax),%ecx
  1,37 │      mov    %ecx,(%eax)
  4,12 │      mov    %edx,-0x1c(%eax)
       │        }
       │
       │    #   undef VSWAPL
       │    #   undef VSIZEL
       │    }
  1,03 │      pop    %eax
  3,44 │      ret

Overal speedup:

    # best of 5 runs
    before: 8268 idents, 47203 lines, 1526763 bytes, 0.148 s, 319217 lines/s, 10.3 MB/s
    after:  8273 idents, 47231 lines, 1527685 bytes, 0.146 s, 324092 lines/s, 10.5 MB/s

Static ASSERT macro taken from CCAN's[1] build_assert[2] which is in
public domain.

[1] http://ccodearchive.net/
[2] http://git.ozlabs.org/?p=ccan;a=blob;f=ccan/build_assert/build_assert.h;h=24e59c44cd930173178ac9b6e101b0af64a879e9;hb=HEAD
This commit is contained in:
Kirill Smelkov 2012-12-21 13:55:01 +04:00
parent 8eb92e6052
commit 63193d1794
2 changed files with 33 additions and 5 deletions

4
tcc.h
View File

@ -228,6 +228,10 @@
#define true 1
typedef int BOOL;
#ifndef _STATIC_ASSERT
#define _STATIC_ASSERT(cond) do { (void) sizeof(char [1 - 2*!(cond)]); } while(0)
#endif
#define INCLUDE_STACK_SIZE 32
#define IFDEF_STACK_SIZE 64
#define VSTACK_SIZE 256

View File

@ -458,8 +458,6 @@ static void vseti(int r, int v)
ST_FUNC void vswap(void)
{
SValue tmp;
/* cannot let cpu flags if other instruction are generated. Also
avoid leaving VT_JMP anywhere except on the top of the stack
because it would complicate the code generator. */
@ -468,9 +466,35 @@ ST_FUNC void vswap(void)
if (v == VT_CMP || (v & ~1) == VT_JMP)
gv(RC_INT);
}
tmp = vtop[0];
vtop[0] = vtop[-1];
vtop[-1] = tmp;
/*
* vtop[0], vtop[-1] = vtop[-1], vtop[0]
*
* vswap is called often and exchanging vtop[0] vs vtop[-1] is hot on
* profile, so it is hand optimized
*/
unsigned long *vtopl = (unsigned long *)vtop;
# define VSIZEL (sizeof(*vtop) / sizeof(*vtopl))
_STATIC_ASSERT( VSIZEL*sizeof(*vtopl) == sizeof(*vtop) );
_STATIC_ASSERT( VSIZEL <= 16 ); /* should be enough */
switch(VSIZEL) {
# define VSWAPL(i) \
case i+1: { \
unsigned long tmpl; \
tmpl = vtopl[i]; \
vtopl[i] = vtopl[-1*VSIZEL + i]; \
vtopl[-1*VSIZEL + i] = tmpl; \
} do {} while (0)
VSWAPL(15); VSWAPL(14); VSWAPL(13); VSWAPL(12);
VSWAPL(11); VSWAPL(10); VSWAPL( 9); VSWAPL( 8);
VSWAPL( 7); VSWAPL( 6); VSWAPL( 5); VSWAPL( 4);
VSWAPL( 3); VSWAPL( 2); VSWAPL( 1); VSWAPL( 0);
}
# undef VSWAPL
# undef VSIZEL
}
ST_FUNC void vpushv(SValue *v)