Go to file
Kirill Smelkov 63193d1794 Optimize vswap()
vswap() is called often enough and shows in profile and it was easy to
hand optimize swapping vtop[-1] and vtop[0] - instead of large (28 bytes
on i386) tmp variable and two memory to memory copies, let's swap areas
by longs through registers with streamlined assembly.

For

    $ ./tcc -B. -bench -DONE_SOURCE -DCONFIG_MULTIARCHDIR=\"i386-linux-gnu\" -c tcc.c

before:

 # Overhead      Command        Shared Object                                          Symbol
 # ........  ...........  ...................  ..............................................
 #
     15.19%          tcc  tcc                  [.] next_nomacro1
      5.19%          tcc  libc-2.13.so         [.] _int_malloc
      4.57%          tcc  tcc                  [.] next
      3.36%          tcc  tcc                  [.] tok_str_add2
      3.03%          tcc  tcc                  [.] macro_subst_tok
      2.93%          tcc  tcc                  [.] macro_subst
      2.53%          tcc  tcc                  [.] next_nomacro_spc
      2.49%          tcc  tcc                  [.] vswap
      2.36%          tcc  libc-2.13.so         [.] _int_free

       │    ST_FUNC void vswap(void)
       │    {
  1,96 │      push   %edi
  2,65 │      push   %esi
  1,08 │      sub    $0x20,%esp
       │        SValue tmp;
       │
       │        /* cannot let cpu flags if other instruction are generated. Also
       │           avoid leaving VT_JMP anywhere except on the top of the stack
       │           because it would complicate the code generator. */
       │        if (vtop >= vstack) {
  0,98 │      mov    0x8078cac,%eax
       │      cmp    $0x8078d3c,%eax
  1,18 │   ┌──jb     24
       │   │        int v = vtop->r & VT_VALMASK;
  1,08 │   │  mov    0x8(%eax),%edx
  0,78 │   │  and    $0x3f,%edx
       │   │        if (v == VT_CMP || (v & ~1) == VT_JMP)
  0,78 │   │  cmp    $0x33,%edx
  0,69 │   │↓ je     54
  0,59 │   │  and    $0xfffffffe,%edx
  0,49 │   │  cmp    $0x34,%edx
  0,29 │   │↓ je     54
       │   │            gv(RC_INT);
       │   │    }
       │   │    tmp = vtop[0];
  1,08 │24:└─→lea    0x4(%esp),%edi
  0,39 │      mov    $0x7,%ecx
       │      mov    %eax,%esi
 14,41 │      rep    movsl %ds:(%esi),%es:(%edi)
       │        vtop[0] = vtop[-1];
  9,51 │      lea    -0x1c(%eax),%esi
  1,96 │      mov    $0x7,%cl
       │      mov    %eax,%edi
 17,06 │      rep    movsl %ds:(%esi),%es:(%edi)
       │        vtop[-1] = tmp;
 10,20 │      mov    0x8078cac,%edi
  2,35 │      sub    $0x1c,%edi
  0,78 │      lea    0x4(%esp),%esi
       │      mov    $0x7,%cl
 15,20 │      rep    movsl %ds:(%esi),%es:(%edi)
       │    }
  9,90 │      add    $0x20,%esp
  2,25 │      pop    %esi
  1,67 │      pop    %edi
  0,69 │      ret

after:

 # Overhead      Command        Shared Object                                          Symbol
 # ........  ...........  ...................  ..............................................
 #
     15.27%          tcc  tcc                  [.] next_nomacro1
      5.08%          tcc  libc-2.13.so         [.] _int_malloc
      4.57%          tcc  tcc                  [.] next
      3.17%          tcc  tcc                  [.] tok_str_add2
      3.12%          tcc  tcc                  [.] macro_subst
      2.99%          tcc  tcc                  [.] macro_subst_tok
      2.43%          tcc  tcc                  [.] next_nomacro_spc
      2.32%          tcc  libc-2.13.so         [.] _int_free

      . . .

      0.71%          tcc  tcc                  [.] vswap

       │    ST_FUNC void vswap(void)
       │    {
  7,22 │      push   %eax
       │        /* cannot let cpu flags if other instruction are generated. Also
       │           avoid leaving VT_JMP anywhere except on the top of the stack
       │           because it would complicate the code generator. */
       │        if (vtop >= vstack) {
 11,34 │      mov    0x8078cac,%eax
  2,75 │      cmp    $0x8078d3c,%eax
  0,34 │   ┌──jb     20
       │   │        int v = vtop->r & VT_VALMASK;
  0,34 │   │  mov    0x8(%eax),%edx
  8,93 │   │  and    $0x3f,%edx
       │   │        if (v == VT_CMP || (v & ~1) == VT_JMP)
  2,06 │   │  cmp    $0x33,%edx
  2,41 │   │↓ je     74
  2,41 │   │  and    $0xfffffffe,%edx
  0,34 │   │  cmp    $0x34,%edx
  2,41 │   │↓ je     74
       │   │        vtopl[-1*VSIZEL + i] = tmpl;    \
       │   │      } do {} while (0)
       │   │
       │   │    VSWAPL(15); VSWAPL(14); VSWAPL(13); VSWAPL(12);
       │   │    VSWAPL(11); VSWAPL(10); VSWAPL( 9); VSWAPL( 8);
       │   │    VSWAPL( 7); VSWAPL( 6); VSWAPL( 5); VSWAPL( 4);
  2,06 │20:└─→mov    0x18(%eax),%edx
  1,37 │      mov    -0x4(%eax),%ecx
  2,06 │      mov    %ecx,0x18(%eax)
  1,37 │      mov    %edx,-0x4(%eax)
  2,06 │      mov    0x14(%eax),%edx
  2,06 │      mov    -0x8(%eax),%ecx
  2,41 │      mov    %ecx,0x14(%eax)
  3,09 │      mov    %edx,-0x8(%eax)
  3,09 │      mov    0x10(%eax),%edx
  1,72 │      mov    -0xc(%eax),%ecx
  2,75 │      mov    %ecx,0x10(%eax)
  1,72 │      mov    %edx,-0xc(%eax)
       │        VSWAPL( 3); VSWAPL( 2); VSWAPL( 1); VSWAPL( 0);
  2,41 │      mov    0xc(%eax),%edx
  2,41 │      mov    -0x10(%eax),%ecx
  2,41 │      mov    %ecx,0xc(%eax)
  0,69 │      mov    %edx,-0x10(%eax)
  1,72 │      mov    0x8(%eax),%edx
  0,69 │      mov    -0x14(%eax),%ecx
  1,03 │      mov    %ecx,0x8(%eax)
  1,37 │      mov    %edx,-0x14(%eax)
  1,37 │      mov    0x4(%eax),%edx
  0,69 │      mov    -0x18(%eax),%ecx
  3,09 │      mov    %ecx,0x4(%eax)
  2,06 │      mov    %edx,-0x18(%eax)
  1,37 │      mov    (%eax),%edx
  2,41 │      mov    -0x1c(%eax),%ecx
  1,37 │      mov    %ecx,(%eax)
  4,12 │      mov    %edx,-0x1c(%eax)
       │        }
       │
       │    #   undef VSWAPL
       │    #   undef VSIZEL
       │    }
  1,03 │      pop    %eax
  3,44 │      ret

Overal speedup:

    # best of 5 runs
    before: 8268 idents, 47203 lines, 1526763 bytes, 0.148 s, 319217 lines/s, 10.3 MB/s
    after:  8273 idents, 47231 lines, 1527685 bytes, 0.146 s, 324092 lines/s, 10.5 MB/s

Static ASSERT macro taken from CCAN's[1] build_assert[2] which is in
public domain.

[1] http://ccodearchive.net/
[2] http://git.ozlabs.org/?p=ccan;a=blob;f=ccan/build_assert/build_assert.h;h=24e59c44cd930173178ac9b6e101b0af64a879e9;hb=HEAD
2012-12-21 20:46:26 +04:00
examples Revert "Make ex1.c and ex4.c be executable on any systems" 2012-06-12 15:45:13 +02:00
include Remove semicolon in x86-64 va_arg definition. 2011-08-05 20:32:57 +02:00
lib build: fix VPATH builds 2012-12-18 10:06:20 +01:00
tests build: fix VPATH builds 2012-12-18 10:06:20 +01:00
tests2 build: fix VPATH builds 2012-12-18 10:06:20 +01:00
win32 win32: build-tcc.bat: get rid of hardcoded VERSION string 2012-12-20 21:20:54 +01:00
.gitignore tests: Add tests for compile/run tcc.c with tcc -b then compile tcc.c again, then run tcctest.c 2012-12-09 19:43:40 +04:00
arm-gen.c Fix OABI calling convention 2012-11-28 22:26:39 +01:00
c67-gen.c rename error/warning -> tcc_(error/warning) 2011-08-11 17:07:56 +02:00
Changelog Add support for arm hardfloat calling convention 2012-06-05 23:09:55 +02:00
coff.h C67 COFF executable format support (TK) 2004-10-05 22:33:55 +00:00
configure configure: support absolete out-of-tree builds 2012-12-21 13:57:22 +01:00
COPYING changed license to LGPL 2003-05-24 14:18:56 +00:00
elf.h Add support for R_ARM_THM_{JUMP24,CALL} relocs 2012-10-28 19:55:12 +01:00
i386-asm.c rename error/warning -> tcc_(error/warning) 2011-08-11 17:07:56 +02:00
i386-asm.h i386-asm: support "pause" opcode 2011-02-24 09:38:13 -08:00
i386-gen.c i386: We can change 'lea 0(%ebp),r' to 'mov %ebp,r' 2012-11-16 10:22:45 +04:00
i386-tok.h integrate x86_64-asm.c into i386-asm.c 2009-12-19 22:16:20 +01:00
il-gen.c rename error/warning -> tcc_(error/warning) 2011-08-11 17:07:56 +02:00
il-opcodes.h added CIL target 2002-02-10 16:14:03 +00:00
libtcc.c fix #include_next infinite loop bug, see http://savannah.nongnu.org/bugs/?31357 2012-09-20 22:12:05 +03:00
libtcc.h tccrun: another incompatible change to the tcc_relocate API 2012-09-01 11:33:34 +02:00
Makefile build: fix out-of-tree install 2012-12-21 14:23:28 +01:00
README configure: support absolete out-of-tree builds 2012-12-21 13:57:22 +01:00
stab.def added 2002-12-08 14:36:36 +00:00
stab.h added 2002-12-08 14:36:36 +00:00
tcc-doc.texi Inform user that -b only exists on i386. 2012-03-13 19:43:43 +01:00
tcc.c tcc.c: fix argv index for parse_args 2012-06-12 15:32:44 +02:00
tcc.h Optimize vswap() 2012-12-21 20:46:26 +04:00
tccasm.c Compile tccasm.c conditionally (TCC_CONFIG_ASM) 2012-01-06 18:34:21 +01:00
tcccoff.c rename error/warning -> tcc_(error/warning) 2011-08-11 17:07:56 +02:00
tccelf.c Generate PLT thumb stub only when necessary 2012-11-17 10:01:11 +01:00
tccgen.c Optimize vswap() 2012-12-21 20:46:26 +04:00
tccpe.c pe: fix tcc not linking to user32 and gdi32 2012-11-02 16:59:21 +08:00
tccpp.c Optimize cstr_reset() to only reset string to empty, not call free() and later malloc() 2012-12-21 20:46:26 +04:00
tccrun.c tccrun: another incompatible change to the tcc_relocate API 2012-09-01 11:33:34 +02:00
tcctok.h tcctok.h: fix ifdef target/host confusion 2011-04-12 00:11:47 -07:00
texi2pod.pl automatic man page generation from tcc-doc.texi 2003-05-18 18:11:06 +00:00
TODO re-apply VLA by Thomas Preud'homme 2011-04-06 09:17:03 -07:00
VERSION update Changelog, bump version: 0.9.25 2009-05-11 19:01:26 +02:00
x86_64-asm.h x86-64: fix udiv, add cqto instruction 2009-12-19 22:16:19 +01:00
x86_64-gen.c x86-64: Fix call saved register restore 2012-06-10 09:01:26 +02:00

Tiny C Compiler - C Scripting Everywhere - The Smallest ANSI C compiler
-----------------------------------------------------------------------

Features:
--------

- SMALL! You can compile and execute C code everywhere, for example on
  rescue disks.

- FAST! tcc generates optimized x86 code. No byte code
  overhead. Compile, assemble and link about 7 times faster than 'gcc
  -O0'.

- UNLIMITED! Any C dynamic library can be used directly. TCC is
  heading torward full ISOC99 compliance. TCC can of course compile
  itself.

- SAFE! tcc includes an optional memory and bound checker. Bound
  checked code can be mixed freely with standard code.

- Compile and execute C source directly. No linking or assembly
  necessary. Full C preprocessor included. 

- C script supported : just add '#!/usr/local/bin/tcc -run' at the first
  line of your C source, and execute it directly from the command
  line.

Documentation:
-------------

1) Installation on a i386 Linux host (for Windows read tcc-win32.txt)

   ./configure
   make
   make test
   make install

Alternatively, out-of-tree builds are supported: you may use different
directories to hold build objects, kept separate from your source tree:

   mkdir _build
   cd _build
   ../configure
   make
   make test
   make install

By default, tcc is installed in /usr/local/bin.
./configure --help  shows configuration options.


2) Introduction

We assume here that you know ANSI C. Look at the example ex1.c to know
what the programs look like.

The include file <tcclib.h> can be used if you want a small basic libc
include support (especially useful for floppy disks). Of course, you
can also use standard headers, although they are slower to compile.

You can begin your C script with '#!/usr/local/bin/tcc -run' on the first
line and set its execute bits (chmod a+x your_script). Then, you can
launch the C code as a shell or perl script :-) The command line
arguments are put in 'argc' and 'argv' of the main functions, as in
ANSI C.

3) Examples

ex1.c: simplest example (hello world). Can also be launched directly
as a script: './ex1.c'.

ex2.c: more complicated example: find a number with the four
operations given a list of numbers (benchmark).

ex3.c: compute fibonacci numbers (benchmark).

ex4.c: more complicated: X11 program. Very complicated test in fact
because standard headers are being used ! As for ex1.c, can also be launched
directly as a script: './ex4.c'.

ex5.c: 'hello world' with standard glibc headers.

tcc.c: TCC can of course compile itself. Used to check the code
generator.

tcctest.c: auto test for TCC which tests many subtle possible bugs. Used
when doing 'make test'.

4) Full Documentation

Please read tcc-doc.html to have all the features of TCC.

Additional information is available for the Windows port in tcc-win32.txt.

License:
-------

TCC is distributed under the GNU Lesser General Public License (see
COPYING file).

Fabrice Bellard.