mirror of
https://github.com/mirror/tinycc.git
synced 2024-12-28 04:00:06 +08:00
63193d1794
vswap() is called often enough and shows in profile and it was easy to hand optimize swapping vtop[-1] and vtop[0] - instead of large (28 bytes on i386) tmp variable and two memory to memory copies, let's swap areas by longs through registers with streamlined assembly. For $ ./tcc -B. -bench -DONE_SOURCE -DCONFIG_MULTIARCHDIR=\"i386-linux-gnu\" -c tcc.c before: # Overhead Command Shared Object Symbol # ........ ........... ................... .............................................. # 15.19% tcc tcc [.] next_nomacro1 5.19% tcc libc-2.13.so [.] _int_malloc 4.57% tcc tcc [.] next 3.36% tcc tcc [.] tok_str_add2 3.03% tcc tcc [.] macro_subst_tok 2.93% tcc tcc [.] macro_subst 2.53% tcc tcc [.] next_nomacro_spc 2.49% tcc tcc [.] vswap 2.36% tcc libc-2.13.so [.] _int_free │ ST_FUNC void vswap(void) │ { 1,96 │ push %edi 2,65 │ push %esi 1,08 │ sub $0x20,%esp │ SValue tmp; │ │ /* cannot let cpu flags if other instruction are generated. Also │ avoid leaving VT_JMP anywhere except on the top of the stack │ because it would complicate the code generator. */ │ if (vtop >= vstack) { 0,98 │ mov 0x8078cac,%eax │ cmp $0x8078d3c,%eax 1,18 │ ┌──jb 24 │ │ int v = vtop->r & VT_VALMASK; 1,08 │ │ mov 0x8(%eax),%edx 0,78 │ │ and $0x3f,%edx │ │ if (v == VT_CMP || (v & ~1) == VT_JMP) 0,78 │ │ cmp $0x33,%edx 0,69 │ │↓ je 54 0,59 │ │ and $0xfffffffe,%edx 0,49 │ │ cmp $0x34,%edx 0,29 │ │↓ je 54 │ │ gv(RC_INT); │ │ } │ │ tmp = vtop[0]; 1,08 │24:└─→lea 0x4(%esp),%edi 0,39 │ mov $0x7,%ecx │ mov %eax,%esi 14,41 │ rep movsl %ds:(%esi),%es:(%edi) │ vtop[0] = vtop[-1]; 9,51 │ lea -0x1c(%eax),%esi 1,96 │ mov $0x7,%cl │ mov %eax,%edi 17,06 │ rep movsl %ds:(%esi),%es:(%edi) │ vtop[-1] = tmp; 10,20 │ mov 0x8078cac,%edi 2,35 │ sub $0x1c,%edi 0,78 │ lea 0x4(%esp),%esi │ mov $0x7,%cl 15,20 │ rep movsl %ds:(%esi),%es:(%edi) │ } 9,90 │ add $0x20,%esp 2,25 │ pop %esi 1,67 │ pop %edi 0,69 │ ret after: # Overhead Command Shared Object Symbol # ........ ........... ................... .............................................. # 15.27% tcc tcc [.] next_nomacro1 5.08% tcc libc-2.13.so [.] _int_malloc 4.57% tcc tcc [.] next 3.17% tcc tcc [.] tok_str_add2 3.12% tcc tcc [.] macro_subst 2.99% tcc tcc [.] macro_subst_tok 2.43% tcc tcc [.] next_nomacro_spc 2.32% tcc libc-2.13.so [.] _int_free . . . 0.71% tcc tcc [.] vswap │ ST_FUNC void vswap(void) │ { 7,22 │ push %eax │ /* cannot let cpu flags if other instruction are generated. Also │ avoid leaving VT_JMP anywhere except on the top of the stack │ because it would complicate the code generator. */ │ if (vtop >= vstack) { 11,34 │ mov 0x8078cac,%eax 2,75 │ cmp $0x8078d3c,%eax 0,34 │ ┌──jb 20 │ │ int v = vtop->r & VT_VALMASK; 0,34 │ │ mov 0x8(%eax),%edx 8,93 │ │ and $0x3f,%edx │ │ if (v == VT_CMP || (v & ~1) == VT_JMP) 2,06 │ │ cmp $0x33,%edx 2,41 │ │↓ je 74 2,41 │ │ and $0xfffffffe,%edx 0,34 │ │ cmp $0x34,%edx 2,41 │ │↓ je 74 │ │ vtopl[-1*VSIZEL + i] = tmpl; \ │ │ } do {} while (0) │ │ │ │ VSWAPL(15); VSWAPL(14); VSWAPL(13); VSWAPL(12); │ │ VSWAPL(11); VSWAPL(10); VSWAPL( 9); VSWAPL( 8); │ │ VSWAPL( 7); VSWAPL( 6); VSWAPL( 5); VSWAPL( 4); 2,06 │20:└─→mov 0x18(%eax),%edx 1,37 │ mov -0x4(%eax),%ecx 2,06 │ mov %ecx,0x18(%eax) 1,37 │ mov %edx,-0x4(%eax) 2,06 │ mov 0x14(%eax),%edx 2,06 │ mov -0x8(%eax),%ecx 2,41 │ mov %ecx,0x14(%eax) 3,09 │ mov %edx,-0x8(%eax) 3,09 │ mov 0x10(%eax),%edx 1,72 │ mov -0xc(%eax),%ecx 2,75 │ mov %ecx,0x10(%eax) 1,72 │ mov %edx,-0xc(%eax) │ VSWAPL( 3); VSWAPL( 2); VSWAPL( 1); VSWAPL( 0); 2,41 │ mov 0xc(%eax),%edx 2,41 │ mov -0x10(%eax),%ecx 2,41 │ mov %ecx,0xc(%eax) 0,69 │ mov %edx,-0x10(%eax) 1,72 │ mov 0x8(%eax),%edx 0,69 │ mov -0x14(%eax),%ecx 1,03 │ mov %ecx,0x8(%eax) 1,37 │ mov %edx,-0x14(%eax) 1,37 │ mov 0x4(%eax),%edx 0,69 │ mov -0x18(%eax),%ecx 3,09 │ mov %ecx,0x4(%eax) 2,06 │ mov %edx,-0x18(%eax) 1,37 │ mov (%eax),%edx 2,41 │ mov -0x1c(%eax),%ecx 1,37 │ mov %ecx,(%eax) 4,12 │ mov %edx,-0x1c(%eax) │ } │ │ # undef VSWAPL │ # undef VSIZEL │ } 1,03 │ pop %eax 3,44 │ ret Overal speedup: # best of 5 runs before: 8268 idents, 47203 lines, 1526763 bytes, 0.148 s, 319217 lines/s, 10.3 MB/s after: 8273 idents, 47231 lines, 1527685 bytes, 0.146 s, 324092 lines/s, 10.5 MB/s Static ASSERT macro taken from CCAN's[1] build_assert[2] which is in public domain. [1] http://ccodearchive.net/ [2] http://git.ozlabs.org/?p=ccan;a=blob;f=ccan/build_assert/build_assert.h;h=24e59c44cd930173178ac9b6e101b0af64a879e9;hb=HEAD |
||
---|---|---|
examples | ||
include | ||
lib | ||
tests | ||
tests2 | ||
win32 | ||
.gitignore | ||
arm-gen.c | ||
c67-gen.c | ||
Changelog | ||
coff.h | ||
configure | ||
COPYING | ||
elf.h | ||
i386-asm.c | ||
i386-asm.h | ||
i386-gen.c | ||
i386-tok.h | ||
il-gen.c | ||
il-opcodes.h | ||
libtcc.c | ||
libtcc.h | ||
Makefile | ||
README | ||
stab.def | ||
stab.h | ||
tcc-doc.texi | ||
tcc.c | ||
tcc.h | ||
tccasm.c | ||
tcccoff.c | ||
tccelf.c | ||
tccgen.c | ||
tccpe.c | ||
tccpp.c | ||
tccrun.c | ||
tcctok.h | ||
texi2pod.pl | ||
TODO | ||
VERSION | ||
x86_64-asm.h | ||
x86_64-gen.c |
Tiny C Compiler - C Scripting Everywhere - The Smallest ANSI C compiler ----------------------------------------------------------------------- Features: -------- - SMALL! You can compile and execute C code everywhere, for example on rescue disks. - FAST! tcc generates optimized x86 code. No byte code overhead. Compile, assemble and link about 7 times faster than 'gcc -O0'. - UNLIMITED! Any C dynamic library can be used directly. TCC is heading torward full ISOC99 compliance. TCC can of course compile itself. - SAFE! tcc includes an optional memory and bound checker. Bound checked code can be mixed freely with standard code. - Compile and execute C source directly. No linking or assembly necessary. Full C preprocessor included. - C script supported : just add '#!/usr/local/bin/tcc -run' at the first line of your C source, and execute it directly from the command line. Documentation: ------------- 1) Installation on a i386 Linux host (for Windows read tcc-win32.txt) ./configure make make test make install Alternatively, out-of-tree builds are supported: you may use different directories to hold build objects, kept separate from your source tree: mkdir _build cd _build ../configure make make test make install By default, tcc is installed in /usr/local/bin. ./configure --help shows configuration options. 2) Introduction We assume here that you know ANSI C. Look at the example ex1.c to know what the programs look like. The include file <tcclib.h> can be used if you want a small basic libc include support (especially useful for floppy disks). Of course, you can also use standard headers, although they are slower to compile. You can begin your C script with '#!/usr/local/bin/tcc -run' on the first line and set its execute bits (chmod a+x your_script). Then, you can launch the C code as a shell or perl script :-) The command line arguments are put in 'argc' and 'argv' of the main functions, as in ANSI C. 3) Examples ex1.c: simplest example (hello world). Can also be launched directly as a script: './ex1.c'. ex2.c: more complicated example: find a number with the four operations given a list of numbers (benchmark). ex3.c: compute fibonacci numbers (benchmark). ex4.c: more complicated: X11 program. Very complicated test in fact because standard headers are being used ! As for ex1.c, can also be launched directly as a script: './ex4.c'. ex5.c: 'hello world' with standard glibc headers. tcc.c: TCC can of course compile itself. Used to check the code generator. tcctest.c: auto test for TCC which tests many subtle possible bugs. Used when doing 'make test'. 4) Full Documentation Please read tcc-doc.html to have all the features of TCC. Additional information is available for the Windows port in tcc-win32.txt. License: ------- TCC is distributed under the GNU Lesser General Public License (see COPYING file). Fabrice Bellard.