mirror of
https://github.com/mirror/tinycc.git
synced 2024-12-30 04:10:08 +08:00
Optimize vswap()
vswap() is called often enough and shows in profile and it was easy to hand optimize swapping vtop[-1] and vtop[0] - instead of large (28 bytes on i386) tmp variable and two memory to memory copies, let's swap areas by longs through registers with streamlined assembly. For $ ./tcc -B. -bench -DONE_SOURCE -DCONFIG_MULTIARCHDIR=\"i386-linux-gnu\" -c tcc.c before: # Overhead Command Shared Object Symbol # ........ ........... ................... .............................................. # 15.19% tcc tcc [.] next_nomacro1 5.19% tcc libc-2.13.so [.] _int_malloc 4.57% tcc tcc [.] next 3.36% tcc tcc [.] tok_str_add2 3.03% tcc tcc [.] macro_subst_tok 2.93% tcc tcc [.] macro_subst 2.53% tcc tcc [.] next_nomacro_spc 2.49% tcc tcc [.] vswap 2.36% tcc libc-2.13.so [.] _int_free │ ST_FUNC void vswap(void) │ { 1,96 │ push %edi 2,65 │ push %esi 1,08 │ sub $0x20,%esp │ SValue tmp; │ │ /* cannot let cpu flags if other instruction are generated. Also │ avoid leaving VT_JMP anywhere except on the top of the stack │ because it would complicate the code generator. */ │ if (vtop >= vstack) { 0,98 │ mov 0x8078cac,%eax │ cmp $0x8078d3c,%eax 1,18 │ ┌──jb 24 │ │ int v = vtop->r & VT_VALMASK; 1,08 │ │ mov 0x8(%eax),%edx 0,78 │ │ and $0x3f,%edx │ │ if (v == VT_CMP || (v & ~1) == VT_JMP) 0,78 │ │ cmp $0x33,%edx 0,69 │ │↓ je 54 0,59 │ │ and $0xfffffffe,%edx 0,49 │ │ cmp $0x34,%edx 0,29 │ │↓ je 54 │ │ gv(RC_INT); │ │ } │ │ tmp = vtop[0]; 1,08 │24:└─→lea 0x4(%esp),%edi 0,39 │ mov $0x7,%ecx │ mov %eax,%esi 14,41 │ rep movsl %ds:(%esi),%es:(%edi) │ vtop[0] = vtop[-1]; 9,51 │ lea -0x1c(%eax),%esi 1,96 │ mov $0x7,%cl │ mov %eax,%edi 17,06 │ rep movsl %ds:(%esi),%es:(%edi) │ vtop[-1] = tmp; 10,20 │ mov 0x8078cac,%edi 2,35 │ sub $0x1c,%edi 0,78 │ lea 0x4(%esp),%esi │ mov $0x7,%cl 15,20 │ rep movsl %ds:(%esi),%es:(%edi) │ } 9,90 │ add $0x20,%esp 2,25 │ pop %esi 1,67 │ pop %edi 0,69 │ ret after: # Overhead Command Shared Object Symbol # ........ ........... ................... .............................................. # 15.27% tcc tcc [.] next_nomacro1 5.08% tcc libc-2.13.so [.] _int_malloc 4.57% tcc tcc [.] next 3.17% tcc tcc [.] tok_str_add2 3.12% tcc tcc [.] macro_subst 2.99% tcc tcc [.] macro_subst_tok 2.43% tcc tcc [.] next_nomacro_spc 2.32% tcc libc-2.13.so [.] _int_free . . . 0.71% tcc tcc [.] vswap │ ST_FUNC void vswap(void) │ { 7,22 │ push %eax │ /* cannot let cpu flags if other instruction are generated. Also │ avoid leaving VT_JMP anywhere except on the top of the stack │ because it would complicate the code generator. */ │ if (vtop >= vstack) { 11,34 │ mov 0x8078cac,%eax 2,75 │ cmp $0x8078d3c,%eax 0,34 │ ┌──jb 20 │ │ int v = vtop->r & VT_VALMASK; 0,34 │ │ mov 0x8(%eax),%edx 8,93 │ │ and $0x3f,%edx │ │ if (v == VT_CMP || (v & ~1) == VT_JMP) 2,06 │ │ cmp $0x33,%edx 2,41 │ │↓ je 74 2,41 │ │ and $0xfffffffe,%edx 0,34 │ │ cmp $0x34,%edx 2,41 │ │↓ je 74 │ │ vtopl[-1*VSIZEL + i] = tmpl; \ │ │ } do {} while (0) │ │ │ │ VSWAPL(15); VSWAPL(14); VSWAPL(13); VSWAPL(12); │ │ VSWAPL(11); VSWAPL(10); VSWAPL( 9); VSWAPL( 8); │ │ VSWAPL( 7); VSWAPL( 6); VSWAPL( 5); VSWAPL( 4); 2,06 │20:└─→mov 0x18(%eax),%edx 1,37 │ mov -0x4(%eax),%ecx 2,06 │ mov %ecx,0x18(%eax) 1,37 │ mov %edx,-0x4(%eax) 2,06 │ mov 0x14(%eax),%edx 2,06 │ mov -0x8(%eax),%ecx 2,41 │ mov %ecx,0x14(%eax) 3,09 │ mov %edx,-0x8(%eax) 3,09 │ mov 0x10(%eax),%edx 1,72 │ mov -0xc(%eax),%ecx 2,75 │ mov %ecx,0x10(%eax) 1,72 │ mov %edx,-0xc(%eax) │ VSWAPL( 3); VSWAPL( 2); VSWAPL( 1); VSWAPL( 0); 2,41 │ mov 0xc(%eax),%edx 2,41 │ mov -0x10(%eax),%ecx 2,41 │ mov %ecx,0xc(%eax) 0,69 │ mov %edx,-0x10(%eax) 1,72 │ mov 0x8(%eax),%edx 0,69 │ mov -0x14(%eax),%ecx 1,03 │ mov %ecx,0x8(%eax) 1,37 │ mov %edx,-0x14(%eax) 1,37 │ mov 0x4(%eax),%edx 0,69 │ mov -0x18(%eax),%ecx 3,09 │ mov %ecx,0x4(%eax) 2,06 │ mov %edx,-0x18(%eax) 1,37 │ mov (%eax),%edx 2,41 │ mov -0x1c(%eax),%ecx 1,37 │ mov %ecx,(%eax) 4,12 │ mov %edx,-0x1c(%eax) │ } │ │ # undef VSWAPL │ # undef VSIZEL │ } 1,03 │ pop %eax 3,44 │ ret Overal speedup: # best of 5 runs before: 8268 idents, 47203 lines, 1526763 bytes, 0.148 s, 319217 lines/s, 10.3 MB/s after: 8273 idents, 47231 lines, 1527685 bytes, 0.146 s, 324092 lines/s, 10.5 MB/s Static ASSERT macro taken from CCAN's[1] build_assert[2] which is in public domain. [1] http://ccodearchive.net/ [2] http://git.ozlabs.org/?p=ccan;a=blob;f=ccan/build_assert/build_assert.h;h=24e59c44cd930173178ac9b6e101b0af64a879e9;hb=HEAD
This commit is contained in:
parent
8eb92e6052
commit
63193d1794
4
tcc.h
4
tcc.h
@ -228,6 +228,10 @@
|
|||||||
#define true 1
|
#define true 1
|
||||||
typedef int BOOL;
|
typedef int BOOL;
|
||||||
|
|
||||||
|
#ifndef _STATIC_ASSERT
|
||||||
|
#define _STATIC_ASSERT(cond) do { (void) sizeof(char [1 - 2*!(cond)]); } while(0)
|
||||||
|
#endif
|
||||||
|
|
||||||
#define INCLUDE_STACK_SIZE 32
|
#define INCLUDE_STACK_SIZE 32
|
||||||
#define IFDEF_STACK_SIZE 64
|
#define IFDEF_STACK_SIZE 64
|
||||||
#define VSTACK_SIZE 256
|
#define VSTACK_SIZE 256
|
||||||
|
34
tccgen.c
34
tccgen.c
@ -458,8 +458,6 @@ static void vseti(int r, int v)
|
|||||||
|
|
||||||
ST_FUNC void vswap(void)
|
ST_FUNC void vswap(void)
|
||||||
{
|
{
|
||||||
SValue tmp;
|
|
||||||
|
|
||||||
/* cannot let cpu flags if other instruction are generated. Also
|
/* cannot let cpu flags if other instruction are generated. Also
|
||||||
avoid leaving VT_JMP anywhere except on the top of the stack
|
avoid leaving VT_JMP anywhere except on the top of the stack
|
||||||
because it would complicate the code generator. */
|
because it would complicate the code generator. */
|
||||||
@ -468,9 +466,35 @@ ST_FUNC void vswap(void)
|
|||||||
if (v == VT_CMP || (v & ~1) == VT_JMP)
|
if (v == VT_CMP || (v & ~1) == VT_JMP)
|
||||||
gv(RC_INT);
|
gv(RC_INT);
|
||||||
}
|
}
|
||||||
tmp = vtop[0];
|
|
||||||
vtop[0] = vtop[-1];
|
/*
|
||||||
vtop[-1] = tmp;
|
* vtop[0], vtop[-1] = vtop[-1], vtop[0]
|
||||||
|
*
|
||||||
|
* vswap is called often and exchanging vtop[0] vs vtop[-1] is hot on
|
||||||
|
* profile, so it is hand optimized
|
||||||
|
*/
|
||||||
|
unsigned long *vtopl = (unsigned long *)vtop;
|
||||||
|
# define VSIZEL (sizeof(*vtop) / sizeof(*vtopl))
|
||||||
|
|
||||||
|
_STATIC_ASSERT( VSIZEL*sizeof(*vtopl) == sizeof(*vtop) );
|
||||||
|
_STATIC_ASSERT( VSIZEL <= 16 ); /* should be enough */
|
||||||
|
switch(VSIZEL) {
|
||||||
|
# define VSWAPL(i) \
|
||||||
|
case i+1: { \
|
||||||
|
unsigned long tmpl; \
|
||||||
|
tmpl = vtopl[i]; \
|
||||||
|
vtopl[i] = vtopl[-1*VSIZEL + i]; \
|
||||||
|
vtopl[-1*VSIZEL + i] = tmpl; \
|
||||||
|
} do {} while (0)
|
||||||
|
|
||||||
|
VSWAPL(15); VSWAPL(14); VSWAPL(13); VSWAPL(12);
|
||||||
|
VSWAPL(11); VSWAPL(10); VSWAPL( 9); VSWAPL( 8);
|
||||||
|
VSWAPL( 7); VSWAPL( 6); VSWAPL( 5); VSWAPL( 4);
|
||||||
|
VSWAPL( 3); VSWAPL( 2); VSWAPL( 1); VSWAPL( 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
# undef VSWAPL
|
||||||
|
# undef VSIZEL
|
||||||
}
|
}
|
||||||
|
|
||||||
ST_FUNC void vpushv(SValue *v)
|
ST_FUNC void vpushv(SValue *v)
|
||||||
|
Loading…
Reference in New Issue
Block a user