From 8545dfb3ea301f5c77626a046d4756ef9f2e4970 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Mon, 20 Jun 2022 11:12:58 +0200 Subject: [PATCH] Fix DoNotOptimize() GCC copy overhead (#1340) (#1410) * Fix DoNotOptimize() GCC copy overhead (#1340) The issue is that GCC DoNotOptimize() does a full copy of an argument if it's not a pointer and it slows down a benchmark. If an argument is big enough there is a memcpy() call for copying the argument. An argument object can be a big object so DoNotOptimize() could add sufficient overhead and affects benchmark results. The cause is in GCC behavior with asm volatile constraints. Looks like GCC trying to use r(register) constraint for all cases despite object size. See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105519 The solution is the split DoNotOptimize() in two cases - value fits in register and value doesn't fit in register. And use case specific asm constraint. std::is_trivially_copyable trait is needed because "+r" constraint doesn't work with non trivial copyable objects. - Fix requires support C++11 feature std::is_trivially_copyable from GCC compiler. The feature has been supported since GCC 5 - Fallback for GCC version < 5 still exists but it uses "m" constraint which means a little bit more overhead in some cases - Add assembly tests for issued cases Fixes #1340 * Add supported compiler versions info for assembly tests - Assembly tests are inherently non-portable. So explicitly add GCC and Clang versions required for reliable tests passed - Write a warning message if the current compiler version isn't supported --- include/benchmark/benchmark.h | 50 +++++++++++++++++++++++++++++ test/AssemblyTests.cmake | 20 ++++++++++++ test/donotoptimize_assembly_test.cc | 39 ++++++++++++++++++++++ 3 files changed, 109 insertions(+) diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index 42c04cc3..a4fc52df 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -446,6 +446,7 @@ inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { // intended to add little to no overhead. // See: https://youtu.be/nXaxk27zwlk?t=2441 #ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY +#if !defined(__GNUC__) || defined(__llvm__) || defined(__INTEL_COMPILER) template inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) { asm volatile("" : : "r,m"(value) : "memory"); @@ -459,6 +460,55 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) { asm volatile("" : "+m,r"(value) : : "memory"); #endif } +#elif defined(BENCHMARK_HAS_CXX11) && (__GNUC__ >= 5) +// Workaround for a bug with full argument copy overhead with GCC. +// See: #1340 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105519 +template +inline BENCHMARK_ALWAYS_INLINE + typename std::enable_if::value && + (sizeof(Tp) <= sizeof(Tp*))>::type + DoNotOptimize(Tp const& value) { + asm volatile("" : : "r"(value) : "memory"); +} + +template +inline BENCHMARK_ALWAYS_INLINE + typename std::enable_if::value || + (sizeof(Tp) > sizeof(Tp*))>::type + DoNotOptimize(Tp const& value) { + asm volatile("" : : "m"(value) : "memory"); +} + +template +inline BENCHMARK_ALWAYS_INLINE + typename std::enable_if::value && + (sizeof(Tp) <= sizeof(Tp*))>::type + DoNotOptimize(Tp& value) { + asm volatile("" : "+r"(value) : : "memory"); +} + +template +inline BENCHMARK_ALWAYS_INLINE + typename std::enable_if::value || + (sizeof(Tp) > sizeof(Tp*))>::type + DoNotOptimize(Tp& value) { + asm volatile("" : "+m"(value) : : "memory"); +} + +#else +// Fallback for GCC < 5. Can add some overhead because the compiler is forced +// to use memory operations instead of operations with registers. +// TODO: Remove if GCC < 5 will be unsupported. +template +inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) { + asm volatile("" : : "m"(value) : "memory"); +} + +template +inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) { + asm volatile("" : "+m"(value) : : "memory"); +} +#endif #ifndef BENCHMARK_HAS_CXX11 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { diff --git a/test/AssemblyTests.cmake b/test/AssemblyTests.cmake index 48318bdd..c43c711f 100644 --- a/test/AssemblyTests.cmake +++ b/test/AssemblyTests.cmake @@ -1,3 +1,23 @@ +set(CLANG_SUPPORTED_VERSION "5.0.0") +set(GCC_SUPPORTED_VERSION "5.5.0") + +if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL ${CLANG_SUPPORTED_VERSION}) + message (WARNING + "Unsupported Clang version " ${CMAKE_CXX_COMPILER_VERSION} + ". Expected is " ${CLANG_SUPPORTED_VERSION} + ". Assembly tests may be broken.") + endif() +elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU") + if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL ${GCC_SUPPORTED_VERSION}) + message (WARNING + "Unsupported GCC version " ${CMAKE_CXX_COMPILER_VERSION} + ". Expected is " ${GCC_SUPPORTED_VERSION} + ". Assembly tests may be broken.") + endif() +else() + message (WARNING "Unsupported compiler. Assembly tests may be broken.") +endif() include(split_list) diff --git a/test/donotoptimize_assembly_test.cc b/test/donotoptimize_assembly_test.cc index 2e86a51e..70e780a5 100644 --- a/test/donotoptimize_assembly_test.cc +++ b/test/donotoptimize_assembly_test.cc @@ -9,6 +9,9 @@ extern "C" { extern int ExternInt; extern int ExternInt2; extern int ExternInt3; +extern int BigArray[2049]; + +const int ConstBigArray[2049]{}; inline int Add42(int x) { return x + 42; } @@ -23,7 +26,15 @@ struct Large { int value; int data[2]; }; + +struct ExtraLarge { + int arr[2049]; +}; } + +extern ExtraLarge ExtraLargeObj; +const ExtraLarge ConstExtraLargeObj{}; + // CHECK-LABEL: test_with_rvalue: extern "C" void test_with_rvalue() { benchmark::DoNotOptimize(Add42(0)); @@ -68,6 +79,22 @@ extern "C" void test_with_large_lvalue() { // CHECK: ret } +// CHECK-LABEL: test_with_extra_large_lvalue_with_op: +extern "C" void test_with_extra_large_lvalue_with_op() { + ExtraLargeObj.arr[16] = 42; + benchmark::DoNotOptimize(ExtraLargeObj); + // CHECK: movl $42, ExtraLargeObj+64(%rip) + // CHECK: ret +} + +// CHECK-LABEL: test_with_big_array_with_op +extern "C" void test_with_big_array_with_op() { + BigArray[16] = 42; + benchmark::DoNotOptimize(BigArray); + // CHECK: movl $42, BigArray+64(%rip) + // CHECK: ret +} + // CHECK-LABEL: test_with_non_trivial_lvalue: extern "C" void test_with_non_trivial_lvalue() { NotTriviallyCopyable NTC(ExternInt); @@ -96,6 +123,18 @@ extern "C" void test_with_large_const_lvalue() { // CHECK: ret } +// CHECK-LABEL: test_with_const_extra_large_obj: +extern "C" void test_with_const_extra_large_obj() { + benchmark::DoNotOptimize(ConstExtraLargeObj); + // CHECK: ret +} + +// CHECK-LABEL: test_with_const_big_array +extern "C" void test_with_const_big_array() { + benchmark::DoNotOptimize(ConstBigArray); + // CHECK: ret +} + // CHECK-LABEL: test_with_non_trivial_const_lvalue: extern "C" void test_with_non_trivial_const_lvalue() { const NotTriviallyCopyable Obj(ExternInt);