From 7e40ff9e35699ea14a6addd2ce20cd23be519430 Mon Sep 17 00:00:00 2001 From: Eric Fiselier Date: Mon, 11 Jul 2016 14:58:50 -0600 Subject: [PATCH] Provide a better implementation of DoNotOptimize(...). This implementation is less likely to ICE compilers, and is more correct. It also acts as a memory barrier which will help prevent writes to global memory from being optimized away. --- README.md | 45 ++++++++++++++++++++++++++++++- include/benchmark/benchmark_api.h | 19 +++++++------ 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index e1ac17a0..a0bcc61e 100644 --- a/README.md +++ b/README.md @@ -279,7 +279,8 @@ BENCHMARK(BM_ManualTiming)->Range(1, 1<<17)->UseManualTime(); ### Preventing optimisation To prevent a value or expression from being optimized away by the compiler -the `benchmark::DoNotOptimize(...)` function can be used. +the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()` +functions can be used. ```c++ static void BM_test(benchmark::State& state) { @@ -292,6 +293,48 @@ static void BM_test(benchmark::State& state) { } ``` +`DoNotOptimize()` forces the *result* of `` to be stored in either +memory or a register. For GNU based compilers it acts as read/write barrier +for global memory. More specifically it forces the compiler to flush pending +writes to memory and reload any other values as necessary. + +Note that `DoNotOptimize()` does not prevent optimizations on `` +in any way. `` may even be removed entirely when the result is already +known. For example: + +```c++ + /* Example 1: `` is removed entirely. */ + int foo(int x) { return x + 42; } + while (...) DoNotOptimize(foo(0)); // Optimized to DoNotOptimize(42); + + /* Example 2: Result of '' is only reused */ + int bar(int) __attribute__((const)); + while (...) DoNotOptimize(bar(0)); // Optimized to: + // int __result__ = bar(0); + // while (...) DoNotOptimize(__result__); +``` + +The second tool for preventing optimizations is `ClobberMemory()`. In essence +`ClobberMemory()` forces the compiler to perform all pending writes to global +memory. Memory managed by block scope objects must be "escaped" using +`DoNotOptimize(...)` before it can be clobbered. In the below example +`ClobberMemory()` prevents the call to `v.push_back(42)` from being optimized +away. + +```c++ +static void BM_vector_push_back(benchmark::State& state) { + while (state.KeepRunning()) { + std::vector v; + v.reserve(1); + benchmark::DoNotOptimize(v.data()); // Allow v.data() to be clobbered. + v.push_back(42); + benchmark::ClobberMemory(); // Force 42 to be written to memory. + } +} +``` + +Note that `ClobberMemory()` is only available for GNU based compilers. + ### Set time unit manually If a benchmark runs a few milliseconds it may be hard to visually compare the measured times, since the output data is given in nanoseconds per default. In diff --git a/include/benchmark/benchmark_api.h b/include/benchmark/benchmark_api.h index f38dc974..664ca2a9 100644 --- a/include/benchmark/benchmark_api.h +++ b/include/benchmark/benchmark_api.h @@ -207,25 +207,24 @@ Benchmark* RegisterBenchmarkInternal(Benchmark*); // The DoNotOptimize(...) function can be used to prevent a value or // expression from being optimized away by the compiler. This function is -// intented to add little to no overhead. -// See: http://stackoverflow.com/questions/28287064 +// intended to add little to no overhead. +// See: https://youtu.be/nXaxk27zwlk?t=2441 #if defined(__GNUC__) -// TODO(ericwf): Clang has a bug where it tries to always use a register -// even if value must be stored in memory. This causes codegen to fail. -// To work around this we remove the "r" modifier so the operand is always -// loaded into memory. -// GCC also has a bug where it complains about inconsistent operand constraints -// when "+rm" is used for a type larger than can fit in a register or two. -// For now force the operand to memory for both GCC and Clang. template inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) { - asm volatile("" : "+m" (const_cast(value))); + asm volatile("" : : "g"(value) : "memory"); +} +// Force the compiler to flush pending writes to global memory. Acts as an +// effective read/write barrier +inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { + asm volatile("" : : : "memory"); } #else template inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) { internal::UseCharPointer(&reinterpret_cast(value)); } +// FIXME Add ClobberMemory() for non-gnu compilers #endif // TimeUnit is passed to a benchmark in order to specify the order of magnitude