2015-03-13 06:03:33 +08:00
|
|
|
|
2017-07-05 06:31:47 +08:00
|
|
|
#include "benchmark/benchmark.h"
|
2015-03-13 06:03:33 +08:00
|
|
|
|
2016-10-08 02:04:50 +08:00
|
|
|
#define BASIC_BENCHMARK_TEST(x) BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
|
2015-03-13 06:03:33 +08:00
|
|
|
|
|
|
|
void BM_empty(benchmark::State& state) {
|
2017-10-18 02:17:02 +08:00
|
|
|
for (auto _ : state) {
|
2015-03-28 04:35:46 +08:00
|
|
|
benchmark::DoNotOptimize(state.iterations());
|
2015-03-13 06:03:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
BENCHMARK(BM_empty);
|
|
|
|
BENCHMARK(BM_empty)->ThreadPerCpu();
|
|
|
|
|
|
|
|
void BM_spin_empty(benchmark::State& state) {
|
2017-10-18 02:17:02 +08:00
|
|
|
for (auto _ : state) {
|
2021-11-04 18:26:11 +08:00
|
|
|
for (auto x = 0; x < state.range(0); ++x) {
|
2015-03-28 04:35:46 +08:00
|
|
|
benchmark::DoNotOptimize(x);
|
2015-03-13 06:03:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
BASIC_BENCHMARK_TEST(BM_spin_empty);
|
|
|
|
BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu();
|
|
|
|
|
|
|
|
void BM_spin_pause_before(benchmark::State& state) {
|
2021-11-04 18:26:11 +08:00
|
|
|
for (auto i = 0; i < state.range(0); ++i) {
|
2015-03-28 04:35:46 +08:00
|
|
|
benchmark::DoNotOptimize(i);
|
2015-03-13 06:03:33 +08:00
|
|
|
}
|
2017-10-18 02:17:02 +08:00
|
|
|
for (auto _ : state) {
|
2021-11-04 18:26:11 +08:00
|
|
|
for (auto i = 0; i < state.range(0); ++i) {
|
2015-03-28 04:35:46 +08:00
|
|
|
benchmark::DoNotOptimize(i);
|
2015-03-13 06:03:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
BASIC_BENCHMARK_TEST(BM_spin_pause_before);
|
|
|
|
BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu();
|
|
|
|
|
|
|
|
void BM_spin_pause_during(benchmark::State& state) {
|
2017-10-18 02:17:02 +08:00
|
|
|
for (auto _ : state) {
|
2015-03-13 06:03:33 +08:00
|
|
|
state.PauseTiming();
|
2021-11-04 18:26:11 +08:00
|
|
|
for (auto i = 0; i < state.range(0); ++i) {
|
2015-03-28 04:35:46 +08:00
|
|
|
benchmark::DoNotOptimize(i);
|
2015-03-13 06:03:33 +08:00
|
|
|
}
|
|
|
|
state.ResumeTiming();
|
2021-11-04 18:26:11 +08:00
|
|
|
for (auto i = 0; i < state.range(0); ++i) {
|
2015-03-28 04:35:46 +08:00
|
|
|
benchmark::DoNotOptimize(i);
|
2015-03-13 06:03:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
BASIC_BENCHMARK_TEST(BM_spin_pause_during);
|
|
|
|
BASIC_BENCHMARK_TEST(BM_spin_pause_during)->ThreadPerCpu();
|
|
|
|
|
2015-03-27 02:26:07 +08:00
|
|
|
void BM_pause_during(benchmark::State& state) {
|
2017-10-18 02:17:02 +08:00
|
|
|
for (auto _ : state) {
|
2015-03-27 02:26:07 +08:00
|
|
|
state.PauseTiming();
|
|
|
|
state.ResumeTiming();
|
|
|
|
}
|
|
|
|
}
|
2015-03-27 02:56:52 +08:00
|
|
|
BENCHMARK(BM_pause_during);
|
|
|
|
BENCHMARK(BM_pause_during)->ThreadPerCpu();
|
2015-03-28 04:37:53 +08:00
|
|
|
BENCHMARK(BM_pause_during)->UseRealTime();
|
|
|
|
BENCHMARK(BM_pause_during)->UseRealTime()->ThreadPerCpu();
|
2015-03-27 02:26:07 +08:00
|
|
|
|
2015-03-13 06:03:33 +08:00
|
|
|
void BM_spin_pause_after(benchmark::State& state) {
|
2017-10-18 02:17:02 +08:00
|
|
|
for (auto _ : state) {
|
2021-11-04 18:26:11 +08:00
|
|
|
for (auto i = 0; i < state.range(0); ++i) {
|
2015-03-28 04:35:46 +08:00
|
|
|
benchmark::DoNotOptimize(i);
|
2015-03-13 06:03:33 +08:00
|
|
|
}
|
|
|
|
}
|
2021-11-04 18:26:11 +08:00
|
|
|
for (auto i = 0; i < state.range(0); ++i) {
|
2015-03-28 04:35:46 +08:00
|
|
|
benchmark::DoNotOptimize(i);
|
2015-03-13 06:03:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
BASIC_BENCHMARK_TEST(BM_spin_pause_after);
|
|
|
|
BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu();
|
|
|
|
|
|
|
|
void BM_spin_pause_before_and_after(benchmark::State& state) {
|
2021-11-04 18:26:11 +08:00
|
|
|
for (auto i = 0; i < state.range(0); ++i) {
|
2015-03-28 04:35:46 +08:00
|
|
|
benchmark::DoNotOptimize(i);
|
2015-03-13 06:03:33 +08:00
|
|
|
}
|
2017-10-18 02:17:02 +08:00
|
|
|
for (auto _ : state) {
|
2021-11-04 18:26:11 +08:00
|
|
|
for (auto i = 0; i < state.range(0); ++i) {
|
2015-03-28 04:35:46 +08:00
|
|
|
benchmark::DoNotOptimize(i);
|
2015-03-13 06:03:33 +08:00
|
|
|
}
|
|
|
|
}
|
2021-11-04 18:26:11 +08:00
|
|
|
for (auto i = 0; i < state.range(0); ++i) {
|
2015-03-28 04:35:46 +08:00
|
|
|
benchmark::DoNotOptimize(i);
|
2015-03-13 06:03:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after);
|
|
|
|
BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after)->ThreadPerCpu();
|
|
|
|
|
|
|
|
void BM_empty_stop_start(benchmark::State& state) {
|
2017-10-18 02:17:02 +08:00
|
|
|
for (auto _ : state) {
|
2016-10-08 02:04:50 +08:00
|
|
|
}
|
2015-03-13 06:03:33 +08:00
|
|
|
}
|
|
|
|
BENCHMARK(BM_empty_stop_start);
|
|
|
|
BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
|
|
|
|
|
2017-10-10 23:56:42 +08:00
|
|
|
void BM_KeepRunning(benchmark::State& state) {
|
Iteration counts should be `uint64_t` globally. (#817)
This is a shameless rip-off of https://github.com/google/benchmark/pull/646
I did promise to look into why that proposed PR was producing
so much worse assembly, and so i finally did.
The reason is - that diff changes `size_t` (unsigned) to `int64_t` (signed).
There is this nice little `assert`:
https://github.com/google/benchmark/blob/7a1c37028359ca9d386d719a6ad527743cf1b753/include/benchmark/benchmark.h#L744
It ensures that we didn't magically decide to advance our iterator
when we should have finished benchmarking.
When `cached_` was unsigned, the `assert` was `cached_ UGT 0`.
But we only ever get to that `assert` if `cached_ NE 0`,
and naturally if `cached_` is not `0`, then it is bigger than `0`,
so the `assert` is tautological, and gets folded away.
But now that `cached_` became signed, the assert became `cached_ SGT 0`.
And we still only know that `cached_ NE 0`, so the assert can't be
optimized out, or at least it doesn't currently.
Regardless of whether or not that is a bug in itself,
that particular diff would have regressed the normal 64-bit systems,
by halving the maximal iteration space (since we go from unsigned counter
to signed one, of the same bit-width), which seems like a bug.
And just so it happens, fixing *this* bug, fixes the other bug.
This produces fully (bit-by-bit) identical state_assembly_test.s
The filecheck change is actually needed regardless of this patch,
else this test does not pass for me even without this diff.
2019-05-13 17:33:11 +08:00
|
|
|
benchmark::IterationCount iter_count = 0;
|
2018-05-24 17:33:19 +08:00
|
|
|
assert(iter_count == state.iterations());
|
2017-10-10 23:56:42 +08:00
|
|
|
while (state.KeepRunning()) {
|
|
|
|
++iter_count;
|
|
|
|
}
|
2018-02-10 12:57:04 +08:00
|
|
|
assert(iter_count == state.iterations());
|
2017-10-10 23:56:42 +08:00
|
|
|
}
|
|
|
|
BENCHMARK(BM_KeepRunning);
|
|
|
|
|
2018-02-10 12:57:04 +08:00
|
|
|
void BM_KeepRunningBatch(benchmark::State& state) {
|
Use fewer ramp up repetitions when KeepRunningBatch is used (#1113)
Use the benchmark's reported iteration count when estimating
iterations for the next repetition, rather than the requested
iteration count. When the benchmark uses KeepRunningBatch the actual
iteration count can be larger than the one the runner requested.
Prior to this fix the runner was underestimating the next iteration
count, sometimes significantly so. Consider the case of a benchmark
using a batch size of 1024. Prior to this change, the benchmark
runner would attempt iteration counts 1, 10, 100 and 1000, yet the
benchmark itself would do the same amount of work each time: a single
batch of 1024 iterations. The discrepancy could also contribute to
estimation errors once the benchmark time reached 10% of the target.
For example, if the very first batch of 1024 iterations reached 10% of
benchmark_min_min time, the runner would attempt to scale that to 100%
from a basis of one iteration rather than 1024.
This bug was particularly noticeable in benchmarks with large batch
sizes, especially when the benchmark also had slow set up or tear down
phases.
With this fix in place it is possible to use KeepRunningBatch to
achieve a kind of "minimum iteration count" feature by using a larger
fixed batch size. For example, a benchmark may build a map of 500K
elements and test a "find" operation. There is no point in running
"find" just 1, 10, 100, etc., times. The benchmark can now pick a
batch size of something like 10K, and the runner will arrive at the
final max iteration count with in noticeably fewer repetitions.
2021-04-20 14:16:05 +08:00
|
|
|
// Choose a batch size >1000 to skip the typical runs with iteration
|
|
|
|
// targets of 10, 100 and 1000. If these are not actually skipped the
|
|
|
|
// bug would be detectable as consecutive runs with the same iteration
|
|
|
|
// count. Below we assert that this does not happen.
|
|
|
|
const benchmark::IterationCount batch_size = 1009;
|
|
|
|
|
|
|
|
static benchmark::IterationCount prior_iter_count = 0;
|
Iteration counts should be `uint64_t` globally. (#817)
This is a shameless rip-off of https://github.com/google/benchmark/pull/646
I did promise to look into why that proposed PR was producing
so much worse assembly, and so i finally did.
The reason is - that diff changes `size_t` (unsigned) to `int64_t` (signed).
There is this nice little `assert`:
https://github.com/google/benchmark/blob/7a1c37028359ca9d386d719a6ad527743cf1b753/include/benchmark/benchmark.h#L744
It ensures that we didn't magically decide to advance our iterator
when we should have finished benchmarking.
When `cached_` was unsigned, the `assert` was `cached_ UGT 0`.
But we only ever get to that `assert` if `cached_ NE 0`,
and naturally if `cached_` is not `0`, then it is bigger than `0`,
so the `assert` is tautological, and gets folded away.
But now that `cached_` became signed, the assert became `cached_ SGT 0`.
And we still only know that `cached_ NE 0`, so the assert can't be
optimized out, or at least it doesn't currently.
Regardless of whether or not that is a bug in itself,
that particular diff would have regressed the normal 64-bit systems,
by halving the maximal iteration space (since we go from unsigned counter
to signed one, of the same bit-width), which seems like a bug.
And just so it happens, fixing *this* bug, fixes the other bug.
This produces fully (bit-by-bit) identical state_assembly_test.s
The filecheck change is actually needed regardless of this patch,
else this test does not pass for me even without this diff.
2019-05-13 17:33:11 +08:00
|
|
|
benchmark::IterationCount iter_count = 0;
|
2018-02-10 12:57:04 +08:00
|
|
|
while (state.KeepRunningBatch(batch_size)) {
|
|
|
|
iter_count += batch_size;
|
|
|
|
}
|
|
|
|
assert(state.iterations() == iter_count);
|
Use fewer ramp up repetitions when KeepRunningBatch is used (#1113)
Use the benchmark's reported iteration count when estimating
iterations for the next repetition, rather than the requested
iteration count. When the benchmark uses KeepRunningBatch the actual
iteration count can be larger than the one the runner requested.
Prior to this fix the runner was underestimating the next iteration
count, sometimes significantly so. Consider the case of a benchmark
using a batch size of 1024. Prior to this change, the benchmark
runner would attempt iteration counts 1, 10, 100 and 1000, yet the
benchmark itself would do the same amount of work each time: a single
batch of 1024 iterations. The discrepancy could also contribute to
estimation errors once the benchmark time reached 10% of the target.
For example, if the very first batch of 1024 iterations reached 10% of
benchmark_min_min time, the runner would attempt to scale that to 100%
from a basis of one iteration rather than 1024.
This bug was particularly noticeable in benchmarks with large batch
sizes, especially when the benchmark also had slow set up or tear down
phases.
With this fix in place it is possible to use KeepRunningBatch to
achieve a kind of "minimum iteration count" feature by using a larger
fixed batch size. For example, a benchmark may build a map of 500K
elements and test a "find" operation. There is no point in running
"find" just 1, 10, 100, etc., times. The benchmark can now pick a
batch size of something like 10K, and the runner will arrive at the
final max iteration count with in noticeably fewer repetitions.
2021-04-20 14:16:05 +08:00
|
|
|
|
|
|
|
// Verify that the iteration count always increases across runs (see
|
|
|
|
// comment above).
|
|
|
|
assert(iter_count == batch_size // max_iterations == 1
|
|
|
|
|| iter_count > prior_iter_count); // max_iterations > batch_size
|
|
|
|
prior_iter_count = iter_count;
|
2018-02-10 12:57:04 +08:00
|
|
|
}
|
Use fewer ramp up repetitions when KeepRunningBatch is used (#1113)
Use the benchmark's reported iteration count when estimating
iterations for the next repetition, rather than the requested
iteration count. When the benchmark uses KeepRunningBatch the actual
iteration count can be larger than the one the runner requested.
Prior to this fix the runner was underestimating the next iteration
count, sometimes significantly so. Consider the case of a benchmark
using a batch size of 1024. Prior to this change, the benchmark
runner would attempt iteration counts 1, 10, 100 and 1000, yet the
benchmark itself would do the same amount of work each time: a single
batch of 1024 iterations. The discrepancy could also contribute to
estimation errors once the benchmark time reached 10% of the target.
For example, if the very first batch of 1024 iterations reached 10% of
benchmark_min_min time, the runner would attempt to scale that to 100%
from a basis of one iteration rather than 1024.
This bug was particularly noticeable in benchmarks with large batch
sizes, especially when the benchmark also had slow set up or tear down
phases.
With this fix in place it is possible to use KeepRunningBatch to
achieve a kind of "minimum iteration count" feature by using a larger
fixed batch size. For example, a benchmark may build a map of 500K
elements and test a "find" operation. There is no point in running
"find" just 1, 10, 100, etc., times. The benchmark can now pick a
batch size of something like 10K, and the runner will arrive at the
final max iteration count with in noticeably fewer repetitions.
2021-04-20 14:16:05 +08:00
|
|
|
// Register with a fixed repetition count to establish the invariant that
|
|
|
|
// the iteration count should always change across runs. This overrides
|
|
|
|
// the --benchmark_repetitions command line flag, which would otherwise
|
|
|
|
// cause this test to fail if set > 1.
|
|
|
|
BENCHMARK(BM_KeepRunningBatch)->Repetitions(1);
|
2018-02-10 12:57:04 +08:00
|
|
|
|
2017-10-10 23:56:42 +08:00
|
|
|
void BM_RangedFor(benchmark::State& state) {
|
Iteration counts should be `uint64_t` globally. (#817)
This is a shameless rip-off of https://github.com/google/benchmark/pull/646
I did promise to look into why that proposed PR was producing
so much worse assembly, and so i finally did.
The reason is - that diff changes `size_t` (unsigned) to `int64_t` (signed).
There is this nice little `assert`:
https://github.com/google/benchmark/blob/7a1c37028359ca9d386d719a6ad527743cf1b753/include/benchmark/benchmark.h#L744
It ensures that we didn't magically decide to advance our iterator
when we should have finished benchmarking.
When `cached_` was unsigned, the `assert` was `cached_ UGT 0`.
But we only ever get to that `assert` if `cached_ NE 0`,
and naturally if `cached_` is not `0`, then it is bigger than `0`,
so the `assert` is tautological, and gets folded away.
But now that `cached_` became signed, the assert became `cached_ SGT 0`.
And we still only know that `cached_ NE 0`, so the assert can't be
optimized out, or at least it doesn't currently.
Regardless of whether or not that is a bug in itself,
that particular diff would have regressed the normal 64-bit systems,
by halving the maximal iteration space (since we go from unsigned counter
to signed one, of the same bit-width), which seems like a bug.
And just so it happens, fixing *this* bug, fixes the other bug.
This produces fully (bit-by-bit) identical state_assembly_test.s
The filecheck change is actually needed regardless of this patch,
else this test does not pass for me even without this diff.
2019-05-13 17:33:11 +08:00
|
|
|
benchmark::IterationCount iter_count = 0;
|
2017-10-10 23:56:42 +08:00
|
|
|
for (auto _ : state) {
|
|
|
|
++iter_count;
|
|
|
|
}
|
|
|
|
assert(iter_count == state.max_iterations);
|
|
|
|
}
|
|
|
|
BENCHMARK(BM_RangedFor);
|
|
|
|
|
2021-10-26 22:38:12 +08:00
|
|
|
#ifdef BENCHMARK_HAS_CXX11
|
|
|
|
template <typename T>
|
|
|
|
void BM_OneTemplateFunc(benchmark::State& state) {
|
2021-11-04 18:26:11 +08:00
|
|
|
auto arg = state.range(0);
|
2021-10-26 22:38:12 +08:00
|
|
|
T sum = 0;
|
|
|
|
for (auto _ : state) {
|
|
|
|
sum += arg;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
BENCHMARK(BM_OneTemplateFunc<int>)->Arg(1);
|
|
|
|
BENCHMARK(BM_OneTemplateFunc<double>)->Arg(1);
|
|
|
|
|
|
|
|
template <typename A, typename B>
|
|
|
|
void BM_TwoTemplateFunc(benchmark::State& state) {
|
2021-11-04 18:26:11 +08:00
|
|
|
auto arg = state.range(0);
|
2021-10-26 22:38:12 +08:00
|
|
|
A sum = 0;
|
|
|
|
B prod = 1;
|
|
|
|
for (auto _ : state) {
|
|
|
|
sum += arg;
|
|
|
|
prod *= arg;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
BENCHMARK(BM_TwoTemplateFunc<int, double>)->Arg(1);
|
|
|
|
BENCHMARK(BM_TwoTemplateFunc<double, int>)->Arg(1);
|
|
|
|
|
|
|
|
#endif // BENCHMARK_HAS_CXX11
|
|
|
|
|
2018-02-21 15:54:19 +08:00
|
|
|
// Ensure that StateIterator provides all the necessary typedefs required to
|
|
|
|
// instantiate std::iterator_traits.
|
2021-11-11 00:22:31 +08:00
|
|
|
static_assert(
|
|
|
|
std::is_same<typename std::iterator_traits<
|
|
|
|
benchmark::State::StateIterator>::value_type,
|
|
|
|
typename benchmark::State::StateIterator::value_type>::value,
|
|
|
|
"");
|
2018-02-21 15:54:19 +08:00
|
|
|
|
2017-12-04 09:45:07 +08:00
|
|
|
BENCHMARK_MAIN();
|