1
0
mirror of https://github.com/google/benchmark.git synced 2025-04-29 14:30:37 +08:00

Add support for other multi-threading APIs

Support the benchmarking of code, which relies on other
multi-threading APIs, e.g. OpenMP.
This commit is contained in:
Olaf Krzikalla 2023-10-09 09:58:55 +02:00
parent b9850909a6
commit b027d02284
11 changed files with 357 additions and 19 deletions

View File

@ -830,6 +830,64 @@ BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
Without `UseRealTime`, CPU time is used by default.
### Manual Multithreaded Benchmarks
Google/benchmark uses `std::thread` as multithreading environment per default.
If you want to use another multithreading environment (e.g. OpenMP), you can
turn off the automatic creation of threads using the `ManualThreading` function.
```c++
static void BM_MultiThreaded(benchmark::State& state) {
// Setup code here.
for (auto _ : state) {
#pragma omp parallel num_threads(state.threads)
// Run the multithreaded test.
}
// Teardown code here.
}
BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4);
```
The above example creates a parallel region in each iteration.
This includes the setup and teardown of the parallel region in the time measurement, and it
adds an implicit barrier at the end of each iteration.
You can avoid these effects, if you run the whole loop in parallel.
Then you must not use the `state` object directly, but create a `ThreadState` object in each thread.
```c++
static void BM_MultiThreaded(benchmark::State& state) {
// Setup code (shared objects) here.
#pragma omp parallel num_threads(state.threads)
{
// Thread-local setup code here.
for (auto _ : benchmark::ThreadState(state)) {
// Run the multithreaded test.
}
}
// Teardown code here.
}
BENCHMARK(BM_MultiThreaded)->ManualThreading()->Threads(1)->Threads(2)->Threads(4);
```
If you use the `ThreadState` object and explicitly specify the number of threads, then you must
use `ManualThreading` and the number of created `ThreadState` objects must match the number of specified threads.
However, if you use `ThreadState` without explicitly specifying the number of threads,
then the number of threads is determined by the number of created `ThreadState` objects.
Specifying `ManualThreading` is optional in this case.
```c++
static void BM_MultiThreaded(benchmark::State& state) {
// Setup code (shared objects) here.
#pragma omp parallel
{
// Thread-local setup code here.
for (auto _ : benchmark::ThreadState(state)) {
// Run the multithreaded test.
}
}
// Teardown code here.
}
BENCHMARK(BM_MultiThreaded); // measures omp_get_max_threads number of threads.
```
<a name="cpu-timers" />
## CPU Timers

View File

@ -930,6 +930,9 @@ class BENCHMARK_EXPORT State {
return max_iterations - total_iterations_ + batch_leftover_;
}
BENCHMARK_ALWAYS_INLINE
int GetNumThreadStates() const { return num_thread_states_; }
BENCHMARK_ALWAYS_INLINE
std::string name() const { return name_; }
@ -976,12 +979,29 @@ class BENCHMARK_EXPORT State {
const std::string name_;
const int thread_index_;
const int threads_;
int num_thread_states_;
internal::ThreadTimer* const timer_;
internal::ThreadManager* const manager_;
internal::PerfCountersMeasurement* const perf_counters_measurement_;
friend class internal::BenchmarkInstance;
protected:
void MergeThreadStateToParent(State& parent) const;
bool started() const { return started_; }
internal::ThreadTimer* timer_;
internal::PerfCountersMeasurement* perf_counters_measurement_;
};
// ThreadState can be used in a manually multithreaded benchmark loop.
class ThreadState : public State {
public:
explicit ThreadState(State& s);
~ThreadState();
private:
State* parent_;
ThreadState(const ThreadState&);
};
inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
@ -1274,6 +1294,9 @@ class BENCHMARK_EXPORT Benchmark {
// Equivalent to ThreadRange(NumCPUs(), NumCPUs())
Benchmark* ThreadPerCpu();
// Don't create threads. Let the user evaluate state.threads and/or use ThreadState.
Benchmark* ManualThreading() { manual_threading_ = true; return this; }
virtual void Run(State& state) = 0;
TimeUnit GetTimeUnit() const;
@ -1286,6 +1309,7 @@ class BENCHMARK_EXPORT Benchmark {
const char* GetName() const;
int ArgsCnt() const;
const char* GetArgName(int arg) const;
bool GetExplicitThreading() const { return !thread_counts_.empty(); }
private:
friend class BenchmarkFamilies;
@ -1307,6 +1331,7 @@ class BENCHMARK_EXPORT Benchmark {
bool measure_process_cpu_time_;
bool use_real_time_;
bool use_manual_time_;
bool manual_threading_;
BigO complexity_;
BigOFunc* complexity_lambda_;
std::vector<Statistics> statistics_;

View File

@ -172,8 +172,9 @@ State::State(std::string name, IterationCount max_iters,
name_(std::move(name)),
thread_index_(thread_i),
threads_(n_threads),
timer_(timer),
num_thread_states_(0),
manager_(manager),
timer_(timer),
perf_counters_measurement_(perf_counters_measurement) {
BM_CHECK(max_iterations != 0) << "At least one iteration must be run";
BM_CHECK_LT(thread_index_, threads_)
@ -309,6 +310,40 @@ void State::FinishKeepRunning() {
manager_->StartStopBarrier();
}
void State::MergeThreadStateToParent(State& parent) const {
MutexLock l(manager_->GetBenchmarkMutex());
internal::MergeResults(*this, timer_, manager_);
assert(parent.total_iterations_ == 0 ||
parent.total_iterations_ == total_iterations_);
assert(parent.batch_leftover_ == 0 ||
parent.batch_leftover_ == batch_leftover_);
parent.total_iterations_ = total_iterations_;
parent.batch_leftover_ = batch_leftover_;
parent.started_ = parent.started_ || started_;
parent.finished_ = parent.finished_ || finished_;
parent.skipped_ =
(parent.error_occurred() || error_occurred())
? internal::SkippedWithError
: (parent.skipped() || skipped() ? internal::SkippedWithMessage
: internal::NotSkipped);
parent.num_thread_states_++;
}
ThreadState::ThreadState(State& s) : State(s), parent_(&s) {
BM_CHECK(!started())
<< "Don't create a ThreadState object after measurement has started";
timer_ = new internal::ThreadTimer(*timer_);
perf_counters_measurement_ = new internal::PerfCountersMeasurement(
perf_counters_measurement_->names());
}
ThreadState::~ThreadState() {
BM_CHECK(error_occurred() || iterations() >= max_iterations)
<< "Benchmark returned before ThreadState::KeepRunning() returned false!";
MergeThreadStateToParent(*parent_);
delete timer_;
}
namespace internal {
namespace {

View File

@ -2,7 +2,10 @@
#include <cinttypes>
#include "counter.h"
#include "string_util.h"
#include "thread_manager.h"
#include "thread_timer.h"
namespace benchmark {
namespace internal {
@ -27,7 +30,9 @@ BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
min_time_(benchmark_.min_time_),
min_warmup_time_(benchmark_.min_warmup_time_),
iterations_(benchmark_.iterations_),
threads_(thread_count) {
threads_(thread_count),
manual_threading_(benchmark_.manual_threading_),
explicit_threading_(benchmark_.GetExplicitThreading()) {
name_.function_name = benchmark_.name_;
size_t arg_i = 0;
@ -114,5 +119,16 @@ void BenchmarkInstance::Teardown() const {
teardown_(st);
}
}
void MergeResults(const State& st, const ThreadTimer* timer,
ThreadManager* manager) {
ThreadManager::Result& results = manager->results;
results.iterations += st.iterations();
results.cpu_time_used += timer->cpu_time_used();
results.real_time_used += timer->real_time_used();
results.manual_time_used += timer->manual_time_used();
results.complexity_n += st.complexity_length_n();
Increment(&results.counters, st.counters);
}
} // namespace internal
} // namespace benchmark

View File

@ -41,6 +41,8 @@ class BenchmarkInstance {
int threads() const { return threads_; }
void Setup() const;
void Teardown() const;
bool explicit_threading() const { return explicit_threading_; }
bool manual_threading() const { return manual_threading_; }
State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
internal::ThreadManager* manager,
@ -66,6 +68,9 @@ class BenchmarkInstance {
double min_warmup_time_;
IterationCount iterations_;
int threads_; // Number of concurrent threads to us
bool manual_threading_;
bool explicit_threading_; // true: Number of threads come from a Threads()
// call
typedef void (*callback_function)(const benchmark::State&);
callback_function setup_ = nullptr;
@ -78,6 +83,9 @@ bool FindBenchmarksInternal(const std::string& re,
bool IsZero(double n);
void MergeResults(const State& st, const ThreadTimer* timer,
ThreadManager* manager);
BENCHMARK_EXPORT
ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false);

View File

@ -217,6 +217,7 @@ Benchmark::Benchmark(const std::string& name)
measure_process_cpu_time_(false),
use_real_time_(false),
use_manual_time_(false),
manual_threading_(false),
complexity_(oNone),
complexity_lambda_(nullptr),
setup_(nullptr),

View File

@ -86,7 +86,7 @@ BenchmarkReporter::Run CreateRunReport(
// This is the total iterations across all threads.
report.iterations = results.iterations;
report.time_unit = b.time_unit();
report.threads = b.threads();
report.threads = results.thread_count;
report.repetition_index = repetition_index;
report.repetitions = repeats;
@ -130,17 +130,36 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,
State st =
b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations)
<< "Benchmark returned before State::KeepRunning() returned false!";
assert(b->explicit_threading() || b->threads() == 1);
if (st.GetNumThreadStates() > 0) {
BM_CHECK((!b->explicit_threading()) || b->manual_threading())
<< "Benchmark " << b->name().str()
<< " run with managed threading. It must not create ThreadStates!";
BM_CHECK((!b->explicit_threading()) ||
st.GetNumThreadStates() == b->threads())
<< "The number of ThreadStates created by Benchmark " << b->name().str()
<< " doesn't match the number of threads!";
} else {
BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations)
<< "Benchmark returned before State::KeepRunning() returned false!";
}
{
MutexLock l(manager->GetBenchmarkMutex());
internal::ThreadManager::Result& results = manager->results;
results.iterations += st.iterations();
results.cpu_time_used += timer.cpu_time_used();
results.real_time_used += timer.real_time_used();
results.manual_time_used += timer.manual_time_used();
results.complexity_n += st.complexity_length_n();
internal::Increment(&results.counters, st.counters);
if (st.GetNumThreadStates() > 0) {
// State values as well as thread state values are summed up for
// complexity_n and user counters:
results.complexity_n += st.complexity_length_n();
internal::Increment(&results.counters, st.counters);
results.thread_count =
b->explicit_threading() ? b->threads() : st.GetNumThreadStates();
} else {
internal::MergeResults(st, &timer, manager);
results.thread_count = b->threads();
}
}
manager->NotifyThreadComplete();
}
@ -234,7 +253,8 @@ BenchmarkRunner::BenchmarkRunner(
has_explicit_iteration_count(b.iterations() != 0 ||
parsed_benchtime_flag.tag ==
BenchTimeType::ITERS),
pool(b.threads() - 1),
num_managed_threads(b.manual_threading() ? 1 : b.threads()),
pool(num_managed_threads - 1),
iters(has_explicit_iteration_count
? ComputeIters(b_, parsed_benchtime_flag)
: 1),
@ -260,7 +280,7 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n";
std::unique_ptr<internal::ThreadManager> manager;
manager.reset(new internal::ThreadManager(b.threads()));
manager.reset(new internal::ThreadManager(num_managed_threads));
// Run all but one thread in separate threads
for (std::size_t ti = 0; ti < pool.size(); ++ti) {
@ -287,17 +307,18 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
manager.reset();
// Adjust real/manual time stats since they were reported per thread.
i.results.real_time_used /= b.threads();
i.results.manual_time_used /= b.threads();
i.results.real_time_used /= i.results.thread_count;
i.results.manual_time_used /= i.results.thread_count;
// If we were measuring whole-process CPU usage, adjust the CPU time too.
if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads();
if (b.measure_process_cpu_time())
i.results.cpu_time_used /= i.results.thread_count;
BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
<< i.results.real_time_used << "\n";
// By using KeepRunningBatch a benchmark can iterate more times than
// requested, so take the iteration count from i.results.
i.iters = i.results.iterations / b.threads();
i.iters = i.results.iterations / i.results.thread_count;
// Base decisions off of real time if requested by this benchmark.
i.seconds = i.results.cpu_time_used;

View File

@ -93,6 +93,7 @@ class BenchmarkRunner {
bool warmup_done;
const int repeats;
const bool has_explicit_iteration_count;
const int num_managed_threads; // must be before pool
int num_repetitions_done = 0;

View File

@ -45,6 +45,7 @@ class ThreadManager {
std::string report_label_;
std::string skip_message_;
internal::Skipped skipped_ = internal::NotSkipped;
int thread_count = 0;
UserCounters counters;
};
GUARDED_BY(GetBenchmarkMutex()) Result results;

View File

@ -167,6 +167,9 @@ add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time
compile_output_test(internal_threading_test)
add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01s)
compile_output_test(manual_threading_test)
add_test(NAME manual_threading_test COMMAND manual_threading_test --benchmark_min_time=0.01s)
compile_output_test(report_aggregates_only_test)
add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01s)

View File

@ -0,0 +1,169 @@
#undef NDEBUG
#include <chrono>
#include <thread>
#include <future>
#include "../src/timers.h"
#include "benchmark/benchmark.h"
#include "output_test.h"
namespace {
static const std::chrono::duration<double, std::milli> time_frame(50);
static const double time_frame_in_sec(
std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(
time_frame)
.count());
void MyBusySpinwait() {
const auto start = benchmark::ChronoClockNow();
while (true) {
const auto now = benchmark::ChronoClockNow();
const auto elapsed = now - start;
if (std::chrono::duration<double, std::chrono::seconds::period>(elapsed) >=
time_frame)
return;
}
}
}
// ========================================================================= //
// --------------------------- TEST CASES BEGIN ---------------------------- //
// ========================================================================= //
// ========================================================================= //
// BM_ManualThreadingInLoop
// Measurements include the creation and joining of threads.
void BM_ManualThreadingInLoop(benchmark::State& state) {
int numWorkerThreads = state.threads() - 1;
std::vector<std::thread> pool (numWorkerThreads);
for (auto _ : state) {
for (int i = 0; i < numWorkerThreads; ++i)
{
pool[i] = std::thread(MyBusySpinwait);
}
MyBusySpinwait();
for (int i = 0; i < numWorkerThreads; ++i)
{
pool[i].join();
}
state.SetIterationTime(time_frame_in_sec);
}
state.counters["invtime"] =
benchmark::Counter{1, benchmark::Counter::kIsRate};
}
BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1);
BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseRealTime();
BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseManualTime();
BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(1)->MeasureProcessCPUTime();
BENCHMARK(BM_ManualThreadingInLoop)
->Iterations(1)
->ManualThreading()
->Threads(1)
->MeasureProcessCPUTime()
->UseRealTime();
BENCHMARK(BM_ManualThreadingInLoop)
->Iterations(1)
->ManualThreading()
->Threads(1)
->MeasureProcessCPUTime()
->UseManualTime();
BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2);
BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseRealTime();
BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseManualTime();
BENCHMARK(BM_ManualThreadingInLoop)->Iterations(1)->ManualThreading()->Threads(2)->MeasureProcessCPUTime();
BENCHMARK(BM_ManualThreadingInLoop)
->Iterations(1)
->ManualThreading()
->Threads(2)
->MeasureProcessCPUTime()
->UseRealTime();
BENCHMARK(BM_ManualThreadingInLoop)
->Iterations(1)
->ManualThreading()
->Threads(2)
->MeasureProcessCPUTime()
->UseManualTime();
// ========================================================================= //
// BM_ManualThreadingBeforeLoop
// Creation of threads is done before the start of the measurement, joining after the finish of the measurement.
void BM_ManualThreadingBeforeLoop(benchmark::State& state) {
std::promise<void> thread_starter;
auto starter_future = thread_starter.get_future();
auto threadedLoop = [&]() {
starter_future.wait();
benchmark::ThreadState ts(state);
for (auto _ : ts) {
MyBusySpinwait();
ts.SetIterationTime(time_frame_in_sec);
}
};
std::vector<std::thread> pool (state.threads());
for (int i = 0; i < state.threads(); ++i)
{
pool[i] = std::thread(threadedLoop);
}
thread_starter.set_value();
for (int i = 0; i < state.threads(); ++i)
{
pool[i].join();
}
state.counters["invtime"] =
benchmark::Counter{1, benchmark::Counter::kIsRate};
}
BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1);
BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseRealTime();
BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1)->UseManualTime();
BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(1)->MeasureProcessCPUTime();
BENCHMARK(BM_ManualThreadingBeforeLoop)
->Iterations(1)
->ManualThreading()
->Threads(1)
->MeasureProcessCPUTime()
->UseRealTime();
BENCHMARK(BM_ManualThreadingBeforeLoop)
->Iterations(1)
->ManualThreading()
->Threads(1)
->MeasureProcessCPUTime()
->UseManualTime();
BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2);
BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseRealTime();
BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2)->UseManualTime();
BENCHMARK(BM_ManualThreadingBeforeLoop)->Iterations(1)->ManualThreading()->Threads(2)->MeasureProcessCPUTime();
BENCHMARK(BM_ManualThreadingBeforeLoop)
->Iterations(1)
->ManualThreading()
->Threads(2)
->MeasureProcessCPUTime()
->UseRealTime();
BENCHMARK(BM_ManualThreadingBeforeLoop)
->Iterations(1)
->ManualThreading()
->Threads(2)
->MeasureProcessCPUTime()
->UseManualTime();
// ========================================================================= //
// ---------------------------- TEST CASES END ----------------------------- //
// ========================================================================= //
int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }