#include "benchmark/benchmark.h" #include "benchmark/macros.h" #include "colorprint.h" #include "commandlineflags.h" #include "mutex_lock.h" #include "sleep.h" #include "stat.h" #include "sysinfo.h" #include "walltime.h" #include #include #include #if defined OS_FREEBSD #include #else #include #endif #include #include #include #include DEFINE_string(benchmark_filter, ".", "A regular expression that specifies the set of benchmarks " "to execute. If this flag is empty, no benchmarks are run. " "If this flag is the string \"all\", all benchmarks linked " "into the process are run."); DEFINE_int32(benchmark_min_iters, 100, "Minimum number of iterations per benchmark"); DEFINE_int32(benchmark_max_iters, 1000000000, "Maximum number of iterations per benchmark"); DEFINE_double(benchmark_min_time, 0.5, "Minimum number of seconds we should run benchmark before " "results are considered significant. For cpu-time based " "tests, this is the lower bound on the total cpu time " "used by all threads that make up the test. For real-time " "based tests, this is the lower bound on the elapsed time " "of the benchmark execution, regardless of number of " "threads."); DEFINE_bool(benchmark_memory_usage, false, "Report memory usage for all benchmarks"); DEFINE_int32(benchmark_repetitions, 1, "The number of runs of each benchmark. If greater than 1, the " "mean and standard deviation of the runs will be reported."); DEFINE_int32(v, 0, "The level of verbose logging to output"); DEFINE_bool(color_print, true, "Enables colorized logging."); // Will be non-empty if heap checking is turned on, which would // invalidate any benchmarks. DECLARE_string(heap_check); // The ""'s catch people who don't pass in a literal for "str" #define strliterallen(str) (sizeof("" str "")-1) // Must use a string literal for prefix. #define memprefix(str, len, prefix) \ ( (((len) >= strliterallen(prefix)) \ && memcmp(str, prefix, strliterallen(prefix)) == 0) \ ? str + strliterallen(prefix) \ : NULL ) namespace benchmark { namespace { // kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta. static const char kBigSIUnits[] = "kMGTPEZY"; // Kibi, Mebi, Gibi, Tebi, Pebi, Exbi, Zebi, Yobi. static const char kBigIECUnits[] = "KMGTPEZY"; // milli, micro, nano, pico, femto, atto, zepto, yocto. static const char kSmallSIUnits[] = "munpfazy"; // We require that all three arrays have the same size. STATIC_ASSERT(arraysize(kBigSIUnits) == arraysize(kBigIECUnits), SI_and_IEC_unit_arrays_must_be_the_same_size); STATIC_ASSERT(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits), Small_SI_and_Big_SI_unit_arrays_must_be_the_same_size); static const int kUnitsSize = arraysize(kBigSIUnits); void ToExponentAndMantissa(double val, double thresh, int precision, double one_k, std::string* mantissa, int* exponent) { std::stringstream mantissa_stream; if (val < 0) { mantissa_stream << "-"; val = -val; } // Adjust threshold so that it never excludes things which can't be rendered // in 'precision' digits. const double adjusted_threshold = std::max(thresh, 1.0 / pow(10.0, precision)); const double big_threshold = adjusted_threshold * one_k; const double small_threshold = adjusted_threshold; if (val > big_threshold) { // Positive powers double scaled = val; for (size_t i = 0; i < arraysize(kBigSIUnits); ++i) { scaled /= one_k; if (scaled <= big_threshold) { mantissa_stream << scaled; *exponent = i + 1; *mantissa = mantissa_stream.str(); return; } } mantissa_stream << val; *exponent = 0; } else if (val < small_threshold) { // Negative powers double scaled = val; for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) { scaled *= one_k; if (scaled >= small_threshold) { mantissa_stream << scaled; *exponent = -i - 1; *mantissa = mantissa_stream.str(); return; } } mantissa_stream << val; *exponent = 0; } else { mantissa_stream << val; *exponent = 0; } *mantissa = mantissa_stream.str(); } std::string ExponentToPrefix(int exponent, bool iec) { if (exponent == 0) return ""; const int index = (exponent > 0 ? exponent - 1 : -exponent - 1); if (index >= kUnitsSize) return ""; const char *array = (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : kSmallSIUnits); if (iec) return array[index] + std::string("i"); else return std::string(1, array[index]); } std::string ToBinaryStringFullySpecified(double value, double threshold, int precision) { std::string mantissa; int exponent; ToExponentAndMantissa(value, threshold, precision, 1024., &mantissa, &exponent); return mantissa + ExponentToPrefix(exponent, false); } inline void AppendHumanReadable(int n, std::string* str) { std::stringstream ss; // Round down to the nearest SI prefix. ss << "/" << ToBinaryStringFullySpecified(n, 1.0, 0); *str += ss.str(); } inline std::string HumanReadableNumber(double n) { // 1.1 means that figures up to 1.1k should be shown with the next unit down; // this softens edge effects. // 1 means that we should show one decimal place of precision. return ToBinaryStringFullySpecified(n, 1.1, 1); } } // end namespace namespace internal { struct Benchmark::ThreadStats { int64_t bytes_processed; int64_t items_processed; ThreadStats() { Reset(); } void Reset() { bytes_processed = 0; items_processed = 0; } void Add(const ThreadStats& other) { bytes_processed += other.bytes_processed; items_processed += other.items_processed; } }; } // end namespace internal namespace { // Per-thread stats pthread_key_t thread_stats_key; internal::Benchmark::ThreadStats* thread_stats = nullptr; // For non-dense Range, intermediate values are powers of kRangeMultiplier. static const int kRangeMultiplier = 8; // List of all registered benchmarks. Note that each registered // benchmark identifies a family of related benchmarks to run. static pthread_mutex_t benchmark_mutex; static std::vector* families = NULL; bool running_benchmark = false; // Should this benchmark report memory usage? bool get_memory_usage; // Should this benchmark base decisions off of real time rather than // cpu time? bool use_real_time; // Overhead of an empty benchmark. double overhead = 0.0; void DeleteThreadStats(void* p) { delete (internal::Benchmark::ThreadStats*) p; } // Return prefix to print in front of each reported line const char* Prefix() { #ifdef NDEBUG return ""; #else return "DEBUG: "; #endif } // TODO //static internal::MallocCounter *benchmark_mc; static bool CpuScalingEnabled() { // On Linux, the CPUfreq subsystem exposes CPU information as files on the // local file system. If reading the exported files fails, then we may not be // running on Linux, so we silently ignore all the read errors. for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) { std::stringstream ss; ss << "/sys/devices/system/cpu/cpu" << cpu << "/cpufreq/scaling_governor"; std::string governor_file = ss.str(); FILE* file = fopen(governor_file.c_str(), "r"); if (!file) break; char buff[16]; size_t bytes_read = fread(buff, 1, sizeof(buff), file); fclose(file); if (memprefix(buff, bytes_read, "performance") == NULL) return true; } return false; } } // namespace namespace internal { BenchmarkReporter::~BenchmarkReporter() {} void ComputeStats(const std::vector& reports, BenchmarkRunData* mean_data, BenchmarkRunData* stddev_data) { // Accumulators. Stat1_d real_accumulated_time_stat; Stat1_d cpu_accumulated_time_stat; Stat1_d bytes_per_second_stat; Stat1_d items_per_second_stat; Stat1MinMax_d max_heapbytes_used_stat; int total_iters = 0; // Populate the accumulators. for (std::vector::const_iterator it = reports.begin(); it != reports.end(); ++it) { CHECK_EQ(reports[0].benchmark_name, it->benchmark_name); total_iters += it->iterations; real_accumulated_time_stat += Stat1_d(it->real_accumulated_time/it->iterations, it->iterations); cpu_accumulated_time_stat += Stat1_d(it->cpu_accumulated_time/it->iterations, it->iterations); items_per_second_stat += Stat1_d(it->items_per_second, it->iterations); bytes_per_second_stat += Stat1_d(it->bytes_per_second, it->iterations); max_heapbytes_used_stat += Stat1MinMax_d(it->max_heapbytes_used, it->iterations); } // Get the data from the accumulator to BenchmarkRunData's. mean_data->benchmark_name = reports[0].benchmark_name + "_mean"; mean_data->iterations = total_iters; mean_data->real_accumulated_time = real_accumulated_time_stat.Sum(); mean_data->cpu_accumulated_time = cpu_accumulated_time_stat.Sum(); mean_data->bytes_per_second = bytes_per_second_stat.Mean(); mean_data->items_per_second = items_per_second_stat.Mean(); mean_data->max_heapbytes_used = max_heapbytes_used_stat.Max(); // Only add label to mean/stddev if it is same for all runs mean_data->report_label = reports[0].report_label; for (size_t i = 1; i < reports.size(); i++) { if (reports[i].report_label != reports[0].report_label) { mean_data->report_label = ""; break; } } stddev_data->benchmark_name = reports[0].benchmark_name + "_stddev"; stddev_data->report_label = mean_data->report_label; stddev_data->iterations = total_iters; // We multiply by total_iters since PrintRunData expects a total time. stddev_data->real_accumulated_time = real_accumulated_time_stat.StdDev() * total_iters; stddev_data->cpu_accumulated_time = cpu_accumulated_time_stat.StdDev() * total_iters; stddev_data->bytes_per_second = bytes_per_second_stat.StdDev(); stddev_data->items_per_second = items_per_second_stat.StdDev(); stddev_data->max_heapbytes_used = max_heapbytes_used_stat.StdDev(); } std::string ConsoleReporter::PrintMemoryUsage(double bytes) { if (!get_memory_usage || bytes < 0.0) return ""; std::stringstream ss; ss << " " << HumanReadableNumber(bytes) << "B peak-mem"; return ss.str(); } bool ConsoleReporter::ReportContext(const BenchmarkContextData& context) { name_field_width_ = context.name_field_width; std::cout << "Benchmarking on " << context.num_cpus << " X " << context.mhz_per_cpu << " MHz CPU" << ((context.num_cpus > 1) ? "s" : "") << "\n"; int remainder_ms; char time_buf[32]; std::cout << walltime::Print(walltime::Now(), "%Y/%m/%d-%H:%M:%S", true, // use local timezone time_buf, &remainder_ms) << "\n"; // Show details of CPU model, caches, TLBs etc. // if (!context.cpu_info.empty()) // std::cout << "CPU: " << context.cpu_info.c_str(); if (context.cpu_scaling_enabled) { std::cerr << "CPU scaling is enabled: Benchmark timings may be noisy.\n"; } int output_width = fprintf(stdout, "%s%-*s %10s %10s %10s\n", Prefix(), name_field_width_, "Benchmark", "Time(ns)", "CPU(ns)", "Iterations"); std::cout << std::string(output_width - 1, '-').c_str() << "\n"; return true; } void ConsoleReporter::ReportRuns(const std::vector& reports) { for (std::vector::const_iterator it = reports.begin(); it != reports.end(); ++it) { CHECK_EQ(reports[0].benchmark_name, it->benchmark_name); PrintRunData(*it); } // We don't report aggregated data if there was a single run. if (reports.size() < 2) return; BenchmarkRunData mean_data; BenchmarkRunData stddev_data; internal::ComputeStats(reports, &mean_data, &stddev_data); // Output using PrintRun. PrintRunData(mean_data); PrintRunData(stddev_data); fprintf(stdout, "\n"); } void ConsoleReporter::PrintRunData(const BenchmarkRunData& result) { // Format bytes per second std::string rate; if (result.bytes_per_second > 0) { std::stringstream ss; ss << " " << HumanReadableNumber(result.bytes_per_second) << "B/s"; rate = ss.str(); } // Format items per second std::string items; if (result.items_per_second > 0) { std::stringstream ss; ss << " " << HumanReadableNumber(result.items_per_second) << " items/s"; items = ss.str(); } ColorPrintf(COLOR_DEFAULT, "%s", Prefix()); ColorPrintf(COLOR_GREEN, "%-*s ", name_field_width_, result.benchmark_name.c_str()); ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ", (result.real_accumulated_time * 1e9) / (static_cast(result.iterations)), (result.cpu_accumulated_time * 1e9) / (static_cast(result.iterations))); ColorPrintf(COLOR_CYAN, "%10lld", result.iterations); ColorPrintf(COLOR_DEFAULT, "%*s %s %s%s\n", 16, rate.c_str(), items.c_str(), result.report_label.c_str(), PrintMemoryUsage(result.max_heapbytes_used).c_str()); } void MemoryUsage() { //if (benchmark_mc) { // benchmark_mc->Reset(); //} else { get_memory_usage = true; //} } void UseRealTime() { use_real_time = true; } void PrintUsageAndExit() { fprintf(stdout, "benchmark [--benchmark_filter=]\n" " [--benchmark_min_iters=]\n" " [--benchmark_max_iters=]\n" " [--benchmark_min_time=]\n" // " [--benchmark_memory_usage]\n" " [--benchmark_repetitions=]\n" " [--color_print={true|false}]\n" " [--v=]\n"); exit(0); } void ParseCommandLineFlags(int* argc, const char** argv) { for (int i = 1; i < *argc; ++i) { if (ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) || ParseInt32Flag(argv[i], "benchmark_min_iters", &FLAGS_benchmark_min_iters) || ParseInt32Flag(argv[i], "benchmark_max_iters", &FLAGS_benchmark_max_iters) || ParseDoubleFlag(argv[i], "benchmark_min_time", &FLAGS_benchmark_min_time) || // TODO(dominic) // ParseBoolFlag(argv[i], "gbenchmark_memory_usage", // &FLAGS_gbenchmark_memory_usage) || ParseInt32Flag(argv[i], "benchmark_repetitions", &FLAGS_benchmark_repetitions) || ParseBoolFlag(argv[i], "color_print", &FLAGS_color_print) || ParseInt32Flag(argv[i], "v", &FLAGS_v)) { for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1]; --(*argc); --i; } else if (IsFlag(argv[i], "help")) PrintUsageAndExit(); } } } // end namespace internal // A clock that provides a fast mechanism to check if we're nearly done. class State::FastClock { public: enum Type { REAL_TIME, CPU_TIME }; explicit FastClock(Type type) : type_(type), approx_time_(NowMicros()) { sem_init(&bg_done_, 0, 0); pthread_create(&bg_, NULL, &BGThreadWrapper, this); } ~FastClock() { sem_post(&bg_done_); pthread_join(bg_, NULL); sem_destroy(&bg_done_); } // Returns true if the current time is guaranteed to be past "when_micros". // This method is very fast. inline bool HasReached(int64_t when_micros) { return approx_time_ >= when_micros; // NOTE: this is the same as we're dealing with an int64_t //return (base::subtle::NoBarrier_Load(&approx_time_) >= when_micros); } // Returns the current time in microseconds past the epoch. int64_t NowMicros() const { double t = 0; switch (type_) { case REAL_TIME: t = walltime::Now(); break; case CPU_TIME: t = MyCPUUsage() + ChildrenCPUUsage(); break; } return static_cast(t * 1e6); } // Reinitialize if necessary (since clock type may be change once benchmark // function starts running - see UseRealTime). void InitType(Type type) { type_ = type; approx_time_ = NowMicros(); // NOTE: This is the same barring a memory barrier // base::subtle::Release_Store(&approx_time_, NowMicros()); } private: Type type_; int64_t approx_time_; // Last time measurement taken by bg_ pthread_t bg_; // Background thread that updates last_time_ once every ms sem_t bg_done_; static void* BGThreadWrapper(void* that) { ((FastClock*)that)->BGThread(); return NULL; } void BGThread() { int done = 0; do { SleepForMicroseconds(1000); approx_time_ = NowMicros(); // NOTE: same code but no memory barrier. think on it. //base::subtle::Release_Store(&approx_time_, NowMicros()); sem_getvalue(&bg_done_, &done); } while (done == 0); } DISALLOW_COPY_AND_ASSIGN(FastClock); }; namespace internal { const int Benchmark::kNumCpuMarker; // Information kept per benchmark we may want to run struct Benchmark::Instance { Instance() : rangeXset(false), rangeX(kNoRange), rangeYset(false), rangeY(kNoRange) {} std::string name; Benchmark* bm; bool rangeXset; int rangeX; bool rangeYset; int rangeY; int threads; // Number of concurrent threads to use bool multithreaded() const { return !bm->thread_counts_.empty(); } }; } // end namespace internal struct State::SharedState { const internal::Benchmark::Instance* instance; pthread_mutex_t mu; int starting; // Number of threads that have entered STARTING state int stopping; // Number of threads that have entered STOPPING state int threads; // Number of total threads that are running concurrently internal::Benchmark::ThreadStats stats; std::vector runs; // accumulated runs std::string label; SharedState(const internal::Benchmark::Instance* b, int t) : instance(b), starting(0), stopping(0), threads(t) { } DISALLOW_COPY_AND_ASSIGN(SharedState); }; namespace internal { Benchmark::Benchmark(const char* name, BenchmarkFunction f) : name_(name), function_(f) { mutex_lock l(&benchmark_mutex); if (families == nullptr) families = new std::vector; registration_index_ = families->size(); families->push_back(this); } Benchmark::~Benchmark() { mutex_lock l(&benchmark_mutex); CHECK((*families)[registration_index_] == this); (*families)[registration_index_] = NULL; // Shrink the vector if convenient. while (!families->empty() && families->back() == NULL) families->pop_back(); } Benchmark* Benchmark::Arg(int x) { mutex_lock l(&benchmark_mutex); rangeX_.push_back(x); return this; } Benchmark* Benchmark::Range(int start, int limit) { std::vector arglist; AddRange(&arglist, start, limit, kRangeMultiplier); mutex_lock l(&benchmark_mutex); for (size_t i = 0; i < arglist.size(); ++i) rangeX_.push_back(arglist[i]); return this; } Benchmark* Benchmark::DenseRange(int start, int limit) { CHECK_GE(start, 0); CHECK_LE(start, limit); mutex_lock l(&benchmark_mutex); for (int arg = start; arg <= limit; ++arg) rangeX_.push_back(arg); return this; } Benchmark* Benchmark::ArgPair(int x, int y) { mutex_lock l(&benchmark_mutex); rangeX_.push_back(x); rangeY_.push_back(y); return this; } Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) { std::vector arglist1, arglist2; AddRange(&arglist1, lo1, hi1, kRangeMultiplier); AddRange(&arglist2, lo2, hi2, kRangeMultiplier); mutex_lock l(&benchmark_mutex); rangeX_.resize(arglist1.size()); std::copy(arglist1.begin(), arglist1.end(), rangeX_.begin()); rangeY_.resize(arglist2.size()); std::copy(arglist2.begin(), arglist2.end(), rangeY_.begin()); return this; } Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) { custom_arguments(this); return this; } Benchmark* Benchmark::Threads(int t) { CHECK_GT(t, 0); mutex_lock l(&benchmark_mutex); thread_counts_.push_back(t); return this; } Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) { CHECK_GT(min_threads, 0); CHECK_GE(max_threads, min_threads); mutex_lock l(&benchmark_mutex); AddRange(&thread_counts_, min_threads, max_threads, 2); return this; } Benchmark* Benchmark::ThreadPerCpu() { mutex_lock l(&benchmark_mutex); thread_counts_.push_back(kNumCpuMarker); return this; } void Benchmark::AddRange(std::vector* dst, int lo, int hi, int mult) { CHECK_GE(lo, 0); CHECK_GE(hi, lo); // Add "lo" dst->push_back(lo); // Now space out the benchmarks in multiples of "mult" for (int32_t i = 1; i < std::numeric_limits::max()/mult; i *= mult) { if (i >= hi) break; if (i > lo) dst->push_back(i); } // Add "hi" (if different from "lo") if (hi != lo) dst->push_back(hi); } std::vector Benchmark::CreateBenchmarkInstances( int rangeXindex, int rangeYindex) { // Special list of thread counts to use when none are specified std::vector one_thread; one_thread.push_back(1); std::vector instances; const bool is_multithreaded = (!thread_counts_.empty()); const std::vector* thread_counts = (is_multithreaded ? &thread_counts_ : &one_thread); for (size_t t = 0; t < thread_counts->size(); ++t) { int num_threads = (*thread_counts)[t]; if (num_threads == kNumCpuMarker) num_threads = NumCPUs(); Instance instance; instance.name = name_; instance.bm = this; instance.threads = num_threads; if (rangeXindex != kNoRange) { instance.rangeX = rangeX_[rangeXindex]; instance.rangeXset = true; AppendHumanReadable(instance.rangeX, &instance.name); } if (rangeYindex != kNoRange) { instance.rangeY = rangeY_[rangeYindex]; instance.rangeYset = true; AppendHumanReadable(instance.rangeY, &instance.name); } // Add the number of threads used to the name if (is_multithreaded) { std::stringstream ss; ss << "/threads:" << instance.threads; instance.name += ss.str(); } instances.push_back(instance); } return instances; } // Extract the list of benchmark instances that match the specified // regular expression. void Benchmark::FindBenchmarks(const std::string& spec, std::vector* benchmarks) { // Make regular expression out of command-line flag regex_t re; int ec = regcomp(&re, spec.c_str(), REG_EXTENDED | REG_NOSUB); if (ec != 0) { size_t needed = regerror(ec, &re, NULL, 0); char* errbuf = new char[needed]; regerror(ec, &re, errbuf, needed); std::cerr << "Could not compile benchmark re: " << errbuf << "\n"; delete[] errbuf; return; } mutex_lock l(&benchmark_mutex); for (Benchmark* family : *families) { if (family == nullptr) continue; // Family was deleted // Match against filter. if (regexec(&re, family->name_.c_str(), 0, NULL, 0) != 0) { #ifdef DEBUG std::cout << "Skipping " << family->name_ << "\n"; #endif continue; } std::vector instances; if (family->rangeX_.empty() && family->rangeY_.empty()) { instances = family->CreateBenchmarkInstances(kNoRange, kNoRange); benchmarks->insert(benchmarks->end(), instances.begin(), instances.end()); } else if (family->rangeY_.empty()) { for (size_t x = 0; x < family->rangeX_.size(); ++x) { instances = family->CreateBenchmarkInstances(x, kNoRange); benchmarks->insert(benchmarks->end(), instances.begin(), instances.end()); } } else { for (size_t x = 0; x < family->rangeX_.size(); ++x) { for (size_t y = 0; y < family->rangeY_.size(); ++y) { instances = family->CreateBenchmarkInstances(x, y); benchmarks->insert(benchmarks->end(), instances.begin(), instances.end()); } } } } } void Benchmark::MeasureOverhead() { State::FastClock clock(State::FastClock::CPU_TIME); State::SharedState state(NULL, 1); State runner(&clock, &state, 0); while (runner.KeepRunning()) {} overhead = state.runs[0].real_accumulated_time / static_cast(state.runs[0].iterations); #ifdef DEBUG std::cout << "Per-iteration overhead for doing nothing: " << overhead << "\n"; #endif } void Benchmark::RunInstance(const Instance& b, BenchmarkReporter* br) { use_real_time = false; running_benchmark = true; // get_memory_usage = FLAGS_gbenchmark_memory_usage; State::FastClock clock(State::FastClock::CPU_TIME); // Initialize the test runners. State::SharedState state(&b, b.threads); { std::unique_ptr runners[b.threads]; // TODO: create thread objects for (int i = 0; i < b.threads; ++i) runners[i].reset(new State(&clock, &state, i)); // Run them all. for (int i = 0; i < b.threads; ++i) { State* r = runners[i].release(); if (b.multithreaded()) { // TODO: start pthreads (member of state?) and set up thread local // pointers to stats //pool->Add(base::NewCallback(r, &State::Run)); } else { pthread_setspecific(thread_stats_key, thread_stats); r->Run(); } } if (b.multithreaded()) { // TODO: join all the threads //pool->JoinAll(); } } /* double mem_usage = 0; if (get_memory_usage) { // Measure memory usage Notification mem_done; BenchmarkRun mem_run; BenchmarkRun::SharedState mem_shared(&b, 1); mem_run.Init(&clock, &mem_shared, 0); { testing::MallocCounter mc(testing::MallocCounter::THIS_THREAD_ONLY); benchmark_mc = &mc; mem_run.Run(&mem_done); mem_done.WaitForNotification(); benchmark_mc = NULL; mem_usage = mc.PeakHeapGrowth(); } } */ running_benchmark = false; for (internal::BenchmarkRunData& report : state.runs) { double seconds = (use_real_time ? report.real_accumulated_time : report.cpu_accumulated_time); // TODO: add the thread index here? report.benchmark_name = b.name; report.report_label = state.label; report.bytes_per_second = state.stats.bytes_processed / seconds; report.items_per_second = state.stats.items_processed / seconds; report.max_heapbytes_used = MeasurePeakHeapMemory(b); } br->ReportRuns(state.runs); } // Run the specified benchmark, measure its peak memory usage, and // return the peak memory usage. double Benchmark::MeasurePeakHeapMemory(const Instance& b) { if (!get_memory_usage) return 0.0; double bytes = 0.0; /* TODO(dominich) // Should we do multi-threaded runs? const int num_threads = 1; const int num_iters = 1; { // internal::MallocCounter mc(internal::MallocCounter::THIS_THREAD_ONLY); running_benchmark = true; timer_manager = new TimerManager(1, NULL); // benchmark_mc = &mc; timer_manager->StartTimer(); b.Run(num_iters); running_benchmark = false; delete timer_manager; timer_manager = NULL; // benchmark_mc = NULL; // bytes = mc.PeakHeapGrowth(); } */ return bytes; } } // end namespace internal State::State(FastClock* clock, SharedState* s, int t) : thread_index(t), state_(STATE_INITIAL), clock_(clock), shared_(s), iterations_(0), start_cpu_(0.0), start_time_(0.0), stop_time_micros_(0.0), start_pause_(0.0), pause_time_(0.0), total_iterations_(0), interval_micros_( static_cast(1e6 * FLAGS_benchmark_min_time / FLAGS_benchmark_repetitions)) { } bool State::KeepRunning() { // Fast path if (!clock_->HasReached(stop_time_micros_ + pause_time_)) { ++iterations_; return true; } switch(state_) { case STATE_INITIAL: return StartRunning(); case STATE_STARTING: CHECK(false); return true; case STATE_RUNNING: return FinishInterval(); case STATE_STOPPING: return MaybeStop(); case STATE_STOPPED: CHECK(false); return true; } CHECK(false); return false; } void State::PauseTiming() { start_pause_ = walltime::Now(); } void State::ResumeTiming() { pause_time_ += walltime::Now() - start_pause_; } void State::SetBytesProcessed(int64_t bytes) { CHECK_EQ(STATE_STOPPED, state_); mutex_lock l(&shared_->mu); internal::Benchmark::ThreadStats* thread_stats = (internal::Benchmark::ThreadStats*) pthread_getspecific(thread_stats_key); thread_stats->bytes_processed = bytes; } void State::SetItemsProcessed(int64_t items) { CHECK_EQ(STATE_STOPPED, state_); mutex_lock l(&shared_->mu); internal::Benchmark::ThreadStats* thread_stats = (internal::Benchmark::ThreadStats*) pthread_getspecific(thread_stats_key); thread_stats->items_processed = items; } void State::SetLabel(const std::string& label) { CHECK_EQ(STATE_STOPPED, state_); mutex_lock l(&shared_->mu); shared_->label = label; } int State::range_x() const { CHECK(shared_->instance->rangeXset); /* << "Failed to get range_x as it was not set. Did you register your " "benchmark with a range parameter?"; */ return shared_->instance->rangeX; } int State::range_y() const { CHECK(shared_->instance->rangeYset); /* << "Failed to get range_y as it was not set. Did you register your " "benchmark with a range parameter?"; */ return shared_->instance->rangeY; } bool State::StartRunning() { { mutex_lock l(&shared_->mu); CHECK_EQ(state_, STATE_INITIAL); state_ = STATE_STARTING; is_continuation_ = false; CHECK_LT(shared_->starting, shared_->threads); ++shared_->starting; if (shared_->starting == shared_->threads) { // Last thread to start. clock_->InitType( use_real_time ? FastClock::REAL_TIME : FastClock::CPU_TIME); } else { // Wait for others. // TODO(dominic): semaphore! // while (pthread_getsemaphore(shared_->starting_sem_) != // shared_->threads) { } //shared_->mu.Await(base::Condition(this, &State::AllStarting)); } CHECK_EQ(state_, STATE_STARTING); state_ = STATE_RUNNING; } NewInterval(); return true; } bool State::AllStarting() { CHECK_LE(shared_->starting, shared_->threads); return shared_->starting == shared_->threads; } void State::NewInterval() { stop_time_micros_ = clock_->NowMicros() + interval_micros_; if (!is_continuation_) { #ifdef DEBUG std::cout << "Starting new interval; stopping in " << interval_micros_ << "\n"; #endif iterations_ = 0; pause_time_ = 0; start_cpu_ = MyCPUUsage() + ChildrenCPUUsage(); start_time_ = walltime::Now(); } else { #ifdef DEBUG std::cout << "Continuing interval; stopping in " << interval_micros_ << "\n"; #endif } } bool State::FinishInterval() { if (iterations_ < FLAGS_benchmark_min_iters / FLAGS_benchmark_repetitions && interval_micros_ < 5000000) { interval_micros_ *= 2; #ifdef DEBUG std::cout << "Interval was too short; trying again for " << interval_micros_ << " useconds.\n"; #endif is_continuation_ = false; NewInterval(); return true; } internal::BenchmarkRunData data; data.thread_index = thread_index; data.iterations = iterations_; data.thread_index = thread_index; const double accumulated_time = walltime::Now() - start_time_; const double total_overhead = 0.0; // TODO: overhead * iterations_; CHECK_LT(pause_time_, accumulated_time); CHECK_LT(pause_time_ + total_overhead, accumulated_time); data.real_accumulated_time = accumulated_time - (pause_time_ + total_overhead); data.cpu_accumulated_time = (MyCPUUsage() + ChildrenCPUUsage()) - start_cpu_; total_iterations_ += iterations_; bool keep_going = false; { mutex_lock l(&shared_->mu); if (is_continuation_) shared_->runs.back() = data; else shared_->runs.push_back(data); keep_going = RunAnotherInterval(); if (!keep_going) { ++shared_->stopping; if (shared_->stopping < shared_->threads) { // Other threads are still running, so continue running but without // timing to present an expected background load to the other threads. state_ = STATE_STOPPING; keep_going = true; } else { state_ = STATE_STOPPED; } } } if (state_ == STATE_RUNNING) { is_continuation_ = true; NewInterval(); } return keep_going; } bool State::RunAnotherInterval() const { if (total_iterations_ < FLAGS_benchmark_min_iters) return true; if (total_iterations_ > FLAGS_benchmark_max_iters) return false; if (static_cast(shared_->runs.size()) >= FLAGS_benchmark_repetitions) return false; return true; } bool State::MaybeStop() { mutex_lock l(&shared_->mu); if (shared_->stopping < shared_->threads) { CHECK_EQ(state_, STATE_STOPPING); return true; } state_ = STATE_STOPPED; return false; } void State::Run() { internal::Benchmark::ThreadStats* thread_stats = (internal::Benchmark::ThreadStats*) pthread_getspecific(thread_stats_key); thread_stats->Reset(); shared_->instance->bm->function_(*this); { mutex_lock l(&shared_->mu); shared_->stats.Add(*thread_stats); } } namespace internal { void RunMatchingBenchmarks(const std::string& spec, BenchmarkReporter* reporter) { CHECK(reporter != NULL); if (spec.empty()) return; std::vector benchmarks; internal::Benchmark::FindBenchmarks(spec, &benchmarks); // Determine the width of the name field using a minimum width of 10. // Also determine max number of threads needed. int name_field_width = 10; for (const internal::Benchmark::Instance& benchmark : benchmarks) { // Add width for _stddev and threads:XX if (benchmark.threads > 1 && FLAGS_benchmark_repetitions > 1) { name_field_width = std::max(name_field_width, benchmark.name.size() + 17); } else if (benchmark.threads> 1) { name_field_width = std::max(name_field_width, benchmark.name.size() + 10); } else if (FLAGS_benchmark_repetitions > 1) { name_field_width = std::max(name_field_width, benchmark.name.size() + 7); } else { name_field_width = std::max(name_field_width, benchmark.name.size()); } } // Print header here BenchmarkContextData context; context.num_cpus = NumCPUs(); context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f; // context.cpu_info = base::CompactCPUIDInfoString(); context.cpu_scaling_enabled = CpuScalingEnabled(); context.name_field_width = name_field_width; if (reporter->ReportContext(context)) { for (internal::Benchmark::Instance& benchmark : benchmarks) { //std::unique_ptr pool; //if (benchmark.threads > 0) { // pool = new thread::ThreadPool(benchmark.threads); // pool->StartWorkers(); //} Benchmark::RunInstance(/*pool, */benchmark, reporter); } } } void FindMatchingBenchmarkNames(const std::string& spec, std::vector* benchmark_names) { if (spec.empty()) return; std::vector benchmarks; internal::Benchmark::FindBenchmarks(spec, &benchmarks); std::transform(benchmarks.begin(), benchmarks.end(), benchmark_names->begin(), [] (const internal::Benchmark::Instance& b) { return b.name; } ); } } // end namespace internal void RunSpecifiedBenchmarks() { std::string spec = FLAGS_benchmark_filter; if (spec.empty() || spec == "all") spec = "."; // Regexp that matches all benchmarks internal::ConsoleReporter default_reporter; internal::RunMatchingBenchmarks(spec, &default_reporter); } void Initialize(int* argc, const char** argv) { //AtomicOps_Internalx86CPUFeaturesInit(); pthread_mutex_init(&benchmark_mutex, nullptr); pthread_key_create(&thread_stats_key, DeleteThreadStats); thread_stats = new internal::Benchmark::ThreadStats(); walltime::Initialize(); internal::Benchmark::MeasureOverhead(); internal::ParseCommandLineFlags(argc, argv); } } // end namespace benchmark