Merge pull request #98 from google/reporter_change

move reporter internals in both headers and source
2025-04-29 06:20:32 +08:00 · 2015-03-17 16:09:53 -04:00 · 2015-03-17 16:09:53 -04:00 · b260cf7698
commit b260cf7698
parent 279e502a05 e45252e6f5
8 changed files with 769 additions and 696 deletions
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@ -1,541 +1,21 @@
-// Support for registering benchmarks for functions.
-
-/* Example usage:
-// Define a function that executes the code to be measured a
-// specified number of times:
-static void BM_StringCreation(benchmark::State& state) {
-  while (state.KeepRunning())
-    std::string empty_string;
-}
-
-// Register the function as a benchmark
-BENCHMARK(BM_StringCreation);
-
-// Define another benchmark
-static void BM_StringCopy(benchmark::State& state) {
-  std::string x = "hello";
-  while (state.KeepRunning())
-    std::string copy(x);
-}
-BENCHMARK(BM_StringCopy);
-
-// Augment the main() program to invoke benchmarks if specified
-// via the --benchmarks command line flag.  E.g.,
-//       my_unittest --benchmark_filter=all
-//       my_unittest --benchmark_filter=BM_StringCreation
-//       my_unittest --benchmark_filter=String
-//       my_unittest --benchmark_filter='Copy|Creation'
-int main(int argc, char** argv) {
-  benchmark::Initialize(&argc, argv);
-  benchmark::RunSpecifiedBenchmarks();
-  return 0;
-}
-
-// Sometimes a family of microbenchmarks can be implemented with
-// just one routine that takes an extra argument to specify which
-// one of the family of benchmarks to run.  For example, the following
-// code defines a family of microbenchmarks for measuring the speed
-// of memcpy() calls of different lengths:
-
-static void BM_memcpy(benchmark::State& state) {
-  char* src = new char[state.range_x()]; char* dst = new char[state.range_x()];
-  memset(src, 'x', state.range_x());
-  while (state.KeepRunning())
-    memcpy(dst, src, state.range_x());
-  state.SetBytesProcessed(int64_t_t(state.iterations) * int64(state.range_x()));
-  delete[] src; delete[] dst;
-}
-BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
-
-// The preceding code is quite repetitive, and can be replaced with the
-// following short-hand.  The following invocation will pick a few
-// appropriate arguments in the specified range and will generate a
-// microbenchmark for each such argument.
-BENCHMARK(BM_memcpy)->Range(8, 8<<10);
-
-// You might have a microbenchmark that depends on two inputs.  For
-// example, the following code defines a family of microbenchmarks for
-// measuring the speed of set insertion.
-static void BM_SetInsert(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    state.PauseTiming();
-    set<int> data = ConstructRandomSet(state.range_x());
-    state.ResumeTiming();
-    for (int j = 0; j < state.rangeY; ++j)
-      data.insert(RandomNumber());
-  }
-}
-BENCHMARK(BM_SetInsert)
-   ->ArgPair(1<<10, 1)
-   ->ArgPair(1<<10, 8)
-   ->ArgPair(1<<10, 64)
-   ->ArgPair(1<<10, 512)
-   ->ArgPair(8<<10, 1)
-   ->ArgPair(8<<10, 8)
-   ->ArgPair(8<<10, 64)
-   ->ArgPair(8<<10, 512);
-
-// The preceding code is quite repetitive, and can be replaced with
-// the following short-hand.  The following macro will pick a few
-// appropriate arguments in the product of the two specified ranges
-// and will generate a microbenchmark for each such pair.
-BENCHMARK(BM_SetInsert)->RangePair(1<<10, 8<<10, 1, 512);
-
-// For more complex patterns of inputs, passing a custom function
-// to Apply allows programmatic specification of an
-// arbitrary set of arguments to run the microbenchmark on.
-// The following example enumerates a dense range on
-// one parameter, and a sparse range on the second.
-static benchmark::internal::Benchmark* CustomArguments(
-    benchmark::internal::Benchmark* b) {
-  for (int i = 0; i <= 10; ++i)
-    for (int j = 32; j <= 1024*1024; j *= 8)
-      b = b->ArgPair(i, j);
-  return b;
-}
-BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
-
-// Templated microbenchmarks work the same way:
-// Produce then consume 'size' messages 'iters' times
-// Measures throughput in the absence of multiprogramming.
-template <class Q> int BM_Sequential(benchmark::State& state) {
-  Q q;
-  typename Q::value_type v;
-  while (state.KeepRunning()) {
-    for (int i = state.range_x(); i--; )
-      q.push(v);
-    for (int e = state.range_x(); e--; )
-      q.Wait(&v);
-  }
-  // actually messages, not bytes:
-  state.SetBytesProcessed(
-      static_cast<int64_t>(state.iterations())*state.range_x());
-}
-BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
-
-In a multithreaded test, it is guaranteed that none of the threads will start
-until all have called KeepRunning, and all will have finished before KeepRunning
-returns false. As such, any global setup or teardown you want to do can be
-wrapped in a check against the thread index:
-
-static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
-    // Setup code here.
-  }
-  while (state.KeepRunning()) {
-    // Run the test as normal.
-  }
-  if (state.thread_index == 0) {
-    // Teardown code here.
-  }
-}
-BENCHMARK(BM_MultiThreaded)->Threads(4);
-*/
-
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #ifndef BENCHMARK_BENCHMARK_H_
 #define BENCHMARK_BENCHMARK_H_

-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-
-#include <string>
-#include <utility>
-#include <vector>
-
 #include "macros.h"
+#include "benchmark_api.h"
+#include "reporter.h"

-namespace benchmark {
-class BenchmarkReporter;
-
-void Initialize(int* argc, const char** argv);
-
-// Otherwise, run all benchmarks specified by the --benchmark_filter flag,
-// and exit after running the benchmarks.
-void RunSpecifiedBenchmarks();
-void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter);
-
-// If this routine is called, peak memory allocation past this point in the
-// benchmark is reported at the end of the benchmark report line. (It is
-// computed by running the benchmark once with a single iteration and a memory
-// tracer.)
-// TODO(dominic)
-// void MemoryUsage();
-
-namespace internal {
-class Benchmark;
-class BenchmarkImp;
-
-template <class T> struct Voider {
-    typedef void type;
-};
-
-template <class T, class = void>
-struct EnableIfString {};
-
-template <class T>
-struct EnableIfString<T, typename Voider<typename T::basic_string>::type> {
-    typedef int type;
-};
-
-} // end namespace internal
-
-// State is passed to a running Benchmark and contains state for the
-// benchmark to use.
-class State {
-public:
-  State(size_t max_iters, bool has_x, int x, bool has_y, int y, int thread_i);
-
-  // Returns true iff the benchmark should continue through another iteration.
-  // NOTE: A benchmark may not return from the test until KeepRunning() has
-  // returned false.
-  bool KeepRunning() {
-    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
-        ResumeTiming();
-        started_ = true;
-    }
-    bool const res = total_iterations_++ < max_iterations;
-    if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
-        assert(started_);
-        PauseTiming();
-        // Total iterations now is one greater than max iterations. Fix this.
-        total_iterations_ = max_iterations;
-    }
-    return res;
-  }
-
-  // REQUIRES: timer is running
-  // Stop the benchmark timer.  If not called, the timer will be
-  // automatically stopped after KeepRunning() returns false for the first time.
-  //
-  // For threaded benchmarks the PauseTiming() function acts
-  // like a barrier.  I.e., the ith call by a particular thread to this
-  // function will block until all threads have made their ith call.
-  // The timer will stop when the last thread has called this function.
-  //
-  // NOTE: PauseTiming()/ResumeTiming() are relatively
-  // heavyweight, and so their use should generally be avoided
-  // within each benchmark iteration, if possible.
-  void PauseTiming();
-
-  // REQUIRES: timer is not running
-  // Start the benchmark timer.  The timer is NOT running on entrance to the
-  // benchmark function. It begins running after the first call to KeepRunning()
-  //
-  // For threaded benchmarks the ResumeTiming() function acts
-  // like a barrier.  I.e., the ith call by a particular thread to this
-  // function will block until all threads have made their ith call.
-  // The timer will start when the last thread has called this function.
-  //
-  // NOTE: PauseTiming()/ResumeTiming() are relatively
-  // heavyweight, and so their use should generally be avoided
-  // within each benchmark iteration, if possible.
-  void ResumeTiming();
-
-  // If a particular benchmark is I/O bound, or if for some reason CPU
-  // timings are not representative, call this method from within the
-  // benchmark routine.  If called, the elapsed time will be used to
-  // control how many iterations are run, and in the printing of
-  // items/second or MB/seconds values.  If not called, the cpu time
-  // used by the benchmark will be used.
-  void UseRealTime();
-
-  // Set the number of bytes processed by the current benchmark
-  // execution.  This routine is typically called once at the end of a
-  // throughput oriented benchmark.  If this routine is called with a
-  // value > 0, the report is printed in MB/sec instead of nanoseconds
-  // per iteration.
-  //
-  // REQUIRES: a benchmark has exited its KeepRunning loop.
-  BENCHMARK_ALWAYS_INLINE
-  void SetBytesProcessed(size_t bytes) {
-    bytes_processed_ = bytes;
-  }
-
-  BENCHMARK_ALWAYS_INLINE
-  size_t bytes_processed() const {
-    return bytes_processed_;
-  }
-
-  // If this routine is called with items > 0, then an items/s
-  // label is printed on the benchmark report line for the currently
-  // executing benchmark. It is typically called at the end of a processing
-  // benchmark where a processing items/second output is desired.
-  //
-  // REQUIRES: a benchmark has exited its KeepRunning loop.
-  BENCHMARK_ALWAYS_INLINE
-  void SetItemsProcessed(size_t items) {
-    items_processed_ = items;
-  }
-
-  BENCHMARK_ALWAYS_INLINE
-  size_t items_processed() const {
-    return items_processed_;
-  }
-
-  // If this routine is called, the specified label is printed at the
-  // end of the benchmark report line for the currently executing
-  // benchmark.  Example:
-  //  static void BM_Compress(int iters) {
-  //    ...
-  //    double compress = input_size / output_size;
-  //    benchmark::SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
-  //  }
-  // Produces output that looks like:
-  //  BM_Compress   50         50   14115038  compress:27.3%
-  //
-  // REQUIRES: a benchmark has exited its KeepRunning loop.
-  void SetLabel(const char* label);
-
-  // Allow the use of std::string without actually including <string>.
-  // This function does not participate in overload resolution unless StringType
-  // has the nested typename `basic_string`. This typename should be provided
-  // as an injected class name in the case of std::string.
-  template <class StringType>
-  void SetLabel(StringType const & str,
-                typename internal::EnableIfString<StringType>::type = 1) {
-    this->SetLabel(str.c_str());
-  }
-
-  // Range arguments for this run. CHECKs if the argument has been set.
-  BENCHMARK_ALWAYS_INLINE
-  int range_x() const {
-    assert(has_range_x_);
-    ((void)has_range_x_); // Prevent unused warning.
-    return range_x_;
-  }
-
-  BENCHMARK_ALWAYS_INLINE
-  int range_y() const {
-    assert(has_range_y_);
-    ((void)has_range_y_); // Prevent unused warning.
-    return range_y_;
-  }
-
-  BENCHMARK_ALWAYS_INLINE
-  size_t iterations() const { return total_iterations_; }
-
-private:
-  bool started_;
-  size_t total_iterations_;
-
-  bool has_range_x_;
-  int range_x_;
-
-  bool has_range_y_;
-  int range_y_;
-
-  size_t bytes_processed_;
-  size_t items_processed_;
-
-public:
-  const int thread_index;
-  const size_t max_iterations;
-
-private:
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
-};
-
-// Interface for custom benchmark result printers.
-// By default, benchmark reports are printed to stdout. However an application
-// can control the destination of the reports by calling
-// RunSpecifiedBenchmarks and passing it a custom reporter object.
-// The reporter object must implement the following interface.
-class BenchmarkReporter {
- public:
-  struct Context {
-    int num_cpus;
-    double mhz_per_cpu;
-    bool cpu_scaling_enabled;
-
-    // The number of chars in the longest benchmark name.
-    size_t name_field_width;
-  };
-
-  struct Run {
-    Run() :
-      iterations(1),
-      real_accumulated_time(0),
-      cpu_accumulated_time(0),
-      bytes_per_second(0),
-      items_per_second(0),
-      max_heapbytes_used(0) {}
-
-    std::string benchmark_name;
-    std::string report_label;  // Empty if not set by benchmark.
-    size_t iterations;
-    double real_accumulated_time;
-    double cpu_accumulated_time;
-
-    // Zero if not set by benchmark.
-    double bytes_per_second;
-    double items_per_second;
-
-    // This is set to 0.0 if memory tracing is not enabled.
-    double max_heapbytes_used;
-  };
-
-  // Called once for every suite of benchmarks run.
-  // The parameter "context" contains information that the
-  // reporter may wish to use when generating its report, for example the
-  // platform under which the benchmarks are running. The benchmark run is
-  // never started if this function returns false, allowing the reporter
-  // to skip runs based on the context information.
-  virtual bool ReportContext(const Context& context) const = 0;
-
-  // Called once for each group of benchmark runs, gives information about
-  // cpu-time and heap memory usage during the benchmark run.
-  // Note that all the grouped benchmark runs should refer to the same
-  // benchmark, thus have the same name.
-  virtual void ReportRuns(const std::vector<Run>& report) const = 0;
-
-  virtual ~BenchmarkReporter();
-};
-
-namespace internal {
-
-typedef void(Function)(State&);
-
-// ------------------------------------------------------
-// Benchmark registration object.  The BENCHMARK() macro expands
-// into an internal::Benchmark* object.  Various methods can
-// be called on this object to change the properties of the benchmark.
-// Each method returns "this" so that multiple method calls can
-// chained into one expression.
-class Benchmark {
- public:
-  Benchmark(const char* name, Function* f);
-
-  ~Benchmark();
-
-  // Note: the following methods all return "this" so that multiple
-  // method calls can be chained together in one expression.
-
-  // Run this benchmark once with "x" as the extra argument passed
-  // to the function.
-  // REQUIRES: The function passed to the constructor must accept an arg1.
-  Benchmark* Arg(int x);
-
-  // Run this benchmark once for a number of values picked from the
-  // range [start..limit].  (start and limit are always picked.)
-  // REQUIRES: The function passed to the constructor must accept an arg1.
-  Benchmark* Range(int start, int limit);
-
-  // Run this benchmark once for every value in the range [start..limit]
-  // REQUIRES: The function passed to the constructor must accept an arg1.
-  Benchmark* DenseRange(int start, int limit);
-
-  // Run this benchmark once with "x,y" as the extra arguments passed
-  // to the function.
-  // REQUIRES: The function passed to the constructor must accept arg1,arg2.
-  Benchmark* ArgPair(int x, int y);
-
-  // Pick a set of values A from the range [lo1..hi1] and a set
-  // of values B from the range [lo2..hi2].  Run the benchmark for
-  // every pair of values in the cartesian product of A and B
-  // (i.e., for all combinations of the values in A and B).
-  // REQUIRES: The function passed to the constructor must accept arg1,arg2.
-  Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2);
-
-  // Pass this benchmark object to *func, which can customize
-  // the benchmark by calling various methods like Arg, ArgPair,
-  // Threads, etc.
-  Benchmark* Apply(void (*func)(Benchmark* benchmark));
-
-  // Support for running multiple copies of the same benchmark concurrently
-  // in multiple threads.  This may be useful when measuring the scaling
-  // of some piece of code.
-
-  // Run one instance of this benchmark concurrently in t threads.
-  Benchmark* Threads(int t);
-
-  // Pick a set of values T from [min_threads,max_threads].
-  // min_threads and max_threads are always included in T.  Run this
-  // benchmark once for each value in T.  The benchmark run for a
-  // particular value t consists of t threads running the benchmark
-  // function concurrently.  For example, consider:
-  //    BENCHMARK(Foo)->ThreadRange(1,16);
-  // This will run the following benchmarks:
-  //    Foo in 1 thread
-  //    Foo in 2 threads
-  //    Foo in 4 threads
-  //    Foo in 8 threads
-  //    Foo in 16 threads
-  Benchmark* ThreadRange(int min_threads, int max_threads);
-
-  // Equivalent to ThreadRange(NumCPUs(), NumCPUs())
-  Benchmark* ThreadPerCpu();
-
-  // Used inside the benchmark implementation
-  struct Instance;
-
- private:
-   BenchmarkImp* imp_;
-   BENCHMARK_DISALLOW_COPY_AND_ASSIGN(Benchmark);
-};
-
-
-// ------------------------------------------------------
-// Internal implementation details follow; please ignore
-
-// Simple reporter that outputs benchmark data to the console. This is the
-// default reporter used by RunSpecifiedBenchmarks().
-class ConsoleReporter : public BenchmarkReporter {
- public:
-  virtual bool ReportContext(const Context& context) const;
-  virtual void ReportRuns(const std::vector<Run>& reports) const;
- private:
-  virtual void PrintRunData(const Run& report) const;
-  // TODO(ericwf): Find a better way to share this information.
-  mutable size_t name_field_width_;
-};
-
-}  // end namespace internal
-}  // end namespace benchmark
-
-
-// ------------------------------------------------------
-// Macro to register benchmarks
-
-// Helpers for generating unique variable names
-#define BENCHMARK_CONCAT(a, b, c) BENCHMARK_CONCAT2(a, b, c)
-#define BENCHMARK_CONCAT2(a, b, c) a##b##c
-
-#define BENCHMARK(n)                                         \
-  static ::benchmark::internal::Benchmark* BENCHMARK_CONCAT( \
-      __benchmark_, n, __LINE__) BENCHMARK_UNUSED =          \
-      (new ::benchmark::internal::Benchmark(#n, n))
-
-// Old-style macros
-#define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
-#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->ArgPair((a1), (a2))
-#define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi))
-#define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
-  BENCHMARK(n)->RangePair((l1), (h1), (l2), (h2))
-
-// This will register a benchmark for a templatized function.  For example:
-//
-// template<int arg>
-// void BM_Foo(int iters);
-//
-// BENCHMARK_TEMPLATE(BM_Foo, 1);
-//
-// will register BM_Foo<1> as a benchmark.
-#define BENCHMARK_TEMPLATE(n, a)                             \
-  static ::benchmark::internal::Benchmark* BENCHMARK_CONCAT( \
-      __benchmark_, n, __LINE__) BENCHMARK_UNUSED =          \
-      (new ::benchmark::internal::Benchmark(#n "<" #a ">", n<a>))
-
-#define BENCHMARK_TEMPLATE2(n, a, b)                         \
-  static ::benchmark::internal::Benchmark* BENCHMARK_CONCAT( \
-      __benchmark_, n, __LINE__) BENCHMARK_UNUSED =          \
-      (new ::benchmark::internal::Benchmark(#n "<" #a "," #b ">", n<a, b>))
-
-// Helper macro to create a main routine in a test that runs the benchmarks
-#define BENCHMARK_MAIN()                             \
-  int main(int argc, const char** argv) {            \
-    ::benchmark::Initialize(&argc, argv);            \
-    ::benchmark::RunSpecifiedBenchmarks();           \
-  }
-
-#endif  // BENCHMARK_BENCHMARK_H_
+#endif // BENCHMARK_BENCHMARK_H_
--- a/include/benchmark/benchmark_api.h
+++ b/include/benchmark/benchmark_api.h
@ -0,0 +1,465 @@
+// Support for registering benchmarks for functions.
+
+/* Example usage:
+// Define a function that executes the code to be measured a
+// specified number of times:
+static void BM_StringCreation(benchmark::State& state) {
+  while (state.KeepRunning())
+    std::string empty_string;
+}
+
+// Register the function as a benchmark
+BENCHMARK(BM_StringCreation);
+
+// Define another benchmark
+static void BM_StringCopy(benchmark::State& state) {
+  std::string x = "hello";
+  while (state.KeepRunning())
+    std::string copy(x);
+}
+BENCHMARK(BM_StringCopy);
+
+// Augment the main() program to invoke benchmarks if specified
+// via the --benchmarks command line flag.  E.g.,
+//       my_unittest --benchmark_filter=all
+//       my_unittest --benchmark_filter=BM_StringCreation
+//       my_unittest --benchmark_filter=String
+//       my_unittest --benchmark_filter='Copy|Creation'
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  return 0;
+}
+
+// Sometimes a family of microbenchmarks can be implemented with
+// just one routine that takes an extra argument to specify which
+// one of the family of benchmarks to run.  For example, the following
+// code defines a family of microbenchmarks for measuring the speed
+// of memcpy() calls of different lengths:
+
+static void BM_memcpy(benchmark::State& state) {
+  char* src = new char[state.range_x()]; char* dst = new char[state.range_x()];
+  memset(src, 'x', state.range_x());
+  while (state.KeepRunning())
+    memcpy(dst, src, state.range_x());
+  state.SetBytesProcessed(int64_t_t(state.iterations) * int64(state.range_x()));
+  delete[] src; delete[] dst;
+}
+BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
+
+// The preceding code is quite repetitive, and can be replaced with the
+// following short-hand.  The following invocation will pick a few
+// appropriate arguments in the specified range and will generate a
+// microbenchmark for each such argument.
+BENCHMARK(BM_memcpy)->Range(8, 8<<10);
+
+// You might have a microbenchmark that depends on two inputs.  For
+// example, the following code defines a family of microbenchmarks for
+// measuring the speed of set insertion.
+static void BM_SetInsert(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    state.PauseTiming();
+    set<int> data = ConstructRandomSet(state.range_x());
+    state.ResumeTiming();
+    for (int j = 0; j < state.rangeY; ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert)
+   ->ArgPair(1<<10, 1)
+   ->ArgPair(1<<10, 8)
+   ->ArgPair(1<<10, 64)
+   ->ArgPair(1<<10, 512)
+   ->ArgPair(8<<10, 1)
+   ->ArgPair(8<<10, 8)
+   ->ArgPair(8<<10, 64)
+   ->ArgPair(8<<10, 512);
+
+// The preceding code is quite repetitive, and can be replaced with
+// the following short-hand.  The following macro will pick a few
+// appropriate arguments in the product of the two specified ranges
+// and will generate a microbenchmark for each such pair.
+BENCHMARK(BM_SetInsert)->RangePair(1<<10, 8<<10, 1, 512);
+
+// For more complex patterns of inputs, passing a custom function
+// to Apply allows programmatic specification of an
+// arbitrary set of arguments to run the microbenchmark on.
+// The following example enumerates a dense range on
+// one parameter, and a sparse range on the second.
+static benchmark::internal::Benchmark* CustomArguments(
+    benchmark::internal::Benchmark* b) {
+  for (int i = 0; i <= 10; ++i)
+    for (int j = 32; j <= 1024*1024; j *= 8)
+      b = b->ArgPair(i, j);
+  return b;
+}
+BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
+
+// Templated microbenchmarks work the same way:
+// Produce then consume 'size' messages 'iters' times
+// Measures throughput in the absence of multiprogramming.
+template <class Q> int BM_Sequential(benchmark::State& state) {
+  Q q;
+  typename Q::value_type v;
+  while (state.KeepRunning()) {
+    for (int i = state.range_x(); i--; )
+      q.push(v);
+    for (int e = state.range_x(); e--; )
+      q.Wait(&v);
+  }
+  // actually messages, not bytes:
+  state.SetBytesProcessed(
+      static_cast<int64_t>(state.iterations())*state.range_x());
+}
+BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+
+In a multithreaded test, it is guaranteed that none of the threads will start
+until all have called KeepRunning, and all will have finished before KeepRunning
+returns false. As such, any global setup or teardown you want to do can be
+wrapped in a check against the thread index:
+
+static void BM_MultiThreaded(benchmark::State& state) {
+  if (state.thread_index == 0) {
+    // Setup code here.
+  }
+  while (state.KeepRunning()) {
+    // Run the test as normal.
+  }
+  if (state.thread_index == 0) {
+    // Teardown code here.
+  }
+}
+BENCHMARK(BM_MultiThreaded)->Threads(4);
+*/
+
+#ifndef BENCHMARK_BENCHMARK_API_H_
+#define BENCHMARK_BENCHMARK_API_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "macros.h"
+
+namespace benchmark {
+class BenchmarkReporter;
+
+void Initialize(int* argc, const char** argv);
+
+// Otherwise, run all benchmarks specified by the --benchmark_filter flag,
+// and exit after running the benchmarks.
+void RunSpecifiedBenchmarks();
+void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter);
+
+// If this routine is called, peak memory allocation past this point in the
+// benchmark is reported at the end of the benchmark report line. (It is
+// computed by running the benchmark once with a single iteration and a memory
+// tracer.)
+// TODO(dominic)
+// void MemoryUsage();
+
+namespace internal {
+class Benchmark;
+class BenchmarkImp;
+
+template <class T> struct Voider {
+    typedef void type;
+};
+
+template <class T, class = void>
+struct EnableIfString {};
+
+template <class T>
+struct EnableIfString<T, typename Voider<typename T::basic_string>::type> {
+    typedef int type;
+};
+
+} // end namespace internal
+
+// State is passed to a running Benchmark and contains state for the
+// benchmark to use.
+class State {
+public:
+  State(size_t max_iters, bool has_x, int x, bool has_y, int y, int thread_i);
+
+  // Returns true iff the benchmark should continue through another iteration.
+  // NOTE: A benchmark may not return from the test until KeepRunning() has
+  // returned false.
+  bool KeepRunning() {
+    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
+        ResumeTiming();
+        started_ = true;
+    }
+    bool const res = total_iterations_++ < max_iterations;
+    if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
+        assert(started_);
+        PauseTiming();
+        // Total iterations now is one greater than max iterations. Fix this.
+        total_iterations_ = max_iterations;
+    }
+    return res;
+  }
+
+  // REQUIRES: timer is running
+  // Stop the benchmark timer.  If not called, the timer will be
+  // automatically stopped after KeepRunning() returns false for the first time.
+  //
+  // For threaded benchmarks the PauseTiming() function acts
+  // like a barrier.  I.e., the ith call by a particular thread to this
+  // function will block until all threads have made their ith call.
+  // The timer will stop when the last thread has called this function.
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
+  void PauseTiming();
+
+  // REQUIRES: timer is not running
+  // Start the benchmark timer.  The timer is NOT running on entrance to the
+  // benchmark function. It begins running after the first call to KeepRunning()
+  //
+  // For threaded benchmarks the ResumeTiming() function acts
+  // like a barrier.  I.e., the ith call by a particular thread to this
+  // function will block until all threads have made their ith call.
+  // The timer will start when the last thread has called this function.
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
+  void ResumeTiming();
+
+  // If a particular benchmark is I/O bound, or if for some reason CPU
+  // timings are not representative, call this method from within the
+  // benchmark routine.  If called, the elapsed time will be used to
+  // control how many iterations are run, and in the printing of
+  // items/second or MB/seconds values.  If not called, the cpu time
+  // used by the benchmark will be used.
+  void UseRealTime();
+
+  // Set the number of bytes processed by the current benchmark
+  // execution.  This routine is typically called once at the end of a
+  // throughput oriented benchmark.  If this routine is called with a
+  // value > 0, the report is printed in MB/sec instead of nanoseconds
+  // per iteration.
+  //
+  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  BENCHMARK_ALWAYS_INLINE
+  void SetBytesProcessed(size_t bytes) {
+    bytes_processed_ = bytes;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t bytes_processed() const {
+    return bytes_processed_;
+  }
+
+  // If this routine is called with items > 0, then an items/s
+  // label is printed on the benchmark report line for the currently
+  // executing benchmark. It is typically called at the end of a processing
+  // benchmark where a processing items/second output is desired.
+  //
+  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  BENCHMARK_ALWAYS_INLINE
+  void SetItemsProcessed(size_t items) {
+    items_processed_ = items;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t items_processed() const {
+    return items_processed_;
+  }
+
+  // If this routine is called, the specified label is printed at the
+  // end of the benchmark report line for the currently executing
+  // benchmark.  Example:
+  //  static void BM_Compress(int iters) {
+  //    ...
+  //    double compress = input_size / output_size;
+  //    benchmark::SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
+  //  }
+  // Produces output that looks like:
+  //  BM_Compress   50         50   14115038  compress:27.3%
+  //
+  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  void SetLabel(const char* label);
+
+  // Allow the use of std::string without actually including <string>.
+  // This function does not participate in overload resolution unless StringType
+  // has the nested typename `basic_string`. This typename should be provided
+  // as an injected class name in the case of std::string.
+  template <class StringType>
+  void SetLabel(StringType const & str,
+                typename internal::EnableIfString<StringType>::type = 1) {
+    this->SetLabel(str.c_str());
+  }
+
+  // Range arguments for this run. CHECKs if the argument has been set.
+  BENCHMARK_ALWAYS_INLINE
+  int range_x() const {
+    assert(has_range_x_);
+    ((void)has_range_x_); // Prevent unused warning.
+    return range_x_;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  int range_y() const {
+    assert(has_range_y_);
+    ((void)has_range_y_); // Prevent unused warning.
+    return range_y_;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t iterations() const { return total_iterations_; }
+
+private:
+  bool started_;
+  size_t total_iterations_;
+
+  bool has_range_x_;
+  int range_x_;
+
+  bool has_range_y_;
+  int range_y_;
+
+  size_t bytes_processed_;
+  size_t items_processed_;
+
+public:
+  const int thread_index;
+  const size_t max_iterations;
+
+private:
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
+};
+
+namespace internal {
+
+typedef void(Function)(State&);
+
+// ------------------------------------------------------
+// Benchmark registration object.  The BENCHMARK() macro expands
+// into an internal::Benchmark* object.  Various methods can
+// be called on this object to change the properties of the benchmark.
+// Each method returns "this" so that multiple method calls can
+// chained into one expression.
+class Benchmark {
+ public:
+  Benchmark(const char* name, Function* f);
+
+  ~Benchmark();
+
+  // Note: the following methods all return "this" so that multiple
+  // method calls can be chained together in one expression.
+
+  // Run this benchmark once with "x" as the extra argument passed
+  // to the function.
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* Arg(int x);
+
+  // Run this benchmark once for a number of values picked from the
+  // range [start..limit].  (start and limit are always picked.)
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* Range(int start, int limit);
+
+  // Run this benchmark once for every value in the range [start..limit]
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* DenseRange(int start, int limit);
+
+  // Run this benchmark once with "x,y" as the extra arguments passed
+  // to the function.
+  // REQUIRES: The function passed to the constructor must accept arg1,arg2.
+  Benchmark* ArgPair(int x, int y);
+
+  // Pick a set of values A from the range [lo1..hi1] and a set
+  // of values B from the range [lo2..hi2].  Run the benchmark for
+  // every pair of values in the cartesian product of A and B
+  // (i.e., for all combinations of the values in A and B).
+  // REQUIRES: The function passed to the constructor must accept arg1,arg2.
+  Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2);
+
+  // Pass this benchmark object to *func, which can customize
+  // the benchmark by calling various methods like Arg, ArgPair,
+  // Threads, etc.
+  Benchmark* Apply(void (*func)(Benchmark* benchmark));
+
+  // Support for running multiple copies of the same benchmark concurrently
+  // in multiple threads.  This may be useful when measuring the scaling
+  // of some piece of code.
+
+  // Run one instance of this benchmark concurrently in t threads.
+  Benchmark* Threads(int t);
+
+  // Pick a set of values T from [min_threads,max_threads].
+  // min_threads and max_threads are always included in T.  Run this
+  // benchmark once for each value in T.  The benchmark run for a
+  // particular value t consists of t threads running the benchmark
+  // function concurrently.  For example, consider:
+  //    BENCHMARK(Foo)->ThreadRange(1,16);
+  // This will run the following benchmarks:
+  //    Foo in 1 thread
+  //    Foo in 2 threads
+  //    Foo in 4 threads
+  //    Foo in 8 threads
+  //    Foo in 16 threads
+  Benchmark* ThreadRange(int min_threads, int max_threads);
+
+  // Equivalent to ThreadRange(NumCPUs(), NumCPUs())
+  Benchmark* ThreadPerCpu();
+
+  // Used inside the benchmark implementation
+  struct Instance;
+
+ private:
+   BenchmarkImp* imp_;
+   BENCHMARK_DISALLOW_COPY_AND_ASSIGN(Benchmark);
+};
+
+}  // end namespace internal
+}  // end namespace benchmark
+
+
+// ------------------------------------------------------
+// Macro to register benchmarks
+
+// Helpers for generating unique variable names
+#define BENCHMARK_CONCAT(a, b, c) BENCHMARK_CONCAT2(a, b, c)
+#define BENCHMARK_CONCAT2(a, b, c) a##b##c
+
+#define BENCHMARK(n)                                         \
+  static ::benchmark::internal::Benchmark* BENCHMARK_CONCAT( \
+      __benchmark_, n, __LINE__) BENCHMARK_UNUSED =          \
+      (new ::benchmark::internal::Benchmark(#n, n))
+
+// Old-style macros
+#define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
+#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->ArgPair((a1), (a2))
+#define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi))
+#define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
+  BENCHMARK(n)->RangePair((l1), (h1), (l2), (h2))
+
+// This will register a benchmark for a templatized function.  For example:
+//
+// template<int arg>
+// void BM_Foo(int iters);
+//
+// BENCHMARK_TEMPLATE(BM_Foo, 1);
+//
+// will register BM_Foo<1> as a benchmark.
+#define BENCHMARK_TEMPLATE(n, a)                             \
+  static ::benchmark::internal::Benchmark* BENCHMARK_CONCAT( \
+      __benchmark_, n, __LINE__) BENCHMARK_UNUSED =          \
+      (new ::benchmark::internal::Benchmark(#n "<" #a ">", n<a>))
+
+#define BENCHMARK_TEMPLATE2(n, a, b)                         \
+  static ::benchmark::internal::Benchmark* BENCHMARK_CONCAT( \
+      __benchmark_, n, __LINE__) BENCHMARK_UNUSED =          \
+      (new ::benchmark::internal::Benchmark(#n "<" #a "," #b ">", n<a, b>))
+
+// Helper macro to create a main routine in a test that runs the benchmarks
+#define BENCHMARK_MAIN()                             \
+  int main(int argc, const char** argv) {            \
+    ::benchmark::Initialize(&argc, argv);            \
+    ::benchmark::RunSpecifiedBenchmarks();           \
+  }
+
+#endif  // BENCHMARK_BENCHMARK_API_H_
--- a/include/benchmark/reporter.h
+++ b/include/benchmark/reporter.h
@ -0,0 +1,94 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef BENCHMARK_REPORTER_H_
+#define BENCHMARK_REPORTER_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "benchmark_api.h" // For forward declaration of BenchmarkReporter
+
+namespace benchmark {
+
+// Interface for custom benchmark result printers.
+// By default, benchmark reports are printed to stdout. However an application
+// can control the destination of the reports by calling
+// RunSpecifiedBenchmarks and passing it a custom reporter object.
+// The reporter object must implement the following interface.
+class BenchmarkReporter {
+ public:
+  struct Context {
+    int num_cpus;
+    double mhz_per_cpu;
+    bool cpu_scaling_enabled;
+
+    // The number of chars in the longest benchmark name.
+    size_t name_field_width;
+  };
+
+  struct Run {
+    Run() :
+      iterations(1),
+      real_accumulated_time(0),
+      cpu_accumulated_time(0),
+      bytes_per_second(0),
+      items_per_second(0),
+      max_heapbytes_used(0) {}
+
+    std::string benchmark_name;
+    std::string report_label;  // Empty if not set by benchmark.
+    size_t iterations;
+    double real_accumulated_time;
+    double cpu_accumulated_time;
+
+    // Zero if not set by benchmark.
+    double bytes_per_second;
+    double items_per_second;
+
+    // This is set to 0.0 if memory tracing is not enabled.
+    double max_heapbytes_used;
+  };
+
+  // Called once for every suite of benchmarks run.
+  // The parameter "context" contains information that the
+  // reporter may wish to use when generating its report, for example the
+  // platform under which the benchmarks are running. The benchmark run is
+  // never started if this function returns false, allowing the reporter
+  // to skip runs based on the context information.
+  virtual bool ReportContext(const Context& context) const = 0;
+
+  // Called once for each group of benchmark runs, gives information about
+  // cpu-time and heap memory usage during the benchmark run.
+  // Note that all the grouped benchmark runs should refer to the same
+  // benchmark, thus have the same name.
+  virtual void ReportRuns(const std::vector<Run>& report) const = 0;
+
+  virtual ~BenchmarkReporter();
+};
+
+// Simple reporter that outputs benchmark data to the console. This is the
+// default reporter used by RunSpecifiedBenchmarks().
+class ConsoleReporter : public BenchmarkReporter {
+ public:
+  virtual bool ReportContext(const Context& context) const;
+  virtual void ReportRuns(const std::vector<Run>& reports) const;
+ private:
+  virtual void PrintRunData(const Run& report) const;
+  // TODO(ericwf): Find a better way to share this information.
+  mutable size_t name_field_width_;
+};
+
+} // end namespace benchmark
+#endif // BENCHMARK_REPORTER_H_
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -3,7 +3,8 @@ include_directories(${PROJECT_SOURCE_DIR}/src)

 # Define the source files
 set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc" "log.cc"
-                 "sleep.cc" "string_util.cc" "sysinfo.cc" "walltime.cc")
+                 "reporter.cc" "sleep.cc" "string_util.cc" "sysinfo.cc"
+                 "walltime.cc")
 # Determine the correct regular expression engine to use
 if(HAVE_STD_REGEX)
  set(RE_FILES "re_std.cc")
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@ -29,7 +29,6 @@

 #include "check.h"
 #include "commandlineflags.h"
-#include "colorprint.h"
 #include "log.h"
 #include "mutex.h"
 #include "re.h"
@ -134,61 +133,6 @@ static bool CpuScalingEnabled() {
  return false;
 }

-void ComputeStats(const std::vector<BenchmarkReporter::Run>& reports,
-                  BenchmarkReporter::Run* mean_data,
-                  BenchmarkReporter::Run* stddev_data) {
-  CHECK(reports.size() >= 2) << "Cannot compute stats for less than 2 reports";
-  // Accumulators.
-  Stat1_d real_accumulated_time_stat;
-  Stat1_d cpu_accumulated_time_stat;
-  Stat1_d bytes_per_second_stat;
-  Stat1_d items_per_second_stat;
-  // All repetitions should be run with the same number of iterations so we
-  // can take this information from the first benchmark.
-  std::size_t const run_iterations = reports.front().iterations;
-
-  // Populate the accumulators.
-  for (BenchmarkReporter::Run const& run : reports) {
-    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
-    CHECK_EQ(run_iterations, run.iterations);
-    real_accumulated_time_stat +=
-        Stat1_d(run.real_accumulated_time/run.iterations, run.iterations);
-    cpu_accumulated_time_stat +=
-        Stat1_d(run.cpu_accumulated_time/run.iterations, run.iterations);
-    items_per_second_stat += Stat1_d(run.items_per_second, run.iterations);
-    bytes_per_second_stat += Stat1_d(run.bytes_per_second, run.iterations);
-  }
-
-  // Get the data from the accumulator to BenchmarkReporter::Run's.
-  mean_data->benchmark_name = reports[0].benchmark_name + "_mean";
-  mean_data->iterations = run_iterations;
-  mean_data->real_accumulated_time = real_accumulated_time_stat.Mean() *
-                                     run_iterations;
-  mean_data->cpu_accumulated_time = cpu_accumulated_time_stat.Mean() *
-                                    run_iterations;
-  mean_data->bytes_per_second = bytes_per_second_stat.Mean();
-  mean_data->items_per_second = items_per_second_stat.Mean();
-
-  // Only add label to mean/stddev if it is same for all runs
-  mean_data->report_label = reports[0].report_label;
-  for (std::size_t i = 1; i < reports.size(); i++) {
-    if (reports[i].report_label != reports[0].report_label) {
-      mean_data->report_label = "";
-      break;
-    }
-  }
-
-  stddev_data->benchmark_name = reports[0].benchmark_name + "_stddev";
-  stddev_data->report_label = mean_data->report_label;
-  stddev_data->iterations = 0;
-  stddev_data->real_accumulated_time =
-      real_accumulated_time_stat.StdDev();
-  stddev_data->cpu_accumulated_time =
-      cpu_accumulated_time_stat.StdDev();
-  stddev_data->bytes_per_second = bytes_per_second_stat.StdDev();
-  stddev_data->items_per_second = items_per_second_stat.StdDev();
-}
-
 struct ThreadStats {
    ThreadStats() : bytes_processed(0), items_processed(0) {}
    int64_t bytes_processed;
@ -816,108 +760,8 @@ void State::SetLabel(const char* label) {
  *GetReportLabel() = label;
 }

-BenchmarkReporter::~BenchmarkReporter() {}
-
 namespace internal {

-bool ConsoleReporter::ReportContext(const Context& context) const {
-  name_field_width_ = context.name_field_width;
-
-  fprintf(stdout,
-          "Run on (%d X %0.0f MHz CPU%s)\n",
-          context.num_cpus,
-          context.mhz_per_cpu,
-          (context.num_cpus > 1) ? "s" : "");
-
-  int remainder_us;
-  std::string walltime_str = walltime::Print(
-                                walltime::Now(), "%Y/%m/%d-%H:%M:%S",
-                                true,  // use local timezone
-                                &remainder_us);
-  fprintf(stdout, "%s\n", walltime_str.c_str());
-
-  if (context.cpu_scaling_enabled) {
-    fprintf(stdout, "***WARNING*** CPU scaling is enabled, the benchmark "
-                    "timings may be noisy\n");
-  }
-
-#ifndef NDEBUG
-  fprintf(stdout, "Build Type: DEBUG\n");
-#endif
-
-  int output_width =
-      fprintf(stdout,
-              "%-*s %10s %10s %10s\n",
-              static_cast<int>(name_field_width_),
-              "Benchmark",
-              "Time(ns)", "CPU(ns)",
-              "Iterations");
-  fprintf(stdout, "%s\n", std::string(output_width - 1, '-').c_str());
-
-  return true;
-}
-
-void ConsoleReporter::ReportRuns(
-    const std::vector<Run>& reports) const {
-  if (reports.empty()) {
-    return;
-  }
-
-  for (Run const& run : reports) {
-    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
-    PrintRunData(run);
-  }
-
-  if (reports.size() < 2) {
-    // We don't report aggregated data if there was a single run.
-    return;
-  }
-
-  Run mean_data;
-  Run stddev_data;
-  ComputeStats(reports, &mean_data, &stddev_data);
-
-  // Output using PrintRun.
-  PrintRunData(mean_data);
-  PrintRunData(stddev_data);
-  fprintf(stdout, "\n");
-}
-
-void ConsoleReporter::PrintRunData(const Run& result) const {
-  // Format bytes per second
-  std::string rate;
-  if (result.bytes_per_second > 0) {
-    rate = StrCat(" ", HumanReadableNumber(result.bytes_per_second), "B/s");
-  }
-
-  // Format items per second
-  std::string items;
-  if (result.items_per_second > 0) {
-    items = StrCat(" ", HumanReadableNumber(result.items_per_second),
-                   " items/s");
-  }
-
-  double const multiplier = 1e9; // nano second multiplier
-  ColorPrintf(COLOR_GREEN, "%-*s ",
-              name_field_width_, result.benchmark_name.c_str());
-  if (result.iterations == 0) {
-    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
-                result.real_accumulated_time * multiplier,
-                result.cpu_accumulated_time * multiplier);
-  } else {
-    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
-                (result.real_accumulated_time * multiplier) /
-                    (static_cast<double>(result.iterations)),
-                (result.cpu_accumulated_time * multiplier) /
-                    (static_cast<double>(result.iterations)));
-  }
-  ColorPrintf(COLOR_CYAN, "%10lld", result.iterations);
-  ColorPrintf(COLOR_DEFAULT, "%*s %*s %s\n",
-              13, rate.c_str(),
-              18, items.c_str(),
-              result.report_label.c_str());
-}
-
 void RunMatchingBenchmarks(const std::string& spec,
                           const BenchmarkReporter* reporter) {
  CHECK(reporter != nullptr);
@ -973,7 +817,7 @@ void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter) {
  std::string spec = FLAGS_benchmark_filter;
  if (spec.empty() || spec == "all")
    spec = ".";  // Regexp that matches all benchmarks
-  internal::ConsoleReporter default_reporter;
+  ConsoleReporter default_reporter;
  internal::RunMatchingBenchmarks(spec, reporter ? reporter : &default_reporter);
 }

--- a/src/reporter.cc
+++ b/src/reporter.cc
@ -0,0 +1,189 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/reporter.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "check.h"
+#include "colorprint.h"
+#include "stat.h"
+#include "string_util.h"
+#include "walltime.h"
+
+namespace benchmark {
+namespace {
+
+void ComputeStats(const std::vector<BenchmarkReporter::Run>& reports,
+                  BenchmarkReporter::Run* mean_data,
+                  BenchmarkReporter::Run* stddev_data) {
+  CHECK(reports.size() >= 2) << "Cannot compute stats for less than 2 reports";
+  // Accumulators.
+  Stat1_d real_accumulated_time_stat;
+  Stat1_d cpu_accumulated_time_stat;
+  Stat1_d bytes_per_second_stat;
+  Stat1_d items_per_second_stat;
+  // All repetitions should be run with the same number of iterations so we
+  // can take this information from the first benchmark.
+  std::size_t const run_iterations = reports.front().iterations;
+
+  // Populate the accumulators.
+  for (BenchmarkReporter::Run const& run : reports) {
+    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
+    CHECK_EQ(run_iterations, run.iterations);
+    real_accumulated_time_stat +=
+        Stat1_d(run.real_accumulated_time/run.iterations, run.iterations);
+    cpu_accumulated_time_stat +=
+        Stat1_d(run.cpu_accumulated_time/run.iterations, run.iterations);
+    items_per_second_stat += Stat1_d(run.items_per_second, run.iterations);
+    bytes_per_second_stat += Stat1_d(run.bytes_per_second, run.iterations);
+  }
+
+  // Get the data from the accumulator to BenchmarkReporter::Run's.
+  mean_data->benchmark_name = reports[0].benchmark_name + "_mean";
+  mean_data->iterations = run_iterations;
+  mean_data->real_accumulated_time = real_accumulated_time_stat.Mean() *
+                                     run_iterations;
+  mean_data->cpu_accumulated_time = cpu_accumulated_time_stat.Mean() *
+                                    run_iterations;
+  mean_data->bytes_per_second = bytes_per_second_stat.Mean();
+  mean_data->items_per_second = items_per_second_stat.Mean();
+
+  // Only add label to mean/stddev if it is same for all runs
+  mean_data->report_label = reports[0].report_label;
+  for (std::size_t i = 1; i < reports.size(); i++) {
+    if (reports[i].report_label != reports[0].report_label) {
+      mean_data->report_label = "";
+      break;
+    }
+  }
+
+  stddev_data->benchmark_name = reports[0].benchmark_name + "_stddev";
+  stddev_data->report_label = mean_data->report_label;
+  stddev_data->iterations = 0;
+  stddev_data->real_accumulated_time =
+      real_accumulated_time_stat.StdDev();
+  stddev_data->cpu_accumulated_time =
+      cpu_accumulated_time_stat.StdDev();
+  stddev_data->bytes_per_second = bytes_per_second_stat.StdDev();
+  stddev_data->items_per_second = items_per_second_stat.StdDev();
+}
+
+} // end namespace
+
+
+BenchmarkReporter::~BenchmarkReporter() {}
+
+bool ConsoleReporter::ReportContext(const Context& context) const {
+  name_field_width_ = context.name_field_width;
+
+  fprintf(stdout,
+          "Run on (%d X %0.0f MHz CPU%s)\n",
+          context.num_cpus,
+          context.mhz_per_cpu,
+          (context.num_cpus > 1) ? "s" : "");
+
+  int remainder_us;
+  std::string walltime_str = walltime::Print(
+                                walltime::Now(), "%Y/%m/%d-%H:%M:%S",
+                                true,  // use local timezone
+                                &remainder_us);
+  fprintf(stdout, "%s\n", walltime_str.c_str());
+
+  if (context.cpu_scaling_enabled) {
+    fprintf(stdout, "***WARNING*** CPU scaling is enabled, the benchmark "
+                    "timings may be noisy\n");
+  }
+
+#ifndef NDEBUG
+  fprintf(stdout, "Build Type: DEBUG\n");
+#endif
+
+  int output_width =
+      fprintf(stdout,
+              "%-*s %10s %10s %10s\n",
+              static_cast<int>(name_field_width_),
+              "Benchmark",
+              "Time(ns)", "CPU(ns)",
+              "Iterations");
+  fprintf(stdout, "%s\n", std::string(output_width - 1, '-').c_str());
+
+  return true;
+}
+
+void ConsoleReporter::ReportRuns(
+    const std::vector<Run>& reports) const {
+  if (reports.empty()) {
+    return;
+  }
+
+  for (Run const& run : reports) {
+    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
+    PrintRunData(run);
+  }
+
+  if (reports.size() < 2) {
+    // We don't report aggregated data if there was a single run.
+    return;
+  }
+
+  Run mean_data;
+  Run stddev_data;
+  ComputeStats(reports, &mean_data, &stddev_data);
+
+  // Output using PrintRun.
+  PrintRunData(mean_data);
+  PrintRunData(stddev_data);
+  fprintf(stdout, "\n");
+}
+
+void ConsoleReporter::PrintRunData(const Run& result) const {
+  // Format bytes per second
+  std::string rate;
+  if (result.bytes_per_second > 0) {
+    rate = StrCat(" ", HumanReadableNumber(result.bytes_per_second), "B/s");
+  }
+
+  // Format items per second
+  std::string items;
+  if (result.items_per_second > 0) {
+    items = StrCat(" ", HumanReadableNumber(result.items_per_second),
+                   " items/s");
+  }
+
+  double const multiplier = 1e9; // nano second multiplier
+  ColorPrintf(COLOR_GREEN, "%-*s ",
+              name_field_width_, result.benchmark_name.c_str());
+  if (result.iterations == 0) {
+    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
+                result.real_accumulated_time * multiplier,
+                result.cpu_accumulated_time * multiplier);
+  } else {
+    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
+                (result.real_accumulated_time * multiplier) /
+                    (static_cast<double>(result.iterations)),
+                (result.cpu_accumulated_time * multiplier) /
+                    (static_cast<double>(result.iterations)));
+  }
+  ColorPrintf(COLOR_CYAN, "%10lld", result.iterations);
+  ColorPrintf(COLOR_DEFAULT, "%*s %*s %s\n",
+              13, rate.c_str(),
+              18, items.c_str(),
+              result.report_label.c_str());
+}
+
+} // end namespace benchmark
--- a/test/basic_test.cc
+++ b/test/basic_test.cc
@ -1,7 +1,7 @@

 #include <cstddef>

-#include "benchmark/benchmark.h"
+#include "benchmark/benchmark_api.h"

 #define BASIC_BENCHMARK_TEST(x) \
    BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
--- a/test/filter_test.cc
+++ b/test/filter_test.cc
@ -21,7 +21,7 @@ double CalculatePi(int depth) {
  return (pi - 1.0) * 4;
 }

-class TestReporter : public benchmark::internal::ConsoleReporter {
+class TestReporter : public benchmark::ConsoleReporter {
 public:
  virtual bool ReportContext(const Context& context) const {
    return ConsoleReporter::ReportContext(context);