From 9d4b719daeda35acf3a3d81b9ac1f38fc13333d1 Mon Sep 17 00:00:00 2001
From: Eric <eric@efcs.ca>
Date: Tue, 4 Jul 2017 16:31:47 -0600
Subject: [PATCH] Make Benchmark a single header library (but not header-only)
 (#407)

* Make Benchmark a single header library (but not header-only)

This patch refactors benchmark into a single header, to allow
for slightly easier usage.

The initial reason for the header split was to keep C++ library
components from being included by benchmark_api.h, making that
part of the library STL agnostic. However this has since changed
and there seems to be little reason to separate the reporters from
the rest of the library.

* Fix internal_macros.h

* Remove more references to macros.h
---
 include/benchmark/benchmark.h     | 1194 ++++++++++++++++++++++++++++-
 include/benchmark/benchmark_api.h |  938 +---------------------
 include/benchmark/macros.h        |   66 --
 include/benchmark/reporter.h      |  225 +-----
 src/benchmark_api_internal.h      |    2 +-
 src/complexity.cc                 |    2 +-
 src/complexity.h                  |    3 +-
 src/console_reporter.cc           |    2 +-
 src/counter.h                     |    2 +-
 src/csv_reporter.cc               |    2 +-
 src/cycleclock.h                  |    2 +-
 src/internal_macros.h             |    2 +-
 src/json_reporter.cc              |    2 +-
 src/log.h                         |    4 +-
 src/reporter.cc                   |    2 +-
 test/basic_test.cc                |    2 +-
 test/diagnostics_test.cc          |    2 +-
 test/options_test.cc              |    2 +-
 18 files changed, 1234 insertions(+), 1220 deletions(-)
 delete mode 100644 include/benchmark/macros.h

diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
index b3b0a8e9..fc448e65 100644
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@@ -11,11 +11,1199 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
+// Support for registering benchmarks for functions.
+
+/* Example usage:
+// Define a function that executes the code to be measured a
+// specified number of times:
+static void BM_StringCreation(benchmark::State& state) {
+  while (state.KeepRunning())
+    std::string empty_string;
+}
+
+// Register the function as a benchmark
+BENCHMARK(BM_StringCreation);
+
+// Define another benchmark
+static void BM_StringCopy(benchmark::State& state) {
+  std::string x = "hello";
+  while (state.KeepRunning())
+    std::string copy(x);
+}
+BENCHMARK(BM_StringCopy);
+
+// Augment the main() program to invoke benchmarks if specified
+// via the --benchmarks command line flag.  E.g.,
+//       my_unittest --benchmark_filter=all
+//       my_unittest --benchmark_filter=BM_StringCreation
+//       my_unittest --benchmark_filter=String
+//       my_unittest --benchmark_filter='Copy|Creation'
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  return 0;
+}
+
+// Sometimes a family of microbenchmarks can be implemented with
+// just one routine that takes an extra argument to specify which
+// one of the family of benchmarks to run.  For example, the following
+// code defines a family of microbenchmarks for measuring the speed
+// of memcpy() calls of different lengths:
+
+static void BM_memcpy(benchmark::State& state) {
+  char* src = new char[state.range(0)]; char* dst = new char[state.range(0)];
+  memset(src, 'x', state.range(0));
+  while (state.KeepRunning())
+    memcpy(dst, src, state.range(0));
+  state.SetBytesProcessed(int64_t(state.iterations()) *
+                          int64_t(state.range(0)));
+  delete[] src; delete[] dst;
+}
+BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
+
+// The preceding code is quite repetitive, and can be replaced with the
+// following short-hand.  The following invocation will pick a few
+// appropriate arguments in the specified range and will generate a
+// microbenchmark for each such argument.
+BENCHMARK(BM_memcpy)->Range(8, 8<<10);
+
+// You might have a microbenchmark that depends on two inputs.  For
+// example, the following code defines a family of microbenchmarks for
+// measuring the speed of set insertion.
+static void BM_SetInsert(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    state.PauseTiming();
+    set<int> data = ConstructRandomSet(state.range(0));
+    state.ResumeTiming();
+    for (int j = 0; j < state.range(1); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert)
+   ->Args({1<<10, 1})
+   ->Args({1<<10, 8})
+   ->Args({1<<10, 64})
+   ->Args({1<<10, 512})
+   ->Args({8<<10, 1})
+   ->Args({8<<10, 8})
+   ->Args({8<<10, 64})
+   ->Args({8<<10, 512});
+
+// The preceding code is quite repetitive, and can be replaced with
+// the following short-hand.  The following macro will pick a few
+// appropriate arguments in the product of the two specified ranges
+// and will generate a microbenchmark for each such pair.
+BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {1, 512}});
+
+// For more complex patterns of inputs, passing a custom function
+// to Apply allows programmatic specification of an
+// arbitrary set of arguments to run the microbenchmark on.
+// The following example enumerates a dense range on
+// one parameter, and a sparse range on the second.
+static void CustomArguments(benchmark::internal::Benchmark* b) {
+  for (int i = 0; i <= 10; ++i)
+    for (int j = 32; j <= 1024*1024; j *= 8)
+      b->Args({i, j});
+}
+BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
+
+// Templated microbenchmarks work the same way:
+// Produce then consume 'size' messages 'iters' times
+// Measures throughput in the absence of multiprogramming.
+template <class Q> int BM_Sequential(benchmark::State& state) {
+  Q q;
+  typename Q::value_type v;
+  while (state.KeepRunning()) {
+    for (int i = state.range(0); i--; )
+      q.push(v);
+    for (int e = state.range(0); e--; )
+      q.Wait(&v);
+  }
+  // actually messages, not bytes:
+  state.SetBytesProcessed(
+      static_cast<int64_t>(state.iterations())*state.range(0));
+}
+BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+
+Use `Benchmark::MinTime(double t)` to set the minimum time used to run the
+benchmark. This option overrides the `benchmark_min_time` flag.
+
+void BM_test(benchmark::State& state) {
+ ... body ...
+}
+BENCHMARK(BM_test)->MinTime(2.0); // Run for at least 2 seconds.
+
+In a multithreaded test, it is guaranteed that none of the threads will start
+until all have called KeepRunning, and all will have finished before KeepRunning
+returns false. As such, any global setup or teardown you want to do can be
+wrapped in a check against the thread index:
+
+static void BM_MultiThreaded(benchmark::State& state) {
+  if (state.thread_index == 0) {
+    // Setup code here.
+  }
+  while (state.KeepRunning()) {
+    // Run the test as normal.
+  }
+  if (state.thread_index == 0) {
+    // Teardown code here.
+  }
+}
+BENCHMARK(BM_MultiThreaded)->Threads(4);
+
+
+If a benchmark runs a few milliseconds it may be hard to visually compare the
+measured times, since the output data is given in nanoseconds per default. In
+order to manually set the time unit, you can specify it manually:
+
+BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
+*/
+
 #ifndef BENCHMARK_BENCHMARK_H_
 #define BENCHMARK_BENCHMARK_H_
 
-#include "benchmark_api.h"
-#include "macros.h"
-#include "reporter.h"
+
+#if __cplusplus >= 201103L
+#define BENCHMARK_HAS_CXX11
+#endif
+
+#include <stdint.h>
+
+#include <cassert>
+#include <cstddef>
+#include <iosfwd>
+#include <string>
+#include <vector>
+#include <map>
+#include <set>
+
+#if defined(BENCHMARK_HAS_CXX11)
+#include <type_traits>
+#include <initializer_list>
+#include <utility>
+#endif
+
+#if defined(_MSC_VER)
+#include <intrin.h> // for _ReadWriteBarrier
+#endif
+
+#ifndef BENCHMARK_HAS_CXX11
+#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);                         \
+  TypeName& operator=(const TypeName&)
+#else
+#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&) = delete;                \
+  TypeName& operator=(const TypeName&) = delete
+#endif
+
+#if defined(__GNUC__)
+#define BENCHMARK_UNUSED __attribute__((unused))
+#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#elif defined(_MSC_VER) && !defined(__clang__)
+#define BENCHMARK_UNUSED
+#define BENCHMARK_ALWAYS_INLINE __forceinline
+#if _MSC_VER >= 1900
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#else
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+#define __func__ __FUNCTION__
+#else
+#define BENCHMARK_UNUSED
+#define BENCHMARK_ALWAYS_INLINE
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+
+#define BENCHMARK_INTERNAL_TOSTRING2(x) #x
+#define BENCHMARK_INTERNAL_TOSTRING(x) BENCHMARK_INTERNAL_TOSTRING2(x)
+
+#if defined(__GNUC__)
+#define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
+#define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+#else
+#define BENCHMARK_BUILTIN_EXPECT(x, y) x
+#define BENCHMARK_DEPRECATED_MSG(msg)
+#define BENCHMARK_WARNING_MSG(msg) __pragma(message(__FILE__ "(" BENCHMARK_INTERNAL_TOSTRING(__LINE__) ") : warning note: " msg))
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#endif
+
+
+namespace benchmark {
+class BenchmarkReporter;
+
+void Initialize(int* argc, char** argv);
+
+// Report to stdout all arguments in 'argv' as unrecognized except the first.
+// Returns true there is at least on unrecognized argument (i.e. 'argc' > 1).
+bool ReportUnrecognizedArguments(int argc, char** argv);
+
+// Generate a list of benchmarks matching the specified --benchmark_filter flag
+// and if --benchmark_list_tests is specified return after printing the name
+// of each matching benchmark. Otherwise run each matching benchmark and
+// report the results.
+//
+// The second and third overload use the specified 'console_reporter' and
+//  'file_reporter' respectively. 'file_reporter' will write to the file
+//  specified
+//   by '--benchmark_output'. If '--benchmark_output' is not given the
+//  'file_reporter' is ignored.
+//
+// RETURNS: The number of matching benchmarks.
+size_t RunSpecifiedBenchmarks();
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter);
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
+                              BenchmarkReporter* file_reporter);
+
+// If this routine is called, peak memory allocation past this point in the
+// benchmark is reported at the end of the benchmark report line. (It is
+// computed by running the benchmark once with a single iteration and a memory
+// tracer.)
+// TODO(dominic)
+// void MemoryUsage();
+
+namespace internal {
+class Benchmark;
+class BenchmarkImp;
+class BenchmarkFamilies;
+
+void UseCharPointer(char const volatile*);
+
+// Take ownership of the pointer and register the benchmark. Return the
+// registered benchmark.
+Benchmark* RegisterBenchmarkInternal(Benchmark*);
+
+// Ensure that the standard streams are properly initialized in every TU.
+int InitializeStreams();
+BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
+
+}  // end namespace internal
+
+
+#if !defined(__GNUC__) || defined(__pnacl__) || defined(EMSCRIPTN)
+# define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
+#endif
+
+// The DoNotOptimize(...) function can be used to prevent a value or
+// expression from being optimized away by the compiler. This function is
+// intended to add little to no overhead.
+// See: https://youtu.be/nXaxk27zwlk?t=2441
+#ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  // Clang doesn't like the 'X' constraint on `value` and certain GCC versions
+  // don't like the 'g' constraint. Attempt to placate them both.
+#if defined(__clang__)
+  asm volatile("" : : "g"(value) : "memory");
+#else
+  asm volatile("" : : "i,r,m"(value) : "memory");
+#endif
+}
+// Force the compiler to flush pending writes to global memory. Acts as an
+// effective read/write barrier
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
+  asm volatile("" : : : "memory");
+}
+#elif defined(_MSC_VER)
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+  _ReadWriteBarrier();
+}
+
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
+  _ReadWriteBarrier();
+}
+#else
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+}
+// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers
+#endif
+
+
+
+// This class is used for user-defined counters.
+class Counter {
+public:
+
+  enum Flags {
+    kDefaults   = 0,
+    // Mark the counter as a rate. It will be presented divided
+    // by the duration of the benchmark.
+    kIsRate     = 1,
+    // Mark the counter as a thread-average quantity. It will be
+    // presented divided by the number of threads.
+    kAvgThreads = 2,
+    // Mark the counter as a thread-average rate. See above.
+    kAvgThreadsRate = kIsRate|kAvgThreads
+  };
+
+  double value;
+  Flags  flags;
+
+  BENCHMARK_ALWAYS_INLINE
+  Counter(double v = 0., Flags f = kDefaults) : value(v), flags(f) {}
+
+  BENCHMARK_ALWAYS_INLINE operator double const& () const { return value; }
+  BENCHMARK_ALWAYS_INLINE operator double      & ()       { return value; }
+
+};
+
+// This is the container for the user-defined counters.
+typedef std::map<std::string, Counter> UserCounters;
+
+
+// TimeUnit is passed to a benchmark in order to specify the order of magnitude
+// for the measured time.
+enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond };
+
+// BigO is passed to a benchmark in order to specify the asymptotic
+// computational
+// complexity for the benchmark. In case oAuto is selected, complexity will be
+// calculated automatically to the best fit.
+enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };
+
+// BigOFunc is passed to a benchmark in order to specify the asymptotic
+// computational complexity for the benchmark.
+typedef double(BigOFunc)(int);
+
+namespace internal {
+class ThreadTimer;
+class ThreadManager;
+
+#if defined(BENCHMARK_HAS_CXX11)
+enum ReportMode : unsigned {
+#else
+enum ReportMode {
+#endif
+  RM_Unspecified,  // The mode has not been manually specified
+  RM_Default,      // The mode is user-specified as default.
+  RM_ReportAggregatesOnly
+};
+}
+
+// State is passed to a running Benchmark and contains state for the
+// benchmark to use.
+class State {
+ public:
+  // Returns true if the benchmark should continue through another iteration.
+  // NOTE: A benchmark may not return from the test until KeepRunning() has
+  // returned false.
+  bool KeepRunning() {
+    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
+      StartKeepRunning();
+    }
+    bool const res = total_iterations_++ < max_iterations;
+    if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
+      FinishKeepRunning();
+    }
+    return res;
+  }
+
+  // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
+  //           by the current thread.
+  // Stop the benchmark timer.  If not called, the timer will be
+  // automatically stopped after KeepRunning() returns false for the first time.
+  //
+  // For threaded benchmarks the PauseTiming() function only pauses the timing
+  // for the current thread.
+  //
+  // NOTE: The "real time" measurement is per-thread. If different threads
+  // report different measurements the largest one is reported.
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
+  void PauseTiming();
+
+  // REQUIRES: timer is not running and 'SkipWithError(...)' has not been called
+  //           by the current thread.
+  // Start the benchmark timer.  The timer is NOT running on entrance to the
+  // benchmark function. It begins running after the first call to KeepRunning()
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
+  void ResumeTiming();
+
+  // REQUIRES: 'SkipWithError(...)' has not been called previously by the
+  //            current thread.
+  // Skip any future iterations of the 'KeepRunning()' loop in the current
+  // thread and report an error with the specified 'msg'. After this call
+  // the user may explicitly 'return' from the benchmark.
+  //
+  // For threaded benchmarks only the current thread stops executing and future
+  // calls to `KeepRunning()` will block until all threads have completed
+  // the `KeepRunning()` loop. If multiple threads report an error only the
+  // first error message is used.
+  //
+  // NOTE: Calling 'SkipWithError(...)' does not cause the benchmark to exit
+  // the current scope immediately. If the function is called from within
+  // the 'KeepRunning()' loop the current iteration will finish. It is the users
+  // responsibility to exit the scope as needed.
+  void SkipWithError(const char* msg);
+
+  // REQUIRES: called exactly once per iteration of the KeepRunning loop.
+  // Set the manually measured time for this benchmark iteration, which
+  // is used instead of automatically measured time if UseManualTime() was
+  // specified.
+  //
+  // For threaded benchmarks the final value will be set to the largest
+  // reported values.
+  void SetIterationTime(double seconds);
+
+  // Set the number of bytes processed by the current benchmark
+  // execution.  This routine is typically called once at the end of a
+  // throughput oriented benchmark.  If this routine is called with a
+  // value > 0, the report is printed in MB/sec instead of nanoseconds
+  // per iteration.
+  //
+  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  BENCHMARK_ALWAYS_INLINE
+  void SetBytesProcessed(size_t bytes) { bytes_processed_ = bytes; }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t bytes_processed() const { return bytes_processed_; }
+
+  // If this routine is called with complexity_n > 0 and complexity report is
+  // requested for the
+  // family benchmark, then current benchmark will be part of the computation
+  // and complexity_n will
+  // represent the length of N.
+  BENCHMARK_ALWAYS_INLINE
+  void SetComplexityN(int complexity_n) { complexity_n_ = complexity_n; }
+
+  BENCHMARK_ALWAYS_INLINE
+  int complexity_length_n() { return complexity_n_; }
+
+  // If this routine is called with items > 0, then an items/s
+  // label is printed on the benchmark report line for the currently
+  // executing benchmark. It is typically called at the end of a processing
+  // benchmark where a processing items/second output is desired.
+  //
+  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  BENCHMARK_ALWAYS_INLINE
+  void SetItemsProcessed(size_t items) { items_processed_ = items; }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t items_processed() const { return items_processed_; }
+
+  // If this routine is called, the specified label is printed at the
+  // end of the benchmark report line for the currently executing
+  // benchmark.  Example:
+  //  static void BM_Compress(benchmark::State& state) {
+  //    ...
+  //    double compress = input_size / output_size;
+  //    state.SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
+  //  }
+  // Produces output that looks like:
+  //  BM_Compress   50         50   14115038  compress:27.3%
+  //
+  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  void SetLabel(const char* label);
+
+  void BENCHMARK_ALWAYS_INLINE SetLabel(const std::string& str) {
+    this->SetLabel(str.c_str());
+  }
+
+  // Range arguments for this run. CHECKs if the argument has been set.
+  BENCHMARK_ALWAYS_INLINE
+  int range(std::size_t pos = 0) const {
+    assert(range_.size() > pos);
+    return range_[pos];
+  }
+
+  BENCHMARK_DEPRECATED_MSG("use 'range(0)' instead")
+  int range_x() const { return range(0); }
+
+  BENCHMARK_DEPRECATED_MSG("use 'range(1)' instead")
+  int range_y() const { return range(1); }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t iterations() const { return total_iterations_; }
+
+ private:
+  bool started_;
+  bool finished_;
+  size_t total_iterations_;
+
+  std::vector<int> range_;
+
+  size_t bytes_processed_;
+  size_t items_processed_;
+
+  int complexity_n_;
+
+  bool error_occurred_;
+
+ public:
+  // Container for user-defined counters.
+  UserCounters counters;
+  // Index of the executing thread. Values from [0, threads).
+  const int thread_index;
+  // Number of threads concurrently executing the benchmark.
+  const int threads;
+  const size_t max_iterations;
+
+  // TODO make me private
+  State(size_t max_iters, const std::vector<int>& ranges, int thread_i,
+        int n_threads, internal::ThreadTimer* timer,
+        internal::ThreadManager* manager);
+
+ private:
+  void StartKeepRunning();
+  void FinishKeepRunning();
+  internal::ThreadTimer* timer_;
+  internal::ThreadManager* manager_;
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
+};
+
+namespace internal {
+
+typedef void(Function)(State&);
+
+// ------------------------------------------------------
+// Benchmark registration object.  The BENCHMARK() macro expands
+// into an internal::Benchmark* object.  Various methods can
+// be called on this object to change the properties of the benchmark.
+// Each method returns "this" so that multiple method calls can
+// chained into one expression.
+class Benchmark {
+ public:
+  virtual ~Benchmark();
+
+  // Note: the following methods all return "this" so that multiple
+  // method calls can be chained together in one expression.
+
+  // Run this benchmark once with "x" as the extra argument passed
+  // to the function.
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* Arg(int x);
+
+  // Run this benchmark with the given time unit for the generated output report
+  Benchmark* Unit(TimeUnit unit);
+
+  // Run this benchmark once for a number of values picked from the
+  // range [start..limit].  (start and limit are always picked.)
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* Range(int start, int limit);
+
+  // Run this benchmark once for all values in the range [start..limit] with
+  // specific step
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* DenseRange(int start, int limit, int step = 1);
+
+  // Run this benchmark once with "args" as the extra arguments passed
+  // to the function.
+  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
+  Benchmark* Args(const std::vector<int>& args);
+
+  // Equivalent to Args({x, y})
+  // NOTE: This is a legacy C++03 interface provided for compatibility only.
+  //   New code should use 'Args'.
+  Benchmark* ArgPair(int x, int y) {
+    std::vector<int> args;
+    args.push_back(x);
+    args.push_back(y);
+    return Args(args);
+  }
+
+  // Run this benchmark once for a number of values picked from the
+  // ranges [start..limit].  (starts and limits are always picked.)
+  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
+  Benchmark* Ranges(const std::vector<std::pair<int, int> >& ranges);
+
+  // Equivalent to ArgNames({name})
+  Benchmark* ArgName(const std::string& name);
+
+  // Set the argument names to display in the benchmark name. If not called,
+  // only argument values will be shown.
+  Benchmark* ArgNames(const std::vector<std::string>& names);
+
+  // Equivalent to Ranges({{lo1, hi1}, {lo2, hi2}}).
+  // NOTE: This is a legacy C++03 interface provided for compatibility only.
+  //   New code should use 'Ranges'.
+  Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2) {
+    std::vector<std::pair<int, int> > ranges;
+    ranges.push_back(std::make_pair(lo1, hi1));
+    ranges.push_back(std::make_pair(lo2, hi2));
+    return Ranges(ranges);
+  }
+
+  // Pass this benchmark object to *func, which can customize
+  // the benchmark by calling various methods like Arg, Args,
+  // Threads, etc.
+  Benchmark* Apply(void (*func)(Benchmark* benchmark));
+
+  // Set the range multiplier for non-dense range. If not called, the range
+  // multiplier kRangeMultiplier will be used.
+  Benchmark* RangeMultiplier(int multiplier);
+
+  // Set the minimum amount of time to use when running this benchmark. This
+  // option overrides the `benchmark_min_time` flag.
+  // REQUIRES: `t > 0` and `Iterations` has not been called on this benchmark.
+  Benchmark* MinTime(double t);
+
+  // Specify the amount of iterations that should be run by this benchmark.
+  // REQUIRES: 'n > 0' and `MinTime` has not been called on this benchmark.
+  //
+  // NOTE: This function should only be used when *exact* iteration control is
+  //   needed and never to control or limit how long a benchmark runs, where
+  // `--benchmark_min_time=N` or `MinTime(...)` should be used instead.
+  Benchmark* Iterations(size_t n);
+
+  // Specify the amount of times to repeat this benchmark. This option overrides
+  // the `benchmark_repetitions` flag.
+  // REQUIRES: `n > 0`
+  Benchmark* Repetitions(int n);
+
+  // Specify if each repetition of the benchmark should be reported separately
+  // or if only the final statistics should be reported. If the benchmark
+  // is not repeated then the single result is always reported.
+  Benchmark* ReportAggregatesOnly(bool v = true);
+
+  // If a particular benchmark is I/O bound, runs multiple threads internally or
+  // if for some reason CPU timings are not representative, call this method. If
+  // called, the elapsed time will be used to control how many iterations are
+  // run, and in the printing of items/second or MB/seconds values.  If not
+  // called, the cpu time used by the benchmark will be used.
+  Benchmark* UseRealTime();
+
+  // If a benchmark must measure time manually (e.g. if GPU execution time is
+  // being
+  // measured), call this method. If called, each benchmark iteration should
+  // call
+  // SetIterationTime(seconds) to report the measured time, which will be used
+  // to control how many iterations are run, and in the printing of items/second
+  // or MB/second values.
+  Benchmark* UseManualTime();
+
+  // Set the asymptotic computational complexity for the benchmark. If called
+  // the asymptotic computational complexity will be shown on the output.
+  Benchmark* Complexity(BigO complexity = benchmark::oAuto);
+
+  // Set the asymptotic computational complexity for the benchmark. If called
+  // the asymptotic computational complexity will be shown on the output.
+  Benchmark* Complexity(BigOFunc* complexity);
+
+  // Support for running multiple copies of the same benchmark concurrently
+  // in multiple threads.  This may be useful when measuring the scaling
+  // of some piece of code.
+
+  // Run one instance of this benchmark concurrently in t threads.
+  Benchmark* Threads(int t);
+
+  // Pick a set of values T from [min_threads,max_threads].
+  // min_threads and max_threads are always included in T.  Run this
+  // benchmark once for each value in T.  The benchmark run for a
+  // particular value t consists of t threads running the benchmark
+  // function concurrently.  For example, consider:
+  //    BENCHMARK(Foo)->ThreadRange(1,16);
+  // This will run the following benchmarks:
+  //    Foo in 1 thread
+  //    Foo in 2 threads
+  //    Foo in 4 threads
+  //    Foo in 8 threads
+  //    Foo in 16 threads
+  Benchmark* ThreadRange(int min_threads, int max_threads);
+
+  // For each value n in the range, run this benchmark once using n threads.
+  // min_threads and max_threads are always included in the range.
+  // stride specifies the increment. E.g. DenseThreadRange(1, 8, 3) starts
+  // a benchmark with 1, 4, 7 and 8 threads.
+  Benchmark* DenseThreadRange(int min_threads, int max_threads, int stride = 1);
+
+  // Equivalent to ThreadRange(NumCPUs(), NumCPUs())
+  Benchmark* ThreadPerCpu();
+
+  virtual void Run(State& state) = 0;
+
+  // Used inside the benchmark implementation
+  struct Instance;
+
+ protected:
+  explicit Benchmark(const char* name);
+  Benchmark(Benchmark const&);
+  void SetName(const char* name);
+
+  int ArgsCnt() const;
+
+  static void AddRange(std::vector<int>* dst, int lo, int hi, int mult);
+
+ private:
+  friend class BenchmarkFamilies;
+
+  std::string name_;
+  ReportMode report_mode_;
+  std::vector<std::string> arg_names_;   // Args for all benchmark runs
+  std::vector<std::vector<int> > args_;  // Args for all benchmark runs
+  TimeUnit time_unit_;
+  int range_multiplier_;
+  double min_time_;
+  size_t iterations_;
+  int repetitions_;
+  bool use_real_time_;
+  bool use_manual_time_;
+  BigO complexity_;
+  BigOFunc* complexity_lambda_;
+  std::vector<int> thread_counts_;
+
+  Benchmark& operator=(Benchmark const&);
+};
+
+}  // namespace internal
+
+// Create and register a benchmark with the specified 'name' that invokes
+// the specified functor 'fn'.
+//
+// RETURNS: A pointer to the registered benchmark.
+internal::Benchmark* RegisterBenchmark(const char* name,
+                                       internal::Function* fn);
+
+#if defined(BENCHMARK_HAS_CXX11)
+template <class Lambda>
+internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn);
+#endif
+
+// Remove all registered benchmarks. All pointers to previously registered
+// benchmarks are invalidated.
+void ClearRegisteredBenchmarks();
+
+namespace internal {
+// The class used to hold all Benchmarks created from static function.
+// (ie those created using the BENCHMARK(...) macros.
+class FunctionBenchmark : public Benchmark {
+ public:
+  FunctionBenchmark(const char* name, Function* func)
+      : Benchmark(name), func_(func) {}
+
+  virtual void Run(State& st);
+
+ private:
+  Function* func_;
+};
+
+#ifdef BENCHMARK_HAS_CXX11
+template <class Lambda>
+class LambdaBenchmark : public Benchmark {
+ public:
+  virtual void Run(State& st) { lambda_(st); }
+
+ private:
+  template <class OLambda>
+  LambdaBenchmark(const char* name, OLambda&& lam)
+      : Benchmark(name), lambda_(std::forward<OLambda>(lam)) {}
+
+  LambdaBenchmark(LambdaBenchmark const&) = delete;
+
+ private:
+  template <class Lam>
+  friend Benchmark* ::benchmark::RegisterBenchmark(const char*, Lam&&);
+
+  Lambda lambda_;
+};
+#endif
+
+}  // end namespace internal
+
+inline internal::Benchmark* RegisterBenchmark(const char* name,
+                                              internal::Function* fn) {
+  return internal::RegisterBenchmarkInternal(
+      ::new internal::FunctionBenchmark(name, fn));
+}
+
+#ifdef BENCHMARK_HAS_CXX11
+template <class Lambda>
+internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn) {
+  using BenchType =
+      internal::LambdaBenchmark<typename std::decay<Lambda>::type>;
+  return internal::RegisterBenchmarkInternal(
+      ::new BenchType(name, std::forward<Lambda>(fn)));
+}
+#endif
+
+#if defined(BENCHMARK_HAS_CXX11) && \
+    (!defined(BENCHMARK_GCC_VERSION) || BENCHMARK_GCC_VERSION >= 409)
+template <class Lambda, class... Args>
+internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn,
+                                       Args&&... args) {
+  return benchmark::RegisterBenchmark(
+      name, [=](benchmark::State& st) { fn(st, args...); });
+}
+#else
+#define BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
+#endif
+
+// The base class for all fixture tests.
+class Fixture : public internal::Benchmark {
+ public:
+  Fixture() : internal::Benchmark("") {}
+
+  virtual void Run(State& st) {
+    this->SetUp(st);
+    this->BenchmarkCase(st);
+    this->TearDown(st);
+  }
+
+  // These will be deprecated ...
+  virtual void SetUp(const State&) {}
+  virtual void TearDown(const State&) {}
+  // ... In favor of these.
+  virtual void SetUp(State& st) { SetUp(const_cast<const State&>(st)); }
+  virtual void TearDown(State& st) { TearDown(const_cast<const State&>(st)); }
+
+ protected:
+  virtual void BenchmarkCase(State&) = 0;
+};
+
+}  // end namespace benchmark
+
+// ------------------------------------------------------
+// Macro to register benchmarks
+
+// Check that __COUNTER__ is defined and that __COUNTER__ increases by 1
+// every time it is expanded. X + 1 == X + 0 is used in case X is defined to be
+// empty. If X is empty the expression becomes (+1 == +0).
+#if defined(__COUNTER__) && (__COUNTER__ + 1 == __COUNTER__ + 0)
+#define BENCHMARK_PRIVATE_UNIQUE_ID __COUNTER__
+#else
+#define BENCHMARK_PRIVATE_UNIQUE_ID __LINE__
+#endif
+
+// Helpers for generating unique variable names
+#define BENCHMARK_PRIVATE_NAME(n) \
+  BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+#define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
+#define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
+
+#define BENCHMARK_PRIVATE_DECLARE(n)                                 \
+  static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
+      BENCHMARK_UNUSED
+
+#define BENCHMARK(n)                                     \
+  BENCHMARK_PRIVATE_DECLARE(n) =                         \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          new ::benchmark::internal::FunctionBenchmark(#n, n)))
+
+// Old-style macros
+#define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
+#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->Args({(a1), (a2)})
+#define BENCHMARK_WITH_UNIT(n, t) BENCHMARK(n)->Unit((t))
+#define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi))
+#define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
+  BENCHMARK(n)->RangePair({{(l1), (h1)}, {(l2), (h2)}})
+
+#if __cplusplus >= 201103L
+
+// Register a benchmark which invokes the function specified by `func`
+// with the additional arguments specified by `...`.
+//
+// For example:
+//
+// template <class ...ExtraArgs>`
+// void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
+//  [...]
+//}
+// /* Registers a benchmark named "BM_takes_args/int_string_test` */
+// BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+#define BENCHMARK_CAPTURE(func, test_case_name, ...)     \
+  BENCHMARK_PRIVATE_DECLARE(func) =                      \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          new ::benchmark::internal::FunctionBenchmark(  \
+              #func "/" #test_case_name,                 \
+              [](::benchmark::State& st) { func(st, __VA_ARGS__); })))
+
+#endif  // __cplusplus >= 11
+
+// This will register a benchmark for a templatized function.  For example:
+//
+// template<int arg>
+// void BM_Foo(int iters);
+//
+// BENCHMARK_TEMPLATE(BM_Foo, 1);
+//
+// will register BM_Foo<1> as a benchmark.
+#define BENCHMARK_TEMPLATE1(n, a)                        \
+  BENCHMARK_PRIVATE_DECLARE(n) =                         \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          new ::benchmark::internal::FunctionBenchmark(#n "<" #a ">", n<a>)))
+
+#define BENCHMARK_TEMPLATE2(n, a, b)                                         \
+  BENCHMARK_PRIVATE_DECLARE(n) =                                             \
+      (::benchmark::internal::RegisterBenchmarkInternal(                     \
+          new ::benchmark::internal::FunctionBenchmark(#n "<" #a "," #b ">", \
+                                                       n<a, b>)))
+
+#if __cplusplus >= 201103L
+#define BENCHMARK_TEMPLATE(n, ...)                       \
+  BENCHMARK_PRIVATE_DECLARE(n) =                         \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          new ::benchmark::internal::FunctionBenchmark(  \
+              #n "<" #__VA_ARGS__ ">", n<__VA_ARGS__>)))
+#else
+#define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
+#endif
+
+#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)        \
+  class BaseClass##_##Method##_Benchmark : public BaseClass { \
+   public:                                                    \
+    BaseClass##_##Method##_Benchmark() : BaseClass() {        \
+      this->SetName(#BaseClass "/" #Method);                  \
+    }                                                         \
+                                                              \
+   protected:                                                 \
+    virtual void BenchmarkCase(::benchmark::State&);          \
+  };
+
+#define BENCHMARK_DEFINE_F(BaseClass, Method)    \
+  BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+#define BENCHMARK_REGISTER_F(BaseClass, Method) \
+  BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark)
+
+#define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
+  BENCHMARK_PRIVATE_DECLARE(TestName) =        \
+      (::benchmark::internal::RegisterBenchmarkInternal(new TestName()))
+
+// This macro will define and register a benchmark within a fixture class.
+#define BENCHMARK_F(BaseClass, Method)           \
+  BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
+  BENCHMARK_REGISTER_F(BaseClass, Method);       \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+// Helper macro to create a main routine in a test that runs the benchmarks
+#define BENCHMARK_MAIN()                   \
+  int main(int argc, char** argv) {        \
+    ::benchmark::Initialize(&argc, argv);  \
+    if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
+    ::benchmark::RunSpecifiedBenchmarks(); \
+  }
+
+
+// ------------------------------------------------------
+// Benchmark Reporters
+
+namespace benchmark {
+
+// Interface for custom benchmark result printers.
+// By default, benchmark reports are printed to stdout. However an application
+// can control the destination of the reports by calling
+// RunSpecifiedBenchmarks and passing it a custom reporter object.
+// The reporter object must implement the following interface.
+class BenchmarkReporter {
+ public:
+  struct Context {
+    int num_cpus;
+    double mhz_per_cpu;
+    bool cpu_scaling_enabled;
+
+    // The number of chars in the longest benchmark name.
+    size_t name_field_width;
+  };
+
+  struct Run {
+    Run()
+        : error_occurred(false),
+          iterations(1),
+          time_unit(kNanosecond),
+          real_accumulated_time(0),
+          cpu_accumulated_time(0),
+          bytes_per_second(0),
+          items_per_second(0),
+          max_heapbytes_used(0),
+          complexity(oNone),
+          complexity_lambda(),
+          complexity_n(0),
+          report_big_o(false),
+          report_rms(false),
+          counters() {}
+
+    std::string benchmark_name;
+    std::string report_label;  // Empty if not set by benchmark.
+    bool error_occurred;
+    std::string error_message;
+
+    int64_t iterations;
+    TimeUnit time_unit;
+    double real_accumulated_time;
+    double cpu_accumulated_time;
+
+    // Return a value representing the real time per iteration in the unit
+    // specified by 'time_unit'.
+    // NOTE: If 'iterations' is zero the returned value represents the
+    // accumulated time.
+    double GetAdjustedRealTime() const;
+
+    // Return a value representing the cpu time per iteration in the unit
+    // specified by 'time_unit'.
+    // NOTE: If 'iterations' is zero the returned value represents the
+    // accumulated time.
+    double GetAdjustedCPUTime() const;
+
+    // Zero if not set by benchmark.
+    double bytes_per_second;
+    double items_per_second;
+
+    // This is set to 0.0 if memory tracing is not enabled.
+    double max_heapbytes_used;
+
+    // Keep track of arguments to compute asymptotic complexity
+    BigO complexity;
+    BigOFunc* complexity_lambda;
+    int complexity_n;
+
+    // Inform print function whether the current run is a complexity report
+    bool report_big_o;
+    bool report_rms;
+
+    UserCounters counters;
+  };
+
+  // Construct a BenchmarkReporter with the output stream set to 'std::cout'
+  // and the error stream set to 'std::cerr'
+  BenchmarkReporter();
+
+  // Called once for every suite of benchmarks run.
+  // The parameter "context" contains information that the
+  // reporter may wish to use when generating its report, for example the
+  // platform under which the benchmarks are running. The benchmark run is
+  // never started if this function returns false, allowing the reporter
+  // to skip runs based on the context information.
+  virtual bool ReportContext(const Context& context) = 0;
+
+  // Called once for each group of benchmark runs, gives information about
+  // cpu-time and heap memory usage during the benchmark run. If the group
+  // of runs contained more than two entries then 'report' contains additional
+  // elements representing the mean and standard deviation of those runs.
+  // Additionally if this group of runs was the last in a family of benchmarks
+  // 'reports' contains additional entries representing the asymptotic
+  // complexity and RMS of that benchmark family.
+  virtual void ReportRuns(const std::vector<Run>& report) = 0;
+
+  // Called once and only once after ever group of benchmarks is run and
+  // reported.
+  virtual void Finalize() {}
+
+  // REQUIRES: The object referenced by 'out' is valid for the lifetime
+  // of the reporter.
+  void SetOutputStream(std::ostream* out) {
+    assert(out);
+    output_stream_ = out;
+  }
+
+  // REQUIRES: The object referenced by 'err' is valid for the lifetime
+  // of the reporter.
+  void SetErrorStream(std::ostream* err) {
+    assert(err);
+    error_stream_ = err;
+  }
+
+  std::ostream& GetOutputStream() const { return *output_stream_; }
+
+  std::ostream& GetErrorStream() const { return *error_stream_; }
+
+  virtual ~BenchmarkReporter();
+
+  // Write a human readable string to 'out' representing the specified
+  // 'context'.
+  // REQUIRES: 'out' is non-null.
+  static void PrintBasicContext(std::ostream* out, Context const& context);
+
+ private:
+  std::ostream* output_stream_;
+  std::ostream* error_stream_;
+};
+
+// Simple reporter that outputs benchmark data to the console. This is the
+// default reporter used by RunSpecifiedBenchmarks().
+class ConsoleReporter : public BenchmarkReporter {
+public:
+  enum OutputOptions {
+    OO_None = 0,
+    OO_Color = 1,
+    OO_Tabular = 2,
+    OO_ColorTabular = OO_Color|OO_Tabular,
+    OO_Defaults = OO_ColorTabular
+  };
+  explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults)
+      : output_options_(opts_), name_field_width_(0),
+        prev_counters_(), printed_header_(false) {}
+
+  virtual bool ReportContext(const Context& context);
+  virtual void ReportRuns(const std::vector<Run>& reports);
+
+ protected:
+  virtual void PrintRunData(const Run& report);
+  virtual void PrintHeader(const Run& report);
+
+  OutputOptions output_options_;
+  size_t name_field_width_;
+  UserCounters prev_counters_;
+  bool printed_header_;
+};
+
+class JSONReporter : public BenchmarkReporter {
+ public:
+  JSONReporter() : first_report_(true) {}
+  virtual bool ReportContext(const Context& context);
+  virtual void ReportRuns(const std::vector<Run>& reports);
+  virtual void Finalize();
+
+ private:
+  void PrintRunData(const Run& report);
+
+  bool first_report_;
+};
+
+class CSVReporter : public BenchmarkReporter {
+ public:
+  CSVReporter() : printed_header_(false) {}
+  virtual bool ReportContext(const Context& context);
+  virtual void ReportRuns(const std::vector<Run>& reports);
+
+ private:
+  void PrintRunData(const Run& report);
+
+  bool printed_header_;
+  std::set< std::string > user_counter_names_;
+};
+
+inline const char* GetTimeUnitString(TimeUnit unit) {
+  switch (unit) {
+    case kMillisecond:
+      return "ms";
+    case kMicrosecond:
+      return "us";
+    case kNanosecond:
+    default:
+      return "ns";
+  }
+}
+
+inline double GetTimeUnitMultiplier(TimeUnit unit) {
+  switch (unit) {
+    case kMillisecond:
+      return 1e3;
+    case kMicrosecond:
+      return 1e6;
+    case kNanosecond:
+    default:
+      return 1e9;
+  }
+}
+
+} // end namespace benchmark
 
 #endif  // BENCHMARK_BENCHMARK_H_
diff --git a/include/benchmark/benchmark_api.h b/include/benchmark/benchmark_api.h
index 7f9a58ff..a9ae6714 100644
--- a/include/benchmark/benchmark_api.h
+++ b/include/benchmark/benchmark_api.h
@@ -1,925 +1,27 @@
-// Support for registering benchmarks for functions.
-
-/* Example usage:
-// Define a function that executes the code to be measured a
-// specified number of times:
-static void BM_StringCreation(benchmark::State& state) {
-  while (state.KeepRunning())
-    std::string empty_string;
-}
-
-// Register the function as a benchmark
-BENCHMARK(BM_StringCreation);
-
-// Define another benchmark
-static void BM_StringCopy(benchmark::State& state) {
-  std::string x = "hello";
-  while (state.KeepRunning())
-    std::string copy(x);
-}
-BENCHMARK(BM_StringCopy);
-
-// Augment the main() program to invoke benchmarks if specified
-// via the --benchmarks command line flag.  E.g.,
-//       my_unittest --benchmark_filter=all
-//       my_unittest --benchmark_filter=BM_StringCreation
-//       my_unittest --benchmark_filter=String
-//       my_unittest --benchmark_filter='Copy|Creation'
-int main(int argc, char** argv) {
-  benchmark::Initialize(&argc, argv);
-  benchmark::RunSpecifiedBenchmarks();
-  return 0;
-}
-
-// Sometimes a family of microbenchmarks can be implemented with
-// just one routine that takes an extra argument to specify which
-// one of the family of benchmarks to run.  For example, the following
-// code defines a family of microbenchmarks for measuring the speed
-// of memcpy() calls of different lengths:
-
-static void BM_memcpy(benchmark::State& state) {
-  char* src = new char[state.range(0)]; char* dst = new char[state.range(0)];
-  memset(src, 'x', state.range(0));
-  while (state.KeepRunning())
-    memcpy(dst, src, state.range(0));
-  state.SetBytesProcessed(int64_t(state.iterations()) *
-                          int64_t(state.range(0)));
-  delete[] src; delete[] dst;
-}
-BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
-
-// The preceding code is quite repetitive, and can be replaced with the
-// following short-hand.  The following invocation will pick a few
-// appropriate arguments in the specified range and will generate a
-// microbenchmark for each such argument.
-BENCHMARK(BM_memcpy)->Range(8, 8<<10);
-
-// You might have a microbenchmark that depends on two inputs.  For
-// example, the following code defines a family of microbenchmarks for
-// measuring the speed of set insertion.
-static void BM_SetInsert(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    state.PauseTiming();
-    set<int> data = ConstructRandomSet(state.range(0));
-    state.ResumeTiming();
-    for (int j = 0; j < state.range(1); ++j)
-      data.insert(RandomNumber());
-  }
-}
-BENCHMARK(BM_SetInsert)
-   ->Args({1<<10, 1})
-   ->Args({1<<10, 8})
-   ->Args({1<<10, 64})
-   ->Args({1<<10, 512})
-   ->Args({8<<10, 1})
-   ->Args({8<<10, 8})
-   ->Args({8<<10, 64})
-   ->Args({8<<10, 512});
-
-// The preceding code is quite repetitive, and can be replaced with
-// the following short-hand.  The following macro will pick a few
-// appropriate arguments in the product of the two specified ranges
-// and will generate a microbenchmark for each such pair.
-BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {1, 512}});
-
-// For more complex patterns of inputs, passing a custom function
-// to Apply allows programmatic specification of an
-// arbitrary set of arguments to run the microbenchmark on.
-// The following example enumerates a dense range on
-// one parameter, and a sparse range on the second.
-static void CustomArguments(benchmark::internal::Benchmark* b) {
-  for (int i = 0; i <= 10; ++i)
-    for (int j = 32; j <= 1024*1024; j *= 8)
-      b->Args({i, j});
-}
-BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
-
-// Templated microbenchmarks work the same way:
-// Produce then consume 'size' messages 'iters' times
-// Measures throughput in the absence of multiprogramming.
-template <class Q> int BM_Sequential(benchmark::State& state) {
-  Q q;
-  typename Q::value_type v;
-  while (state.KeepRunning()) {
-    for (int i = state.range(0); i--; )
-      q.push(v);
-    for (int e = state.range(0); e--; )
-      q.Wait(&v);
-  }
-  // actually messages, not bytes:
-  state.SetBytesProcessed(
-      static_cast<int64_t>(state.iterations())*state.range(0));
-}
-BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
-
-Use `Benchmark::MinTime(double t)` to set the minimum time used to run the
-benchmark. This option overrides the `benchmark_min_time` flag.
-
-void BM_test(benchmark::State& state) {
- ... body ...
-}
-BENCHMARK(BM_test)->MinTime(2.0); // Run for at least 2 seconds.
-
-In a multithreaded test, it is guaranteed that none of the threads will start
-until all have called KeepRunning, and all will have finished before KeepRunning
-returns false. As such, any global setup or teardown you want to do can be
-wrapped in a check against the thread index:
-
-static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
-    // Setup code here.
-  }
-  while (state.KeepRunning()) {
-    // Run the test as normal.
-  }
-  if (state.thread_index == 0) {
-    // Teardown code here.
-  }
-}
-BENCHMARK(BM_MultiThreaded)->Threads(4);
-
-
-If a benchmark runs a few milliseconds it may be hard to visually compare the
-measured times, since the output data is given in nanoseconds per default. In
-order to manually set the time unit, you can specify it manually:
-
-BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
-*/
-
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #ifndef BENCHMARK_BENCHMARK_API_H_
 #define BENCHMARK_BENCHMARK_API_H_
 
-#include <assert.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include <string>
-#include <vector>
-#include <map>
-
-#include "macros.h"
-
-#if defined(BENCHMARK_HAS_CXX11)
-#include <type_traits>
-#include <initializer_list>
-#include <utility>
+#ifdef __DEPRECATED
+# ifndef BENCHMARK_WARNING_MSG
+#   warning the benchmark_api.h header has been deprecated and will be removed, please include benchmark.h instead
+# else
+    BENCHMARK_WARNING_MSG("the benchmark_api.h header has been deprecated and will be removed, please include benchmark.h instead")
+# endif
 #endif
 
-#if defined(_MSC_VER)
-#include <intrin.h> // for _ReadWriteBarrier
-#endif
-
-namespace benchmark {
-class BenchmarkReporter;
-
-void Initialize(int* argc, char** argv);
-
-// Report to stdout all arguments in 'argv' as unrecognized except the first.
-// Returns true there is at least on unrecognized argument (i.e. 'argc' > 1).
-bool ReportUnrecognizedArguments(int argc, char** argv);
-
-// Generate a list of benchmarks matching the specified --benchmark_filter flag
-// and if --benchmark_list_tests is specified return after printing the name
-// of each matching benchmark. Otherwise run each matching benchmark and
-// report the results.
-//
-// The second and third overload use the specified 'console_reporter' and
-//  'file_reporter' respectively. 'file_reporter' will write to the file
-//  specified
-//   by '--benchmark_output'. If '--benchmark_output' is not given the
-//  'file_reporter' is ignored.
-//
-// RETURNS: The number of matching benchmarks.
-size_t RunSpecifiedBenchmarks();
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter);
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
-                              BenchmarkReporter* file_reporter);
-
-// If this routine is called, peak memory allocation past this point in the
-// benchmark is reported at the end of the benchmark report line. (It is
-// computed by running the benchmark once with a single iteration and a memory
-// tracer.)
-// TODO(dominic)
-// void MemoryUsage();
-
-namespace internal {
-class Benchmark;
-class BenchmarkImp;
-class BenchmarkFamilies;
-
-void UseCharPointer(char const volatile*);
-
-// Take ownership of the pointer and register the benchmark. Return the
-// registered benchmark.
-Benchmark* RegisterBenchmarkInternal(Benchmark*);
-
-// Ensure that the standard streams are properly initialized in every TU.
-int InitializeStreams();
-BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
-
-}  // end namespace internal
-
-
-#if !defined(__GNUC__) || defined(__pnacl__) || defined(EMSCRIPTN)
-# define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
-#endif
-
-// The DoNotOptimize(...) function can be used to prevent a value or
-// expression from being optimized away by the compiler. This function is
-// intended to add little to no overhead.
-// See: https://youtu.be/nXaxk27zwlk?t=2441
-#ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
-template <class Tp>
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-  // Clang doesn't like the 'X' constraint on `value` and certain GCC versions
-  // don't like the 'g' constraint. Attempt to placate them both.
-#if defined(__clang__)
-  asm volatile("" : : "g"(value) : "memory");
-#else
-  asm volatile("" : : "i,r,m"(value) : "memory");
-#endif
-}
-// Force the compiler to flush pending writes to global memory. Acts as an
-// effective read/write barrier
-inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
-  asm volatile("" : : : "memory");
-}
-#elif defined(_MSC_VER)
-template <class Tp>
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
-  _ReadWriteBarrier();
-}
-
-inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
-  _ReadWriteBarrier();
-}
-#else
-template <class Tp>
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
-}
-// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers
-#endif
-
-
-
-// This class is used for user-defined counters.
-class Counter {
-public:
-
-  enum Flags {
-    kDefaults   = 0,
-    // Mark the counter as a rate. It will be presented divided
-    // by the duration of the benchmark.
-    kIsRate     = 1,
-    // Mark the counter as a thread-average quantity. It will be
-    // presented divided by the number of threads.
-    kAvgThreads = 2,
-    // Mark the counter as a thread-average rate. See above.
-    kAvgThreadsRate = kIsRate|kAvgThreads
-  };
-
-  double value;
-  Flags  flags;
-
-  BENCHMARK_ALWAYS_INLINE
-  Counter(double v = 0., Flags f = kDefaults) : value(v), flags(f) {}
-
-  BENCHMARK_ALWAYS_INLINE operator double const& () const { return value; }
-  BENCHMARK_ALWAYS_INLINE operator double      & ()       { return value; }
-
-};
-
-// This is the container for the user-defined counters.
-typedef std::map<std::string, Counter> UserCounters;
-
-
-// TimeUnit is passed to a benchmark in order to specify the order of magnitude
-// for the measured time.
-enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond };
-
-// BigO is passed to a benchmark in order to specify the asymptotic
-// computational
-// complexity for the benchmark. In case oAuto is selected, complexity will be
-// calculated automatically to the best fit.
-enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };
-
-// BigOFunc is passed to a benchmark in order to specify the asymptotic
-// computational complexity for the benchmark.
-typedef double(BigOFunc)(int);
-
-namespace internal {
-class ThreadTimer;
-class ThreadManager;
-
-#if defined(BENCHMARK_HAS_CXX11)
-enum ReportMode : unsigned {
-#else
-enum ReportMode {
-#endif
-  RM_Unspecified,  // The mode has not been manually specified
-  RM_Default,      // The mode is user-specified as default.
-  RM_ReportAggregatesOnly
-};
-}
-
-// State is passed to a running Benchmark and contains state for the
-// benchmark to use.
-class State {
- public:
-  // Returns true if the benchmark should continue through another iteration.
-  // NOTE: A benchmark may not return from the test until KeepRunning() has
-  // returned false.
-  bool KeepRunning() {
-    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
-      StartKeepRunning();
-    }
-    bool const res = total_iterations_++ < max_iterations;
-    if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
-      FinishKeepRunning();
-    }
-    return res;
-  }
-
-  // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
-  //           by the current thread.
-  // Stop the benchmark timer.  If not called, the timer will be
-  // automatically stopped after KeepRunning() returns false for the first time.
-  //
-  // For threaded benchmarks the PauseTiming() function only pauses the timing
-  // for the current thread.
-  //
-  // NOTE: The "real time" measurement is per-thread. If different threads
-  // report different measurements the largest one is reported.
-  //
-  // NOTE: PauseTiming()/ResumeTiming() are relatively
-  // heavyweight, and so their use should generally be avoided
-  // within each benchmark iteration, if possible.
-  void PauseTiming();
-
-  // REQUIRES: timer is not running and 'SkipWithError(...)' has not been called
-  //           by the current thread.
-  // Start the benchmark timer.  The timer is NOT running on entrance to the
-  // benchmark function. It begins running after the first call to KeepRunning()
-  //
-  // NOTE: PauseTiming()/ResumeTiming() are relatively
-  // heavyweight, and so their use should generally be avoided
-  // within each benchmark iteration, if possible.
-  void ResumeTiming();
-
-  // REQUIRES: 'SkipWithError(...)' has not been called previously by the
-  //            current thread.
-  // Skip any future iterations of the 'KeepRunning()' loop in the current
-  // thread and report an error with the specified 'msg'. After this call
-  // the user may explicitly 'return' from the benchmark.
-  //
-  // For threaded benchmarks only the current thread stops executing and future
-  // calls to `KeepRunning()` will block until all threads have completed
-  // the `KeepRunning()` loop. If multiple threads report an error only the
-  // first error message is used.
-  //
-  // NOTE: Calling 'SkipWithError(...)' does not cause the benchmark to exit
-  // the current scope immediately. If the function is called from within
-  // the 'KeepRunning()' loop the current iteration will finish. It is the users
-  // responsibility to exit the scope as needed.
-  void SkipWithError(const char* msg);
-
-  // REQUIRES: called exactly once per iteration of the KeepRunning loop.
-  // Set the manually measured time for this benchmark iteration, which
-  // is used instead of automatically measured time if UseManualTime() was
-  // specified.
-  //
-  // For threaded benchmarks the final value will be set to the largest
-  // reported values.
-  void SetIterationTime(double seconds);
-
-  // Set the number of bytes processed by the current benchmark
-  // execution.  This routine is typically called once at the end of a
-  // throughput oriented benchmark.  If this routine is called with a
-  // value > 0, the report is printed in MB/sec instead of nanoseconds
-  // per iteration.
-  //
-  // REQUIRES: a benchmark has exited its KeepRunning loop.
-  BENCHMARK_ALWAYS_INLINE
-  void SetBytesProcessed(size_t bytes) { bytes_processed_ = bytes; }
-
-  BENCHMARK_ALWAYS_INLINE
-  size_t bytes_processed() const { return bytes_processed_; }
-
-  // If this routine is called with complexity_n > 0 and complexity report is
-  // requested for the
-  // family benchmark, then current benchmark will be part of the computation
-  // and complexity_n will
-  // represent the length of N.
-  BENCHMARK_ALWAYS_INLINE
-  void SetComplexityN(int complexity_n) { complexity_n_ = complexity_n; }
-
-  BENCHMARK_ALWAYS_INLINE
-  int complexity_length_n() { return complexity_n_; }
-
-  // If this routine is called with items > 0, then an items/s
-  // label is printed on the benchmark report line for the currently
-  // executing benchmark. It is typically called at the end of a processing
-  // benchmark where a processing items/second output is desired.
-  //
-  // REQUIRES: a benchmark has exited its KeepRunning loop.
-  BENCHMARK_ALWAYS_INLINE
-  void SetItemsProcessed(size_t items) { items_processed_ = items; }
-
-  BENCHMARK_ALWAYS_INLINE
-  size_t items_processed() const { return items_processed_; }
-
-  // If this routine is called, the specified label is printed at the
-  // end of the benchmark report line for the currently executing
-  // benchmark.  Example:
-  //  static void BM_Compress(benchmark::State& state) {
-  //    ...
-  //    double compress = input_size / output_size;
-  //    state.SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
-  //  }
-  // Produces output that looks like:
-  //  BM_Compress   50         50   14115038  compress:27.3%
-  //
-  // REQUIRES: a benchmark has exited its KeepRunning loop.
-  void SetLabel(const char* label);
-
-  void BENCHMARK_ALWAYS_INLINE SetLabel(const std::string& str) {
-    this->SetLabel(str.c_str());
-  }
-
-  // Range arguments for this run. CHECKs if the argument has been set.
-  BENCHMARK_ALWAYS_INLINE
-  int range(std::size_t pos = 0) const {
-    assert(range_.size() > pos);
-    return range_[pos];
-  }
-
-  BENCHMARK_DEPRECATED_MSG("use 'range(0)' instead")
-  int range_x() const { return range(0); }
-
-  BENCHMARK_DEPRECATED_MSG("use 'range(1)' instead")
-  int range_y() const { return range(1); }
-
-  BENCHMARK_ALWAYS_INLINE
-  size_t iterations() const { return total_iterations_; }
-
- private:
-  bool started_;
-  bool finished_;
-  size_t total_iterations_;
-
-  std::vector<int> range_;
-
-  size_t bytes_processed_;
-  size_t items_processed_;
-
-  int complexity_n_;
-
-  bool error_occurred_;
-
- public:
-  // Container for user-defined counters.
-  UserCounters counters;
-  // Index of the executing thread. Values from [0, threads).
-  const int thread_index;
-  // Number of threads concurrently executing the benchmark.
-  const int threads;
-  const size_t max_iterations;
-
-  // TODO make me private
-  State(size_t max_iters, const std::vector<int>& ranges, int thread_i,
-        int n_threads, internal::ThreadTimer* timer,
-        internal::ThreadManager* manager);
-
- private:
-  void StartKeepRunning();
-  void FinishKeepRunning();
-  internal::ThreadTimer* timer_;
-  internal::ThreadManager* manager_;
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
-};
-
-namespace internal {
-
-typedef void(Function)(State&);
-
-// ------------------------------------------------------
-// Benchmark registration object.  The BENCHMARK() macro expands
-// into an internal::Benchmark* object.  Various methods can
-// be called on this object to change the properties of the benchmark.
-// Each method returns "this" so that multiple method calls can
-// chained into one expression.
-class Benchmark {
- public:
-  virtual ~Benchmark();
-
-  // Note: the following methods all return "this" so that multiple
-  // method calls can be chained together in one expression.
-
-  // Run this benchmark once with "x" as the extra argument passed
-  // to the function.
-  // REQUIRES: The function passed to the constructor must accept an arg1.
-  Benchmark* Arg(int x);
-
-  // Run this benchmark with the given time unit for the generated output report
-  Benchmark* Unit(TimeUnit unit);
-
-  // Run this benchmark once for a number of values picked from the
-  // range [start..limit].  (start and limit are always picked.)
-  // REQUIRES: The function passed to the constructor must accept an arg1.
-  Benchmark* Range(int start, int limit);
-
-  // Run this benchmark once for all values in the range [start..limit] with
-  // specific step
-  // REQUIRES: The function passed to the constructor must accept an arg1.
-  Benchmark* DenseRange(int start, int limit, int step = 1);
-
-  // Run this benchmark once with "args" as the extra arguments passed
-  // to the function.
-  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
-  Benchmark* Args(const std::vector<int>& args);
-
-  // Equivalent to Args({x, y})
-  // NOTE: This is a legacy C++03 interface provided for compatibility only.
-  //   New code should use 'Args'.
-  Benchmark* ArgPair(int x, int y) {
-    std::vector<int> args;
-    args.push_back(x);
-    args.push_back(y);
-    return Args(args);
-  }
-
-  // Run this benchmark once for a number of values picked from the
-  // ranges [start..limit].  (starts and limits are always picked.)
-  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
-  Benchmark* Ranges(const std::vector<std::pair<int, int> >& ranges);
-
-  // Equivalent to ArgNames({name})
-  Benchmark* ArgName(const std::string& name);
-
-  // Set the argument names to display in the benchmark name. If not called,
-  // only argument values will be shown.
-  Benchmark* ArgNames(const std::vector<std::string>& names);
-
-  // Equivalent to Ranges({{lo1, hi1}, {lo2, hi2}}).
-  // NOTE: This is a legacy C++03 interface provided for compatibility only.
-  //   New code should use 'Ranges'.
-  Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2) {
-    std::vector<std::pair<int, int> > ranges;
-    ranges.push_back(std::make_pair(lo1, hi1));
-    ranges.push_back(std::make_pair(lo2, hi2));
-    return Ranges(ranges);
-  }
-
-  // Pass this benchmark object to *func, which can customize
-  // the benchmark by calling various methods like Arg, Args,
-  // Threads, etc.
-  Benchmark* Apply(void (*func)(Benchmark* benchmark));
-
-  // Set the range multiplier for non-dense range. If not called, the range
-  // multiplier kRangeMultiplier will be used.
-  Benchmark* RangeMultiplier(int multiplier);
-
-  // Set the minimum amount of time to use when running this benchmark. This
-  // option overrides the `benchmark_min_time` flag.
-  // REQUIRES: `t > 0` and `Iterations` has not been called on this benchmark.
-  Benchmark* MinTime(double t);
-
-  // Specify the amount of iterations that should be run by this benchmark.
-  // REQUIRES: 'n > 0' and `MinTime` has not been called on this benchmark.
-  //
-  // NOTE: This function should only be used when *exact* iteration control is
-  //   needed and never to control or limit how long a benchmark runs, where
-  // `--benchmark_min_time=N` or `MinTime(...)` should be used instead.
-  Benchmark* Iterations(size_t n);
-
-  // Specify the amount of times to repeat this benchmark. This option overrides
-  // the `benchmark_repetitions` flag.
-  // REQUIRES: `n > 0`
-  Benchmark* Repetitions(int n);
-
-  // Specify if each repetition of the benchmark should be reported separately
-  // or if only the final statistics should be reported. If the benchmark
-  // is not repeated then the single result is always reported.
-  Benchmark* ReportAggregatesOnly(bool v = true);
-
-  // If a particular benchmark is I/O bound, runs multiple threads internally or
-  // if for some reason CPU timings are not representative, call this method. If
-  // called, the elapsed time will be used to control how many iterations are
-  // run, and in the printing of items/second or MB/seconds values.  If not
-  // called, the cpu time used by the benchmark will be used.
-  Benchmark* UseRealTime();
-
-  // If a benchmark must measure time manually (e.g. if GPU execution time is
-  // being
-  // measured), call this method. If called, each benchmark iteration should
-  // call
-  // SetIterationTime(seconds) to report the measured time, which will be used
-  // to control how many iterations are run, and in the printing of items/second
-  // or MB/second values.
-  Benchmark* UseManualTime();
-
-  // Set the asymptotic computational complexity for the benchmark. If called
-  // the asymptotic computational complexity will be shown on the output.
-  Benchmark* Complexity(BigO complexity = benchmark::oAuto);
-
-  // Set the asymptotic computational complexity for the benchmark. If called
-  // the asymptotic computational complexity will be shown on the output.
-  Benchmark* Complexity(BigOFunc* complexity);
-
-  // Support for running multiple copies of the same benchmark concurrently
-  // in multiple threads.  This may be useful when measuring the scaling
-  // of some piece of code.
-
-  // Run one instance of this benchmark concurrently in t threads.
-  Benchmark* Threads(int t);
-
-  // Pick a set of values T from [min_threads,max_threads].
-  // min_threads and max_threads are always included in T.  Run this
-  // benchmark once for each value in T.  The benchmark run for a
-  // particular value t consists of t threads running the benchmark
-  // function concurrently.  For example, consider:
-  //    BENCHMARK(Foo)->ThreadRange(1,16);
-  // This will run the following benchmarks:
-  //    Foo in 1 thread
-  //    Foo in 2 threads
-  //    Foo in 4 threads
-  //    Foo in 8 threads
-  //    Foo in 16 threads
-  Benchmark* ThreadRange(int min_threads, int max_threads);
-
-  // For each value n in the range, run this benchmark once using n threads.
-  // min_threads and max_threads are always included in the range.
-  // stride specifies the increment. E.g. DenseThreadRange(1, 8, 3) starts
-  // a benchmark with 1, 4, 7 and 8 threads.
-  Benchmark* DenseThreadRange(int min_threads, int max_threads, int stride = 1);
-
-  // Equivalent to ThreadRange(NumCPUs(), NumCPUs())
-  Benchmark* ThreadPerCpu();
-
-  virtual void Run(State& state) = 0;
-
-  // Used inside the benchmark implementation
-  struct Instance;
-
- protected:
-  explicit Benchmark(const char* name);
-  Benchmark(Benchmark const&);
-  void SetName(const char* name);
-
-  int ArgsCnt() const;
-
-  static void AddRange(std::vector<int>* dst, int lo, int hi, int mult);
-
- private:
-  friend class BenchmarkFamilies;
-
-  std::string name_;
-  ReportMode report_mode_;
-  std::vector<std::string> arg_names_;   // Args for all benchmark runs
-  std::vector<std::vector<int> > args_;  // Args for all benchmark runs
-  TimeUnit time_unit_;
-  int range_multiplier_;
-  double min_time_;
-  size_t iterations_;
-  int repetitions_;
-  bool use_real_time_;
-  bool use_manual_time_;
-  BigO complexity_;
-  BigOFunc* complexity_lambda_;
-  std::vector<int> thread_counts_;
-
-  Benchmark& operator=(Benchmark const&);
-};
-
-}  // namespace internal
-
-// Create and register a benchmark with the specified 'name' that invokes
-// the specified functor 'fn'.
-//
-// RETURNS: A pointer to the registered benchmark.
-internal::Benchmark* RegisterBenchmark(const char* name,
-                                       internal::Function* fn);
-
-#if defined(BENCHMARK_HAS_CXX11)
-template <class Lambda>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn);
-#endif
-
-// Remove all registered benchmarks. All pointers to previously registered
-// benchmarks are invalidated.
-void ClearRegisteredBenchmarks();
-
-namespace internal {
-// The class used to hold all Benchmarks created from static function.
-// (ie those created using the BENCHMARK(...) macros.
-class FunctionBenchmark : public Benchmark {
- public:
-  FunctionBenchmark(const char* name, Function* func)
-      : Benchmark(name), func_(func) {}
-
-  virtual void Run(State& st);
-
- private:
-  Function* func_;
-};
-
-#ifdef BENCHMARK_HAS_CXX11
-template <class Lambda>
-class LambdaBenchmark : public Benchmark {
- public:
-  virtual void Run(State& st) { lambda_(st); }
-
- private:
-  template <class OLambda>
-  LambdaBenchmark(const char* name, OLambda&& lam)
-      : Benchmark(name), lambda_(std::forward<OLambda>(lam)) {}
-
-  LambdaBenchmark(LambdaBenchmark const&) = delete;
-
- private:
-  template <class Lam>
-  friend Benchmark* ::benchmark::RegisterBenchmark(const char*, Lam&&);
-
-  Lambda lambda_;
-};
-#endif
-
-}  // end namespace internal
-
-inline internal::Benchmark* RegisterBenchmark(const char* name,
-                                              internal::Function* fn) {
-  return internal::RegisterBenchmarkInternal(
-      ::new internal::FunctionBenchmark(name, fn));
-}
-
-#ifdef BENCHMARK_HAS_CXX11
-template <class Lambda>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn) {
-  using BenchType =
-      internal::LambdaBenchmark<typename std::decay<Lambda>::type>;
-  return internal::RegisterBenchmarkInternal(
-      ::new BenchType(name, std::forward<Lambda>(fn)));
-}
-#endif
-
-#if defined(BENCHMARK_HAS_CXX11) && \
-    (!defined(BENCHMARK_GCC_VERSION) || BENCHMARK_GCC_VERSION >= 409)
-template <class Lambda, class... Args>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn,
-                                       Args&&... args) {
-  return benchmark::RegisterBenchmark(
-      name, [=](benchmark::State& st) { fn(st, args...); });
-}
-#else
-#define BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
-#endif
-
-// The base class for all fixture tests.
-class Fixture : public internal::Benchmark {
- public:
-  Fixture() : internal::Benchmark("") {}
-
-  virtual void Run(State& st) {
-    this->SetUp(st);
-    this->BenchmarkCase(st);
-    this->TearDown(st);
-  }
-
-  // These will be deprecated ...
-  virtual void SetUp(const State&) {}
-  virtual void TearDown(const State&) {}
-  // ... In favor of these.
-  virtual void SetUp(State& st) { SetUp(const_cast<const State&>(st)); }
-  virtual void TearDown(State& st) { TearDown(const_cast<const State&>(st)); }
-
- protected:
-  virtual void BenchmarkCase(State&) = 0;
-};
-
-}  // end namespace benchmark
-
-// ------------------------------------------------------
-// Macro to register benchmarks
-
-// Check that __COUNTER__ is defined and that __COUNTER__ increases by 1
-// every time it is expanded. X + 1 == X + 0 is used in case X is defined to be
-// empty. If X is empty the expression becomes (+1 == +0).
-#if defined(__COUNTER__) && (__COUNTER__ + 1 == __COUNTER__ + 0)
-#define BENCHMARK_PRIVATE_UNIQUE_ID __COUNTER__
-#else
-#define BENCHMARK_PRIVATE_UNIQUE_ID __LINE__
-#endif
-
-// Helpers for generating unique variable names
-#define BENCHMARK_PRIVATE_NAME(n) \
-  BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
-#define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
-#define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
-
-#define BENCHMARK_PRIVATE_DECLARE(n)                                 \
-  static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
-      BENCHMARK_UNUSED
-
-#define BENCHMARK(n)                                     \
-  BENCHMARK_PRIVATE_DECLARE(n) =                         \
-      (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(#n, n)))
-
-// Old-style macros
-#define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
-#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->Args({(a1), (a2)})
-#define BENCHMARK_WITH_UNIT(n, t) BENCHMARK(n)->Unit((t))
-#define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi))
-#define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
-  BENCHMARK(n)->RangePair({{(l1), (h1)}, {(l2), (h2)}})
-
-#if __cplusplus >= 201103L
-
-// Register a benchmark which invokes the function specified by `func`
-// with the additional arguments specified by `...`.
-//
-// For example:
-//
-// template <class ...ExtraArgs>`
-// void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
-//  [...]
-//}
-// /* Registers a benchmark named "BM_takes_args/int_string_test` */
-// BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
-#define BENCHMARK_CAPTURE(func, test_case_name, ...)     \
-  BENCHMARK_PRIVATE_DECLARE(func) =                      \
-      (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(  \
-              #func "/" #test_case_name,                 \
-              [](::benchmark::State& st) { func(st, __VA_ARGS__); })))
-
-#endif  // __cplusplus >= 11
-
-// This will register a benchmark for a templatized function.  For example:
-//
-// template<int arg>
-// void BM_Foo(int iters);
-//
-// BENCHMARK_TEMPLATE(BM_Foo, 1);
-//
-// will register BM_Foo<1> as a benchmark.
-#define BENCHMARK_TEMPLATE1(n, a)                        \
-  BENCHMARK_PRIVATE_DECLARE(n) =                         \
-      (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(#n "<" #a ">", n<a>)))
-
-#define BENCHMARK_TEMPLATE2(n, a, b)                                         \
-  BENCHMARK_PRIVATE_DECLARE(n) =                                             \
-      (::benchmark::internal::RegisterBenchmarkInternal(                     \
-          new ::benchmark::internal::FunctionBenchmark(#n "<" #a "," #b ">", \
-                                                       n<a, b>)))
-
-#if __cplusplus >= 201103L
-#define BENCHMARK_TEMPLATE(n, ...)                       \
-  BENCHMARK_PRIVATE_DECLARE(n) =                         \
-      (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(  \
-              #n "<" #__VA_ARGS__ ">", n<__VA_ARGS__>)))
-#else
-#define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
-#endif
-
-#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)        \
-  class BaseClass##_##Method##_Benchmark : public BaseClass { \
-   public:                                                    \
-    BaseClass##_##Method##_Benchmark() : BaseClass() {        \
-      this->SetName(#BaseClass "/" #Method);                  \
-    }                                                         \
-                                                              \
-   protected:                                                 \
-    virtual void BenchmarkCase(::benchmark::State&);          \
-  };
-
-#define BENCHMARK_DEFINE_F(BaseClass, Method)    \
-  BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
-
-#define BENCHMARK_REGISTER_F(BaseClass, Method) \
-  BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark)
-
-#define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
-  BENCHMARK_PRIVATE_DECLARE(TestName) =        \
-      (::benchmark::internal::RegisterBenchmarkInternal(new TestName()))
-
-// This macro will define and register a benchmark within a fixture class.
-#define BENCHMARK_F(BaseClass, Method)           \
-  BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
-  BENCHMARK_REGISTER_F(BaseClass, Method);       \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
-
-// Helper macro to create a main routine in a test that runs the benchmarks
-#define BENCHMARK_MAIN()                   \
-  int main(int argc, char** argv) {        \
-    ::benchmark::Initialize(&argc, argv);  \
-    if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
-    ::benchmark::RunSpecifiedBenchmarks(); \
-  }
+#include "benchmark.h"  // For forward declaration of BenchmarkReporter
 
 #endif  // BENCHMARK_BENCHMARK_API_H_
diff --git a/include/benchmark/macros.h b/include/benchmark/macros.h
deleted file mode 100644
index 2466fd3f..00000000
--- a/include/benchmark/macros.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef BENCHMARK_MACROS_H_
-#define BENCHMARK_MACROS_H_
-
-#if __cplusplus >= 201103L
-#define BENCHMARK_HAS_CXX11
-#endif
-
-#ifndef BENCHMARK_HAS_CXX11
-#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
-  TypeName(const TypeName&);                         \
-  TypeName& operator=(const TypeName&)
-#else
-#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
-  TypeName(const TypeName&) = delete;                \
-  TypeName& operator=(const TypeName&) = delete
-#endif
-
-#if defined(__GNUC__)
-#define BENCHMARK_UNUSED __attribute__((unused))
-#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
-#define BENCHMARK_NOEXCEPT noexcept
-#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
-#elif defined(_MSC_VER) && !defined(__clang__)
-#define BENCHMARK_UNUSED
-#define BENCHMARK_ALWAYS_INLINE __forceinline
-#if _MSC_VER >= 1900
-#define BENCHMARK_NOEXCEPT noexcept
-#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
-#else
-#define BENCHMARK_NOEXCEPT
-#define BENCHMARK_NOEXCEPT_OP(x)
-#endif
-#define __func__ __FUNCTION__
-#else
-#define BENCHMARK_UNUSED
-#define BENCHMARK_ALWAYS_INLINE
-#define BENCHMARK_NOEXCEPT
-#define BENCHMARK_NOEXCEPT_OP(x)
-#endif
-
-#if defined(__GNUC__)
-#define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
-#define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
-#else
-#define BENCHMARK_BUILTIN_EXPECT(x, y) x
-#define BENCHMARK_DEPRECATED_MSG(msg)
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__)
-#define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-#endif
-
-#endif  // BENCHMARK_MACROS_H_
diff --git a/include/benchmark/reporter.h b/include/benchmark/reporter.h
index 8821ebfa..5baca1a7 100644
--- a/include/benchmark/reporter.h
+++ b/include/benchmark/reporter.h
@@ -14,223 +14,14 @@
 #ifndef BENCHMARK_REPORTER_H_
 #define BENCHMARK_REPORTER_H_
 
-#include <cassert>
-#include <iosfwd>
-#include <string>
-#include <utility>
-#include <vector>
-#include <set>
+#ifdef __DEPRECATED
+# ifndef BENCHMARK_WARNING_MSG
+#   warning the reporter.h header has been deprecated and will be removed, please include benchmark.h instead
+# else
+    BENCHMARK_WARNING_MSG("the reporter.h header has been deprecated and will be removed, please include benchmark.h instead")
+# endif
+#endif
 
-#include "benchmark_api.h"  // For forward declaration of BenchmarkReporter
+#include "benchmark.h"  // For forward declaration of BenchmarkReporter
 
-namespace benchmark {
-
-// Interface for custom benchmark result printers.
-// By default, benchmark reports are printed to stdout. However an application
-// can control the destination of the reports by calling
-// RunSpecifiedBenchmarks and passing it a custom reporter object.
-// The reporter object must implement the following interface.
-class BenchmarkReporter {
- public:
-  struct Context {
-    int num_cpus;
-    double mhz_per_cpu;
-    bool cpu_scaling_enabled;
-
-    // The number of chars in the longest benchmark name.
-    size_t name_field_width;
-  };
-
-  struct Run {
-    Run()
-        : error_occurred(false),
-          iterations(1),
-          time_unit(kNanosecond),
-          real_accumulated_time(0),
-          cpu_accumulated_time(0),
-          bytes_per_second(0),
-          items_per_second(0),
-          max_heapbytes_used(0),
-          complexity(oNone),
-          complexity_lambda(),
-          complexity_n(0),
-          report_big_o(false),
-          report_rms(false),
-          counters() {}
-
-    std::string benchmark_name;
-    std::string report_label;  // Empty if not set by benchmark.
-    bool error_occurred;
-    std::string error_message;
-
-    int64_t iterations;
-    TimeUnit time_unit;
-    double real_accumulated_time;
-    double cpu_accumulated_time;
-
-    // Return a value representing the real time per iteration in the unit
-    // specified by 'time_unit'.
-    // NOTE: If 'iterations' is zero the returned value represents the
-    // accumulated time.
-    double GetAdjustedRealTime() const;
-
-    // Return a value representing the cpu time per iteration in the unit
-    // specified by 'time_unit'.
-    // NOTE: If 'iterations' is zero the returned value represents the
-    // accumulated time.
-    double GetAdjustedCPUTime() const;
-
-    // Zero if not set by benchmark.
-    double bytes_per_second;
-    double items_per_second;
-
-    // This is set to 0.0 if memory tracing is not enabled.
-    double max_heapbytes_used;
-
-    // Keep track of arguments to compute asymptotic complexity
-    BigO complexity;
-    BigOFunc* complexity_lambda;
-    int complexity_n;
-
-    // Inform print function whether the current run is a complexity report
-    bool report_big_o;
-    bool report_rms;
-
-    UserCounters counters;
-  };
-
-  // Construct a BenchmarkReporter with the output stream set to 'std::cout'
-  // and the error stream set to 'std::cerr'
-  BenchmarkReporter();
-
-  // Called once for every suite of benchmarks run.
-  // The parameter "context" contains information that the
-  // reporter may wish to use when generating its report, for example the
-  // platform under which the benchmarks are running. The benchmark run is
-  // never started if this function returns false, allowing the reporter
-  // to skip runs based on the context information.
-  virtual bool ReportContext(const Context& context) = 0;
-
-  // Called once for each group of benchmark runs, gives information about
-  // cpu-time and heap memory usage during the benchmark run. If the group
-  // of runs contained more than two entries then 'report' contains additional
-  // elements representing the mean and standard deviation of those runs.
-  // Additionally if this group of runs was the last in a family of benchmarks
-  // 'reports' contains additional entries representing the asymptotic
-  // complexity and RMS of that benchmark family.
-  virtual void ReportRuns(const std::vector<Run>& report) = 0;
-
-  // Called once and only once after ever group of benchmarks is run and
-  // reported.
-  virtual void Finalize() {}
-
-  // REQUIRES: The object referenced by 'out' is valid for the lifetime
-  // of the reporter.
-  void SetOutputStream(std::ostream* out) {
-    assert(out);
-    output_stream_ = out;
-  }
-
-  // REQUIRES: The object referenced by 'err' is valid for the lifetime
-  // of the reporter.
-  void SetErrorStream(std::ostream* err) {
-    assert(err);
-    error_stream_ = err;
-  }
-
-  std::ostream& GetOutputStream() const { return *output_stream_; }
-
-  std::ostream& GetErrorStream() const { return *error_stream_; }
-
-  virtual ~BenchmarkReporter();
-
-  // Write a human readable string to 'out' representing the specified
-  // 'context'.
-  // REQUIRES: 'out' is non-null.
-  static void PrintBasicContext(std::ostream* out, Context const& context);
-
- private:
-  std::ostream* output_stream_;
-  std::ostream* error_stream_;
-};
-
-// Simple reporter that outputs benchmark data to the console. This is the
-// default reporter used by RunSpecifiedBenchmarks().
-class ConsoleReporter : public BenchmarkReporter {
-public:
-  enum OutputOptions {
-    OO_None = 0,
-    OO_Color = 1,
-    OO_Tabular = 2,
-    OO_ColorTabular = OO_Color|OO_Tabular,
-    OO_Defaults = OO_ColorTabular
-  };
-  explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults)
-      : output_options_(opts_), name_field_width_(0),
-        prev_counters_(), printed_header_(false) {}
-
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
-
- protected:
-  virtual void PrintRunData(const Run& report);
-  virtual void PrintHeader(const Run& report);
-
-  OutputOptions output_options_;
-  size_t name_field_width_;
-  UserCounters prev_counters_;
-  bool printed_header_;
-};
-
-class JSONReporter : public BenchmarkReporter {
- public:
-  JSONReporter() : first_report_(true) {}
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
-  virtual void Finalize();
-
- private:
-  void PrintRunData(const Run& report);
-
-  bool first_report_;
-};
-
-class CSVReporter : public BenchmarkReporter {
- public:
-  CSVReporter() : printed_header_(false) {}
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
-
- private:
-  void PrintRunData(const Run& report);
-
-  bool printed_header_;
-  std::set< std::string > user_counter_names_;
-};
-
-inline const char* GetTimeUnitString(TimeUnit unit) {
-  switch (unit) {
-    case kMillisecond:
-      return "ms";
-    case kMicrosecond:
-      return "us";
-    case kNanosecond:
-    default:
-      return "ns";
-  }
-}
-
-inline double GetTimeUnitMultiplier(TimeUnit unit) {
-  switch (unit) {
-    case kMillisecond:
-      return 1e3;
-    case kMicrosecond:
-      return 1e6;
-    case kNanosecond:
-    default:
-      return 1e9;
-  }
-}
-
-}  // end namespace benchmark
 #endif  // BENCHMARK_REPORTER_H_
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
index c2208737..36d23404 100644
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@@ -1,7 +1,7 @@
 #ifndef BENCHMARK_API_INTERNAL_H
 #define BENCHMARK_API_INTERNAL_H
 
-#include "benchmark/benchmark_api.h"
+#include "benchmark/benchmark.h"
 
 #include <cmath>
 #include <iosfwd>
diff --git a/src/complexity.cc b/src/complexity.cc
index 24b3ed4f..33975be5 100644
--- a/src/complexity.cc
+++ b/src/complexity.cc
@@ -15,7 +15,7 @@
 // Source project : https://github.com/ismaelJimenez/cpp.leastsq
 // Adapted to be used with google benchmark
 
-#include "benchmark/benchmark_api.h"
+#include "benchmark/benchmark.h"
 
 #include <algorithm>
 #include <cmath>
diff --git a/src/complexity.h b/src/complexity.h
index 23cd9bbc..c0ca60e6 100644
--- a/src/complexity.h
+++ b/src/complexity.h
@@ -21,8 +21,7 @@
 #include <string>
 #include <vector>
 
-#include "benchmark/benchmark_api.h"
-#include "benchmark/reporter.h"
+#include "benchmark/benchmark.h"
 
 namespace benchmark {
 
diff --git a/src/console_reporter.cc b/src/console_reporter.cc
index 7c3b7d84..4bb6f712 100644
--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/reporter.h"
+#include "benchmark/benchmark.h"
 #include "complexity.h"
 #include "counter.h"
 
diff --git a/src/counter.h b/src/counter.h
index bbb92d9a..dd6865a3 100644
--- a/src/counter.h
+++ b/src/counter.h
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark_api.h"
+#include "benchmark/benchmark.h"
 
 namespace benchmark {
 
diff --git a/src/csv_reporter.cc b/src/csv_reporter.cc
index b8072fd7..261b1514 100644
--- a/src/csv_reporter.cc
+++ b/src/csv_reporter.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/reporter.h"
+#include "benchmark/benchmark.h"
 #include "complexity.h"
 
 #include <algorithm>
diff --git a/src/cycleclock.h b/src/cycleclock.h
index e0f9b01f..4251fe4c 100644
--- a/src/cycleclock.h
+++ b/src/cycleclock.h
@@ -23,7 +23,7 @@
 
 #include <cstdint>
 
-#include "benchmark/macros.h"
+#include "benchmark/benchmark.h"
 #include "internal_macros.h"
 
 #if defined(BENCHMARK_OS_MACOSX)
diff --git a/src/internal_macros.h b/src/internal_macros.h
index ab9dd85c..8dd9f87b 100644
--- a/src/internal_macros.h
+++ b/src/internal_macros.h
@@ -1,7 +1,7 @@
 #ifndef BENCHMARK_INTERNAL_MACROS_H_
 #define BENCHMARK_INTERNAL_MACROS_H_
 
-#include "benchmark/macros.h"
+#include "benchmark/benchmark.h"
 
 #ifndef __has_feature
 #define __has_feature(x) 0
diff --git a/src/json_reporter.cc b/src/json_reporter.cc
index 5a653088..edf6ecc8 100644
--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/reporter.h"
+#include "benchmark/benchmark.h"
 #include "complexity.h"
 
 #include <algorithm>
diff --git a/src/log.h b/src/log.h
index 978cb0b4..d06e1031 100644
--- a/src/log.h
+++ b/src/log.h
@@ -4,7 +4,7 @@
 #include <iostream>
 #include <ostream>
 
-#include "benchmark/macros.h"
+#include "benchmark/benchmark.h"
 
 namespace benchmark {
 namespace internal {
@@ -70,4 +70,4 @@ inline LogType& GetLogInstanceForLevel(int level) {
   (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \
                                                                          " ")
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/reporter.cc b/src/reporter.cc
index 64742426..f0a90338 100644
--- a/src/reporter.cc
+++ b/src/reporter.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/reporter.h"
+#include "benchmark/benchmark.h"
 #include "timers.h"
 
 #include <cstdlib>
diff --git a/test/basic_test.cc b/test/basic_test.cc
index 22de007c..bc1f96d9 100644
--- a/test/basic_test.cc
+++ b/test/basic_test.cc
@@ -1,5 +1,5 @@
 
-#include "benchmark/benchmark_api.h"
+#include "benchmark/benchmark.h"
 
 #define BASIC_BENCHMARK_TEST(x) BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
 
diff --git a/test/diagnostics_test.cc b/test/diagnostics_test.cc
index 1046730b..7aac8069 100644
--- a/test/diagnostics_test.cc
+++ b/test/diagnostics_test.cc
@@ -11,7 +11,7 @@
 #include <stdexcept>
 
 #include "../src/check.h"
-#include "benchmark/benchmark_api.h"
+#include "benchmark/benchmark.h"
 
 #if defined(__GNUC__) && !defined(__EXCEPTIONS)
 #define TEST_HAS_NO_EXCEPTIONS
diff --git a/test/options_test.cc b/test/options_test.cc
index bbbed288..8eac068b 100644
--- a/test/options_test.cc
+++ b/test/options_test.cc
@@ -1,4 +1,4 @@
-#include "benchmark/benchmark_api.h"
+#include "benchmark/benchmark.h"
 #include <chrono>
 #include <thread>