diff --git a/.travis-libcxx-setup.sh b/.travis-libcxx-setup.sh
index 1b6b5851..a591743c 100644
--- a/.travis-libcxx-setup.sh
+++ b/.travis-libcxx-setup.sh
@@ -10,12 +10,18 @@ git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source
 git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx
 git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi
 
+# Setup libc++ options
+if [ -z "$BUILD_32_BITS" ]; then
+  export BUILD_32_BITS=OFF && echo disabling 32 bit build
+fi
+
 # Build and install libc++ (Use unstable ABI for better sanitizer coverage)
 mkdir llvm-build && cd llvm-build
 cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \
       -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \
       -DLIBCXX_ABI_UNSTABLE=ON \
       -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \
+      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS} \
       ../llvm-source
 make cxx -j2
 sudo make install-cxxabi install-cxx
diff --git a/.travis.yml b/.travis.yml
index 19c68dda..6bbd13bb 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,6 +20,18 @@ matrix:
           env: COMPILER=g++ C_COMPILER=gcc       BUILD_TYPE=Debug
         - compiler: gcc
           env: COMPILER=g++ C_COMPILER=gcc       BUILD_TYPE=Release
+        - compiler: gcc
+          addons:
+            apt:
+              packages:
+                - g++-multilib
+          env: COMPILER=g++ C_COMPILER=gcc       BUILD_TYPE=Debug   BUILD_32_BITS=ON
+        - compiler: gcc
+          addons:
+            apt:
+              packages:
+                - g++-multilib
+          env: COMPILER=g++ C_COMPILER=gcc       BUILD_TYPE=Release BUILD_32_BITS=ON
         - compiler: gcc
           addons:
             apt:
@@ -44,6 +56,39 @@ matrix:
             - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
             - LIBCXX_BUILD=1
             - EXTRA_FLAGS="-stdlib=libc++"
+        - compiler: clang
+          addons:
+            apt:
+              packages:
+                clang-3.8
+          env:
+            - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
+            - LIBCXX_BUILD=1
+            - EXTRA_FLAGS="-stdlib=libc++"
+        # Clang w/ 32bit libc++
+        - compiler: clang
+          addons:
+            apt:
+              packages:
+                - clang-3.8
+                - g++-multilib
+          env:
+            - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+            - LIBCXX_BUILD=1
+            - BUILD_32_BITS=ON
+            - EXTRA_FLAGS="-stdlib=libc++ -m32"
+        # Clang w/ 32bit libc++
+        - compiler: clang
+          addons:
+            apt:
+              packages:
+                - clang-3.8
+                - g++-multilib
+          env:
+            - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
+            - LIBCXX_BUILD=1
+            - BUILD_32_BITS=ON
+            - EXTRA_FLAGS="-stdlib=libc++ -m32"
         # Clang w/ libc++, ASAN, UBSAN
         - compiler: clang
           addons:
@@ -77,6 +122,9 @@ matrix:
             - EXTRA_FLAGS="-stdlib=libc++ -g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
 
 before_script:
+    - if [ -z "$BUILD_32_BITS" ]; then
+        export BUILD_32_BITS=OFF && echo disabling 32 bit build;
+      fi
     - if [ -n "${LIBCXX_BUILD}" ]; then
         source .travis-libcxx-setup.sh;
       fi
@@ -90,7 +138,7 @@ install:
     fi
 
 script:
-    - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS}" ..
+    - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS}" -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ..
     - make
     - make CTEST_OUTPUT_ON_FAILURE=1 test
 
diff --git a/AUTHORS b/AUTHORS
index 8c15b87f..c4b059df 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -18,6 +18,7 @@ Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Google Inc.
+International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
@@ -25,6 +26,7 @@ Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
+Maxim Vafin <maxvafin@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e3abf0fc..1ba31331 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,8 @@ option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
 option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
+option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library" OFF)
+
 # Make sure we can import out CMake functions
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
@@ -34,6 +36,10 @@ include(CheckCXXCompilerFlag)
 include(AddCXXCompilerFlag)
 include(CXXFeatureCheck)
 
+if (BENCHMARK_BUILD_32_BITS)
+  add_required_cxx_compiler_flag(-m32)
+endif()
+
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
   # Turn compiler warnings up to 11
   string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
@@ -92,8 +98,13 @@ else()
     add_cxx_compiler_flag(-Wzero-as-null-pointer-constant)
   endif()
   if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
-    add_cxx_compiler_flag(-Wstrict-aliasing)
+    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
+      add_cxx_compiler_flag(-Wstrict-aliasing)
+    endif()
   endif()
+  # ICC17u2: overloaded virtual function "benchmark::Fixture::SetUp" is only partially overridden
+  # (because of deprecated overload)
+  add_cxx_compiler_flag(-wd654)  
   add_cxx_compiler_flag(-Wthread-safety)
   if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
     cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 96cd15a0..8ca4565a 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -41,12 +41,14 @@ Kaito Udagawa <umireon@gmail.com>
 Kai Wolf <kai.wolf@gmail.com>
 Lei Xu <eddyxu@gmail.com>
 Matt Clarkson <mattyclarkson@gmail.com>
+Maxim Vafin <maxvafin@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Pascal Leroy <phl@google.com>
 Paul Redmond <paul.redmond@gmail.com>
 Pierre Phaneuf <pphaneuf@google.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Ray Glover <ray.glover@uk.ibm.com>
 Shuo Chen <chenshuo@chenshuo.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Tobias Ulvgård <tobias.ulvgard@dirac.se>
diff --git a/README.md b/README.md
index 33d58155..2430d93b 100644
--- a/README.md
+++ b/README.md
@@ -365,7 +365,7 @@ static void BM_vector_push_back(benchmark::State& state) {
 }
 ```
 
-Note that `ClobberMemory()` is only available for GNU based compilers.
+Note that `ClobberMemory()` is only available for GNU or MSVC based compilers.
 
 ### Set time unit manually
 If a benchmark runs a few milliseconds it may be hard to visually compare the
@@ -710,6 +710,7 @@ The following minimum versions are strongly recommended build the library:
 * GCC 4.8
 * Clang 3.4
 * Visual Studio 2013
+* Intel 2015 Update 1
 
 Anything older *may* work.
 
diff --git a/appveyor.yml b/appveyor.yml
index 204f30de..e084f386 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,16 +1,18 @@
 version: '{build}'
 
+image: Visual Studio 2017
+
 configuration:
   - Debug
   - Release
 
 environment:
   matrix:
-    - compiler: msvc-12-seh
-      generator: "Visual Studio 12 2013"
+    - compiler: msvc-15-seh
+      generator: "Visual Studio 15 2017"
 
-    - compiler: msvc-12-seh
-      generator: "Visual Studio 12 2013 Win64"
+    - compiler: msvc-15-seh
+      generator: "Visual Studio 15 2017 Win64"
 
     - compiler: msvc-14-seh
       generator: "Visual Studio 14 2015"
@@ -18,9 +20,16 @@ environment:
     - compiler: msvc-14-seh
       generator: "Visual Studio 14 2015 Win64"
 
+    - compiler: msvc-12-seh
+      generator: "Visual Studio 12 2013"
+
+    - compiler: msvc-12-seh
+      generator: "Visual Studio 12 2013 Win64"
+
     - compiler: gcc-5.3.0-posix
       generator: "MinGW Makefiles"
       cxx_path: 'C:\mingw-w64\i686-5.3.0-posix-dwarf-rt_v4-rev0\mingw32\bin'
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
 
 matrix:
   fast_finish: true
@@ -30,12 +39,6 @@ install:
   - if "%generator%"=="MinGW Makefiles" (set "PATH=%PATH:C:\Program Files\Git\usr\bin;=%")
   - if not "%cxx_path%"=="" (set "PATH=%PATH%;%cxx_path%")
 
-# TODO Remove this. This is a hack to work around bogus warning messages
-# See http://goo.gl/euguBI for more information.
-before_build:
-  - del "C:\Program Files (x86)\MSBuild\14.0\Microsoft.Common.targets\ImportAfter\Xamarin.Common.targets"
-  - del "C:\Program Files (x86)\MSBuild\12.0\Microsoft.Common.targets\ImportAfter\Xamarin.Common.targets"
-
 build_script:
   - md _build -Force
   - cd _build
@@ -51,4 +54,3 @@ artifacts:
     name: logs
   - path: '_build/Testing/**/*.xml'
     name: test_results
-
diff --git a/cmake/AddCXXCompilerFlag.cmake b/cmake/AddCXXCompilerFlag.cmake
index 9afde84b..0b176ba2 100644
--- a/cmake/AddCXXCompilerFlag.cmake
+++ b/cmake/AddCXXCompilerFlag.cmake
@@ -19,14 +19,21 @@ set(__add_cxx_compiler_flag INCLUDED)
 
 include(CheckCXXCompilerFlag)
 
-function(add_cxx_compiler_flag FLAG)
+function(mangle_compiler_flag FLAG OUTPUT)
   string(TOUPPER "HAVE_CXX_FLAG_${FLAG}" SANITIZED_FLAG)
   string(REPLACE "+" "X" SANITIZED_FLAG ${SANITIZED_FLAG})
   string(REGEX REPLACE "[^A-Za-z_0-9]" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
   string(REGEX REPLACE "_+" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
-  set(CMAKE_REQUIRED_FLAGS "${FLAG}")
-  check_cxx_compiler_flag("${FLAG}" ${SANITIZED_FLAG})
-  if(${SANITIZED_FLAG})
+  set(${OUTPUT} "${SANITIZED_FLAG}" PARENT_SCOPE)
+endfunction(mangle_compiler_flag)
+
+function(add_cxx_compiler_flag FLAG)
+  mangle_compiler_flag("${FLAG}" MANGLED_FLAG)
+  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}")
+  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
+  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
+  if(${MANGLED_FLAG})
     set(VARIANT ${ARGV1})
     if(ARGV1)
       string(TOUPPER "_${VARIANT}" VARIANT)
@@ -35,3 +42,23 @@ function(add_cxx_compiler_flag FLAG)
   endif()
 endfunction()
 
+function(add_required_cxx_compiler_flag FLAG)
+  mangle_compiler_flag("${FLAG}" MANGLED_FLAG)
+  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}")
+  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
+  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
+  if(${MANGLED_FLAG})
+    set(VARIANT ${ARGV1})
+    if(ARGV1)
+      string(TOUPPER "_${VARIANT}" VARIANT)
+    endif()
+    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}" PARENT_SCOPE)
+  else()
+    message(FATAL_ERROR "Required flag '${FLAG}' is not supported by the compiler")
+  endif()
+endfunction()
diff --git a/cmake/Config.cmake.in b/cmake/Config.cmake.in
new file mode 100644
index 00000000..6e9256ee
--- /dev/null
+++ b/cmake/Config.cmake.in
@@ -0,0 +1 @@
+include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
diff --git a/include/benchmark/benchmark_api.h b/include/benchmark/benchmark_api.h
index da3e7f9e..1e853e2c 100644
--- a/include/benchmark/benchmark_api.h
+++ b/include/benchmark/benchmark_api.h
@@ -165,6 +165,10 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #include <utility>
 #endif
 
+#if defined(_MSC_VER)
+#include <intrin.h> // for _ReadWriteBarrier
+#endif
+
 namespace benchmark {
 class BenchmarkReporter;
 
@@ -203,19 +207,6 @@ class Benchmark;
 class BenchmarkImp;
 class BenchmarkFamilies;
 
-template <class T>
-struct Voider {
-  typedef void type;
-};
-
-template <class T, class = void>
-struct EnableIfString {};
-
-template <class T>
-struct EnableIfString<T, typename Voider<typename T::basic_string>::type> {
-  typedef int type;
-};
-
 void UseCharPointer(char const volatile*);
 
 // Take ownership of the pointer and register the benchmark. Return the
@@ -228,11 +219,16 @@ BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
 
 }  // end namespace internal
 
+
+#if !defined(__GNUC__) || defined(__pnacl__) || defined(EMSCRIPTN)
+# define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
+#endif
+
 // The DoNotOptimize(...) function can be used to prevent a value or
 // expression from being optimized away by the compiler. This function is
 // intended to add little to no overhead.
 // See: https://youtu.be/nXaxk27zwlk?t=2441
-#if defined(__GNUC__) && !defined(__pnacl__) && !defined(EMSCRIPTEN)
+#ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   asm volatile("" : : "g"(value) : "memory");
@@ -242,12 +238,22 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
   asm volatile("" : : : "memory");
 }
+#elif defined(_MSC_VER)
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+  _ReadWriteBarrier();
+}
+
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
+  _ReadWriteBarrier();
+}
 #else
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
 }
-// FIXME Add ClobberMemory() for non-gnu compilers
+// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers
 #endif
 
 
@@ -432,13 +438,7 @@ class State {
   // REQUIRES: a benchmark has exited its KeepRunning loop.
   void SetLabel(const char* label);
 
-  // Allow the use of std::string without actually including <string>.
-  // This function does not participate in overload resolution unless StringType
-  // has the nested typename `basic_string`. This typename should be provided
-  // as an injected class name in the case of std::string.
-  template <class StringType>
-  void SetLabel(StringType const& str,
-                typename internal::EnableIfString<StringType>::type = 1) {
+  void BENCHMARK_ALWAYS_INLINE SetLabel(const std::string& str) {
     this->SetLabel(str.c_str());
   }
 
@@ -577,9 +577,17 @@ class Benchmark {
 
   // Set the minimum amount of time to use when running this benchmark. This
   // option overrides the `benchmark_min_time` flag.
-  // REQUIRES: `t > 0`
+  // REQUIRES: `t > 0` and `Iterations` has not been called on this benchmark.
   Benchmark* MinTime(double t);
 
+  // Specify the amount of iterations that should be run by this benchmark.
+  // REQUIRES: 'n > 0' and `MinTime` has not been called on this benchmark.
+  //
+  // NOTE: This function should only be used when *exact* iteration control is
+  //   needed and never to control or limit how long a benchmark runs, where
+  // `--benchmark_min_time=N` or `MinTime(...)` should be used instead.
+  Benchmark* Iterations(size_t n);
+
   // Specify the amount of times to repeat this benchmark. This option overrides
   // the `benchmark_repetitions` flag.
   // REQUIRES: `n > 0`
@@ -668,6 +676,7 @@ class Benchmark {
   TimeUnit time_unit_;
   int range_multiplier_;
   double min_time_;
+  size_t iterations_;
   int repetitions_;
   bool use_real_time_;
   bool use_manual_time_;
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index dce98cf8..77077739 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -31,18 +31,45 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
   target_link_libraries(benchmark Shlwapi)
 endif()
 
-# Expose public API
-target_include_directories(benchmark PUBLIC ${PROJECT_SOURCE_DIR}/include)
+set(include_install_dir "include")
+set(lib_install_dir "lib/")
+set(bin_install_dir "bin/")
+set(config_install_dir "lib/cmake/${PROJECT_NAME}")
+
+set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
+
+set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
+set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
+set(targets_export_name "${PROJECT_NAME}Targets")
+
+set(namespace "${PROJECT_NAME}::")
+
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+    "${version_config}" VERSION ${GIT_VERSION} COMPATIBILITY SameMajorVersion
+)
+
+configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)
 
 # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
 install(
   TARGETS benchmark
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-  RUNTIME DESTINATION bin
-  COMPONENT library)
+  EXPORT ${targets_export_name}
+  ARCHIVE DESTINATION ${lib_install_dir}
+  LIBRARY DESTINATION ${lib_install_dir}
+  RUNTIME DESTINATION ${bin_install_dir}
+  INCLUDES DESTINATION ${include_install_dir})
 
 install(
   DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
-  DESTINATION include
+  DESTINATION ${include_install_dir}
   FILES_MATCHING PATTERN "*.*h")
+
+install(
+    FILES "${project_config}" "${version_config}"
+    DESTINATION "${config_install_dir}")
+
+install(
+    EXPORT "${targets_export_name}"
+    NAMESPACE "${namespace}"
+    DESTINATION "${config_install_dir}")
diff --git a/src/benchmark.cc b/src/benchmark.cc
index 712d4a39..dea6756d 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -257,6 +257,7 @@ BenchmarkReporter::Run CreateRunReport(
     report.complexity = b.complexity;
     report.complexity_lambda = b.complexity_lambda;
     report.counters = results.counters;
+    internal::Finish(&report.counters, seconds, b.threads);
   }
   return report;
 }
@@ -290,7 +291,8 @@ std::vector<BenchmarkReporter::Run> RunBenchmark(
     std::vector<BenchmarkReporter::Run>* complexity_reports) {
   std::vector<BenchmarkReporter::Run> reports;  // return value
 
-  size_t iters = 1;
+  const bool has_explicit_iteration_count = b.iterations != 0;
+  size_t iters = has_explicit_iteration_count ? b.iterations : 1;
   std::unique_ptr<internal::ThreadManager> manager;
   std::vector<std::thread> pool(b.threads - 1);
   const int repeats =
@@ -300,7 +302,7 @@ std::vector<BenchmarkReporter::Run> RunBenchmark(
       (b.report_mode == internal::RM_Unspecified
            ? FLAGS_benchmark_report_aggregates_only
            : b.report_mode == internal::RM_ReportAggregatesOnly);
-  for (int i = 0; i < repeats; i++) {
+  for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
     for (;;) {
       // Try benchmark
       VLOG(2) << "Running " << b.name << " for " << iters << "\n";
@@ -336,10 +338,20 @@ std::vector<BenchmarkReporter::Run> RunBenchmark(
 
       const double min_time =
           !IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time;
-      // If this was the first run, was elapsed time or cpu time large enough?
-      // If this is not the first run, go with the current value of iter.
-      if ((i > 0) || results.has_error_ || (iters >= kMaxIterations) ||
-          (seconds >= min_time) || (results.real_time_used >= 5 * min_time)) {
+
+      // Determine if this run should be reported; Either it has
+      // run for a sufficient amount of time or because an error was reported.
+      const bool should_report =  repetition_num > 0
+        || has_explicit_iteration_count // An exact iteration count was requested
+        || results.has_error_
+        || iters >= kMaxIterations
+        || seconds >= min_time // the elapsed time is large enough
+        // CPU time is specified but the elapsed real time greatly exceeds the
+        // minimum time. Note that user provided timers are except from this
+        // sanity check.
+        || ((results.real_time_used >= 5 * min_time) && !b.use_manual_time);
+
+      if (should_report) {
         BenchmarkReporter::Run report =
             CreateRunReport(b, results, iters, seconds);
         if (!report.error_occurred && b.complexity != oNone)
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
index 978fc0a0..828ed121 100644
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@@ -28,6 +28,7 @@ struct Benchmark::Instance {
   bool last_benchmark_instance;
   int repetitions;
   double min_time;
+  size_t iterations;
   int threads;  // Number of concurrent threads to us
 };
 
diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc
index cf78bbf4..c95da987 100644
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@@ -31,6 +31,7 @@
 #include <fstream>
 #include <iostream>
 #include <memory>
+#include <sstream>
 #include <thread>
 
 #include "check.h"
@@ -143,6 +144,7 @@ bool BenchmarkFamilies::FindBenchmarks(
         instance.time_unit = family->time_unit_;
         instance.range_multiplier = family->range_multiplier_;
         instance.min_time = family->min_time_;
+        instance.iterations = family->iterations_;
         instance.repetitions = family->repetitions_;
         instance.use_real_time = family->use_real_time_;
         instance.use_manual_time = family->use_manual_time_;
@@ -162,17 +164,18 @@ bool BenchmarkFamilies::FindBenchmarks(
                   StringPrintF("%s:", family->arg_names_[arg_i].c_str());
             }
           }
-
-          instance.name += std::to_string(arg);
+          
+          instance.name += StringPrintF("%d", arg);
           ++arg_i;
         }
 
-        if (!IsZero(family->min_time_)) {
+        if (!IsZero(family->min_time_))
           instance.name += StringPrintF("/min_time:%0.3f", family->min_time_);
-        }
-        if (family->repetitions_ != 0) {
+        if (family->iterations_ != 0)
+          instance.name += StringPrintF("/iterations:%d", family->iterations_);
+        if (family->repetitions_ != 0)
           instance.name += StringPrintF("/repeats:%d", family->repetitions_);
-        }
+
         if (family->use_manual_time_) {
           instance.name += "/manual_time";
         } else if (family->use_real_time_) {
@@ -219,6 +222,7 @@ Benchmark::Benchmark(const char* name)
       time_unit_(kNanosecond),
       range_multiplier_(kRangeMultiplier),
       min_time_(0),
+      iterations_(0),
       repetitions_(0),
       use_real_time_(false),
       use_manual_time_(false),
@@ -344,6 +348,22 @@ Benchmark* Benchmark::RangeMultiplier(int multiplier) {
   return this;
 }
 
+
+Benchmark* Benchmark::MinTime(double t) {
+  CHECK(t > 0.0);
+  CHECK(iterations_ == 0);
+  min_time_ = t;
+  return this;
+}
+
+
+Benchmark* Benchmark::Iterations(size_t n) {
+  CHECK(n > 0);
+  CHECK(IsZero(min_time_));
+  iterations_ = n;
+  return this;
+}
+
 Benchmark* Benchmark::Repetitions(int n) {
   CHECK(n > 0);
   repetitions_ = n;
@@ -355,12 +375,6 @@ Benchmark* Benchmark::ReportAggregatesOnly(bool value) {
   return this;
 }
 
-Benchmark* Benchmark::MinTime(double t) {
-  CHECK(t > 0.0);
-  min_time_ = t;
-  return this;
-}
-
 Benchmark* Benchmark::UseRealTime() {
   CHECK(!use_manual_time_)
       << "Cannot set UseRealTime and UseManualTime simultaneously.";
diff --git a/src/check.h b/src/check.h
index 6f1fe0cf..73bead2f 100644
--- a/src/check.h
+++ b/src/check.h
@@ -3,6 +3,7 @@
 
 #include <cstdlib>
 #include <ostream>
+#include <cmath>
 
 #include "internal_macros.h"
 #include "log.h"
@@ -68,4 +69,11 @@ class CheckHandler {
 #define CHECK_GT(a, b) CHECK((a) > (b))
 #define CHECK_LT(a, b) CHECK((a) < (b))
 
+#define CHECK_FLOAT_EQ(a, b, eps) CHECK(std::fabs((a) - (b)) <  (eps))
+#define CHECK_FLOAT_NE(a, b, eps) CHECK(std::fabs((a) - (b)) >= (eps))
+#define CHECK_FLOAT_GE(a, b, eps) CHECK((a) - (b) > -(eps))
+#define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps))
+#define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) >  (eps))
+#define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) >  (eps))
+
 #endif  // CHECK_H_
diff --git a/src/colorprint.cc b/src/colorprint.cc
index 513376b1..2dec4a8b 100644
--- a/src/colorprint.cc
+++ b/src/colorprint.cc
@@ -89,7 +89,7 @@ std::string FormatString(const char* msg, va_list args) {
 
   std::size_t size = 256;
   char local_buff[256];
-  auto ret = std::vsnprintf(local_buff, size, msg, args_cp);
+  auto ret = vsnprintf(local_buff, size, msg, args_cp);
 
   va_end(args_cp);
 
@@ -104,7 +104,7 @@ std::string FormatString(const char* msg, va_list args) {
     // we did not provide a long enough buffer on our first attempt.
     size = (size_t)ret + 1;  // + 1 for the null byte
     std::unique_ptr<char[]> buff(new char[size]);
-    ret = std::vsnprintf(buff.get(), size, msg, args);
+    ret = vsnprintf(buff.get(), size, msg, args);
     CHECK(ret > 0 && ((size_t)ret) < size);
     return buff.get();
   }
diff --git a/src/complexity.cc b/src/complexity.cc
index 015db4ce..a4c57c43 100644
--- a/src/complexity.cc
+++ b/src/complexity.cc
@@ -35,9 +35,9 @@ BigOFunc* FittingCurve(BigO complexity) {
     case oNCubed:
       return [](int n) -> double { return std::pow(n, 3); };
     case oLogN:
-      return [](int n) { return std::log2(n); };
+      return [](int n) { return log2(n); };
     case oNLogN:
-      return [](int n) { return n * std::log2(n); };
+      return [](int n) { return n * log2(n); };
     case o1:
     default:
       return [](int) { return 1.0; };
@@ -295,6 +295,11 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
   big_o.report_big_o = true;
   big_o.complexity = result_cpu.complexity;
 
+  // All the time results are reported after being multiplied by the
+  // time unit multiplier. But since RMS is a relative quantity it
+  // should not be multiplied at all. So, here, we _divide_ it by the
+  // multiplier so that when it is multiplied later the result is the
+  // correct one.
   double multiplier = GetTimeUnitMultiplier(reports[0].time_unit);
 
   // Only add label to mean/stddev if it is same for all runs
@@ -307,6 +312,9 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
   rms.cpu_accumulated_time = result_cpu.rms / multiplier;
   rms.report_rms = true;
   rms.complexity = result_cpu.complexity;
+  // don't forget to keep the time unit, or we won't be able to
+  // recover the correct value.
+  rms.time_unit = reports[0].time_unit;
 
   results.push_back(big_o);
   results.push_back(rms);
diff --git a/src/console_reporter.cc b/src/console_reporter.cc
index efa584c8..fe0dda9e 100644
--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@@ -53,11 +53,17 @@ bool ConsoleReporter::ReportContext(const Context& context) {
 }
 
 void ConsoleReporter::PrintHeader(const Run& run) {
-  std::string str =
-      FormatString("%-*s %13s %13s %10s\n", static_cast<int>(name_field_width_),
-                   "Benchmark", "Time", "CPU", "Iterations");
+  std::string str = FormatString("%-*s %13s %13s %10s\n", static_cast<int>(name_field_width_),
+                                 "Benchmark", "Time", "CPU", "Iterations");
   if(!run.counters.empty()) {
-    str += " UserCounters...";
+    if(output_options_ & OO_Tabular) {
+      for(auto const& c : run.counters) {
+        str += FormatString(" %10s", c.first);
+      }
+    }
+    else {
+      str += " UserCounters...";
+    }
   }
   std::string line = std::string(str.length(), '-');
   GetOutputStream() << line << "\n" << str << line << "\n";
@@ -143,10 +149,12 @@ void ConsoleReporter::PrintRunData(const Run& result) {
 
   for (auto& c : result.counters) {
     auto const& s = HumanReadableNumber(c.second.value);
+    const char* unit = (c.second.flags & Counter::kIsRate) ? "/s" : "";
     if(output_options_ & OO_Tabular) {
-      printer(Out, COLOR_DEFAULT, " %10s", s.c_str());
+      printer(Out, COLOR_DEFAULT, " %10s%s", s.c_str(), unit);
     } else {
-      printer(Out, COLOR_DEFAULT, " %s=%s", c.first.c_str(), s.c_str());
+      printer(Out, COLOR_DEFAULT, " %s=%s%s", c.first.c_str(), s.c_str(),
+              unit);
     }
   }
 
diff --git a/src/counter.cc b/src/counter.cc
index 8b0a8eeb..ed1aa044 100644
--- a/src/counter.cc
+++ b/src/counter.cc
@@ -30,7 +30,7 @@ double Finish(Counter const& c, double cpu_time, double num_threads) {
 
 void Finish(UserCounters *l, double cpu_time, double num_threads) {
   for (auto &c : *l) {
-    c.second = Finish(c.second, cpu_time, num_threads);
+    c.second.value = Finish(c.second, cpu_time, num_threads);
   }
 }
 
@@ -39,7 +39,7 @@ void Increment(UserCounters *l, UserCounters const& r) {
   for (auto &c : *l) {
     auto it = r.find(c.first);
     if (it != r.end()) {
-      c.second = c.second + it->second;
+      c.second.value = c.second + it->second;
     }
   }
   // add counters present in r, but not in *l
diff --git a/src/sleep.cc b/src/sleep.cc
index 918abc48..54aa04a4 100644
--- a/src/sleep.cc
+++ b/src/sleep.cc
@@ -15,6 +15,7 @@
 #include "sleep.h"
 
 #include <cerrno>
+#include <cstdlib>
 #include <ctime>
 
 #include "internal_macros.h"
@@ -40,7 +41,7 @@ void SleepForMicroseconds(int microseconds) {
 }
 
 void SleepForMilliseconds(int milliseconds) {
-  SleepForMicroseconds(static_cast<int>(milliseconds) * kNumMicrosPerMilli);
+  SleepForMicroseconds(milliseconds * kNumMicrosPerMilli);
 }
 
 void SleepForSeconds(double seconds) {
diff --git a/src/sleep.h b/src/sleep.h
index f1e515ca..f98551af 100644
--- a/src/sleep.h
+++ b/src/sleep.h
@@ -1,14 +1,12 @@
 #ifndef BENCHMARK_SLEEP_H_
 #define BENCHMARK_SLEEP_H_
 
-#include <cstdint>
-
 namespace benchmark {
-const int64_t kNumMillisPerSecond = 1000LL;
-const int64_t kNumMicrosPerMilli = 1000LL;
-const int64_t kNumMicrosPerSecond = kNumMillisPerSecond * 1000LL;
-const int64_t kNumNanosPerMicro = 1000LL;
-const int64_t kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
+const int kNumMillisPerSecond = 1000;
+const int kNumMicrosPerMilli = 1000;
+const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000;
+const int kNumNanosPerMicro = 1000;
+const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
 
 void SleepForMilliseconds(int milliseconds);
 void SleepForSeconds(double seconds);
diff --git a/src/sysinfo.cc b/src/sysinfo.cc
index 34f9871c..7feb79e6 100644
--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@@ -75,7 +75,9 @@ bool ReadIntFromFile(const char* file, long* value) {
     char line[1024];
     char* err;
     memset(line, '\0', sizeof(line));
-    CHECK(read(fd, line, sizeof(line) - 1));
+    ssize_t read_err = read(fd, line, sizeof(line) - 1);
+    ((void)read_err); // prevent unused warning
+    CHECK(read_err >= 0);
     const long temp_value = strtol(line, &err, 10);
     if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
       *value = temp_value;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 87245984..40eb8808 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -2,13 +2,32 @@
 
 find_package(Threads REQUIRED)
 
+# NOTE: Some tests use `<cassert>` to perform the test. Therefore we must
+# strip -DNDEBUG from the default CMake flags in DEBUG mode.
+string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
+if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
+  add_definitions( -UNDEBUG )
+  add_definitions(-DTEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS)
+  # Also remove /D NDEBUG to avoid MSVC warnings about conflicting defines.
+  foreach (flags_var_to_scrub
+      CMAKE_CXX_FLAGS_RELEASE
+      CMAKE_CXX_FLAGS_RELWITHDEBINFO
+      CMAKE_CXX_FLAGS_MINSIZEREL
+      CMAKE_C_FLAGS_RELEASE
+      CMAKE_C_FLAGS_RELWITHDEBINFO
+      CMAKE_C_FLAGS_MINSIZEREL)
+    string (REGEX REPLACE "(^| )[/-]D *NDEBUG($| )" " "
+      "${flags_var_to_scrub}" "${${flags_var_to_scrub}}")
+  endforeach()
+endif()
+
 # NOTE: These flags must be added after find_package(Threads REQUIRED) otherwise
 # they will break the configuration check.
 if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
   list(APPEND CMAKE_EXE_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
 endif()
 
-add_library(output_test_helper STATIC output_test_helper.cc)
+add_library(output_test_helper STATIC output_test_helper.cc output_test.h)
 
 macro(compile_benchmark_test name)
   add_executable(${name} "${name}.cc")
@@ -73,6 +92,9 @@ add_test(multiple_ranges_test multiple_ranges_test --benchmark_min_time=0.01)
 compile_output_test(reporter_output_test)
 add_test(reporter_output_test reporter_output_test --benchmark_min_time=0.01)
 
+compile_output_test(user_counters_test)
+add_test(user_counters_test user_counters_test --benchmark_min_time=0.01)
+
 check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG)
 if (BENCHMARK_HAS_CXX03_FLAG)
   set(CXX03_FLAGS "${CMAKE_CXX_FLAGS}")
diff --git a/test/benchmark_test.cc b/test/benchmark_test.cc
index dfcf0920..7a16466e 100644
--- a/test/benchmark_test.cc
+++ b/test/benchmark_test.cc
@@ -150,7 +150,7 @@ static void BM_LongTest(benchmark::State& state) {
 BENCHMARK(BM_LongTest)->Range(1 << 16, 1 << 28);
 
 static void BM_ParallelMemset(benchmark::State& state) {
-  int size = state.range(0) / sizeof(int);
+  int size = state.range(0) / static_cast<int>(sizeof(int));
   int thread_size = size / state.threads;
   int from = thread_size * state.thread_index;
   int to = from + thread_size;
@@ -213,23 +213,6 @@ void BM_non_template_args(benchmark::State& state, int, double) {
 }
 BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);
 
-static void BM_UserCounter(benchmark::State& state) {
-  static const int depth = 1024;
-  while (state.KeepRunning()) {
-    benchmark::DoNotOptimize(CalculatePi(depth));
-  }
-  state.counters["Foo"] = 1;
-  state.counters["Bar"] = 2;
-  state.counters["Baz"] = 3;
-  state.counters["Bat"] = 5;
-#ifdef BENCHMARK_HAS_CXX11
-  state.counters.insert({{"Foo", 2}, {"Bar", 3}, {"Baz", 5}, {"Bat", 6}});
-#endif
-}
-BENCHMARK(BM_UserCounter)->Threads(8);
-BENCHMARK(BM_UserCounter)->ThreadRange(1, 32);
-BENCHMARK(BM_UserCounter)->ThreadPerCpu();
-
 #endif  // __cplusplus >= 201103L
 
 static void BM_DenseThreadRanges(benchmark::State& st) {
diff --git a/test/complexity_test.cc b/test/complexity_test.cc
index 14e03b06..62d1154d 100644
--- a/test/complexity_test.cc
+++ b/test/complexity_test.cc
@@ -141,7 +141,7 @@ BENCHMARK(BM_Complexity_O_N_log_N)
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
-    ->Complexity([](int n) { return n * std::log2(n); });
+    ->Complexity([](int n) { return n * log2(n); });
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
diff --git a/test/diagnostics_test.cc b/test/diagnostics_test.cc
index c6c235d0..1046730b 100644
--- a/test/diagnostics_test.cc
+++ b/test/diagnostics_test.cc
@@ -26,7 +26,7 @@ void TestHandler() {
 }
 
 void try_invalid_pause_resume(benchmark::State& state) {
-#if !defined(NDEBUG) && !defined(TEST_HAS_NO_EXCEPTIONS)
+#if !defined(TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) && !defined(TEST_HAS_NO_EXCEPTIONS)
   try {
     state.PauseTiming();
     std::abort();
diff --git a/test/options_test.cc b/test/options_test.cc
index bedb1cc3..bbbed288 100644
--- a/test/options_test.cc
+++ b/test/options_test.cc
@@ -1,8 +1,12 @@
 #include "benchmark/benchmark_api.h"
-
 #include <chrono>
 #include <thread>
 
+#if defined(NDEBUG)
+#undef NDEBUG
+#endif
+#include <cassert>
+
 void BM_basic(benchmark::State& state) {
   while (state.KeepRunning()) {
   }
@@ -40,4 +44,22 @@ void CustomArgs(benchmark::internal::Benchmark* b) {
 
 BENCHMARK(BM_basic)->Apply(CustomArgs);
 
+void BM_explicit_iteration_count(benchmark::State& st) {
+  // Test that benchmarks specified with an explicit iteration count are
+  // only run once.
+  static bool invoked_before = false;
+  assert(!invoked_before);
+  invoked_before = true;
+
+  // Test that the requested iteration count is respected.
+  assert(st.max_iterations == 42);
+  size_t actual_iterations = 0;
+  while (st.KeepRunning())
+    ++actual_iterations;
+  assert(st.iterations() == st.max_iterations);
+  assert(st.iterations() == 42);
+
+}
+BENCHMARK(BM_explicit_iteration_count)->Iterations(42);
+
 BENCHMARK_MAIN()
diff --git a/test/output_test.h b/test/output_test.h
index 57d4397a..897a1386 100644
--- a/test/output_test.h
+++ b/test/output_test.h
@@ -7,6 +7,8 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include <functional>
+#include <sstream>
 
 #include "../src/re.h"
 #include "benchmark/benchmark.h"
@@ -58,6 +60,134 @@ int SetSubstitutions(
 // Run all output tests.
 void RunOutputTests(int argc, char* argv[]);
 
+// ========================================================================= //
+// ------------------------- Results checking ------------------------------ //
+// ========================================================================= //
+
+// Call this macro to register a benchmark for checking its results. This
+// should be all that's needed. It subscribes a function to check the (CSV)
+// results of a benchmark. This is done only after verifying that the output
+// strings are really as expected.
+// bm_name_pattern: a name or a regex pattern which will be matched against
+//                  all the benchmark names. Matching benchmarks
+//                  will be the subject of a call to checker_function
+// checker_function: should be of type ResultsCheckFn (see below)
+#define CHECK_BENCHMARK_RESULTS(bm_name_pattern, checker_function) \
+    size_t CONCAT(dummy, __LINE__) = AddChecker(bm_name_pattern, checker_function)
+
+struct Results;
+typedef std::function< void(Results const&) > ResultsCheckFn;
+
+size_t AddChecker(const char* bm_name_pattern, ResultsCheckFn fn);
+
+// Class holding the results of a benchmark.
+// It is passed in calls to checker functions.
+struct Results {
+
+  // the benchmark name
+  std::string name;
+  // the benchmark fields
+  std::map< std::string, std::string > values;
+
+  Results(const std::string& n) : name(n) {}
+
+  int NumThreads() const;
+
+  typedef enum { kCpuTime, kRealTime } BenchmarkTime;
+
+  // get cpu_time or real_time in seconds
+  double GetTime(BenchmarkTime which) const;
+
+  // get the real_time duration of the benchmark in seconds.
+  // it is better to use fuzzy float checks for this, as the float
+  // ASCII formatting is lossy.
+  double DurationRealTime() const {
+    return GetAs< double >("iterations") * GetTime(kRealTime);
+  }
+  // get the cpu_time duration of the benchmark in seconds
+  double DurationCPUTime() const {
+    return GetAs< double >("iterations") * GetTime(kCpuTime);
+  }
+
+  // get the string for a result by name, or nullptr if the name
+  // is not found
+  const std::string* Get(const char* entry_name) const {
+    auto it = values.find(entry_name);
+    if(it == values.end()) return nullptr;
+    return &it->second;
+  }
+
+  // get a result by name, parsed as a specific type.
+  // NOTE: for counters, use GetCounterAs instead.
+  template <class T>
+  T GetAs(const char* entry_name) const;
+
+  // counters are written as doubles, so they have to be read first
+  // as a double, and only then converted to the asked type.
+  template <class T>
+  T GetCounterAs(const char* entry_name) const {
+    double dval = GetAs< double >(entry_name);
+    T tval = static_cast< T >(dval);
+    return tval;
+  }
+};
+
+template <class T>
+T Results::GetAs(const char* entry_name) const {
+  auto *sv = Get(entry_name);
+  CHECK(sv != nullptr && !sv->empty());
+  std::stringstream ss;
+  ss << *sv;
+  T out;
+  ss >> out;
+  CHECK(!ss.fail());
+  return out;
+}
+
+//----------------------------------
+// Macros to help in result checking. Do not use them with arguments causing
+// side-effects.
+
+#define _CHECK_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value) \
+    CONCAT(CHECK_, relationship)                                        \
+    (entry.getfn< var_type >(var_name), (value)) << "\n"                \
+    << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
+    << __FILE__ << ":" << __LINE__ << ": "                              \
+    << "expected (" << #var_type << ")" << (var_name)                   \
+    << "=" << (entry).getfn< var_type >(var_name)                       \
+    << " to be " #relationship " to " << (value) << "\n"
+
+// check with tolerance. eps_factor is the tolerance window, which is
+// interpreted relative to value (eg, 0.1 means 10% of value).
+#define _CHECK_FLOAT_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value, eps_factor) \
+    CONCAT(CHECK_FLOAT_, relationship)                                  \
+    (entry.getfn< var_type >(var_name), (value), (eps_factor) * (value)) << "\n" \
+    << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
+    << __FILE__ << ":" << __LINE__ << ": "                              \
+    << "expected (" << #var_type << ")" << (var_name)                   \
+    << "=" << (entry).getfn< var_type >(var_name)                       \
+    << " to be " #relationship " to " << (value) << "\n"                \
+    << __FILE__ << ":" << __LINE__ << ": "                              \
+    << "with tolerance of " << (eps_factor) * (value)                   \
+    << " (" << (eps_factor)*100. << "%), "                              \
+    << "but delta was " << ((entry).getfn< var_type >(var_name) - (value)) \
+    << " (" << (((entry).getfn< var_type >(var_name) - (value))         \
+               /                                                        \
+               ((value) > 1.e-5 || value < -1.e-5 ? value : 1.e-5)*100.) \
+    << "%)"
+
+#define CHECK_RESULT_VALUE(entry, var_type, var_name, relationship, value) \
+    _CHECK_RESULT_VALUE(entry, GetAs, var_type, var_name, relationship, value)
+
+#define CHECK_COUNTER_VALUE(entry, var_type, var_name, relationship, value) \
+    _CHECK_RESULT_VALUE(entry, GetCounterAs, var_type, var_name, relationship, value)
+
+#define CHECK_FLOAT_RESULT_VALUE(entry, var_name, relationship, value, eps_factor) \
+    _CHECK_FLOAT_RESULT_VALUE(entry, GetAs, double, var_name, relationship, value, eps_factor)
+
+#define CHECK_FLOAT_COUNTER_VALUE(entry, var_name, relationship, value, eps_factor) \
+    _CHECK_FLOAT_RESULT_VALUE(entry, GetCounterAs, double, var_name, relationship, value, eps_factor)
+
 // ========================================================================= //
 // --------------------------- Misc Utilities ------------------------------ //
 // ========================================================================= //
diff --git a/test/output_test_helper.cc b/test/output_test_helper.cc
index 721d39f9..fe750c7e 100644
--- a/test/output_test_helper.cc
+++ b/test/output_test_helper.cc
@@ -2,6 +2,7 @@
 #include <map>
 #include <memory>
 #include <sstream>
+#include <cstring>
 
 #include "../src/check.h"  // NOTE: check.h is for internal use only!
 #include "../src/re.h"     // NOTE: re.h is for internal use only
@@ -31,21 +32,29 @@ TestCaseList& GetTestCaseList(TestCaseID ID) {
 
 SubMap& GetSubstitutions() {
   // Don't use 'dec_re' from header because it may not yet be initialized.
-  static std::string dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
+  static std::string safe_dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
   static SubMap map = {
       {"%float", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"},
+      // human-readable float
+      {"%hrfloat", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?[kMGTPEZYmunpfazy]?"},
       {"%int", "[ ]*[0-9]+"},
       {" %s ", "[ ]+"},
       {"%time", "[ ]*[0-9]{1,5} ns"},
       {"%console_report", "[ ]*[0-9]{1,5} ns [ ]*[0-9]{1,5} ns [ ]*[0-9]+"},
       {"%console_us_report", "[ ]*[0-9] us [ ]*[0-9] us [ ]*[0-9]+"},
-      {"%csv_report", "[0-9]+," + dec_re + "," + dec_re + ",ns,,,,,"},
-      {"%csv_us_report", "[0-9]+," + dec_re + "," + dec_re + ",us,,,,,"},
+      {"%csv_header",
+       "name,iterations,real_time,cpu_time,time_unit,bytes_per_second,"
+       "items_per_second,label,error_occurred,error_message"},
+      {"%csv_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,,,"},
+      {"%csv_us_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",us,,,,,"},
       {"%csv_bytes_report",
-       "[0-9]+," + dec_re + "," + dec_re + ",ns," + dec_re + ",,,,"},
+       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re + ",,,,"},
       {"%csv_items_report",
-       "[0-9]+," + dec_re + "," + dec_re + ",ns,," + dec_re + ",,,"},
-      {"%csv_label_report_begin", "[0-9]+," + dec_re + "," + dec_re + ",ns,,,"},
+       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,," + safe_dec_re + ",,,"},
+      {"%csv_bytes_items_report",
+       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re +
+       "," + safe_dec_re + ",,,"},
+      {"%csv_label_report_begin", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,"},
       {"%csv_label_report_end", ",,"}};
   return map;
 }
@@ -140,8 +149,180 @@ class TestReporter : public benchmark::BenchmarkReporter {
   std::vector<benchmark::BenchmarkReporter *> reporters_;
 };
 }
+
 }  // end namespace internal
 
+// ========================================================================= //
+// -------------------------- Results checking ----------------------------- //
+// ========================================================================= //
+
+namespace internal {
+
+// Utility class to manage subscribers for checking benchmark results.
+// It works by parsing the CSV output to read the results.
+class ResultsChecker {
+ public:
+
+  struct PatternAndFn : public TestCase { // reusing TestCase for its regexes
+    PatternAndFn(const std::string& rx, ResultsCheckFn fn_)
+    : TestCase(rx), fn(fn_) {}
+    ResultsCheckFn fn;
+  };
+
+  std::vector< PatternAndFn > check_patterns;
+  std::vector< Results > results;
+  std::vector< std::string > field_names;
+
+  void Add(const std::string& entry_pattern, ResultsCheckFn fn);
+
+  void CheckResults(std::stringstream& output);
+
+ private:
+
+  void SetHeader_(const std::string& csv_header);
+  void SetValues_(const std::string& entry_csv_line);
+
+  std::vector< std::string > SplitCsv_(const std::string& line);
+
+};
+
+// store the static ResultsChecker in a function to prevent initialization
+// order problems
+ResultsChecker& GetResultsChecker() {
+  static ResultsChecker rc;
+  return rc;
+}
+
+// add a results checker for a benchmark
+void ResultsChecker::Add(const std::string& entry_pattern, ResultsCheckFn fn) {
+  check_patterns.emplace_back(entry_pattern, fn);
+}
+
+// check the results of all subscribed benchmarks
+void ResultsChecker::CheckResults(std::stringstream& output) {
+  // first reset the stream to the start
+  {
+    auto start = std::ios::streampos(0);
+    // clear before calling tellg()
+    output.clear();
+    // seek to zero only when needed
+    if(output.tellg() > start) output.seekg(start);
+    // and just in case
+    output.clear();
+  }
+  // now go over every line and publish it to the ResultsChecker
+  std::string line;
+  bool on_first = true;
+  while (output.eof() == false) {
+    CHECK(output.good());
+    std::getline(output, line);
+    if (on_first) {
+      SetHeader_(line); // this is important
+      on_first = false;
+      continue;
+    }
+    SetValues_(line);
+  }
+  // finally we can call the subscribed check functions
+  for(const auto& p : check_patterns) {
+    VLOG(2) << "--------------------------------\n";
+    VLOG(2) << "checking for benchmarks matching " << p.regex_str << "...\n";
+    for(const auto& r : results) {
+      if(!p.regex->Match(r.name)) {
+        VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n";
+        continue;
+      }
+      else {
+        VLOG(2) << p.regex_str << " is matched by " << r.name << "\n";
+      }
+      VLOG(1) << "Checking results of " << r.name << ": ... \n";
+      p.fn(r);
+      VLOG(1) << "Checking results of " << r.name << ": OK.\n";
+    }
+  }
+}
+
+// prepare for the names in this header
+void ResultsChecker::SetHeader_(const std::string& csv_header) {
+  field_names = SplitCsv_(csv_header);
+}
+
+// set the values for a benchmark
+void ResultsChecker::SetValues_(const std::string& entry_csv_line) {
+  if(entry_csv_line.empty()) return; // some lines are empty
+  CHECK(!field_names.empty());
+  auto vals = SplitCsv_(entry_csv_line);
+  CHECK_EQ(vals.size(), field_names.size());
+  results.emplace_back(vals[0]); // vals[0] is the benchmark name
+  auto &entry = results.back();
+  for (size_t i = 1, e = vals.size(); i < e; ++i) {
+    entry.values[field_names[i]] = vals[i];
+  }
+}
+
+// a quick'n'dirty csv splitter (eliminating quotes)
+std::vector< std::string > ResultsChecker::SplitCsv_(const std::string& line) {
+  std::vector< std::string > out;
+  if(line.empty()) return out;
+  if(!field_names.empty()) out.reserve(field_names.size());
+  size_t prev = 0, pos = line.find_first_of(','), curr = pos;
+  while(pos != line.npos) {
+    CHECK(curr > 0);
+    if(line[prev] == '"') ++prev;
+    if(line[curr-1] == '"') --curr;
+    out.push_back(line.substr(prev, curr-prev));
+    prev = pos + 1;
+    pos = line.find_first_of(',', pos + 1);
+    curr = pos;
+  }
+  curr = line.size();
+  if(line[prev] == '"') ++prev;
+  if(line[curr-1] == '"') --curr;
+  out.push_back(line.substr(prev, curr-prev));
+  return out;
+}
+
+}  // end namespace internal
+
+size_t AddChecker(const char* bm_name, ResultsCheckFn fn)
+{
+  auto &rc = internal::GetResultsChecker();
+  rc.Add(bm_name, fn);
+  return rc.results.size();
+}
+
+int Results::NumThreads() const {
+  auto pos = name.find("/threads:");
+  if(pos == name.npos) return 1;
+  auto end = name.find('/', pos + 9);
+  std::stringstream ss;
+  ss << name.substr(pos + 9, end);
+  int num = 1;
+  ss >> num;
+  CHECK(!ss.fail());
+  return num;
+}
+
+double Results::GetTime(BenchmarkTime which) const {
+  CHECK(which == kCpuTime || which == kRealTime);
+  const char *which_str = which == kCpuTime ? "cpu_time" : "real_time";
+  double val = GetAs< double >(which_str);
+  auto unit = Get("time_unit");
+  CHECK(unit);
+  if(*unit == "ns") {
+    return val * 1.e-9;
+  } else if(*unit == "us") {
+    return val * 1.e-6;
+  } else if(*unit == "ms") {
+    return val * 1.e-3;
+  } else if(*unit == "s") {
+    return val;
+  } else {
+    CHECK(1 == 0) << "unknown time unit: " << *unit;
+    return 0;
+  }
+}
+
 // ========================================================================= //
 // -------------------------- Public API Definitions------------------------ //
 // ========================================================================= //
@@ -231,4 +412,11 @@ void RunOutputTests(int argc, char* argv[]) {
 
     std::cout << "\n";
   }
+
+  // now that we know the output is as expected, we can dispatch
+  // the checks to subscribees.
+  auto &csv = TestCases[2];
+  // would use == but gcc spits a warning
+  CHECK(std::strcmp(csv.name, "CSVReporter") == 0);
+  internal::GetResultsChecker().CheckResults(csv.out_stream);
 }
diff --git a/test/register_benchmark_test.cc b/test/register_benchmark_test.cc
index e9f8ea53..c0ecad7d 100644
--- a/test/register_benchmark_test.cc
+++ b/test/register_benchmark_test.cc
@@ -114,14 +114,14 @@ void TestRegistrationAtRuntime() {
 #endif
 #ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
   {
-    int x = 42;
+    const char* x = "42";
     auto capturing_lam = [=](benchmark::State& st) {
       while (st.KeepRunning()) {
       }
-      st.SetLabel(std::to_string(x));
+      st.SetLabel(x);
     };
     benchmark::RegisterBenchmark("lambda_benchmark", capturing_lam);
-    AddCases({{"lambda_benchmark", "42"}});
+    AddCases({{"lambda_benchmark", x}});
   }
 #endif
 }
diff --git a/test/reporter_output_test.cc b/test/reporter_output_test.cc
index cb52aec0..4a481433 100644
--- a/test/reporter_output_test.cc
+++ b/test/reporter_output_test.cc
@@ -13,9 +13,7 @@ ADD_CASES(TC_ConsoleOut,
           {{"^[-]+$", MR_Next},
            {"^Benchmark %s Time %s CPU %s Iterations$", MR_Next},
            {"^[-]+$", MR_Next}});
-ADD_CASES(TC_CSVOut,
-          {{"name,iterations,real_time,cpu_time,time_unit,bytes_per_second,"
-            "items_per_second,label,error_occurred,error_message"}});
+ADD_CASES(TC_CSVOut, {{"%csv_header"}});
 
 // ========================================================================= //
 // ------------------------ Testing Basic Output --------------------------- //
diff --git a/test/user_counters_test.cc b/test/user_counters_test.cc
new file mode 100644
index 00000000..66df48b3
--- /dev/null
+++ b/test/user_counters_test.cc
@@ -0,0 +1,217 @@
+
+#undef NDEBUG
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// ========================================================================= //
+// ---------------------- Testing Prologue Output -------------------------- //
+// ========================================================================= //
+
+ADD_CASES(TC_ConsoleOut,
+          {{"^[-]+$", MR_Next},
+           {"^Benchmark %s Time %s CPU %s Iterations UserCounters...$", MR_Next},
+           {"^[-]+$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"%csv_header,\"bar\",\"foo\""}});
+
+// ========================================================================= //
+// ------------------------- Simple Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_Simple(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  state.counters["foo"] = 1;
+  state.counters["bar"] = 2 * (double)state.iterations();
+}
+BENCHMARK(BM_Counters_Simple);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Simple %console_report bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Simple\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Simple\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSimple(Results const& e) {
+  double its = e.GetAs< double >("iterations");
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
+  // check that the value of bar is within 0.1% of the expected value
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2.*its, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Simple", &CheckSimple);
+
+// ========================================================================= //
+// --------------------- Counters+Items+Bytes/s Output --------------------- //
+// ========================================================================= //
+
+namespace { int num_calls1 = 0; }
+void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  state.counters["foo"] = 1;
+  state.counters["bar"] = ++num_calls1;
+  state.SetBytesProcessed(364);
+  state.SetItemsProcessed(150);
+}
+BENCHMARK(BM_Counters_WithBytesAndItemsPSec);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Counters_WithBytesAndItemsPSec %console_report "
+            "bar=%hrfloat foo=%hrfloat +%hrfloatB/s +%hrfloat items/s$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bytes_per_second\": %int,$", MR_Next},
+                       {"\"items_per_second\": %int,$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_WithBytesAndItemsPSec\","
+                       "%csv_bytes_items_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckBytesAndItemsPSec(Results const& e) {
+  double t = e.DurationCPUTime(); // this (and not real time) is the time used
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
+  CHECK_COUNTER_VALUE(e, int, "bar", EQ, num_calls1);
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_RESULT_VALUE(e, "bytes_per_second", EQ, 364./t, 0.001);
+  CHECK_FLOAT_RESULT_VALUE(e, "items_per_second", EQ, 150./t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_WithBytesAndItemsPSec",
+                        &CheckBytesAndItemsPSec);
+
+// ========================================================================= //
+// ------------------------- Rate Counters Output -------------------------- //
+// ========================================================================= //
+
+void BM_Counters_Rate(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kIsRate};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kIsRate};
+}
+BENCHMARK(BM_Counters_Rate);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Rate %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Rate\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Rate\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckRate(Results const& e) {
+  double t = e.DurationCPUTime(); // this (and not real time) is the time used
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1./t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2./t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Rate", &CheckRate);
+
+// ========================================================================= //
+// ------------------------- Thread Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_Threads(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  state.counters["foo"] = 1;
+  state.counters["bar"] = 2;
+}
+BENCHMARK(BM_Counters_Threads)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Threads/threads:%int %console_report bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Threads/threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckThreads(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, e.NumThreads());
+  CHECK_COUNTER_VALUE(e, int, "bar", EQ, 2 * e.NumThreads());
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Threads/threads:%int", &CheckThreads);
+
+// ========================================================================= //
+// ---------------------- ThreadAvg Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_AvgThreads(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreads};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreads};
+}
+BENCHMARK(BM_Counters_AvgThreads)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreads/threads:%int %console_report bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreads/threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgThreads(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
+  CHECK_COUNTER_VALUE(e, int, "bar", EQ, 2);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreads/threads:%int",
+                        &CheckAvgThreads);
+
+// ========================================================================= //
+// ---------------------- ThreadAvg Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_AvgThreadsRate(benchmark::State& state) {
+  while (state.KeepRunning()) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreadsRate};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreadsRate};
+}
+BENCHMARK(BM_Counters_AvgThreadsRate)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreadsRate/threads:%int %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %int,$", MR_Next},
+                       {"\"cpu_time\": %int,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreadsRate/threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgThreadsRate(Results const& e) {
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1./e.DurationCPUTime(), 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2./e.DurationCPUTime(), 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreadsRate/threads:%int",
+                        &CheckAvgThreadsRate);
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/tools/compare_bench.py b/tools/compare_bench.py
index 8a7e7991..d54baaa0 100755
--- a/tools/compare_bench.py
+++ b/tools/compare_bench.py
@@ -59,7 +59,7 @@ def main():
     json1 = gbench.util.run_or_load_benchmark(test1, benchmark_options)
     json2 = gbench.util.run_or_load_benchmark(test2, benchmark_options)
     output_lines = gbench.report.generate_difference_report(json1, json2)
-    print 'Comparing %s to %s' % (test1, test2)
+    print('Comparing %s to %s' % (test1, test2))
     for ln in output_lines:
         print(ln)
 
diff --git a/tools/gbench/report.py b/tools/gbench/report.py
index 6776d037..015d33d9 100644
--- a/tools/gbench/report.py
+++ b/tools/gbench/report.py
@@ -80,7 +80,9 @@ def generate_difference_report(json1, json2, use_color=True):
     first_line = "{:<{}s}     Time           CPU           Old           New".format(
         'Benchmark', first_col_width)
     output_strs = [first_line, '-' * len(first_line)]
-    for bn in json1['benchmarks']:
+
+    gen = (bn for bn in json1['benchmarks'] if 'real_time' in bn and 'cpu_time' in bn)
+    for bn in gen:
         other_bench = find_test(bn['name'])
         if not other_bench:
             continue
@@ -132,7 +134,7 @@ class TestReportDifference(unittest.TestCase):
         json1, json2 = self.load_results()
         output_lines_with_header = generate_difference_report(json1, json2, use_color=False)
         output_lines = output_lines_with_header[2:]
-        print "\n".join(output_lines_with_header)
+        print("\n".join(output_lines_with_header))
         self.assertEqual(len(output_lines), len(expect_lines))
         for i in xrange(0, len(output_lines)):
             parts = [x for x in output_lines[i].split(' ') if x]
diff --git a/tools/gbench/util.py b/tools/gbench/util.py
index bfce376c..07c23772 100644
--- a/tools/gbench/util.py
+++ b/tools/gbench/util.py
@@ -20,21 +20,21 @@ def is_executable_file(filename):
     """
     if not os.path.isfile(filename):
         return False
-    with open(filename, 'r') as f:
+    with open(filename, mode='rb') as f:
         magic_bytes = f.read(_num_magic_bytes)
     if sys.platform == 'darwin':
         return magic_bytes in [
-            '\xfe\xed\xfa\xce',  # MH_MAGIC
-            '\xce\xfa\xed\xfe',  # MH_CIGAM
-            '\xfe\xed\xfa\xcf',  # MH_MAGIC_64
-            '\xcf\xfa\xed\xfe',  # MH_CIGAM_64
-            '\xca\xfe\xba\xbe',  # FAT_MAGIC
-            '\xbe\xba\xfe\xca'   # FAT_CIGAM
+            b'\xfe\xed\xfa\xce',  # MH_MAGIC
+            b'\xce\xfa\xed\xfe',  # MH_CIGAM
+            b'\xfe\xed\xfa\xcf',  # MH_MAGIC_64
+            b'\xcf\xfa\xed\xfe',  # MH_CIGAM_64
+            b'\xca\xfe\xba\xbe',  # FAT_MAGIC
+            b'\xbe\xba\xfe\xca'   # FAT_CIGAM
         ]
     elif sys.platform.startswith('win'):
-        return magic_bytes == 'MZ'
+        return magic_bytes == b'MZ'
     else:
-        return magic_bytes == '\x7FELF'
+        return magic_bytes == b'\x7FELF'
 
 
 def is_json_file(filename):
@@ -68,7 +68,7 @@ def classify_input_file(filename):
     elif is_json_file(filename):
         ftype = IT_JSON
     else:
-        err_msg = "'%s' does not name a valid benchmark executable or JSON file"
+        err_msg = "'%s' does not name a valid benchmark executable or JSON file" % filename
     return ftype, err_msg
 
 
@@ -80,7 +80,7 @@ def check_input_file(filename):
     """
     ftype, msg = classify_input_file(filename)
     if ftype == IT_Invalid:
-        print "Invalid input file: %s" % msg
+        print("Invalid input file: %s" % msg)
         sys.exit(1)
     return ftype