Merge branch 'google:main' into master

2025-04-29 22:40:33 +08:00 · 2023-09-15 15:43:08 +02:00 · 2023-09-15 15:43:08 +02:00 · a75f6cfffc
commit a75f6cfffc
parent 12b8cbe1aa 344117638c
167 changed files with 10698 additions and 3553 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@ -0,0 +1,7 @@
+---
+Checks:          'clang-analyzer-*,readability-redundant-*,performance-*'
+WarningsAsErrors: 'clang-analyzer-*,readability-redundant-*,performance-*'
+HeaderFilterRegex: '.*'
+AnalyzeTemporaryDtors: false
+FormatStyle:     none
+User:            user
--- a/.github/install_bazel.sh
+++ b/.github/install_bazel.sh
@ -0,0 +1,13 @@
+if ! bazel version; then
+  arch=$(uname -m)
+  if [ "$arch" == "aarch64" ]; then
+    arch="arm64"
+  fi
+  echo "Installing wget and downloading $arch Bazel binary from GitHub releases."
+  yum install -y wget
+  wget "https://github.com/bazelbuild/bazel/releases/download/6.3.0/bazel-6.3.0-linux-$arch" -O /usr/local/bin/bazel
+  chmod +x /usr/local/bin/bazel
+else
+  # bazel is installed for the correct architecture
+  exit 0
+fi
--- a/.github/libcxx-setup.sh
+++ b/.github/libcxx-setup.sh
@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Checkout LLVM sources
+git clone --depth=1 https://github.com/llvm/llvm-project.git llvm-project
+
+## Setup libc++ options
+if [ -z "$BUILD_32_BITS" ]; then
+  export BUILD_32_BITS=OFF && echo disabling 32 bit build
+fi
+
+## Build and install libc++ (Use unstable ABI for better sanitizer coverage)
+mkdir llvm-build && cd llvm-build
+cmake -DCMAKE_C_COMPILER=${CC}                  \
+      -DCMAKE_CXX_COMPILER=${CXX}               \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo         \
+      -DCMAKE_INSTALL_PREFIX=/usr               \
+      -DLIBCXX_ABI_UNSTABLE=OFF                 \
+      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER}  \
+      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS}     \
+      -DLLVM_ENABLE_RUNTIMES='libcxx;libcxxabi;libunwind' \
+      -G "Unix Makefiles" \
+      ../llvm-project/runtimes/
+make -j cxx cxxabi unwind
+cd ..
--- a/.github/workflows/bazel.yml
+++ b/.github/workflows/bazel.yml
@ -0,0 +1,35 @@
+name: bazel
+
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  build_and_test_default:
+    name: bazel.${{ matrix.os }}.${{ matrix.bzlmod && 'bzlmod' || 'no_bzlmod' }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        bzlmod: [false, true]
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: mount bazel cache
+      uses: actions/cache@v3
+      env:
+        cache-name: bazel-cache
+      with:
+        path: "~/.cache/bazel"
+        key: ${{ env.cache-name }}-${{ matrix.os }}-${{ github.ref }}
+        restore-keys: |
+          ${{ env.cache-name }}-${{ matrix.os }}-main
+
+    - name: build
+      run: |
+        bazel build ${{ matrix.bzlmod && '--enable_bzlmod' || '--noenable_bzlmod' }} //:benchmark //:benchmark_main //test/...
+
+    - name: test
+      run: |
+        bazel test ${{ matrix.bzlmod && '--enable_bzlmod' || '--noenable_bzlmod' }} --test_output=all //test/...
--- a/.github/workflows/build-and-test-min-cmake.yml
+++ b/.github/workflows/build-and-test-min-cmake.yml
@ -0,0 +1,46 @@
+name: build-and-test-min-cmake
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  job:
+    name: ${{ matrix.os }}.min-cmake
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: lukka/get-cmake@latest
+        with:
+          cmakeVersion: 3.10.0
+
+      - name: create build environment
+        run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+      - name: setup cmake initial cache
+        run: touch compiler-cache.cmake
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: >
+          cmake -C ${{ github.workspace }}/compiler-cache.cmake
+          $GITHUB_WORKSPACE
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DCMAKE_CXX_VISIBILITY_PRESET=hidden
+          -DCMAKE_VISIBILITY_INLINES_HIDDEN=ON
+
+      - name: build
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: cmake --build .
--- a/.github/workflows/build-and-test-perfcounters.yml
+++ b/.github/workflows/build-and-test-perfcounters.yml
@ -0,0 +1,51 @@
+name: build-and-test-perfcounters
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  job:
+    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
+    name: ${{ matrix.os }}.${{ matrix.build_type }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-22.04, ubuntu-20.04]
+        build_type: ['Release', 'Debug']
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: install libpfm
+      run: |
+        sudo apt update
+        sudo apt -y install libpfm4-dev
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_LIBPFM=1
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    # Skip testing, for now. It seems perf_event_open does not succeed on the
+    # hosting machine, very likely a permissions issue.
+    # TODO(mtrofin): Enable test.
+    # - name: test
+    #   shell: bash
+    #   working-directory: ${{ runner.workspace }}/_build
+    #   run: ctest -C ${{ matrix.build_type }} --rerun-failed --output-on-failure
+
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@ -2,37 +2,113 @@ name: build-and-test

 on:
  push:
-    branches: [ master ]
+    branches: [ main ]
  pull_request:
-    branches: [ master ]
+    branches: [ main ]

 jobs:
+  # TODO: add 32-bit builds (g++ and clang++) for ubuntu
+  #   (requires g++-multilib and libc6:i386)
+  # TODO: add coverage build (requires lcov)
+  # TODO: add clang + libc++ builds for ubuntu
  job:
-    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
-    name: ${{ matrix.os }}.${{ matrix.build_type }}
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.compiler }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
-        os: [ubuntu-latest, ubuntu-16.04, ubuntu-20.04, macos-latest, windows-latest]
+        os: [ubuntu-22.04, ubuntu-20.04, macos-latest]
        build_type: ['Release', 'Debug']
+        compiler: ['g++', 'clang++']
+        lib: ['shared', 'static']
+
    steps:
-    - uses: actions/checkout@v2
-      
-    - name: create build environment
-      run: cmake -E make_directory ${{ runner.workspace }}/_build
-      
-    - name: configure cmake
-      shell: bash
-      working-directory: ${{ runner.workspace }}/_build
-      run: cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-      
-    - name: build
-      shell: bash
-      working-directory: ${{ runner.workspace }}/_build
-      run: cmake --build . --config ${{ matrix.build_type }}
-      
-    - name: test
-      shell: bash
-      working-directory: ${{ runner.workspace }}/_build
-      run: ctest -C ${{ matrix.build_type }}
+      - uses: actions/checkout@v3
+
+      - uses: lukka/get-cmake@latest
+
+      - name: create build environment
+        run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+      - name: setup cmake initial cache
+        run: touch compiler-cache.cmake
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: >
+          cmake -C ${{ github.workspace }}/compiler-cache.cmake
+          $GITHUB_WORKSPACE
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          -DCMAKE_CXX_COMPILER=${{ env.CXX }}
+          -DCMAKE_CXX_VISIBILITY_PRESET=hidden
+          -DCMAKE_VISIBILITY_INLINES_HIDDEN=ON
+
+      - name: build
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: cmake --build . --config ${{ matrix.build_type }}
+
+      - name: test
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: ctest -C ${{ matrix.build_type }} -VV
+
+  msvc:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.msvc }}
+    runs-on: ${{ matrix.os }}
+    defaults:
+        run:
+            shell: powershell
+    strategy:
+      fail-fast: false
+      matrix:
+        msvc:
+          - VS-16-2019
+          - VS-17-2022
+        arch:
+          - x64
+        build_type:
+          - Debug
+          - Release
+        lib:
+          - shared
+          - static
+        include:
+          - msvc: VS-16-2019
+            os: windows-2019
+            generator: 'Visual Studio 16 2019'
+          - msvc: VS-17-2022
+            os: windows-2022
+            generator: 'Visual Studio 17 2022'
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: lukka/get-cmake@latest
+
+      - name: configure cmake
+        run: >
+          cmake -S . -B _build/
+          -A ${{ matrix.arch }}
+          -G "${{ matrix.generator }}"
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+
+      - name: build
+        run: cmake --build _build/ --config ${{ matrix.build_type }}
+
+      - name: setup test environment
+        # Make sure gmock and benchmark DLLs can be found
+        run: >
+            echo "$((Get-Item .).FullName)/_build/bin/${{ matrix.build_type }}" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append;
+            echo "$((Get-Item .).FullName)/_build/src/${{ matrix.build_type }}" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append;
+
+      - name: test
+        run: ctest --test-dir _build/ -C ${{ matrix.build_type }} -VV
+
+
--- a/.github/workflows/clang-format-lint.yml
+++ b/.github/workflows/clang-format-lint.yml
@ -0,0 +1,17 @@
+name: clang-format-lint
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - uses: DoozyX/clang-format-lint-action@v0.13
+      with:
+        source: './include/benchmark ./src ./test'
+        extensions: 'h,cc'
+        clangFormatVersion: 12
+        style: Google
--- a/.github/workflows/clang-tidy.yml
+++ b/.github/workflows/clang-tidy.yml
@ -0,0 +1,38 @@
+name: clang-tidy
+
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  job:
+    name: run-clang-tidy
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: install clang-tidy
+      run: sudo apt update && sudo apt -y install clang-tidy
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=clang
+        -DCMAKE_CXX_COMPILER=clang++
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DGTEST_COMPILE_COMMANDS=OFF
+
+    - name: run
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: run-clang-tidy
--- a/.github/workflows/doxygen.yml
+++ b/.github/workflows/doxygen.yml
@ -0,0 +1,28 @@
+name: doxygen
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  build-and-deploy:
+    name: Build HTML documentation
+    runs-on: ubuntu-latest
+    steps:
+    - name: Fetching sources
+      uses: actions/checkout@v3
+
+    - name: Installing build dependencies
+      run: |
+        sudo apt update
+        sudo apt install doxygen gcc git
+
+    - name: Creating build directory
+      run: mkdir build
+
+    - name: Building HTML documentation with Doxygen
+      run: |
+        cmake -S . -B build -DBENCHMARK_ENABLE_TESTING:BOOL=OFF -DBENCHMARK_ENABLE_DOXYGEN:BOOL=ON -DBENCHMARK_INSTALL_DOCS:BOOL=ON
+        cmake --build build --target benchmark_doxygen
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@ -0,0 +1,28 @@
+name: pylint
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  pylint:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.8
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pylint pylint-exit conan
+
+    - name: Run pylint
+      run: |
+        pylint `find . -name '*.py'|xargs` || pylint-exit $?
--- a/.github/workflows/sanitizer.yml
+++ b/.github/workflows/sanitizer.yml
@ -0,0 +1,96 @@
+name: sanitizer
+
+on:
+  push: {}
+  pull_request: {}
+
+env:
+  UBSAN_OPTIONS: "print_stacktrace=1"
+
+jobs:
+  job:
+    name: ${{ matrix.sanitizer }}.${{ matrix.build_type }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: ['Debug', 'RelWithDebInfo']
+        sanitizer: ['asan', 'ubsan', 'tsan', 'msan']
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: configure msan env
+      if: matrix.sanitizer == 'msan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=MemoryWithOrigins" >> $GITHUB_ENV
+
+    - name: configure ubsan env
+      if: matrix.sanitizer == 'ubsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Undefined" >> $GITHUB_ENV
+
+    - name: configure asan env
+      if: matrix.sanitizer == 'asan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=address -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Address" >> $GITHUB_ENV
+
+    - name: configure tsan env
+      if: matrix.sanitizer == 'tsan'
+      run: |
+        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all" >> $GITHUB_ENV
+        echo "LIBCXX_SANITIZER=Thread" >> $GITHUB_ENV
+
+    - name: fine-tune asan options
+      # in asan we get an error from std::regex. ignore it.
+      if: matrix.sanitizer == 'asan'
+      run: |
+        echo "ASAN_OPTIONS=alloc_dealloc_mismatch=0" >> $GITHUB_ENV
+
+    - name: setup clang
+      uses: egor-tensin/setup-clang@v1
+      with:
+        version: latest
+        platform: x64
+
+    - name: configure clang
+      run: |
+        echo "CC=cc" >> $GITHUB_ENV
+        echo "CXX=c++" >> $GITHUB_ENV
+
+    - name: build libc++ (non-asan)
+      if: matrix.sanitizer != 'asan'
+      run: |
+        "${GITHUB_WORKSPACE}/.github/libcxx-setup.sh"
+        echo "EXTRA_CXX_FLAGS=-stdlib=libc++ -L ${GITHUB_WORKSPACE}/llvm-build/lib -lc++abi -Isystem${GITHUB_WORKSPACE}/llvm-build/include -Isystem${GITHUB_WORKSPACE}/llvm-build/include/c++/v1 -Wl,-rpath,${GITHUB_WORKSPACE}/llvm-build/lib" >> $GITHUB_ENV
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: >
+        VERBOSE=1
+        cmake $GITHUB_WORKSPACE
+        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
+        -DBENCHMARK_ENABLE_LIBPFM=OFF
+        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+        -DCMAKE_C_COMPILER=${{ env.CC }}
+        -DCMAKE_CXX_COMPILER=${{ env.CXX }}
+        -DCMAKE_C_FLAGS="${{ env.EXTRA_FLAGS }}"
+        -DCMAKE_CXX_FLAGS="${{ env.EXTRA_FLAGS }} ${{ env.EXTRA_CXX_FLAGS }}"
+        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    - name: test
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: ctest -C ${{ matrix.build_type }} -VV
--- a/.github/workflows/test_bindings.yml
+++ b/.github/workflows/test_bindings.yml
@ -0,0 +1,29 @@
+name: test-bindings
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  python_bindings:
+    name: Test GBM Python bindings on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest, macos-latest, windows-2019 ]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install GBM Python bindings on ${{ matrix.os}}
+        run:
+          python -m pip install wheel .
+      - name: Run bindings example on ${{ matrix.os }}
+        run:
+          python bindings/python/google_benchmark/example.py
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@ -0,0 +1,79 @@
+name: Build and upload Python wheels
+
+on:
+  workflow_dispatch:
+  release:
+    types:
+      - published
+
+jobs:
+  build_sdist:
+    name: Build source distribution
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+
+      - name: Install Python 3.11
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+
+      - name: Build and check sdist
+        run: |
+          python setup.py sdist
+      - name: Upload sdist
+        uses: actions/upload-artifact@v3
+        with:
+          name: dist
+          path: dist/*.tar.gz
+
+  build_wheels:
+    name: Build Google Benchmark wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-2019]
+
+    steps:
+      - name: Check out Google Benchmark
+        uses: actions/checkout@v3
+
+      - name: Set up QEMU
+        if: runner.os == 'Linux'
+        uses: docker/setup-qemu-action@v2
+        with:
+          platforms: all
+
+      - name: Build wheels on ${{ matrix.os }} using cibuildwheel
+        uses: pypa/cibuildwheel@v2.14.1
+        env:
+          CIBW_BUILD: 'cp38-* cp39-* cp310-* cp311-*'
+          CIBW_SKIP: "*-musllinux_*"
+          CIBW_TEST_SKIP: "*-macosx_arm64"
+          CIBW_ARCHS_LINUX: x86_64 aarch64
+          CIBW_ARCHS_MACOS: x86_64 arm64
+          CIBW_ARCHS_WINDOWS: AMD64
+          CIBW_BEFORE_ALL_LINUX: bash .github/install_bazel.sh
+          CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
+
+      - name: Upload Google Benchmark ${{ matrix.os }} wheels
+        uses: actions/upload-artifact@v3
+        with:
+          name: dist
+          path: ./wheelhouse/*.whl
+
+  pypi_upload:
+    name: Publish google-benchmark wheels to PyPI
+    needs: [build_sdist, build_wheels]
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/download-artifact@v3
+      with:
+        name: dist
+        path: dist
+
+    - uses: pypa/gh-action-pypi-publish@v1.6.4
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_PASSWORD }}
--- a/.gitignore
+++ b/.gitignore
@ -11,6 +11,7 @@
 *.swp
 *.pyc
 __pycache__
+.DS_Store

 # lcov
 *.lcov
--- a/.travis-libcxx-setup.sh
+++ b/.travis-libcxx-setup.sh
@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-# Install a newer CMake version
-curl -sSL https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh -o install-cmake.sh
-chmod +x install-cmake.sh
-sudo ./install-cmake.sh --prefix=/usr/local --skip-license
-
-# Checkout LLVM sources
-git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source
-git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx
-git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi
-
-# Setup libc++ options
-if [ -z "$BUILD_32_BITS" ]; then
-  export BUILD_32_BITS=OFF && echo disabling 32 bit build
-fi
-
-# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
-mkdir llvm-build && cd llvm-build
-cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \
-      -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \
-      -DLIBCXX_ABI_UNSTABLE=ON \
-      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \
-      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS} \
-      ../llvm-source
-make cxx -j2
-sudo make install-cxxabi install-cxx
-cd ../
--- a/.travis.yml
+++ b/.travis.yml
@ -10,10 +10,6 @@ matrix:
          packages:
            - lcov
      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release
    - compiler: gcc
      addons:
        apt:
@ -44,10 +40,6 @@ matrix:
        - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
        - ENABLE_SANITIZER=1
        - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
    # Clang w/ libc++
    - compiler: clang
      dist: xenial
@ -146,16 +138,6 @@ matrix:
        - ENABLE_SANITIZER=1
        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Debug
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Release
    - os: osx
      osx_image: xcode8.3
      compiler: clang
@ -164,15 +146,10 @@ matrix:
        - BUILD_TYPE=Release
        - BUILD_32_BITS=ON
        - EXTRA_FLAGS="-m32"
-    - os: osx
-      osx_image: xcode9.4
-      compiler: gcc
-      env:
-        - COMPILER=g++-7 C_COMPILER=gcc-7  BUILD_TYPE=Debug

 before_script:
  - if [ -n "${LIBCXX_BUILD}" ]; then
-      source .travis-libcxx-setup.sh;
+      source .libcxx-setup.sh;
    fi
  - if [ -n "${ENABLE_SANITIZER}" ]; then
      export EXTRA_OPTIONS="-DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF";
--- a/15
+++ b/15
@ -13,6 +13,8 @@ Alex Steele <steeleal123@gmail.com>
 Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
 Carto
+Cezary Skrzyński <czars1988@gmail.com>
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
 Colin Braley <braley.colin@gmail.com>
 Daniel Harvey <danielharvey458@gmail.com>
@ -20,14 +22,18 @@ David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Deniz Evrenci <denizevrenci@gmail.com>
 Dirac Research 
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
 Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fabien Pichot <pichot.fabien@gmail.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Gergő Szitár <szitar.gergo@gmail.com>
 Google Inc.
+Henrique Bucher <hbucher@gmail.com>
 International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
@ -38,19 +44,28 @@ Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
 MongoDB Inc.
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Raghu Raja <raghu@enfabrica.net>
+Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Roman Lebedev <lebedev.ri@gmail.com>
 Sayan Bhattacharjee <aero.sayan@gmail.com>
+Shapr3D <google-contributors@shapr3d.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Staffan Tjernstrom <staffantj@gmail.com>
 Steinar H. Gunderson <sgunderson@bigfoot.com>
 Stripe, Inc.
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -1,14 +1,36 @@
 licenses(["notice"])

+config_setting(
+    name = "qnx",
+    constraint_values = ["@platforms//os:qnx"],
+    values = {
+        "cpu": "x64_qnx",
+    },
+    visibility = [":__subpackages__"],
+)
+
 config_setting(
    name = "windows",
+    constraint_values = ["@platforms//os:windows"],
    values = {
        "cpu": "x64_windows",
    },
    visibility = [":__subpackages__"],
 )

-load("@rules_cc//cc:defs.bzl", "cc_library")
+config_setting(
+    name = "macos",
+    constraint_values = ["@platforms//os:macos"],
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "perfcounters",
+    define_values = {
+        "pfm": "1",
+    },
+    visibility = [":__subpackages__"],
+)

 cc_library(
    name = "benchmark",
@ -19,19 +41,40 @@ cc_library(
        ],
        exclude = ["src/benchmark_main.cc"],
    ),
-    hdrs = ["include/benchmark/benchmark.h"],
+    hdrs = [
+        "include/benchmark/benchmark.h",
+        "include/benchmark/export.h",
+    ],
    linkopts = select({
        ":windows": ["-DEFAULTLIB:shlwapi.lib"],
        "//conditions:default": ["-pthread"],
    }),
+    copts = select({
+        ":windows": [],
+        "//conditions:default": ["-Werror=old-style-cast"],
+    }),
    strip_include_prefix = "include",
    visibility = ["//visibility:public"],
+    # Only static linking is allowed; no .so will be produced.
+    # Using `defines` (i.e. not `local_defines`) means that no
+    # dependent rules need to bother about defining the macro.
+    linkstatic = True,
+    defines = [
+        "BENCHMARK_STATIC_DEFINE",
+    ] + select({
+        ":perfcounters": ["HAVE_LIBPFM"],
+        "//conditions:default": [],
+    }),
+    deps = select({
+        ":perfcounters": ["@libpfm//:libpfm"],
+        "//conditions:default": [],
+    }),
 )

 cc_library(
    name = "benchmark_main",
    srcs = ["src/benchmark_main.cc"],
-    hdrs = ["include/benchmark/benchmark.h"],
+    hdrs = ["include/benchmark/benchmark.h", "include/benchmark/export.h"],
    strip_include_prefix = "include",
    visibility = ["//visibility:public"],
    deps = [":benchmark"],
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,30 +1,34 @@
-cmake_minimum_required (VERSION 3.5.1)
+# Require CMake 3.10. If available, use the policies up to CMake 3.22.
+cmake_minimum_required (VERSION 3.10...3.22)

-foreach(p
-    CMP0048 # OK to clear PROJECT_VERSION on project()
-    CMP0054 # CMake 3.1
-    CMP0056 # export EXE_LINKER_FLAGS to try_run
-    CMP0057 # Support no if() IN_LIST operator
-    CMP0063 # Honor visibility properties for all targets
-    CMP0077 # Allow option() overrides in importing projects
-    )
-  if(POLICY ${p})
-    cmake_policy(SET ${p} NEW)
-  endif()
-endforeach()
-
-project (benchmark CXX)
+project (benchmark VERSION 1.8.3 LANGUAGES CXX)

 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
 option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
+option(BENCHMARK_ENABLE_WERROR "Build Release candidates with -Werror." ON)
+option(BENCHMARK_FORCE_WERROR "Build Release candidates with -Werror regardless of compiler issues." OFF)
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
+  # PGC++ maybe reporting false positives.
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "NVHPC")
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if(BENCHMARK_FORCE_WERROR)
+  set(BENCHMARK_ENABLE_WERROR ON)
+endif(BENCHMARK_FORCE_WERROR)
+
 if(NOT MSVC)
  option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
 else()
  set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
 endif()
 option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
+option(BENCHMARK_ENABLE_DOXYGEN "Build documentation with Doxygen." OFF)
+option(BENCHMARK_INSTALL_DOCS "Enable installation of documentation." ON)

 # Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
 # may require downloading the source code.
@ -33,8 +37,25 @@ option(BENCHMARK_DOWNLOAD_DEPENDENCIES "Allow the downloading and in-tree buildi
 # This option can be used to disable building and running unit tests which depend on gtest
 # in cases where it is not possible to build or find a valid version of gtest.
 option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
+option(BENCHMARK_USE_BUNDLED_GTEST "Use bundled GoogleTest. If disabled, the find_package(GTest) will be used." ON)
+
+option(BENCHMARK_ENABLE_LIBPFM "Enable performance counters provided by libpfm" OFF)
+
+# Export only public symbols
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)
+
+if(MSVC)
+    # As of CMake 3.18, CMAKE_SYSTEM_PROCESSOR is not set properly for MSVC and
+    # cross-compilation (e.g. Host=x86_64, target=aarch64) requires using the
+    # undocumented, but working variable.
+    # See https://gitlab.kitware.com/cmake/cmake/-/issues/15170
+    set(CMAKE_SYSTEM_PROCESSOR ${MSVC_CXX_ARCHITECTURE_ID})
+    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ARM")
+      set(CMAKE_CROSSCOMPILING TRUE)
+    endif()
+endif()

-set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 set(ENABLE_ASSEMBLY_TESTS_DEFAULT OFF)
 function(should_enable_assembly_tests)
  if(CMAKE_BUILD_TYPE)
@ -81,23 +102,42 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 include(GetGitVersion)
 get_git_version(GIT_VERSION)

+# If no git version can be determined, use the version
+# from the project() command
+if ("${GIT_VERSION}" STREQUAL "0.0.0")
+  set(VERSION "${benchmark_VERSION}")
+else()
+  set(VERSION "${GIT_VERSION}")
+endif()
 # Tell the user what versions we are using
-string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
-message(STATUS "Version: ${VERSION}")
+message(STATUS "Google Benchmark version: ${VERSION}")

 # The version of the libraries
 set(GENERIC_LIB_VERSION ${VERSION})
 string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)

 # Import our CMake modules
-include(CheckCXXCompilerFlag)
 include(AddCXXCompilerFlag)
+include(CheckCXXCompilerFlag)
+include(CheckLibraryExists)
 include(CXXFeatureCheck)

+check_library_exists(rt shm_open "" HAVE_LIB_RT)
+
 if (BENCHMARK_BUILD_32_BITS)
  add_required_cxx_compiler_flag(-m32)
 endif()

+if (MSVC)
+  set(BENCHMARK_CXX_STANDARD 14)
+else()
+  set(BENCHMARK_CXX_STANDARD 11)
+endif()
+
+set(CMAKE_CXX_STANDARD ${BENCHMARK_CXX_STANDARD})
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
 if (MSVC)
  # Turn compiler warnings up to 11
  string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
@ -130,44 +170,43 @@ if (MSVC)
    set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG")
  endif()
 else()
-  # Try and enable C++11. Don't use C++14 because it doesn't work in some
-  # configurations.
-  add_cxx_compiler_flag(-std=c++11)
-  if (NOT HAVE_CXX_FLAG_STD_CXX11)
-    add_cxx_compiler_flag(-std=c++0x)
-  endif()
-
  # Turn compiler warnings up to 11
  add_cxx_compiler_flag(-Wall)
  add_cxx_compiler_flag(-Wextra)
  add_cxx_compiler_flag(-Wshadow)
-  add_cxx_compiler_flag(-Werror RELEASE)
-  add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
-  add_cxx_compiler_flag(-Werror MINSIZEREL)
-  # Disabled until googletest (gmock) stops emitting variadic macro warnings
-  #add_cxx_compiler_flag(-pedantic)
-  #add_cxx_compiler_flag(-pedantic-errors)
+  add_cxx_compiler_flag(-Wfloat-equal)
+  add_cxx_compiler_flag(-Wold-style-cast)
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Werror)
+  endif()
+  if (NOT BENCHMARK_ENABLE_TESTING)
+    # Disable warning when compiling tests as gtest does not use 'override'.
+    add_cxx_compiler_flag(-Wsuggest-override)
+  endif()
+  add_cxx_compiler_flag(-pedantic)
+  add_cxx_compiler_flag(-pedantic-errors)
  add_cxx_compiler_flag(-Wshorten-64-to-32)
  add_cxx_compiler_flag(-fstrict-aliasing)
  # Disable warnings regarding deprecated parts of the library while building
  # and testing those parts of the library.
  add_cxx_compiler_flag(-Wno-deprecated-declarations)
-  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
    # Intel silently ignores '-Wno-deprecated-declarations',
    # warning no. 1786 must be explicitly disabled.
    # See #631 for rationale.
    add_cxx_compiler_flag(-wd1786)
+    add_cxx_compiler_flag(-fno-finite-math-only)
  endif()
  # Disable deprecation warnings for release builds (when -Werror is enabled).
-  add_cxx_compiler_flag(-Wno-deprecated RELEASE)
-  add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
-  add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Wno-deprecated)
+  endif()
  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
    add_cxx_compiler_flag(-fno-exceptions)
  endif()

  if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
-    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
+    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM") #ICC17u2: Many false positives for Wstrict-aliasing
      add_cxx_compiler_flag(-Wstrict-aliasing)
    endif()
  endif()
@ -176,12 +215,12 @@ else()
  add_cxx_compiler_flag(-wd654)
  add_cxx_compiler_flag(-Wthread-safety)
  if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
-    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
+    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES "-DINCLUDE_DIRECTORIES=${PROJECT_SOURCE_DIR}/include")
  endif()

  # On most UNIX like platforms g++ and clang++ define _GNU_SOURCE as a
  # predefined macro, which turns on all of the wonderful libc extensions.
-  # However g++ doesn't do this in Cygwin so we have to define it ourselfs
+  # However g++ doesn't do this in Cygwin so we have to define it ourselves
  # since we depend on GNU/POSIX/BSD extensions.
  if (CYGWIN)
    add_definitions(-D_GNU_SOURCE=1)
@ -194,6 +233,7 @@ else()
  # Link time optimisation
  if (BENCHMARK_ENABLE_LTO)
    add_cxx_compiler_flag(-flto)
+    add_cxx_compiler_flag(-Wno-lto-type-mismatch)
    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
      find_program(GCC_AR gcc-ar)
      if (GCC_AR)
@ -231,7 +271,8 @@ if (BENCHMARK_USE_LIBCXX)
  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
    add_cxx_compiler_flag(-stdlib=libc++)
  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
-          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel" OR
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM")
    add_cxx_compiler_flag(-nostdinc++)
    message(WARNING "libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
    # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
@ -245,11 +286,17 @@ if (BENCHMARK_USE_LIBCXX)
  endif()
 endif(BENCHMARK_USE_LIBCXX)

+set(EXTRA_CXX_FLAGS "")
+if (WIN32 AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+  # Clang on Windows fails to compile the regex feature check under C++11
+  set(EXTRA_CXX_FLAGS "-DCMAKE_CXX_STANDARD=14")
+endif()
+
 # C++ feature checks
 # Determine the correct regular expression engine to use
-cxx_feature_check(STD_REGEX)
-cxx_feature_check(GNU_POSIX_REGEX)
-cxx_feature_check(POSIX_REGEX)
+cxx_feature_check(STD_REGEX ${EXTRA_CXX_FLAGS})
+cxx_feature_check(GNU_POSIX_REGEX ${EXTRA_CXX_FLAGS})
+cxx_feature_check(POSIX_REGEX ${EXTRA_CXX_FLAGS})
 if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
  message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
 endif()
@ -257,10 +304,16 @@ if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
        AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
  message(WARNING "Using std::regex with exceptions disabled is not fully supported")
 endif()
+
 cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
+cxx_feature_check(PTHREAD_AFFINITY)
+
+if (BENCHMARK_ENABLE_LIBPFM)
+  find_package(PFM)
+endif()

 # Set up directories
 include_directories(${PROJECT_SOURCE_DIR}/include)
@ -273,7 +326,15 @@ if (BENCHMARK_ENABLE_TESTING)
  if (BENCHMARK_ENABLE_GTEST_TESTS AND
      NOT (TARGET gtest AND TARGET gtest_main AND
           TARGET gmock AND TARGET gmock_main))
-    include(GoogleTest)
+    if (BENCHMARK_USE_BUNDLED_GTEST)
+      include(GoogleTest)
+    else()
+      find_package(GTest CONFIG REQUIRED)
+      add_library(gtest ALIAS GTest::gtest)
+      add_library(gtest_main ALIAS GTest::gtest_main)
+      add_library(gmock ALIAS GTest::gmock)
+      add_library(gmock_main ALIAS GTest::gmock_main)
+    endif()
  endif()
  add_subdirectory(test)
 endif()
--- a/19
+++ b/19
@ -22,12 +22,16 @@
 #
 # Please keep the list sorted.

+Abhina Sreeskantharajan <abhina.sreeskantharajan@ibm.com>
 Albert Pretorius <pretoalb@gmail.com>
 Alex Steele <steelal123@gmail.com>
 Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
+Bátor Tallér <bator.taller@shapr3d.com>
 Billy Robert O'Neal III <billy.oneal@gmail.com> <bion@microsoft.com>
+Cezary Skrzyński <czars1988@gmail.com>
 Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
 Colin Braley <braley.colin@gmail.com>
 Cyrille Faucheux <cyrille.faucheux@gmail.com>
@ -36,15 +40,20 @@ David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Deniz Evrenci <denizevrenci@gmail.com>
 Dominic Hamon <dma@stripysock.com> <dominic@google.com>
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
 Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fabien Pichot <pichot.fabien@gmail.com>
+Fanbo Meng <fanbo.meng@ibm.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Geoffrey Martin-Noble <gcmn@google.com> <gmngeoffrey@gmail.com>
 Gergő Szitár <szitar.gergo@gmail.com>
 Hannes Hauswedell <h2@fsfe.org>
+Henrique Bucher <hbucher@gmail.com>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
@ -52,25 +61,33 @@ Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 John Millikin <jmillikin@stripe.com>
 Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
-Kai Wolf <kai.wolf@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
+Kai Wolf <kai.wolf@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Pascal Leroy <phl@google.com>
 Paul Redmond <paul.redmond@gmail.com>
 Pierre Phaneuf <pphaneuf@google.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Raghu Raja <raghu@enfabrica.net>
+Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Raul Marin <rmrodriguez@cartodb.com>
 Ray Glover <ray.glover@uk.ibm.com>
 Robert Guo <robert.guo@mongodb.com>
 Roman Lebedev <lebedev.ri@gmail.com>
 Sayan Bhattacharjee <aero.sayan@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Steven Wan <wan.yu@ibm.com>
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Tobias Ulvgård <tobias.ulvgard@dirac.se>
 Tom Madams <tom.ej.madams@gmail.com> <tmadams@google.com>
 Yixuan Qiu <yixuanq@gmail.com>
--- a/MODULE.bazel
+++ b/MODULE.bazel
@ -0,0 +1,24 @@
+module(name = "google_benchmark", version="1.8.3")
+
+bazel_dep(name = "bazel_skylib", version = "1.4.1")
+bazel_dep(name = "platforms", version = "0.0.6")
+bazel_dep(name = "rules_foreign_cc", version = "0.9.0")
+bazel_dep(name = "rules_cc", version = "0.0.6")
+bazel_dep(name = "rules_python", version = "0.24.0", dev_dependency = True)
+bazel_dep(name = "googletest", version = "1.12.1", repo_name = "com_google_googletest", dev_dependency = True)
+bazel_dep(name = "libpfm", version = "4.11.0")
+
+# Register a toolchain for Python 3.9 to be able to build numpy. Python
+# versions >=3.10 are problematic.
+# A second reason for this is to be able to build Python hermetically instead
+# of relying on the changing default version from rules_python.
+
+python = use_extension("@rules_python//python/extensions:python.bzl", "python", dev_dependency = True)
+python.toolchain(python_version = "3.9")
+
+pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip", dev_dependency = True)
+pip.parse(
+    hub_name="tools_pip_deps",
+    python_version = "3.9",
+    requirements_lock="//tools:requirements.txt")
+use_repo(pip, "tools_pip_deps")
--- a/README.md
+++ b/README.md
--- a/35
+++ b/35
@ -1,37 +1,22 @@
 workspace(name = "com_github_google_benchmark")

-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//:bazel/benchmark_deps.bzl", "benchmark_deps")

-http_archive(
-    name = "rules_cc",
-    strip_prefix = "rules_cc-a508235df92e71d537fcbae0c7c952ea6957a912",
-    urls = ["https://github.com/bazelbuild/rules_cc/archive/a508235df92e71d537fcbae0c7c952ea6957a912.zip"],
-)
+benchmark_deps()

-http_archive(
-    name = "com_google_absl",
-    sha256 = "f41868f7a938605c92936230081175d1eae87f6ea2c248f41077c8f88316f111",
-    strip_prefix = "abseil-cpp-20200225.2",
-    urls = ["https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz"],
-)
+load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")

-http_archive(
-    name = "com_google_googletest",
-    strip_prefix = "googletest-3f0cf6b62ad1eb50d8736538363d3580dd640c3e",
-    urls = ["https://github.com/google/googletest/archive/3f0cf6b62ad1eb50d8736538363d3580dd640c3e.zip"],
-)
+rules_foreign_cc_dependencies()

-http_archive(
-    name = "pybind11",
-    build_file = "@//bindings/python:pybind11.BUILD",
-    sha256 = "1eed57bc6863190e35637290f97a20c81cfe4d9090ac0a24f3bbf08f265eb71d",
-    strip_prefix = "pybind11-2.4.3",
-    urls = ["https://github.com/pybind/pybind11/archive/v2.4.3.tar.gz"],
+load("@rules_python//python:pip.bzl", pip3_install="pip_install")
+
+pip3_install(
+   name = "tools_pip_deps",
+   requirements = "//tools:requirements.txt",
 )

 new_local_repository(
    name = "python_headers",
    build_file = "@//bindings/python:python_headers.BUILD",
-    path = "/usr/include/python3.6",  # May be overwritten by setup.py.
+    path = "<PYTHON_INCLUDE_PATH>",  # May be overwritten by setup.py.
 )
-
--- a/WORKSPACE.bzlmod
+++ b/WORKSPACE.bzlmod
@ -0,0 +1,2 @@
+# This file marks the root of the Bazel workspace.
+# See MODULE.bazel for dependencies and setup.
--- a/_config.yml
+++ b/_config.yml
@ -1 +1,2 @@
-theme: jekyll-theme-midnight
+theme: jekyll-theme-midnight
+markdown: GFM
--- a/bazel/benchmark_deps.bzl
+++ b/bazel/benchmark_deps.bzl
@ -0,0 +1,65 @@
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "new_git_repository")
+
+def benchmark_deps():
+    """Loads dependencies required to build Google Benchmark."""
+
+    if "bazel_skylib" not in native.existing_rules():
+        http_archive(
+            name = "bazel_skylib",
+            sha256 = "f7be3474d42aae265405a592bb7da8e171919d74c16f082a5457840f06054728",
+            urls = [
+                "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
+                "https://github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
+            ],
+        )
+
+    if "rules_foreign_cc" not in native.existing_rules():
+        http_archive(
+            name = "rules_foreign_cc",
+            sha256 = "bcd0c5f46a49b85b384906daae41d277b3dc0ff27c7c752cc51e43048a58ec83",
+            strip_prefix = "rules_foreign_cc-0.7.1",
+            url = "https://github.com/bazelbuild/rules_foreign_cc/archive/0.7.1.tar.gz",
+        )
+
+    if "rules_python" not in native.existing_rules():
+        http_archive(
+            name = "rules_python",
+            url = "https://github.com/bazelbuild/rules_python/releases/download/0.1.0/rules_python-0.1.0.tar.gz",
+            sha256 = "b6d46438523a3ec0f3cead544190ee13223a52f6a6765a29eae7b7cc24cc83a0",
+        )
+
+    if "com_google_absl" not in native.existing_rules():
+        http_archive(
+            name = "com_google_absl",
+            sha256 = "f41868f7a938605c92936230081175d1eae87f6ea2c248f41077c8f88316f111",
+            strip_prefix = "abseil-cpp-20200225.2",
+            urls = ["https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz"],
+        )
+
+    if "com_google_googletest" not in native.existing_rules():
+        new_git_repository(
+            name = "com_google_googletest",
+            remote = "https://github.com/google/googletest.git",
+            tag = "release-1.11.0",
+        )
+
+    if "nanobind" not in native.existing_rules():
+        new_git_repository(
+            name = "nanobind",
+            remote = "https://github.com/wjakob/nanobind.git",
+            tag = "v1.4.0",
+            build_file = "@//bindings/python:nanobind.BUILD",
+            recursive_init_submodules = True,
+        )
+
+    if "libpfm" not in native.existing_rules():
+        # Downloaded from v4.9.0 tag at https://sourceforge.net/p/perfmon2/libpfm4/ref/master/tags/
+        http_archive(
+            name = "libpfm",
+            build_file = str(Label("//tools:libpfm.BUILD.bazel")),
+            sha256 = "5da5f8872bde14b3634c9688d980f68bda28b510268723cc12973eedbab9fecc",
+            type = "tar.gz",
+            strip_prefix = "libpfm-4.11.0",
+            urls = ["https://sourceforge.net/projects/perfmon2/files/libpfm4/libpfm-4.11.0.tar.gz/download"],
+        )
--- a/bindings/python/build_defs.bzl
+++ b/bindings/python/build_defs.bzl
@ -8,8 +8,8 @@ def py_extension(name, srcs, hdrs = [], copts = [], features = [], deps = []):
        shared_lib_name = name + shared_lib_suffix
        native.cc_binary(
            name = shared_lib_name,
-            linkshared = 1,
-            linkstatic = 1,
+            linkshared = True,
+            linkstatic = True,
            srcs = srcs + hdrs,
            copts = copts,
            features = features,
--- a/bindings/python/google_benchmark/BUILD
+++ b/bindings/python/google_benchmark/BUILD
@ -6,7 +6,6 @@ py_library(
    visibility = ["//visibility:public"],
    deps = [
        ":_benchmark",
-        # pip; absl:app
    ],
 )

@ -17,10 +16,13 @@ py_extension(
        "-fexceptions",
        "-fno-strict-aliasing",
    ],
-    features = ["-use_header_modules"],
+    features = [
+        "-use_header_modules",
+        "-parse_headers",
+    ],
    deps = [
        "//:benchmark",
-        "@pybind11",
+        "@nanobind",
        "@python_headers",
    ],
 )
--- a/bindings/python/google_benchmark/init.py
+++ b/bindings/python/google_benchmark/init.py
@ -26,42 +26,137 @@ Example usage:
  if __name__ == '__main__':
    benchmark.main()
 """
+import atexit

 from absl import app
 from google_benchmark import _benchmark
+from google_benchmark._benchmark import (
+    Counter,
+    kNanosecond,
+    kMicrosecond,
+    kMillisecond,
+    kSecond,
+    oNone,
+    o1,
+    oN,
+    oNSquared,
+    oNCubed,
+    oLogN,
+    oNLogN,
+    oAuto,
+    oLambda,
+    State,
+)
+

 __all__ = [
    "register",
    "main",
+    "Counter",
+    "kNanosecond",
+    "kMicrosecond",
+    "kMillisecond",
+    "kSecond",
+    "oNone",
+    "o1",
+    "oN",
+    "oNSquared",
+    "oNCubed",
+    "oLogN",
+    "oNLogN",
+    "oAuto",
+    "oLambda",
+    "State",
 ]

-__version__ = "0.1.0"
+__version__ = "1.8.3"


-def register(f=None, *, name=None):
-  if f is None:
-    return lambda f: register(f, name=name)
-  if name is None:
-    name = f.__name__
-  _benchmark.RegisterBenchmark(name, f)
-  return f
+class __OptionMaker:
+    """A stateless class to collect benchmark options.
+
+    Collect all decorator calls like @option.range(start=0, limit=1<<5).
+    """
+
+    class Options:
+        """Pure data class to store options calls, along with the benchmarked function."""
+
+        def __init__(self, func):
+            self.func = func
+            self.builder_calls = []
+
+    @classmethod
+    def make(cls, func_or_options):
+        """Make Options from Options or the benchmarked function."""
+        if isinstance(func_or_options, cls.Options):
+            return func_or_options
+        return cls.Options(func_or_options)
+
+    def __getattr__(self, builder_name):
+        """Append option call in the Options."""
+
+        # The function that get returned on @option.range(start=0, limit=1<<5).
+        def __builder_method(*args, **kwargs):
+
+            # The decorator that get called, either with the benchmared function
+            # or the previous Options
+            def __decorator(func_or_options):
+                options = self.make(func_or_options)
+                options.builder_calls.append((builder_name, args, kwargs))
+                # The decorator returns Options so it is not technically a decorator
+                # and needs a final call to @register
+                return options
+
+            return __decorator
+
+        return __builder_method
+
+
+# Alias for nicer API.
+# We have to instantiate an object, even if stateless, to be able to use __getattr__
+# on option.range
+option = __OptionMaker()
+
+
+def register(undefined=None, *, name=None):
+    """Register function for benchmarking."""
+    if undefined is None:
+        # Decorator is called without parenthesis so we return a decorator
+        return lambda f: register(f, name=name)
+
+    # We have either the function to benchmark (simple case) or an instance of Options
+    # (@option._ case).
+    options = __OptionMaker.make(undefined)
+
+    if name is None:
+        name = options.func.__name__
+
+    # We register the benchmark and reproduce all the @option._ calls onto the
+    # benchmark builder pattern
+    benchmark = _benchmark.RegisterBenchmark(name, options.func)
+    for name, args, kwargs in options.builder_calls[::-1]:
+        getattr(benchmark, name)(*args, **kwargs)
+
+    # return the benchmarked function because the decorator does not modify it
+    return options.func


 def _flags_parser(argv):
-  argv = _benchmark.Initialize(argv)
-  return app.parse_flags_with_usage(argv)
+    argv = _benchmark.Initialize(argv)
+    return app.parse_flags_with_usage(argv)


 def _run_benchmarks(argv):
-  if len(argv) > 1:
-    raise app.UsageError('Too many command-line arguments.')
-  return _benchmark.RunSpecifiedBenchmarks()
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
+    return _benchmark.RunSpecifiedBenchmarks()


 def main(argv=None):
-  return app.run(_run_benchmarks, argv=argv, flags_parser=_flags_parser)
+    return app.run(_run_benchmarks, argv=argv, flags_parser=_flags_parser)


 # Methods for use with custom main function.
 initialize = _benchmark.Initialize
 run_benchmarks = _benchmark.RunSpecifiedBenchmarks
+atexit.register(_benchmark.ClearRegisteredBenchmarks)
--- a/bindings/python/google_benchmark/benchmark.cc
+++ b/bindings/python/google_benchmark/benchmark.cc
@ -1,11 +1,17 @@
 // Benchmark for Python.

 #include "benchmark/benchmark.h"
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
+
+#include "nanobind/nanobind.h"
+#include "nanobind/operators.h"
+#include "nanobind/stl/bind_map.h"
+#include "nanobind/stl/string.h"
+#include "nanobind/stl/vector.h"
+
+NB_MAKE_OPAQUE(benchmark::UserCounters);

 namespace {
-namespace py = ::pybind11;
+namespace nb = nanobind;

 std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
  // The `argv` pointers here become invalid when this function returns, but
@ -28,21 +34,151 @@ std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
  return remaining_argv;
 }

-void RegisterBenchmark(const char* name, py::function f) {
-  benchmark::RegisterBenchmark(name, [f](benchmark::State& state) {
-    f(&state);
-  });
+benchmark::internal::Benchmark* RegisterBenchmark(const std::string& name,
+                                                  nb::callable f) {
+  return benchmark::RegisterBenchmark(
+      name, [f](benchmark::State& state) { f(&state); });
 }

-PYBIND11_MODULE(_benchmark, m) {
+NB_MODULE(_benchmark, m) {
+
+  using benchmark::TimeUnit;
+  nb::enum_<TimeUnit>(m, "TimeUnit")
+      .value("kNanosecond", TimeUnit::kNanosecond)
+      .value("kMicrosecond", TimeUnit::kMicrosecond)
+      .value("kMillisecond", TimeUnit::kMillisecond)
+      .value("kSecond", TimeUnit::kSecond)
+      .export_values();
+
+  using benchmark::BigO;
+  nb::enum_<BigO>(m, "BigO")
+      .value("oNone", BigO::oNone)
+      .value("o1", BigO::o1)
+      .value("oN", BigO::oN)
+      .value("oNSquared", BigO::oNSquared)
+      .value("oNCubed", BigO::oNCubed)
+      .value("oLogN", BigO::oLogN)
+      .value("oNLogN", BigO::oNLogN)
+      .value("oAuto", BigO::oAuto)
+      .value("oLambda", BigO::oLambda)
+      .export_values();
+
+  using benchmark::internal::Benchmark;
+  nb::class_<Benchmark>(m, "Benchmark")
+      // For methods returning a pointer to the current object, reference
+      // return policy is used to ask nanobind not to take ownership of the
+      // returned object and avoid calling delete on it.
+      // https://pybind11.readthedocs.io/en/stable/advanced/functions.html#return-value-policies
+      //
+      // For methods taking a const std::vector<...>&, a copy is created
+      // because a it is bound to a Python list.
+      // https://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html
+      .def("unit", &Benchmark::Unit, nb::rv_policy::reference)
+      .def("arg", &Benchmark::Arg, nb::rv_policy::reference)
+      .def("args", &Benchmark::Args, nb::rv_policy::reference)
+      .def("range", &Benchmark::Range, nb::rv_policy::reference,
+           nb::arg("start"), nb::arg("limit"))
+      .def("dense_range", &Benchmark::DenseRange,
+           nb::rv_policy::reference, nb::arg("start"),
+           nb::arg("limit"), nb::arg("step") = 1)
+      .def("ranges", &Benchmark::Ranges, nb::rv_policy::reference)
+      .def("args_product", &Benchmark::ArgsProduct,
+           nb::rv_policy::reference)
+      .def("arg_name", &Benchmark::ArgName, nb::rv_policy::reference)
+      .def("arg_names", &Benchmark::ArgNames,
+           nb::rv_policy::reference)
+      .def("range_pair", &Benchmark::RangePair,
+           nb::rv_policy::reference, nb::arg("lo1"), nb::arg("hi1"),
+           nb::arg("lo2"), nb::arg("hi2"))
+      .def("range_multiplier", &Benchmark::RangeMultiplier,
+           nb::rv_policy::reference)
+      .def("min_time", &Benchmark::MinTime, nb::rv_policy::reference)
+      .def("min_warmup_time", &Benchmark::MinWarmUpTime,
+           nb::rv_policy::reference)
+      .def("iterations", &Benchmark::Iterations,
+           nb::rv_policy::reference)
+      .def("repetitions", &Benchmark::Repetitions,
+           nb::rv_policy::reference)
+      .def("report_aggregates_only", &Benchmark::ReportAggregatesOnly,
+           nb::rv_policy::reference, nb::arg("value") = true)
+      .def("display_aggregates_only", &Benchmark::DisplayAggregatesOnly,
+           nb::rv_policy::reference, nb::arg("value") = true)
+      .def("measure_process_cpu_time", &Benchmark::MeasureProcessCPUTime,
+           nb::rv_policy::reference)
+      .def("use_real_time", &Benchmark::UseRealTime,
+           nb::rv_policy::reference)
+      .def("use_manual_time", &Benchmark::UseManualTime,
+           nb::rv_policy::reference)
+      .def(
+          "complexity",
+          (Benchmark * (Benchmark::*)(benchmark::BigO)) & Benchmark::Complexity,
+          nb::rv_policy::reference,
+          nb::arg("complexity") = benchmark::oAuto);
+
+  using benchmark::Counter;
+  nb::class_<Counter> py_counter(m, "Counter");
+
+  nb::enum_<Counter::Flags>(py_counter, "Flags")
+      .value("kDefaults", Counter::Flags::kDefaults)
+      .value("kIsRate", Counter::Flags::kIsRate)
+      .value("kAvgThreads", Counter::Flags::kAvgThreads)
+      .value("kAvgThreadsRate", Counter::Flags::kAvgThreadsRate)
+      .value("kIsIterationInvariant", Counter::Flags::kIsIterationInvariant)
+      .value("kIsIterationInvariantRate",
+             Counter::Flags::kIsIterationInvariantRate)
+      .value("kAvgIterations", Counter::Flags::kAvgIterations)
+      .value("kAvgIterationsRate", Counter::Flags::kAvgIterationsRate)
+      .value("kInvert", Counter::Flags::kInvert)
+      .export_values()
+      .def(nb::self | nb::self);
+
+  nb::enum_<Counter::OneK>(py_counter, "OneK")
+      .value("kIs1000", Counter::OneK::kIs1000)
+      .value("kIs1024", Counter::OneK::kIs1024)
+      .export_values();
+
+  py_counter
+      .def(nb::init<double, Counter::Flags, Counter::OneK>(),
+           nb::arg("value") = 0., nb::arg("flags") = Counter::kDefaults,
+           nb::arg("k") = Counter::kIs1000)
+      .def("__init__", ([](Counter *c, double value) { new (c) Counter(value); }))
+      .def_rw("value", &Counter::value)
+      .def_rw("flags", &Counter::flags)
+      .def_rw("oneK", &Counter::oneK)
+      .def(nb::init_implicit<double>());
+
+  nb::implicitly_convertible<nb::int_, Counter>();
+
+  nb::bind_map<benchmark::UserCounters>(m, "UserCounters");
+
+  using benchmark::State;
+  nb::class_<State>(m, "State")
+      .def("__bool__", &State::KeepRunning)
+      .def_prop_ro("keep_running", &State::KeepRunning)
+      .def("pause_timing", &State::PauseTiming)
+      .def("resume_timing", &State::ResumeTiming)
+      .def("skip_with_error", &State::SkipWithError)
+      .def_prop_ro("error_occurred", &State::error_occurred)
+      .def("set_iteration_time", &State::SetIterationTime)
+      .def_prop_rw("bytes_processed", &State::bytes_processed,
+                    &State::SetBytesProcessed)
+      .def_prop_rw("complexity_n", &State::complexity_length_n,
+                    &State::SetComplexityN)
+      .def_prop_rw("items_processed", &State::items_processed,
+                   &State::SetItemsProcessed)
+      .def("set_label", &State::SetLabel)
+      .def("range", &State::range, nb::arg("pos") = 0)
+      .def_prop_ro("iterations", &State::iterations)
+      .def_prop_ro("name", &State::name)
+      .def_rw("counters", &State::counters)
+      .def_prop_ro("thread_index", &State::thread_index)
+      .def_prop_ro("threads", &State::threads);
+
  m.def("Initialize", Initialize);
-  m.def("RegisterBenchmark", RegisterBenchmark);
+  m.def("RegisterBenchmark", RegisterBenchmark,
+        nb::rv_policy::reference);
  m.def("RunSpecifiedBenchmarks",
        []() { benchmark::RunSpecifiedBenchmarks(); });
-
-  py::class_<benchmark::State>(m, "State")
-      .def("__bool__", &benchmark::State::KeepRunning)
-      .def_property_readonly("keep_running", &benchmark::State::KeepRunning)
-      .def("skip_with_error", &benchmark::State::SkipWithError);
+  m.def("ClearRegisteredBenchmarks", benchmark::ClearRegisteredBenchmarks);
 };
 }  // namespace
--- a/bindings/python/google_benchmark/example.py
+++ b/bindings/python/google_benchmark/example.py
@ -20,29 +20,117 @@ In the extracted directory, execute:
  python setup.py install
 """

+import random
+import time
+
 import google_benchmark as benchmark
+from google_benchmark import Counter


@benchmark.register
 def empty(state):
-  while state:
-    pass
+    while state:
+        pass


@benchmark.register
 def sum_million(state):
-  while state:
-    sum(range(1_000_000))
+    while state:
+        sum(range(1_000_000))
+
+@benchmark.register
+def pause_timing(state):
+    """Pause timing every iteration."""
+    while state:
+        # Construct a list of random ints every iteration without timing it
+        state.pause_timing()
+        random_list = [random.randint(0, 100) for _ in range(100)]
+        state.resume_timing()
+        # Time the in place sorting algorithm
+        random_list.sort()


@benchmark.register
 def skipped(state):
-  if True:  # Test some predicate here.
-    state.skip_with_error('some error')
-    return  # NOTE: You must explicitly return, or benchmark will continue.
+    if True:  # Test some predicate here.
+        state.skip_with_error("some error")
+        return  # NOTE: You must explicitly return, or benchmark will continue.

-  ...  # Benchmark code would be here.
+    ...  # Benchmark code would be here.


-if __name__ == '__main__':
-  benchmark.main()
+@benchmark.register
+def manual_timing(state):
+    while state:
+        # Manually count Python CPU time
+        start = time.perf_counter()  # perf_counter_ns() in Python 3.7+
+        # Something to benchmark
+        time.sleep(0.01)
+        end = time.perf_counter()
+        state.set_iteration_time(end - start)
+
+
+@benchmark.register
+def custom_counters(state):
+    """Collect custom metric using benchmark.Counter."""
+    num_foo = 0.0
+    while state:
+        # Benchmark some code here
+        pass
+        # Collect some custom metric named foo
+        num_foo += 0.13
+
+    # Automatic Counter from numbers.
+    state.counters["foo"] = num_foo
+    # Set a counter as a rate.
+    state.counters["foo_rate"] = Counter(num_foo, Counter.kIsRate)
+    #  Set a counter as an inverse of rate.
+    state.counters["foo_inv_rate"] = Counter(num_foo, Counter.kIsRate | Counter.kInvert)
+    # Set a counter as a thread-average quantity.
+    state.counters["foo_avg"] = Counter(num_foo, Counter.kAvgThreads)
+    # There's also a combined flag:
+    state.counters["foo_avg_rate"] = Counter(num_foo, Counter.kAvgThreadsRate)
+
+
+@benchmark.register
+@benchmark.option.measure_process_cpu_time()
+@benchmark.option.use_real_time()
+def with_options(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register(name="sum_million_microseconds")
+@benchmark.option.unit(benchmark.kMicrosecond)
+def with_options2(state):
+    while state:
+        sum(range(1_000_000))
+
+
+@benchmark.register
+@benchmark.option.arg(100)
+@benchmark.option.arg(1000)
+def passing_argument(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+@benchmark.register
+@benchmark.option.range(8, limit=8 << 10)
+def using_range(state):
+    while state:
+        sum(range(state.range(0)))
+
+
+@benchmark.register
+@benchmark.option.range_multiplier(2)
+@benchmark.option.range(1 << 10, 1 << 18)
+@benchmark.option.complexity(benchmark.oN)
+def computing_complexity(state):
+    while state:
+        sum(range(state.range(0)))
+    state.complexity_n = state.range(0)
+
+
+if __name__ == "__main__":
+    benchmark.main()
--- a/bindings/python/nanobind.BUILD
+++ b/bindings/python/nanobind.BUILD
@ -0,0 +1,17 @@
+cc_library(
+    name = "nanobind",
+    srcs = glob([
+        "src/*.cpp"
+    ]),
+    copts = ["-fexceptions"],
+    includes = ["include", "ext/robin_map/include"],
+    textual_hdrs = glob(
+        [
+            "include/**/*.h",
+            "src/*.h",
+            "ext/robin_map/include/tsl/*.h",
+        ],
+    ),
+    deps = ["@python_headers"],
+    visibility = ["//visibility:public"],
+)
--- a/bindings/python/pybind11.BUILD
+++ b/bindings/python/pybind11.BUILD
@ -1,20 +0,0 @@
-cc_library(
-    name = "pybind11",
-    hdrs = glob(
-        include = [
-            "include/pybind11/*.h",
-            "include/pybind11/detail/*.h",
-        ],
-        exclude = [
-            "include/pybind11/common.h",
-            "include/pybind11/eigen.h",
-        ],
-    ),
-    copts = [
-        "-fexceptions",
-        "-Wno-undefined-inline",
-        "-Wno-pragma-once-outside-header",
-    ],
-    includes = ["include"],
-    visibility = ["//visibility:public"],
-)
--- a/bindings/python/requirements.txt
+++ b/bindings/python/requirements.txt
@ -1,2 +0,0 @@
-absl-py>=0.7.1
-
--- a/cmake/AddCXXCompilerFlag.cmake
+++ b/cmake/AddCXXCompilerFlag.cmake
@ -34,9 +34,11 @@ function(add_cxx_compiler_flag FLAG)
  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
  if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
      string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
    endif()
    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${BENCHMARK_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
  endif()
@ -49,9 +51,11 @@ function(add_required_cxx_compiler_flag FLAG)
  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
  if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
      string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
    endif()
    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
--- a/cmake/CXXFeatureCheck.cmake
+++ b/cmake/CXXFeatureCheck.cmake
@ -17,6 +17,8 @@ if(__cxx_feature_check)
 endif()
 set(__cxx_feature_check INCLUDED)

+option(CXXFEATURECHECK_DEBUG OFF)
+
 function(cxx_feature_check FILE)
  string(TOLOWER ${FILE} FILE)
  string(TOUPPER ${FILE} VAR)
@ -27,13 +29,22 @@ function(cxx_feature_check FILE)
    return()
  endif()

+  set(FEATURE_CHECK_CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+  if (ARGC GREATER 1)
+    message(STATUS "Enabling additional flags: ${ARGV1}")
+    list(APPEND FEATURE_CHECK_CMAKE_FLAGS ${ARGV1})
+  endif()
+
  if (NOT DEFINED COMPILE_${FEATURE})
-    message(STATUS "Performing Test ${FEATURE}")
    if(CMAKE_CROSSCOMPILING)
+      message(STATUS "Cross-compiling to test ${FEATURE}")
      try_compile(COMPILE_${FEATURE}
              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 11
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
      if(COMPILE_${FEATURE})
        message(WARNING
              "If you see build failures due to cross compilation, try setting HAVE_${VAR} to 0")
@ -42,11 +53,14 @@ function(cxx_feature_check FILE)
        set(RUN_${FEATURE} 1 CACHE INTERNAL "")
      endif()
    else()
-      message(STATUS "Performing Test ${FEATURE}")
+      message(STATUS "Compiling and running to test ${FEATURE}")
      try_run(RUN_${FEATURE} COMPILE_${FEATURE}
              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 11
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
    endif()
  endif()

@ -56,7 +70,11 @@ function(cxx_feature_check FILE)
    add_definitions(-DHAVE_${VAR})
  else()
    if(NOT COMPILE_${FEATURE})
-      message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      if(CXXFEATURECHECK_DEBUG)
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile: ${COMPILE_OUTPUT_VAR}")
+      else()
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      endif()
    else()
      message(STATUS "Performing Test ${FEATURE} -- compiled but failed to run")
    endif()
--- a/cmake/Config.cmake.in
+++ b/cmake/Config.cmake.in
@ -1 +1,7 @@
+@PACKAGE_INIT@
+
+include (CMakeFindDependencyMacro)
+
+find_dependency (Threads)
+
 include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
--- a/cmake/GetGitVersion.cmake
+++ b/cmake/GetGitVersion.cmake
@ -20,16 +20,20 @@ set(__get_git_version INCLUDED)

 function(get_git_version var)
  if(GIT_EXECUTABLE)
-      execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
          RESULT_VARIABLE status
-          OUTPUT_VARIABLE GIT_VERSION
+          OUTPUT_VARIABLE GIT_DESCRIBE_VERSION
          ERROR_QUIET)
-      if(${status})
-          set(GIT_VERSION "v0.0.0")
+      if(status)
+          set(GIT_DESCRIBE_VERSION "v0.0.0")
+      endif()
+      
+      string(STRIP ${GIT_DESCRIBE_VERSION} GIT_DESCRIBE_VERSION)
+      if(GIT_DESCRIBE_VERSION MATCHES v[^-]*-) 
+         string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  GIT_VERSION ${GIT_DESCRIBE_VERSION})
      else()
-          string(STRIP ${GIT_VERSION} GIT_VERSION)
-          string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION})
+         string(REGEX REPLACE "v(.*)" "\\1" GIT_VERSION ${GIT_DESCRIBE_VERSION})
      endif()

      # Work out if the repository is dirty
@ -43,12 +47,12 @@ function(get_git_version var)
          ERROR_QUIET)
      string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
      if (${GIT_DIRTY})
-          set(GIT_VERSION "${GIT_VERSION}-dirty")
+          set(GIT_DESCRIBE_VERSION "${GIT_DESCRIBE_VERSION}-dirty")
      endif()
+      message(STATUS "git version: ${GIT_DESCRIBE_VERSION} normalized to ${GIT_VERSION}")
  else()
-      set(GIT_VERSION "v0.0.0")
+      set(GIT_VERSION "0.0.0")
  endif()

-  message(STATUS "git Version: ${GIT_VERSION}")
  set(${var} ${GIT_VERSION} PARENT_SCOPE)
 endfunction()
--- a/cmake/GoogleTest.cmake
+++ b/cmake/GoogleTest.cmake
@ -35,7 +35,24 @@ add_subdirectory(${GOOGLETEST_SOURCE_DIR}
                 ${GOOGLETEST_BINARY_DIR}
                 EXCLUDE_FROM_ALL)

-set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES>)
+# googletest doesn't seem to want to stay build warning clean so let's not hurt ourselves.
+if (MSVC)
+  target_compile_options(gtest PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gtest_main PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock_main PRIVATE "/wd4244" "/wd4722")
+else()
+  target_compile_options(gtest PRIVATE "-w")
+  target_compile_options(gtest_main PRIVATE "-w")
+  target_compile_options(gmock PRIVATE "-w")
+  target_compile_options(gmock_main PRIVATE "-w")
+endif()
+
+if(NOT DEFINED GTEST_COMPILE_COMMANDS)
+    set(GTEST_COMPILE_COMMANDS ON)
+endif()
+
+set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
--- a/cmake/GoogleTest.cmake.in
+++ b/cmake/GoogleTest.cmake.in
@ -31,13 +31,14 @@ if(EXISTS "${GOOGLETEST_PATH}"            AND IS_DIRECTORY "${GOOGLETEST_PATH}"
  )
 else()
  if(NOT ALLOW_DOWNLOADING_GOOGLETEST)
-    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_USE_BUNDLED_GTEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+    return()
  else()
    message(WARNING "Did not find Google Test sources! Fetching from web...")
    ExternalProject_Add(
      googletest
      GIT_REPOSITORY    https://github.com/google/googletest.git
-      GIT_TAG           master
+      GIT_TAG           "release-1.11.0"
      PREFIX            "${CMAKE_BINARY_DIR}"
      STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
      DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
--- a/cmake/Modules/FindPFM.cmake
+++ b/cmake/Modules/FindPFM.cmake
@ -0,0 +1,28 @@
+# If successful, the following variables will be defined:
+# PFM_FOUND.
+# PFM_LIBRARIES
+# PFM_INCLUDE_DIRS
+# the following target will be defined:
+# PFM::libpfm
+
+include(FeatureSummary)
+include(FindPackageHandleStandardArgs)
+
+set_package_properties(PFM PROPERTIES
+                       URL http://perfmon2.sourceforge.net/
+                       DESCRIPTION "A helper library to develop monitoring tools"
+                       PURPOSE "Used to program specific performance monitoring events")
+
+find_library(PFM_LIBRARY NAMES pfm)
+find_path(PFM_INCLUDE_DIR NAMES perfmon/pfmlib.h)
+
+find_package_handle_standard_args(PFM REQUIRED_VARS PFM_LIBRARY PFM_INCLUDE_DIR)
+
+if (PFM_FOUND AND NOT TARGET PFM::libpfm)
+    add_library(PFM::libpfm UNKNOWN IMPORTED)
+    set_target_properties(PFM::libpfm PROPERTIES
+        IMPORTED_LOCATION "${PFM_LIBRARY}"
+        INTERFACE_INCLUDE_DIRECTORIES "${PFM_INCLUDE_DIR}")
+endif()
+
+mark_as_advanced(PFM_LIBRARY PFM_INCLUDE_DIR)
--- a/cmake/benchmark.pc.in
+++ b/cmake/benchmark.pc.in
@ -1,7 +1,7 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
-libdir=${prefix}/lib
-includedir=${prefix}/include
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@

 Name: @PROJECT_NAME@
 Description: Google microbenchmark framework
--- a/cmake/pthread_affinity.cpp
+++ b/cmake/pthread_affinity.cpp
@ -0,0 +1,16 @@
+#include <pthread.h>
+int main() {
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  for (int i = 0; i < CPU_SETSIZE; ++i) {
+    CPU_SET(i, &set);
+    CPU_CLR(i, &set);
+  }
+  pthread_t self = pthread_self();
+  int ret;
+  ret = pthread_getaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  ret = pthread_setaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  return 0;
+}
--- a/conan/CMakeLists.txt
+++ b/conan/CMakeLists.txt
@ -1,7 +0,0 @@
-cmake_minimum_required(VERSION 2.8.11)
-project(cmake_wrapper)
-
-include(conanbuildinfo.cmake)
-conan_basic_setup()
-
-include(${CMAKE_SOURCE_DIR}/CMakeListsOriginal.txt)
--- a/conan/test_package/CMakeLists.txt
+++ b/conan/test_package/CMakeLists.txt
@ -1,10 +0,0 @@
-cmake_minimum_required(VERSION 2.8.11)
-project(test_package)
-
-set(CMAKE_VERBOSE_MAKEFILE TRUE)
-
-include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
-conan_basic_setup()
-
-add_executable(${PROJECT_NAME} test_package.cpp)
-target_link_libraries(${PROJECT_NAME} ${CONAN_LIBS})
--- a/conan/test_package/conanfile.py
+++ b/conan/test_package/conanfile.py
@ -1,19 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from conans import ConanFile, CMake
-import os
-
-
-class TestPackageConan(ConanFile):
-    settings = "os", "compiler", "build_type", "arch"
-    generators = "cmake"
-
-    def build(self):
-        cmake = CMake(self)
-        cmake.configure()
-        cmake.build()
-
-    def test(self):
-        bin_path = os.path.join("bin", "test_package")
-        self.run(bin_path, run_environment=True)
--- a/conan/test_package/test_package.cpp
+++ b/conan/test_package/test_package.cpp
@ -1,18 +0,0 @@
-#include "benchmark/benchmark.h"
-
-void BM_StringCreation(benchmark::State& state) {
-    while (state.KeepRunning())
-        std::string empty_string;
-}
-
-BENCHMARK(BM_StringCreation);
-
-void BM_StringCopy(benchmark::State& state) {
-    std::string x = "hello";
-    while (state.KeepRunning())
-        std::string copy(x);
-}
-
-BENCHMARK(BM_StringCopy);
-
-BENCHMARK_MAIN();
--- a/conanfile.py
+++ b/conanfile.py
@ -1,79 +0,0 @@
-from conans import ConanFile, CMake, tools
-from conans.errors import ConanInvalidConfiguration
-import shutil
-import os
-
-
-class GoogleBenchmarkConan(ConanFile):
-    name = "benchmark"
-    description = "A microbenchmark support library."
-    topics = ("conan", "benchmark", "google", "microbenchmark")
-    url = "https://github.com/google/benchmark"
-    homepage = "https://github.com/google/benchmark"
-    author = "Google Inc."
-    license = "Apache-2.0"
-    exports_sources = ["*"]
-    generators = "cmake"
-
-    settings = "arch", "build_type", "compiler", "os"
-    options = {
-        "shared": [True, False],
-        "fPIC": [True, False],
-        "enable_lto": [True, False],
-        "enable_exceptions": [True, False]
-    }
-    default_options = {"shared": False, "fPIC": True, "enable_lto": False, "enable_exceptions": True}
-
-    _build_subfolder = "."
-
-    def source(self):
-        # Wrap the original CMake file to call conan_basic_setup
-        shutil.move("CMakeLists.txt", "CMakeListsOriginal.txt")
-        shutil.move(os.path.join("conan", "CMakeLists.txt"), "CMakeLists.txt")
-
-    def config_options(self):
-        if self.settings.os == "Windows":
-            if self.settings.compiler == "Visual Studio" and float(self.settings.compiler.version.value) <= 12:
-                raise ConanInvalidConfiguration("{} {} does not support Visual Studio <= 12".format(self.name, self.version))
-            del self.options.fPIC
-
-    def configure(self):
-        if self.settings.os == "Windows" and self.options.shared:
-            raise ConanInvalidConfiguration("Windows shared builds are not supported right now, see issue #639")
-
-    def _configure_cmake(self):
-        cmake = CMake(self)
-
-        cmake.definitions["BENCHMARK_ENABLE_TESTING"] = "OFF"
-        cmake.definitions["BENCHMARK_ENABLE_GTEST_TESTS"] = "OFF"
-        cmake.definitions["BENCHMARK_ENABLE_LTO"] = "ON" if self.options.enable_lto else "OFF"
-        cmake.definitions["BENCHMARK_ENABLE_EXCEPTIONS"] = "ON" if self.options.enable_exceptions else "OFF"
-
-        # See https://github.com/google/benchmark/pull/638 for Windows 32 build explanation
-        if self.settings.os != "Windows":
-            cmake.definitions["BENCHMARK_BUILD_32_BITS"] = "ON" if "64" not in str(self.settings.arch) else "OFF"
-            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "ON" if (str(self.settings.compiler.libcxx) == "libc++") else "OFF"
-        else:
-            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "OFF"
-
-        cmake.configure(build_folder=self._build_subfolder)
-        return cmake
-
-    def build(self):
-        cmake = self._configure_cmake()
-        cmake.build()
-
-    def package(self):
-        cmake = self._configure_cmake()
-        cmake.install()
-
-        self.copy(pattern="LICENSE", dst="licenses")
-
-    def package_info(self):
-        self.cpp_info.libs = tools.collect_libs(self)
-        if self.settings.os == "Linux":
-            self.cpp_info.libs.extend(["pthread", "rt"])
-        elif self.settings.os == "Windows":
-            self.cpp_info.libs.append("shlwapi")
-        elif self.settings.os == "SunOS":
-            self.cpp_info.libs.append("kstat")
--- a/dependencies.md
+++ b/dependencies.md
@ -1,18 +0,0 @@
-# Build tool dependency policy
-
-To ensure the broadest compatibility when building the benchmark library, but
-still allow forward progress, we require any build tooling to be available for:
-
-* Debian stable AND
-* The last two Ubuntu LTS releases AND
-
-Currently, this means using build tool versions that are available for Ubuntu
-16.04 (Xenial), Ubuntu 18.04 (Bionic), and Debian stretch.
-
-_Note, [travis](.travis.yml) runs under Ubuntu 14.04 (Trusty) for linux builds._
-
-## cmake
-The current supported version is cmake 3.5.1 as of 2018-06-06.
-
-_Note, this version is also available for Ubuntu 14.04, the previous Ubuntu LTS
-release, as `cmake3`._
--- a/docs/AssemblyTests.md
+++ b/docs/AssemblyTests.md
@ -111,6 +111,7 @@ between compilers or compiler versions. A common example of this
 is matching stack frame addresses. In this case regular expressions
 can be used to match the differing bits of output. For example:

+<!-- {% raw %} -->
 ```c++
 int ExternInt;
 struct Point { int x, y, z; };
@ -127,6 +128,7 @@ extern "C" void test_store_point() {
    // CHECK: ret
 }
 ```
+<!-- {% endraw %} -->

 ## Current Requirements and Limitations

--- a/docs/_config.yml
+++ b/docs/_config.yml
@ -1 +1,3 @@
-theme: jekyll-theme-midnight
+theme: jekyll-theme-minimal
+logo: /assets/images/icon_black.png
+show_downloads: true
--- a/docs/assets/images/icon.png
+++ b/docs/assets/images/icon.png
--- a/docs/assets/images/icon.xcf
+++ b/docs/assets/images/icon.xcf
--- a/docs/assets/images/icon_black.png
+++ b/docs/assets/images/icon_black.png
--- a/docs/assets/images/icon_black.xcf
+++ b/docs/assets/images/icon_black.xcf
--- a/docs/dependencies.md
+++ b/docs/dependencies.md
@ -0,0 +1,13 @@
+# Build tool dependency policy
+
+We follow the [Foundational C++ support policy](https://opensource.google/documentation/policies/cplusplus-support) for our build tools. In
+particular the ["Build Systems" section](https://opensource.google/documentation/policies/cplusplus-support#build-systems).
+
+## CMake
+
+The current supported version is CMake 3.10 as of 2023-08-10. Most modern
+distributions include newer versions, for example:
+
+* Ubuntu 20.04 provides CMake 3.16.3
+* Debian 11.4 provides CMake 3.18.4
+* Ubuntu 22.04 provides CMake 3.22.1
--- a/docs/index.md
+++ b/docs/index.md
@ -0,0 +1,12 @@
+# Benchmark
+
+* [Assembly Tests](AssemblyTests.md)
+* [Dependencies](dependencies.md)
+* [Perf Counters](perf_counters.md)
+* [Platform Specific Build Instructions](platform_specific_build_instructions.md)
+* [Python Bindings](python_bindings.md)
+* [Random Interleaving](random_interleaving.md)
+* [Reducing Variance](reducing_variance.md)
+* [Releasing](releasing.md)
+* [Tools](tools.md)
+* [User Guide](user_guide.md)
--- a/docs/perf_counters.md
+++ b/docs/perf_counters.md
@ -0,0 +1,35 @@
+<a name="perf-counters" />
+
+# User-Requested Performance Counters
+
+When running benchmarks, the user may choose to request collection of
+performance counters. This may be useful in investigation scenarios - narrowing
+down the cause of a regression; or verifying that the underlying cause of a
+performance improvement matches expectations.
+
+This feature is available if:
+
+* The benchmark is run on an architecture featuring a Performance Monitoring
+  Unit (PMU),
+* The benchmark is compiled with support for collecting counters. Currently,
+  this requires [libpfm](http://perfmon2.sourceforge.net/), which is built as a
+  dependency via Bazel.
+
+The feature does not require modifying benchmark code. Counter collection is
+handled at the boundaries where timer collection is also handled. 
+
+To opt-in:
+* If using a Bazel build, add `--define pfm=1` to your build flags
+* If using CMake:
+  * Install `libpfm4-dev`, e.g. `apt-get install libpfm4-dev`.
+  * Enable the CMake flag `BENCHMARK_ENABLE_LIBPFM` in `CMakeLists.txt`.
+
+To use, pass a comma-separated list of counter names through the
+`--benchmark_perf_counters` flag. The names are decoded through libpfm - meaning,
+they are platform specific, but some (e.g. `CYCLES` or `INSTRUCTIONS`) are
+mapped by libpfm to platform-specifics - see libpfm
+[documentation](http://perfmon2.sourceforge.net/docs.html) for more details.
+
+The counter values are reported back through the [User Counters](../README.md#custom-counters)
+mechanism, meaning, they are available in all the formats (e.g. JSON) supported
+by User Counters.
--- a/docs/platform_specific_build_instructions.md
+++ b/docs/platform_specific_build_instructions.md
@ -0,0 +1,48 @@
+# Platform Specific Build Instructions
+
+## Building with GCC
+
+When the library is built using GCC it is necessary to link with the pthread
+library due to how GCC implements `std::thread`. Failing to link to pthread will
+lead to runtime exceptions (unless you're using libc++), not linker errors. See
+[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
+can link to pthread by adding `-pthread` to your linker command. Note, you can
+also use `-lpthread`, but there are potential issues with ordering of command
+line parameters if you use that.
+
+On QNX, the pthread library is part of libc and usually included automatically
+(see
+[`pthread_create()`](https://www.qnx.com/developers/docs/7.1/index.html#com.qnx.doc.neutrino.lib_ref/topic/p/pthread_create.html)).
+There's no separate pthread library to link.
+
+## Building with Visual Studio 2015 or 2017
+
+The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
+
+```
+// Alternatively, can add libraries using linker options.
+#ifdef _WIN32
+#pragma comment ( lib, "Shlwapi.lib" )
+#ifdef _DEBUG
+#pragma comment ( lib, "benchmarkd.lib" )
+#else
+#pragma comment ( lib, "benchmark.lib" )
+#endif
+#endif
+```
+
+Can also use the graphical version of CMake:
+* Open `CMake GUI`.
+* Under `Where to build the binaries`, same path as source plus `build`.
+* Under `CMAKE_INSTALL_PREFIX`, same path as source plus `install`.
+* Click `Configure`, `Generate`, `Open Project`.
+* If build fails, try deleting entire directory and starting again, or unticking options to build less.
+
+## Building with Intel 2015 Update 1 or Intel System Studio Update 4
+
+See instructions for building with Visual Studio. Once built, right click on the solution and change the build to Intel.
+
+## Building on Solaris
+
+If you're running benchmarks on solaris, you'll want the kstat library linked in
+too (`-lkstat`).
--- a/docs/python_bindings.md
+++ b/docs/python_bindings.md
@ -0,0 +1,34 @@
+# Building and installing Python bindings
+
+Python bindings are available as wheels on [PyPI](https://pypi.org/project/google-benchmark/) for importing and 
+using Google Benchmark directly in Python. 
+Currently, pre-built wheels exist for macOS (both ARM64 and Intel x86), Linux x86-64 and 64-bit Windows.
+Supported Python versions are Python 3.7 - 3.10.
+
+To install Google Benchmark's Python bindings, run:
+
+```bash
+python -m pip install --upgrade pip  # for manylinux2014 support
+python -m pip install google-benchmark
+```
+
+In order to keep your system Python interpreter clean, it is advisable to run these commands in a virtual
+environment. See the [official Python documentation](https://docs.python.org/3/library/venv.html) 
+on how to create virtual environments.
+
+To build a wheel directly from source, you can follow these steps:
+```bash
+git clone https://github.com/google/benchmark.git
+cd benchmark
+# create a virtual environment and activate it
+python3 -m venv venv --system-site-packages
+source venv/bin/activate  # .\venv\Scripts\Activate.ps1 on Windows
+
+# upgrade Python's system-wide packages
+python -m pip install --upgrade pip setuptools wheel
+# builds the wheel and stores it in the directory "wheelhouse".
+python -m pip wheel . -w wheelhouse
+```
+
+NB: Building wheels from source requires Bazel. For platform-specific instructions on how to install Bazel,
+refer to the [Bazel installation docs](https://bazel.build/install).
--- a/docs/random_interleaving.md
+++ b/docs/random_interleaving.md
@ -0,0 +1,13 @@
+<a name="interleaving" />
+
+# Random Interleaving
+
+[Random Interleaving](https://github.com/google/benchmark/issues/1051) is a
+technique to lower run-to-run variance. It randomly interleaves repetitions of a
+microbenchmark with repetitions from other microbenchmarks in the same benchmark
+test. Data shows it is able to lower run-to-run variance by
+[40%](https://github.com/google/benchmark/issues/1051) on average.
+
+To use, you mainly need to set `--benchmark_enable_random_interleaving=true`,
+and optionally specify non-zero repetition count `--benchmark_repetitions=9`
+and optionally decrease the per-repetition time `--benchmark_min_time=0.1`.
--- a/docs/reducing_variance.md
+++ b/docs/reducing_variance.md
@ -0,0 +1,100 @@
+# Reducing Variance
+
+<a name="disabling-cpu-frequency-scaling" />
+
+## Disabling CPU Frequency Scaling
+
+If you see this error:
+
+```
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+```
+
+you might want to disable the CPU frequency scaling while running the
+benchmark, as well as consider other ways to stabilize the performance of
+your system while benchmarking.
+
+See [Reducing Variance](reducing_variance.md) for more information.
+
+Exactly how to do this depends on the Linux distribution,
+desktop environment, and installed programs.  Specific details are a moving
+target, so we will not attempt to exhaustively document them here.
+
+One simple option is to use the `cpupower` program to change the
+performance governor to "performance".  This tool is maintained along with
+the Linux kernel and provided by your distribution.
+
+It must be run as root, like this:
+
+```bash
+sudo cpupower frequency-set --governor performance
+```
+
+After this you can verify that all CPUs are using the performance governor
+by running this command:
+
+```bash
+cpupower frequency-info -o proc
+```
+
+The benchmarks you subsequently run will have less variance.
+
+<a name="reducing-variance" />
+
+## Reducing Variance in Benchmarks
+
+The Linux CPU frequency governor [discussed
+above](user_guide#disabling-cpu-frequency-scaling) is not the only source
+of noise in benchmarks.  Some, but not all, of the sources of variance
+include:
+
+1. On multi-core machines not all CPUs/CPU cores/CPU threads run the same
+   speed, so running a benchmark one time and then again may give a
+   different result depending on which CPU it ran on.
+2. CPU scaling features that run on the CPU, like Intel's Turbo Boost and
+   AMD Turbo Core and Precision Boost, can temporarily change the CPU
+   frequency even when the using the "performance" governor on Linux.
+3. Context switching between CPUs, or scheduling competition on the CPU the
+   benchmark is running on.
+4. Intel Hyperthreading or AMD SMT causing the same issue as above.
+5. Cache effects caused by code running on other CPUs.
+6. Non-uniform memory architectures (NUMA).
+
+These can cause variance in benchmarks results within a single run
+(`--benchmark_repetitions=N`) or across multiple runs of the benchmark
+program.
+
+Reducing sources of variance is OS and architecture dependent, which is one
+reason some companies maintain machines dedicated to performance testing.
+
+Some of the easier and and effective ways of reducing variance on a typical
+Linux workstation are:
+
+1. Use the performance governor as [discussed
+above](user_guide#disabling-cpu-frequency-scaling).
+1. Disable processor boosting by:
+   ```sh
+   echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost
+   ```
+   See the Linux kernel's
+   [boost.txt](https://www.kernel.org/doc/Documentation/cpu-freq/boost.txt)
+   for more information.
+2. Set the benchmark program's task affinity to a fixed cpu.  For example:
+   ```sh
+   taskset -c 0 ./mybenchmark
+   ```
+3. Disabling Hyperthreading/SMT.  This can be done in the Bios or using the
+   `/sys` file system (see the LLVM project's [Benchmarking
+   tips](https://llvm.org/docs/Benchmarking.html)).
+4. Close other programs that do non-trivial things based on timers, such as
+   your web browser, desktop environment, etc.
+5. Reduce the working set of your benchmark to fit within the L1 cache, but
+   do be aware that this may lead you to optimize for an unrelistic
+   situation.
+
+Further resources on this topic:
+
+1. The LLVM project's [Benchmarking
+   tips](https://llvm.org/docs/Benchmarking.html).
+1. The Arch Wiki [Cpu frequency
+scaling](https://wiki.archlinux.org/title/CPU_frequency_scaling) page.
--- a/docs/releasing.md
+++ b/docs/releasing.md
@ -1,16 +1,41 @@
 # How to release

-* Make sure you're on master and synced to HEAD
-* Ensure the project builds and tests run (sanity check only, obviously)
+* Make sure you're on main and synced to HEAD
+* Ensure the project builds and tests run
    * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
      passes
 * Prepare release notes
    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
      commits between the last annotated tag and HEAD
    * Pick the most interesting.
+* Create one last commit that updates the version saved in `CMakeLists.txt`, `MODULE.bazel`
+  and the `__version__` variable in `bindings/python/google_benchmark/__init__.py`to the
+  release version you're creating. (This version will be used if benchmark is installed
+  from the archive you'll be creating in the next step.)
+
+```
+project (benchmark VERSION 1.8.0 LANGUAGES CXX)
+```
+
+```
+module(name = "com_github_google_benchmark", version="1.8.0")
+```
+
+```python
+# bindings/python/google_benchmark/__init__.py
+
+# ...
+
+__version__ = "1.8.0"  # <-- change this to the release version you are creating
+
+# ...
+```
+
 * Create a release through github's interface
    * Note this will create a lightweight tag.
    * Update this to an annotated tag:
      * `git pull --tags`
      * `git tag -a -f <tag> <tag>`
-      * `git push --force origin`
+      * `git push --force --tags origin`
+* Confirm that the "Build and upload Python wheels" action runs to completion
+    * run it manually if it hasn't run
--- a/docs/tools.md
+++ b/docs/tools.md
@ -186,6 +186,146 @@ Benchmark                               Time             CPU      Time Old
 This is a mix of the previous two modes, two (potentially different) benchmark binaries are run, and a different filter is applied to each one.
 As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.

+### Note: Interpreting the output
+
+Performance measurements are an art, and performance comparisons are doubly so.
+Results are often noisy and don't necessarily have large absolute differences to
+them, so just by visual inspection, it is not at all apparent if two
+measurements are actually showing a performance change or not. It is even more
+confusing with multiple benchmark repetitions.
+
+Thankfully, what we can do, is use statistical tests on the results to determine
+whether the performance has statistically-significantly changed. `compare.py`
+uses [Mann–Whitney U
+test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), with a null
+hypothesis being that there's no difference in performance.
+ 
+**The below output is a summary of a benchmark comparison with statistics
+provided for a multi-threaded process.**
+```
+Benchmark                                               Time        CPU    Time Old      Time New       CPU Old       CPU New
+-----------------------------------------------------------------------------------------------------------------------------
+benchmark/threads:1/process_time/real_time_pvalue     0.0000     0.0000    U Test, Repetitions: 27 vs 27
+benchmark/threads:1/process_time/real_time_mean      -0.1442    -0.1442          90            77            90            77
+benchmark/threads:1/process_time/real_time_median    -0.1444    -0.1444          90            77            90            77
+benchmark/threads:1/process_time/real_time_stddev    +0.3974    +0.3933           0             0             0             0
+benchmark/threads:1/process_time/real_time_cv        +0.6329    +0.6280           0             0             0             0
+OVERALL_GEOMEAN                                      -0.1442    -0.1442           0             0             0             0
+```
+--------------------------------------------
+Here's a breakdown of each row:
+
+**benchmark/threads:1/process_time/real_time_pvalue**: This shows the _p-value_ for
+the statistical test comparing the performance of the process running with one
+thread. A value of 0.0000 suggests a statistically significant difference in
+performance. The comparison was conducted using the U Test (Mann-Whitney
+U Test) with 27 repetitions for each case.
+
+**benchmark/threads:1/process_time/real_time_mean**: This shows the relative
+difference in mean execution time between two different cases. The negative
+value (-0.1442) implies that the new process is faster by about 14.42%. The old
+time was 90 units, while the new time is 77 units.
+
+**benchmark/threads:1/process_time/real_time_median**: Similarly, this shows the
+relative difference in the median execution time. Again, the new process is
+faster by 14.44%.
+
+**benchmark/threads:1/process_time/real_time_stddev**: This is the relative
+difference in the standard deviation of the execution time, which is a measure
+of how much variation or dispersion there is from the mean. A positive value
+(+0.3974) implies there is more variance in the execution time in the new
+process.
+
+**benchmark/threads:1/process_time/real_time_cv**: CV stands for Coefficient of
+Variation. It is the ratio of the standard deviation to the mean. It provides a
+standardized measure of dispersion. An increase (+0.6329) indicates more
+relative variability in the new process.
+
+**OVERALL_GEOMEAN**: Geomean stands for geometric mean, a type of average that is
+less influenced by outliers. The negative value indicates a general improvement
+in the new process. However, given the values are all zero for the old and new
+times, this seems to be a mistake or placeholder in the output.
+
+-----------------------------------------
+
+
+
+Let's first try to see what the different columns represent in the above
+`compare.py` benchmarking output:
+
+  1. **Benchmark:** The name of the function being benchmarked, along with the
+     size of the input (after the slash).
+
+  2. **Time:** The average time per operation, across all iterations.
+
+  3. **CPU:** The average CPU time per operation, across all iterations.
+
+  4. **Iterations:** The number of iterations the benchmark was run to get a
+     stable estimate.
+
+  5. **Time Old and Time New:** These represent the average time it takes for a
+     function to run in two different scenarios or versions. For example, you
+     might be comparing how fast a function runs before and after you make some
+     changes to it.
+
+  6. **CPU Old and CPU New:** These show the average amount of CPU time that the
+     function uses in two different scenarios or versions. This is similar to
+     Time Old and Time New, but focuses on CPU usage instead of overall time.
+
+In the comparison section, the relative differences in both time and CPU time
+are displayed for each input size.
+
+
+A statistically-significant difference is determined by a **p-value**, which is
+a measure of the probability that the observed difference could have occurred
+just by random chance. A smaller p-value indicates stronger evidence against the
+null hypothesis. 
+
+**Therefore:**
+  1. If the p-value is less than the chosen significance level (alpha), we
+     reject the null hypothesis and conclude the benchmarks are significantly
+     different.
+  2. If the p-value is greater than or equal to alpha, we fail to reject the
+     null hypothesis and treat the two benchmarks as similar.
+
+
+
+The result of said the statistical test is additionally communicated through color coding:
+```diff
+ Green:
+```
+  The benchmarks are _**statistically different**_. This could mean the
+  performance has either **significantly improved** or **significantly
+  deteriorated**. You should look at the actual performance numbers to see which
+  is the case.
+```diff
+- Red:
+```
+  The benchmarks are _**statistically similar**_. This means the performance
+  **hasn't significantly changed**.
+
+In statistical terms, **'green'** means we reject the null hypothesis that
+there's no difference in performance, and **'red'** means we fail to reject the
+null hypothesis. This might seem counter-intuitive if you're expecting 'green'
+to mean 'improved performance' and 'red' to mean 'worsened performance'. 
+```bash
+  But remember, in this context:
+
+    'Success' means 'successfully finding a difference'.
+    'Failure' means 'failing to find a difference'.
+```
+
+
+Also, please note that **even if** we determine that there **is** a
+statistically-significant difference between the two measurements, it does not
+_necessarily_ mean that the actual benchmarks that were measured **are**
+different, or vice versa, even if we determine that there is **no**
+statistically-significant difference between the two measurements, it does not
+necessarily mean that the actual benchmarks that were measured **are not**
+different.
+
+
+
 ### U test

 If there is a sufficient repetition count of the benchmarks, the tool can do
--- a/docs/user_guide.md
+++ b/docs/user_guide.md
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
--- a/include/benchmark/export.h
+++ b/include/benchmark/export.h
@ -0,0 +1,47 @@
+#ifndef BENCHMARK_EXPORT_H
+#define BENCHMARK_EXPORT_H
+
+#if defined(_WIN32)
+#define EXPORT_ATTR __declspec(dllexport)
+#define IMPORT_ATTR __declspec(dllimport)
+#define NO_EXPORT_ATTR
+#define DEPRECATED_ATTR __declspec(deprecated)
+#else  // _WIN32
+#define EXPORT_ATTR __attribute__((visibility("default")))
+#define IMPORT_ATTR __attribute__((visibility("default")))
+#define NO_EXPORT_ATTR __attribute__((visibility("hidden")))
+#define DEPRECATE_ATTR __attribute__((__deprecated__))
+#endif  // _WIN32
+
+#ifdef BENCHMARK_STATIC_DEFINE
+#define BENCHMARK_EXPORT
+#define BENCHMARK_NO_EXPORT
+#else  // BENCHMARK_STATIC_DEFINE
+#ifndef BENCHMARK_EXPORT
+#ifdef benchmark_EXPORTS
+/* We are building this library */
+#define BENCHMARK_EXPORT EXPORT_ATTR
+#else  // benchmark_EXPORTS
+/* We are using this library */
+#define BENCHMARK_EXPORT IMPORT_ATTR
+#endif  // benchmark_EXPORTS
+#endif  // !BENCHMARK_EXPORT
+
+#ifndef BENCHMARK_NO_EXPORT
+#define BENCHMARK_NO_EXPORT NO_EXPORT_ATTR
+#endif  // !BENCHMARK_NO_EXPORT
+#endif  // BENCHMARK_STATIC_DEFINE
+
+#ifndef BENCHMARK_DEPRECATED
+#define BENCHMARK_DEPRECATED DEPRECATE_ATTR
+#endif  // BENCHMARK_DEPRECATED
+
+#ifndef BENCHMARK_DEPRECATED_EXPORT
+#define BENCHMARK_DEPRECATED_EXPORT BENCHMARK_EXPORT BENCHMARK_DEPRECATED
+#endif  // BENCHMARK_DEPRECATED_EXPORT
+
+#ifndef BENCHMARK_DEPRECATED_NO_EXPORT
+#define BENCHMARK_DEPRECATED_NO_EXPORT BENCHMARK_NO_EXPORT BENCHMARK_DEPRECATED
+#endif  // BENCHMARK_DEPRECATED_EXPORT
+
+#endif /* BENCHMARK_EXPORT_H */
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,50 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "google_benchmark"
+description = "A library to benchmark code snippets."
+requires-python = ">=3.8"
+license = {file = "LICENSE"}
+keywords = ["benchmark"]
+
+authors = [
+    {name = "Google", email = "benchmark-discuss@googlegroups.com"},
+]
+
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Software Development :: Testing",
+    "Topic :: System :: Benchmark",
+]
+
+dynamic = ["readme", "version"]
+
+dependencies = [
+    "absl-py>=0.7.1",
+]
+
+[project.urls]
+Homepage = "https://github.com/google/benchmark"
+Documentation = "https://github.com/google/benchmark/tree/main/docs"
+Repository = "https://github.com/google/benchmark.git"
+Discord = "https://discord.gg/cz7UX7wKC2"
+
+[tool.setuptools]
+package-dir = {"" = "bindings/python"}
+zip-safe = false
+
+[tool.setuptools.packages.find]
+where = ["bindings/python"]
+
+[tool.setuptools.dynamic]
+version = { attr = "google_benchmark.__version__" }
+readme = { file = "README.md", content-type = "text/markdown" }
--- a/setup.py
+++ b/setup.py
@ -1,125 +1,113 @@
+import contextlib
 import os
-import posixpath
-import re
+import platform
 import shutil
-import sys
+import sysconfig
+from pathlib import Path

-from distutils import sysconfig
 import setuptools
 from setuptools.command import build_ext


-here = os.path.dirname(os.path.abspath(__file__))
+PYTHON_INCLUDE_PATH_PLACEHOLDER = "<PYTHON_INCLUDE_PATH>"
+
+IS_WINDOWS = platform.system() == "Windows"
+IS_MAC = platform.system() == "Darwin"


-IS_WINDOWS = sys.platform.startswith('win')
-
-
-def _get_version():
-  """Parse the version string from __init__.py."""
-  with open(os.path.join(here, 'bindings', 'python', 'google_benchmark', '__init__.py')) as f:
-    try:
-      version_line = next(
-          line for line in f if line.startswith('__version__'))
-    except StopIteration:
-      raise ValueError('__version__ not defined in __init__.py')
-    else:
-      ns = {}
-      exec(version_line, ns)  # pylint: disable=exec-used
-      return ns['__version__']
-
-
-def _parse_requirements(path):
-  with open(os.path.join(here, path)) as f:
-    return [
-        line.rstrip() for line in f
-        if not (line.isspace() or line.startswith('#'))
-    ]
+@contextlib.contextmanager
+def temp_fill_include_path(fp: str):
+    """Temporarily set the Python include path in a file."""
+    with open(fp, "r+") as f:
+        try:
+            content = f.read()
+            replaced = content.replace(
+                PYTHON_INCLUDE_PATH_PLACEHOLDER,
+                Path(sysconfig.get_paths()['include']).as_posix(),
+            )
+            f.seek(0)
+            f.write(replaced)
+            f.truncate()
+            yield
+        finally:
+            # revert to the original content after exit
+            f.seek(0)
+            f.write(content)
+            f.truncate()


 class BazelExtension(setuptools.Extension):
-  """A C/C++ extension that is defined as a Bazel BUILD target."""
+    """A C/C++ extension that is defined as a Bazel BUILD target."""

-  def __init__(self, name, bazel_target):
-    self.bazel_target = bazel_target
-    self.relpath, self.target_name = (
-        posixpath.relpath(bazel_target, '//').split(':'))
-    setuptools.Extension.__init__(self, name, sources=[])
+    def __init__(self, name: str, bazel_target: str):
+        super().__init__(name=name, sources=[])
+
+        self.bazel_target = bazel_target
+        stripped_target = bazel_target.split("//")[-1]
+        self.relpath, self.target_name = stripped_target.split(":")


 class BuildBazelExtension(build_ext.build_ext):
-  """A command that runs Bazel to build a C/C++ extension."""
+    """A command that runs Bazel to build a C/C++ extension."""

-  def run(self):
-    for ext in self.extensions:
-      self.bazel_build(ext)
-    build_ext.build_ext.run(self)
+    def run(self):
+        for ext in self.extensions:
+            self.bazel_build(ext)
+        build_ext.build_ext.run(self)

-  def bazel_build(self, ext):
-    with open('WORKSPACE', 'r') as f:
-      workspace_contents = f.read()
+    def bazel_build(self, ext: BazelExtension):
+        """Runs the bazel build to create the package."""
+        with temp_fill_include_path("WORKSPACE"):
+            temp_path = Path(self.build_temp)

-    with open('WORKSPACE', 'w') as f:
-      f.write(re.sub(
-          r'(?<=path = ").*(?=",  # May be overwritten by setup\.py\.)',
-          sysconfig.get_python_inc().replace(os.path.sep, posixpath.sep),
-          workspace_contents))
+            bazel_argv = [
+                "bazel",
+                "build",
+                ext.bazel_target,
+                f"--symlink_prefix={temp_path / 'bazel-'}",
+                f"--compilation_mode={'dbg' if self.debug else 'opt'}",
+                # C++17 is required by nanobind
+                f"--cxxopt={'/std:c++17' if IS_WINDOWS else '-std=c++17'}",
+            ]

-    if not os.path.exists(self.build_temp):
-      os.makedirs(self.build_temp)
+            if IS_WINDOWS:
+                # Link with python*.lib.
+                for library_dir in self.library_dirs:
+                    bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
+            elif IS_MAC:
+                if platform.machine() == "x86_64":
+                    # C++17 needs macOS 10.14 at minimum
+                    bazel_argv.append("--macos_minimum_os=10.14")

-    bazel_argv = [
-        'bazel',
-        'build',
-        ext.bazel_target,
-        '--symlink_prefix=' + os.path.join(self.build_temp, 'bazel-'),
-        '--compilation_mode=' + ('dbg' if self.debug else 'opt'),
-    ]
+                    # cross-compilation for Mac ARM64 on GitHub Mac x86 runners.
+                    # ARCHFLAGS is set by cibuildwheel before macOS wheel builds.
+                    archflags = os.getenv("ARCHFLAGS", "")
+                    if "arm64" in archflags:
+                        bazel_argv.append("--cpu=darwin_arm64")
+                        bazel_argv.append("--macos_cpus=arm64")

-    if IS_WINDOWS:
-      # Link with python*.lib.
-      for library_dir in self.library_dirs:
-        bazel_argv.append('--linkopt=/LIBPATH:' + library_dir)
+                elif platform.machine() == "arm64":
+                    bazel_argv.append("--macos_minimum_os=11.0")

-    self.spawn(bazel_argv)
+            self.spawn(bazel_argv)

-    shared_lib_suffix = '.dll' if IS_WINDOWS else '.so'
-    ext_bazel_bin_path = os.path.join(
-        self.build_temp, 'bazel-bin',
-        ext.relpath, ext.target_name + shared_lib_suffix)
-    ext_dest_path = self.get_ext_fullpath(ext.name)
-    ext_dest_dir = os.path.dirname(ext_dest_path)
-    if not os.path.exists(ext_dest_dir):
-      os.makedirs(ext_dest_dir)
-    shutil.copyfile(ext_bazel_bin_path, ext_dest_path)
+            shared_lib_suffix = '.dll' if IS_WINDOWS else '.so'
+            ext_name = ext.target_name + shared_lib_suffix
+            ext_bazel_bin_path = temp_path / 'bazel-bin' / ext.relpath / ext_name
+
+            ext_dest_path = Path(self.get_ext_fullpath(ext.name))
+            shutil.copyfile(ext_bazel_bin_path, ext_dest_path)
+
+            # explicitly call `bazel shutdown` for graceful exit
+            self.spawn(["bazel", "shutdown"])


 setuptools.setup(
-    name='google_benchmark',
-    version=_get_version(),
-    url='https://github.com/google/benchmark',
-    description='A library to benchmark code snippets.',
-    author='Google',
-    author_email='benchmark-py@google.com',
-    # Contained modules and scripts.
-    package_dir={'': 'bindings/python'},
-    packages=setuptools.find_packages('bindings/python'),
-    install_requires=_parse_requirements('bindings/python/requirements.txt'),
    cmdclass=dict(build_ext=BuildBazelExtension),
-    ext_modules=[BazelExtension('google_benchmark._benchmark', '//bindings/python/google_benchmark:_benchmark')],
-    zip_safe=False,
-    # PyPI package information.
-    classifiers=[
-        'Development Status :: 4 - Beta',
-        'Intended Audience :: Developers',
-        'Intended Audience :: Science/Research',
-        'License :: OSI Approved :: Apache Software License',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
-        'Topic :: Software Development :: Testing',
-        'Topic :: System :: Benchmark',
+    ext_modules=[
+        BazelExtension(
+            name="google_benchmark._benchmark",
+            bazel_target="//bindings/python/google_benchmark:_benchmark",
+        )
    ],
-    license='Apache 2.0',
-    keywords='benchmark',
 )
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -25,32 +25,42 @@ set_target_properties(benchmark PROPERTIES
  SOVERSION ${GENERIC_LIB_SOVERSION}
 )
 target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+)
+
+# libpfm, if available
+if (PFM_FOUND)
+  target_link_libraries(benchmark PRIVATE PFM::libpfm)
+  target_compile_definitions(benchmark PRIVATE -DHAVE_LIBPFM)
+endif()
+
+# pthread affinity, if available
+if(HAVE_PTHREAD_AFFINITY)
+  target_compile_definitions(benchmark PRIVATE -DBENCHMARK_HAS_PTHREAD_AFFINITY)
+endif()

 # Link threads.
-target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-find_library(LIBRT rt)
-if(LIBRT)
-  target_link_libraries(benchmark ${LIBRT})
-endif()
+target_link_libraries(benchmark PRIVATE Threads::Threads)
+
+target_link_libraries(benchmark PRIVATE ${BENCHMARK_CXX_LIBRARIES})
+
+if(HAVE_LIB_RT)
+  target_link_libraries(benchmark PRIVATE rt)
+endif(HAVE_LIB_RT)

-if(CMAKE_BUILD_TYPE)
-  string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPER)
-endif()
-if(NOT CMAKE_THREAD_LIBS_INIT AND "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}" MATCHES ".*-fsanitize=[^ ]*address.*")
-  message(WARNING "CMake's FindThreads.cmake did not fail, but CMAKE_THREAD_LIBS_INIT ended up being empty. This was fixed in https://github.com/Kitware/CMake/commit/d53317130e84898c5328c237186dbd995aaf1c12 Let's guess that -pthread is sufficient.")
-  target_link_libraries(benchmark -pthread)
-endif()

 # We need extra libraries on Windows
 if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-  target_link_libraries(benchmark shlwapi)
+  target_link_libraries(benchmark PRIVATE shlwapi)
 endif()

 # We need extra libraries on Solaris
 if(${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
-  target_link_libraries(benchmark kstat)
+  target_link_libraries(benchmark PRIVATE kstat)
+endif()
+
+if (NOT BUILD_SHARED_LIBS)
+  target_compile_definitions(benchmark PUBLIC -DBENCHMARK_STATIC_DEFINE)
 endif()

 # Benchmark main library
@ -60,34 +70,45 @@ set_target_properties(benchmark_main PROPERTIES
  OUTPUT_NAME "benchmark_main"
  VERSION ${GENERIC_LIB_VERSION}
  SOVERSION ${GENERIC_LIB_SOVERSION}
+  DEFINE_SYMBOL benchmark_EXPORTS
 )
-target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
-target_link_libraries(benchmark_main benchmark::benchmark)
+target_link_libraries(benchmark_main PUBLIC benchmark::benchmark)

-
-set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
+set(generated_dir "${PROJECT_BINARY_DIR}")

 set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
 set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
 set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
+set(targets_to_export benchmark benchmark_main)
 set(targets_export_name "${PROJECT_NAME}Targets")

 set(namespace "${PROJECT_NAME}::")

 include(CMakePackageConfigHelpers)
+
+configure_package_config_file (
+  ${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in
+  ${project_config}
+  INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
+  NO_SET_AND_CHECK_MACRO
+  NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
 write_basic_package_version_file(
  "${version_config}" VERSION ${GENERIC_LIB_VERSION} COMPATIBILITY SameMajorVersion
 )

-configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)
 configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)

+export (
+  TARGETS ${targets_to_export}
+  NAMESPACE "${namespace}"
+  FILE ${generated_dir}/${targets_export_name}.cmake
+)
+
 if (BENCHMARK_ENABLE_INSTALL)
  # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
  install(
-    TARGETS benchmark benchmark_main
+    TARGETS ${targets_to_export}
    EXPORT ${targets_export_name}
    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
@ -96,6 +117,7 @@ if (BENCHMARK_ENABLE_INSTALL)

  install(
    DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
+              "${PROJECT_BINARY_DIR}/include/benchmark"
    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
    FILES_MATCHING PATTERN "*.*h")

@ -112,3 +134,37 @@ if (BENCHMARK_ENABLE_INSTALL)
      NAMESPACE "${namespace}"
      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
 endif()
+
+if (BENCHMARK_ENABLE_DOXYGEN)
+  find_package(Doxygen REQUIRED)
+  set(DOXYGEN_QUIET YES)
+  set(DOXYGEN_RECURSIVE YES)
+  set(DOXYGEN_GENERATE_HTML YES)
+  set(DOXYGEN_GENERATE_MAN NO)
+  set(DOXYGEN_MARKDOWN_SUPPORT YES)
+  set(DOXYGEN_BUILTIN_STL_SUPPORT YES)
+  set(DOXYGEN_EXTRACT_PACKAGE YES)
+  set(DOXYGEN_EXTRACT_STATIC YES)
+  set(DOXYGEN_SHOW_INCLUDE_FILES YES)
+  set(DOXYGEN_BINARY_TOC YES)
+  set(DOXYGEN_TOC_EXPAND YES)
+  set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "index.md")
+  doxygen_add_docs(benchmark_doxygen
+    docs
+    include
+    src
+    ALL
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+    COMMENT "Building documentation with Doxygen.")
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/html/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
+else()
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${PROJECT_SOURCE_DIR}/docs/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
+endif()
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@ -13,12 +13,13 @@
 // limitations under the License.

 #include "benchmark/benchmark.h"
+
 #include "benchmark_api_internal.h"
 #include "benchmark_runner.h"
 #include "internal_macros.h"

 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@ -32,7 +33,10 @@
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
+#include <limits>
+#include <map>
 #include <memory>
+#include <random>
 #include <string>
 #include <thread>
 #include <utility>
@ -45,94 +49,146 @@
 #include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
+#include "perf_counters.h"
 #include "re.h"
 #include "statistics.h"
 #include "string_util.h"
 #include "thread_manager.h"
 #include "thread_timer.h"

+namespace benchmark {
 // Print a list of benchmarks. This option overrides all other options.
-DEFINE_bool(benchmark_list_tests, false);
+BM_DEFINE_bool(benchmark_list_tests, false);

 // A regular expression that specifies the set of benchmarks to execute.  If
 // this flag is empty, or if this flag is the string \"all\", all benchmarks
 // linked into the binary are run.
-DEFINE_string(benchmark_filter, ".");
+BM_DEFINE_string(benchmark_filter, "");

-// Minimum number of seconds we should run benchmark before results are
-// considered significant.  For cpu-time based tests, this is the lower bound
+// Specification of how long to run the benchmark.
+//
+// It can be either an exact number of iterations (specified as `<integer>x`),
+// or a minimum number of seconds (specified as `<float>s`). If the latter
+// format (ie., min seconds) is used, the system may run the benchmark longer
+// until the results are considered significant.
+//
+// For backward compatibility, the `s` suffix may be omitted, in which case,
+// the specified number is interpreted as the number of seconds.
+//
+// For cpu-time based tests, this is the lower bound
 // on the total cpu time used by all threads that make up the test.  For
 // real-time based tests, this is the lower bound on the elapsed time of the
 // benchmark execution, regardless of number of threads.
-DEFINE_double(benchmark_min_time, 0.5);
+BM_DEFINE_string(benchmark_min_time, kDefaultMinTimeStr);
+
+// Minimum number of seconds a benchmark should be run before results should be
+// taken into account. This e.g can be necessary for benchmarks of code which
+// needs to fill some form of cache before performance is of interest.
+// Note: results gathered within this period are discarded and not used for
+// reported result.
+BM_DEFINE_double(benchmark_min_warmup_time, 0.0);

 // The number of runs of each benchmark. If greater than 1, the mean and
 // standard deviation of the runs will be reported.
-DEFINE_int32(benchmark_repetitions, 1);
+BM_DEFINE_int32(benchmark_repetitions, 1);
+
+// If set, enable random interleaving of repetitions of all benchmarks.
+// See http://github.com/google/benchmark/issues/1051 for details.
+BM_DEFINE_bool(benchmark_enable_random_interleaving, false);

 // Report the result of each benchmark repetitions. When 'true' is specified
 // only the mean, standard deviation, and other statistics are reported for
 // repeated benchmarks. Affects all reporters.
-DEFINE_bool(benchmark_report_aggregates_only, false);
+BM_DEFINE_bool(benchmark_report_aggregates_only, false);

 // Display the result of each benchmark repetitions. When 'true' is specified
 // only the mean, standard deviation, and other statistics are displayed for
 // repeated benchmarks. Unlike benchmark_report_aggregates_only, only affects
 // the display reporter, but  *NOT* file reporter, which will still contain
 // all the output.
-DEFINE_bool(benchmark_display_aggregates_only, false);
+BM_DEFINE_bool(benchmark_display_aggregates_only, false);

 // The format to use for console output.
 // Valid values are 'console', 'json', or 'csv'.
-DEFINE_string(benchmark_format, "console");
+BM_DEFINE_string(benchmark_format, "console");

 // The format to use for file output.
 // Valid values are 'console', 'json', or 'csv'.
-DEFINE_string(benchmark_out_format, "json");
+BM_DEFINE_string(benchmark_out_format, "json");

 // The file to write additional output to.
-DEFINE_string(benchmark_out, "");
+BM_DEFINE_string(benchmark_out, "");

 // Whether to use colors in the output.  Valid values:
 // 'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use colors if
 // the output is being sent to a terminal and the TERM environment variable is
 // set to a terminal type that supports colors.
-DEFINE_string(benchmark_color, "auto");
+BM_DEFINE_string(benchmark_color, "auto");

 // Whether to use tabular format when printing user counters to the console.
 // Valid values: 'true'/'yes'/1, 'false'/'no'/0.  Defaults to false.
-DEFINE_bool(benchmark_counters_tabular, false);
+BM_DEFINE_bool(benchmark_counters_tabular, false);
+
+// List of additional perf counters to collect, in libpfm format. For more
+// information about libpfm: https://man7.org/linux/man-pages/man3/libpfm.3.html
+BM_DEFINE_string(benchmark_perf_counters, "");
+
+// Extra context to include in the output formatted as comma-separated key-value
+// pairs. Kept internal as it's only used for parsing from env/command line.
+BM_DEFINE_kvpairs(benchmark_context, {});
+
+// Set the default time unit to use for reports
+// Valid values are 'ns', 'us', 'ms' or 's'
+BM_DEFINE_string(benchmark_time_unit, "");

 // The level of verbose logging to output
-DEFINE_int32(v, 0);
-
-namespace benchmark {
+BM_DEFINE_int32(v, 0);

 namespace internal {

+std::map<std::string, std::string>* global_context = nullptr;
+
+BENCHMARK_EXPORT std::map<std::string, std::string>*& GetGlobalContext() {
+  return global_context;
+}
+
 // FIXME: wouldn't LTO mess this up?
 void UseCharPointer(char const volatile*) {}

 }  // namespace internal

-State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
-             int thread_i, int n_threads, internal::ThreadTimer* timer,
-             internal::ThreadManager* manager)
+State::State(std::string name, IterationCount max_iters,
+             const std::vector<int64_t>& ranges, int thread_i, int n_threads,
+             internal::ThreadTimer* timer, internal::ThreadManager* manager,
+             internal::PerfCountersMeasurement* perf_counters_measurement)
    : total_iterations_(0),
      batch_leftover_(0),
      max_iterations(max_iters),
      started_(false),
      finished_(false),
-      error_occurred_(false),
+      skipped_(internal::NotSkipped),
      range_(ranges),
      complexity_n_(0),
-      counters(),
-      thread_index(thread_i),
-      threads(n_threads),
+      name_(std::move(name)),
+      thread_index_(thread_i),
+      threads_(n_threads),
      timer_(timer),
-      manager_(manager) {
-  CHECK(max_iterations != 0) << "At least one iteration must be run";
-  CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
+      manager_(manager),
+      perf_counters_measurement_(perf_counters_measurement) {
+  BM_CHECK(max_iterations != 0) << "At least one iteration must be run";
+  BM_CHECK_LT(thread_index_, threads_)
+      << "thread_index must be less than threads";
+
+  // Add counters with correct flag now.  If added with `counters[name]` in
+  // `PauseTiming`, a new `Counter` will be inserted the first time, which
+  // won't have the flag.  Inserting them now also reduces the allocations
+  // during the benchmark.
+  if (perf_counters_measurement_) {
+    for (const std::string& counter_name :
+         perf_counters_measurement_->names()) {
+      counters[counter_name] = Counter(0.0, Counter::kAvgIterations);
+    }
+  }

  // Note: The use of offsetof below is technically undefined until C++17
  // because State is not a standard layout type. However, all compilers
@ -146,38 +202,79 @@ State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
 #elif defined(__GNUC__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Winvalid-offsetof"
+#endif
+#if defined(__NVCC__)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 1427
+#endif
+#if defined(__NVCOMPILER)
+#pragma diagnostic push
+#pragma diag_suppress offset_in_non_POD_nonstandard
 #endif
  // Offset tests to ensure commonly accessed data is on the first cache line.
  const int cache_line_size = 64;
-  static_assert(offsetof(State, error_occurred_) <=
-                    (cache_line_size - sizeof(error_occurred_)),
-                "");
+  static_assert(
+      offsetof(State, skipped_) <= (cache_line_size - sizeof(skipped_)), "");
 #if defined(__INTEL_COMPILER)
 #pragma warning pop
 #elif defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif
+#if defined(__NVCC__)
+#pragma nv_diagnostic pop
+#endif
+#if defined(__NVCOMPILER)
+#pragma diagnostic pop
+#endif
 }

 void State::PauseTiming() {
  // Add in time accumulated so far
-  CHECK(started_ && !finished_ && !error_occurred_);
+  BM_CHECK(started_ && !finished_ && !skipped());
  timer_->StopTimer();
+  if (perf_counters_measurement_) {
+    std::vector<std::pair<std::string, double>> measurements;
+    if (!perf_counters_measurement_->Stop(measurements)) {
+      BM_CHECK(false) << "Perf counters read the value failed.";
+    }
+    for (const auto& name_and_measurement : measurements) {
+      const std::string& name = name_and_measurement.first;
+      const double measurement = name_and_measurement.second;
+      // Counter was inserted with `kAvgIterations` flag by the constructor.
+      assert(counters.find(name) != counters.end());
+      counters[name].value += measurement;
+    }
+  }
 }

 void State::ResumeTiming() {
-  CHECK(started_ && !finished_ && !error_occurred_);
+  BM_CHECK(started_ && !finished_ && !skipped());
  timer_->StartTimer();
+  if (perf_counters_measurement_) {
+    perf_counters_measurement_->Start();
+  }
 }

-void State::SkipWithError(const char* msg) {
-  CHECK(msg);
-  error_occurred_ = true;
+void State::SkipWithMessage(const std::string& msg) {
+  skipped_ = internal::SkippedWithMessage;
  {
    MutexLock l(manager_->GetBenchmarkMutex());
-    if (manager_->results.has_error_ == false) {
-      manager_->results.error_message_ = msg;
-      manager_->results.has_error_ = true;
+    if (internal::NotSkipped == manager_->results.skipped_) {
+      manager_->results.skip_message_ = msg;
+      manager_->results.skipped_ = skipped_;
+    }
+  }
+  total_iterations_ = 0;
+  if (timer_->running()) timer_->StopTimer();
+}
+
+void State::SkipWithError(const std::string& msg) {
+  skipped_ = internal::SkippedWithError;
+  {
+    MutexLock l(manager_->GetBenchmarkMutex());
+    if (internal::NotSkipped == manager_->results.skipped_) {
+      manager_->results.skip_message_ = msg;
+      manager_->results.skipped_ = skipped_;
    }
  }
  total_iterations_ = 0;
@ -188,22 +285,22 @@ void State::SetIterationTime(double seconds) {
  timer_->SetIterationTime(seconds);
 }

-void State::SetLabel(const char* label) {
+void State::SetLabel(const std::string& label) {
  MutexLock l(manager_->GetBenchmarkMutex());
  manager_->results.report_label_ = label;
 }

 void State::StartKeepRunning() {
-  CHECK(!started_ && !finished_);
+  BM_CHECK(!started_ && !finished_);
  started_ = true;
-  total_iterations_ = error_occurred_ ? 0 : max_iterations;
+  total_iterations_ = skipped() ? 0 : max_iterations;
  manager_->StartStopBarrier();
-  if (!error_occurred_) ResumeTiming();
+  if (!skipped()) ResumeTiming();
 }

 void State::FinishKeepRunning() {
-  CHECK(started_ && (!finished_ || error_occurred_));
-  if (!error_occurred_) {
+  BM_CHECK(started_ && (!finished_ || skipped()));
+  if (!skipped()) {
    PauseTiming();
  }
  // Total iterations has now wrapped around past 0. Fix this.
@ -215,11 +312,42 @@ void State::FinishKeepRunning() {
 namespace internal {
 namespace {

+// Flushes streams after invoking reporter methods that write to them. This
+// ensures users get timely updates even when streams are not line-buffered.
+void FlushStreams(BenchmarkReporter* reporter) {
+  if (!reporter) return;
+  std::flush(reporter->GetOutputStream());
+  std::flush(reporter->GetErrorStream());
+}
+
+// Reports in both display and file reporters.
+void Report(BenchmarkReporter* display_reporter,
+            BenchmarkReporter* file_reporter, const RunResults& run_results) {
+  auto report_one = [](BenchmarkReporter* reporter, bool aggregates_only,
+                       const RunResults& results) {
+    assert(reporter);
+    // If there are no aggregates, do output non-aggregates.
+    aggregates_only &= !results.aggregates_only.empty();
+    if (!aggregates_only) reporter->ReportRuns(results.non_aggregates);
+    if (!results.aggregates_only.empty())
+      reporter->ReportRuns(results.aggregates_only);
+  };
+
+  report_one(display_reporter, run_results.display_report_aggregates_only,
+             run_results);
+  if (file_reporter)
+    report_one(file_reporter, run_results.file_report_aggregates_only,
+               run_results);
+
+  FlushStreams(display_reporter);
+  FlushStreams(file_reporter);
+}
+
 void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
                   BenchmarkReporter* display_reporter,
                   BenchmarkReporter* file_reporter) {
  // Note the file_reporter can be null.
-  CHECK(display_reporter != nullptr);
+  BM_CHECK(display_reporter != nullptr);

  // Determine the width of the name field using a minimum width of 10.
  bool might_have_aggregates = FLAGS_benchmark_repetitions > 1;
@ -227,10 +355,10 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
  size_t stat_field_width = 0;
  for (const BenchmarkInstance& benchmark : benchmarks) {
    name_field_width =
-        std::max<size_t>(name_field_width, benchmark.name.str().size());
-    might_have_aggregates |= benchmark.repetitions > 1;
+        std::max<size_t>(name_field_width, benchmark.name().str().size());
+    might_have_aggregates |= benchmark.repetitions() > 1;

-    for (const auto& Stat : *benchmark.statistics)
+    for (const auto& Stat : benchmark.statistics())
      stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
  }
  if (might_have_aggregates) name_field_width += 1 + stat_field_width;
@ -239,75 +367,129 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
  BenchmarkReporter::Context context;
  context.name_field_width = name_field_width;

-  // Keep track of running times of all instances of current benchmark
-  std::vector<BenchmarkReporter::Run> complexity_reports;
-
-  // We flush streams after invoking reporter methods that write to them. This
-  // ensures users get timely updates even when streams are not line-buffered.
-  auto flushStreams = [](BenchmarkReporter* reporter) {
-    if (!reporter) return;
-    std::flush(reporter->GetOutputStream());
-    std::flush(reporter->GetErrorStream());
-  };
+  // Keep track of running times of all instances of each benchmark family.
+  std::map<int /*family_index*/, BenchmarkReporter::PerFamilyRunReports>
+      per_family_reports;

  if (display_reporter->ReportContext(context) &&
      (!file_reporter || file_reporter->ReportContext(context))) {
-    flushStreams(display_reporter);
-    flushStreams(file_reporter);
+    FlushStreams(display_reporter);
+    FlushStreams(file_reporter);

-    for (const auto& benchmark : benchmarks) {
-      RunResults run_results = RunBenchmark(benchmark, &complexity_reports);
+    size_t num_repetitions_total = 0;

-      auto report = [&run_results](BenchmarkReporter* reporter,
-                                   bool report_aggregates_only) {
-        assert(reporter);
-        // If there are no aggregates, do output non-aggregates.
-        report_aggregates_only &= !run_results.aggregates_only.empty();
-        if (!report_aggregates_only)
-          reporter->ReportRuns(run_results.non_aggregates);
-        if (!run_results.aggregates_only.empty())
-          reporter->ReportRuns(run_results.aggregates_only);
-      };
+    // This perfcounters object needs to be created before the runners vector
+    // below so it outlasts their lifetime.
+    PerfCountersMeasurement perfcounters(
+        StrSplit(FLAGS_benchmark_perf_counters, ','));

-      report(display_reporter, run_results.display_report_aggregates_only);
+    // Vector of benchmarks to run
+    std::vector<internal::BenchmarkRunner> runners;
+    runners.reserve(benchmarks.size());
+
+    // Count the number of benchmarks with threads to warn the user in case
+    // performance counters are used.
+    int benchmarks_with_threads = 0;
+
+    // Loop through all benchmarks
+    for (const BenchmarkInstance& benchmark : benchmarks) {
+      BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
+      if (benchmark.complexity() != oNone)
+        reports_for_family = &per_family_reports[benchmark.family_index()];
+      benchmarks_with_threads += (benchmark.threads() > 1);
+      runners.emplace_back(benchmark, &perfcounters, reports_for_family);
+      int num_repeats_of_this_instance = runners.back().GetNumRepeats();
+      num_repetitions_total += num_repeats_of_this_instance;
+      if (reports_for_family)
+        reports_for_family->num_runs_total += num_repeats_of_this_instance;
+    }
+    assert(runners.size() == benchmarks.size() && "Unexpected runner count.");
+
+    // The use of performance counters with threads would be unintuitive for
+    // the average user so we need to warn them about this case
+    if ((benchmarks_with_threads > 0) && (perfcounters.num_counters() > 0)) {
+      GetErrorLogInstance()
+          << "***WARNING*** There are " << benchmarks_with_threads
+          << " benchmarks with threads and " << perfcounters.num_counters()
+          << " performance counters were requested. Beware counters will "
+             "reflect the combined usage across all "
+             "threads.\n";
+    }
+
+    std::vector<size_t> repetition_indices;
+    repetition_indices.reserve(num_repetitions_total);
+    for (size_t runner_index = 0, num_runners = runners.size();
+         runner_index != num_runners; ++runner_index) {
+      const internal::BenchmarkRunner& runner = runners[runner_index];
+      std::fill_n(std::back_inserter(repetition_indices),
+                  runner.GetNumRepeats(), runner_index);
+    }
+    assert(repetition_indices.size() == num_repetitions_total &&
+           "Unexpected number of repetition indexes.");
+
+    if (FLAGS_benchmark_enable_random_interleaving) {
+      std::random_device rd;
+      std::mt19937 g(rd());
+      std::shuffle(repetition_indices.begin(), repetition_indices.end(), g);
+    }
+
+    for (size_t repetition_index : repetition_indices) {
+      internal::BenchmarkRunner& runner = runners[repetition_index];
+      runner.DoOneRepetition();
+      if (runner.HasRepeatsRemaining()) continue;
+      // FIXME: report each repetition separately, not all of them in bulk.
+
+      display_reporter->ReportRunsConfig(
+          runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());
      if (file_reporter)
-        report(file_reporter, run_results.file_report_aggregates_only);
+        file_reporter->ReportRunsConfig(
+            runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());

-      flushStreams(display_reporter);
-      flushStreams(file_reporter);
+      RunResults run_results = runner.GetResults();
+
+      // Maybe calculate complexity report
+      if (const auto* reports_for_family = runner.GetReportsForFamily()) {
+        if (reports_for_family->num_runs_done ==
+            reports_for_family->num_runs_total) {
+          auto additional_run_stats = ComputeBigO(reports_for_family->Runs);
+          run_results.aggregates_only.insert(run_results.aggregates_only.end(),
+                                             additional_run_stats.begin(),
+                                             additional_run_stats.end());
+          per_family_reports.erase(
+              static_cast<int>(reports_for_family->Runs.front().family_index));
+        }
+      }
+
+      Report(display_reporter, file_reporter, run_results);
    }
  }
  display_reporter->Finalize();
  if (file_reporter) file_reporter->Finalize();
-  flushStreams(display_reporter);
-  flushStreams(file_reporter);
+  FlushStreams(display_reporter);
+  FlushStreams(file_reporter);
 }

 // Disable deprecated warnings temporarily because we need to reference
 // CSVReporter but don't want to trigger -Werror=-Wdeprecated-declarations
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
+BENCHMARK_DISABLE_DEPRECATED_WARNING

 std::unique_ptr<BenchmarkReporter> CreateReporter(
    std::string const& name, ConsoleReporter::OutputOptions output_opts) {
  typedef std::unique_ptr<BenchmarkReporter> PtrType;
  if (name == "console") {
    return PtrType(new ConsoleReporter(output_opts));
-  } else if (name == "json") {
-    return PtrType(new JSONReporter);
-  } else if (name == "csv") {
-    return PtrType(new CSVReporter);
-  } else {
-    std::cerr << "Unexpected format: '" << name << "'\n";
-    std::exit(1);
  }
+  if (name == "json") {
+    return PtrType(new JSONReporter());
+  }
+  if (name == "csv") {
+    return PtrType(new CSVReporter());
+  }
+  std::cerr << "Unexpected format: '" << name << "'\n";
+  std::exit(1);
 }

-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
+BENCHMARK_RESTORE_DEPRECATED_WARNING

 }  // end namespace

@ -341,17 +523,41 @@ ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {

 }  // end namespace internal

+BenchmarkReporter* CreateDefaultDisplayReporter() {
+  static auto default_display_reporter =
+      internal::CreateReporter(FLAGS_benchmark_format,
+                               internal::GetOutputOptions())
+          .release();
+  return default_display_reporter;
+}
+
 size_t RunSpecifiedBenchmarks() {
-  return RunSpecifiedBenchmarks(nullptr, nullptr);
+  return RunSpecifiedBenchmarks(nullptr, nullptr, FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(std::string spec) {
+  return RunSpecifiedBenchmarks(nullptr, nullptr, std::move(spec));
 }

 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter) {
-  return RunSpecifiedBenchmarks(display_reporter, nullptr);
+  return RunSpecifiedBenchmarks(display_reporter, nullptr,
+                                FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              std::string spec) {
+  return RunSpecifiedBenchmarks(display_reporter, nullptr, std::move(spec));
 }

 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
                              BenchmarkReporter* file_reporter) {
-  std::string spec = FLAGS_benchmark_filter;
+  return RunSpecifiedBenchmarks(display_reporter, file_reporter,
+                                FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              BenchmarkReporter* file_reporter,
+                              std::string spec) {
  if (spec.empty() || spec == "all")
    spec = ".";  // Regexp that matches all benchmarks

@ -360,8 +566,7 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
  std::unique_ptr<BenchmarkReporter> default_display_reporter;
  std::unique_ptr<BenchmarkReporter> default_file_reporter;
  if (!display_reporter) {
-    default_display_reporter = internal::CreateReporter(
-        FLAGS_benchmark_format, internal::GetOutputOptions());
+    default_display_reporter.reset(CreateDefaultDisplayReporter());
    display_reporter = default_display_reporter.get();
  }
  auto& Out = display_reporter->GetOutputStream();
@ -377,12 +582,14 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
  if (!fname.empty()) {
    output_file.open(fname);
    if (!output_file.is_open()) {
-      Err << "invalid file name: '" << fname << std::endl;
+      Err << "invalid file name: '" << fname << "'" << std::endl;
      std::exit(1);
    }
    if (!file_reporter) {
      default_file_reporter = internal::CreateReporter(
-          FLAGS_benchmark_out_format, ConsoleReporter::OO_None);
+          FLAGS_benchmark_out_format, FLAGS_benchmark_counters_tabular
+                                          ? ConsoleReporter::OO_Tabular
+                                          : ConsoleReporter::OO_None);
      file_reporter = default_file_reporter.get();
    }
    file_reporter->SetOutputStream(&output_file);
@ -399,7 +606,7 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,

  if (FLAGS_benchmark_list_tests) {
    for (auto const& benchmark : benchmarks)
-      Out << benchmark.name.str() << "\n";
+      Out << benchmark.name().str() << "\n";
  } else {
    internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
  }
@ -407,30 +614,64 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
  return benchmarks.size();
 }

+namespace {
+// stores the time unit benchmarks use by default
+TimeUnit default_time_unit = kNanosecond;
+}  // namespace
+
+TimeUnit GetDefaultTimeUnit() { return default_time_unit; }
+
+void SetDefaultTimeUnit(TimeUnit unit) { default_time_unit = unit; }
+
+std::string GetBenchmarkFilter() { return FLAGS_benchmark_filter; }
+
+void SetBenchmarkFilter(std::string value) {
+  FLAGS_benchmark_filter = std::move(value);
+}
+
+int32_t GetBenchmarkVerbosity() { return FLAGS_v; }
+
 void RegisterMemoryManager(MemoryManager* manager) {
  internal::memory_manager = manager;
 }

+void AddCustomContext(const std::string& key, const std::string& value) {
+  if (internal::global_context == nullptr) {
+    internal::global_context = new std::map<std::string, std::string>();
+  }
+  if (!internal::global_context->emplace(key, value).second) {
+    std::cerr << "Failed to add custom context \"" << key << "\" as it already "
+              << "exists with value \"" << value << "\"\n";
+  }
+}
+
 namespace internal {

+void (*HelperPrintf)();
+
 void PrintUsageAndExit() {
-  fprintf(stdout,
-          "benchmark"
-          " [--benchmark_list_tests={true|false}]\n"
-          "          [--benchmark_filter=<regex>]\n"
-          "          [--benchmark_min_time=<min_time>]\n"
-          "          [--benchmark_repetitions=<num_repetitions>]\n"
-          "          [--benchmark_report_aggregates_only={true|false}]\n"
-          "          [--benchmark_display_aggregates_only={true|false}]\n"
-          "          [--benchmark_format=<console|json|csv>]\n"
-          "          [--benchmark_out=<filename>]\n"
-          "          [--benchmark_out_format=<json|console|csv>]\n"
-          "          [--benchmark_color={auto|true|false}]\n"
-          "          [--benchmark_counters_tabular={true|false}]\n"
-          "          [--v=<verbosity>]\n");
+  HelperPrintf();
  exit(0);
 }

+void SetDefaultTimeUnitFromFlag(const std::string& time_unit_flag) {
+  if (time_unit_flag == "s") {
+    return SetDefaultTimeUnit(kSecond);
+  }
+  if (time_unit_flag == "ms") {
+    return SetDefaultTimeUnit(kMillisecond);
+  }
+  if (time_unit_flag == "us") {
+    return SetDefaultTimeUnit(kMicrosecond);
+  }
+  if (time_unit_flag == "ns") {
+    return SetDefaultTimeUnit(kNanosecond);
+  }
+  if (!time_unit_flag.empty()) {
+    PrintUsageAndExit();
+  }
+}
+
 void ParseCommandLineFlags(int* argc, char** argv) {
  using namespace benchmark;
  BenchmarkReporter::Context::executable_name =
@ -439,10 +680,14 @@ void ParseCommandLineFlags(int* argc, char** argv) {
    if (ParseBoolFlag(argv[i], "benchmark_list_tests",
                      &FLAGS_benchmark_list_tests) ||
        ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
-        ParseDoubleFlag(argv[i], "benchmark_min_time",
+        ParseStringFlag(argv[i], "benchmark_min_time",
                        &FLAGS_benchmark_min_time) ||
+        ParseDoubleFlag(argv[i], "benchmark_min_warmup_time",
+                        &FLAGS_benchmark_min_warmup_time) ||
        ParseInt32Flag(argv[i], "benchmark_repetitions",
                       &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
+                      &FLAGS_benchmark_enable_random_interleaving) ||
        ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
                      &FLAGS_benchmark_report_aggregates_only) ||
        ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
@ -452,11 +697,14 @@ void ParseCommandLineFlags(int* argc, char** argv) {
        ParseStringFlag(argv[i], "benchmark_out_format",
                        &FLAGS_benchmark_out_format) ||
        ParseStringFlag(argv[i], "benchmark_color", &FLAGS_benchmark_color) ||
-        // "color_print" is the deprecated name for "benchmark_color".
-        // TODO: Remove this.
-        ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
        ParseBoolFlag(argv[i], "benchmark_counters_tabular",
                      &FLAGS_benchmark_counters_tabular) ||
+        ParseStringFlag(argv[i], "benchmark_perf_counters",
+                        &FLAGS_benchmark_perf_counters) ||
+        ParseKeyValueFlag(argv[i], "benchmark_context",
+                          &FLAGS_benchmark_context) ||
+        ParseStringFlag(argv[i], "benchmark_time_unit",
+                        &FLAGS_benchmark_time_unit) ||
        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
      for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];

@ -467,13 +715,18 @@ void ParseCommandLineFlags(int* argc, char** argv) {
    }
  }
  for (auto const* flag :
-       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format})
+       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format}) {
    if (*flag != "console" && *flag != "json" && *flag != "csv") {
      PrintUsageAndExit();
    }
+  }
+  SetDefaultTimeUnitFromFlag(FLAGS_benchmark_time_unit);
  if (FLAGS_benchmark_color.empty()) {
    PrintUsageAndExit();
  }
+  for (const auto& kv : FLAGS_benchmark_context) {
+    AddCustomContext(kv.first, kv.second);
+  }
 }

 int InitializeStreams() {
@ -483,11 +736,38 @@ int InitializeStreams() {

 }  // end namespace internal

-void Initialize(int* argc, char** argv) {
+void PrintDefaultHelp() {
+  fprintf(stdout,
+          "benchmark"
+          " [--benchmark_list_tests={true|false}]\n"
+          "          [--benchmark_filter=<regex>]\n"
+          "          [--benchmark_min_time=`<integer>x` OR `<float>s` ]\n"
+          "          [--benchmark_min_warmup_time=<min_warmup_time>]\n"
+          "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_enable_random_interleaving={true|false}]\n"
+          "          [--benchmark_report_aggregates_only={true|false}]\n"
+          "          [--benchmark_display_aggregates_only={true|false}]\n"
+          "          [--benchmark_format=<console|json|csv>]\n"
+          "          [--benchmark_out=<filename>]\n"
+          "          [--benchmark_out_format=<json|console|csv>]\n"
+          "          [--benchmark_color={auto|true|false}]\n"
+          "          [--benchmark_counters_tabular={true|false}]\n"
+#if defined HAVE_LIBPFM
+          "          [--benchmark_perf_counters=<counter>,...]\n"
+#endif
+          "          [--benchmark_context=<key>=<value>,...]\n"
+          "          [--benchmark_time_unit={ns|us|ms|s}]\n"
+          "          [--v=<verbosity>]\n");
+}
+
+void Initialize(int* argc, char** argv, void (*HelperPrintf)()) {
+  internal::HelperPrintf = HelperPrintf;
  internal::ParseCommandLineFlags(argc, argv);
  internal::LogLevel() = FLAGS_v;
 }

+void Shutdown() { delete internal::global_context; }
+
 bool ReportUnrecognizedArguments(int argc, char** argv) {
  for (int i = 1; i < argc; ++i) {
    fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0],
--- a/src/benchmark_api_internal.cc
+++ b/src/benchmark_api_internal.cc
@ -1,15 +1,118 @@
 #include "benchmark_api_internal.h"

+#include <cinttypes>
+
+#include "string_util.h"
+
 namespace benchmark {
 namespace internal {

-State BenchmarkInstance::Run(IterationCount iters, int thread_id,
-                             internal::ThreadTimer* timer,
-                             internal::ThreadManager* manager) const {
-  State st(iters, arg, thread_id, threads, timer, manager);
-  benchmark->Run(st);
+BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
+                                     int per_family_instance_idx,
+                                     const std::vector<int64_t>& args,
+                                     int thread_count)
+    : benchmark_(*benchmark),
+      family_index_(family_idx),
+      per_family_instance_index_(per_family_instance_idx),
+      aggregation_report_mode_(benchmark_.aggregation_report_mode_),
+      args_(args),
+      time_unit_(benchmark_.GetTimeUnit()),
+      measure_process_cpu_time_(benchmark_.measure_process_cpu_time_),
+      use_real_time_(benchmark_.use_real_time_),
+      use_manual_time_(benchmark_.use_manual_time_),
+      complexity_(benchmark_.complexity_),
+      complexity_lambda_(benchmark_.complexity_lambda_),
+      statistics_(benchmark_.statistics_),
+      repetitions_(benchmark_.repetitions_),
+      min_time_(benchmark_.min_time_),
+      min_warmup_time_(benchmark_.min_warmup_time_),
+      iterations_(benchmark_.iterations_),
+      threads_(thread_count) {
+  name_.function_name = benchmark_.name_;
+
+  size_t arg_i = 0;
+  for (const auto& arg : args) {
+    if (!name_.args.empty()) {
+      name_.args += '/';
+    }
+
+    if (arg_i < benchmark->arg_names_.size()) {
+      const auto& arg_name = benchmark_.arg_names_[arg_i];
+      if (!arg_name.empty()) {
+        name_.args += StrFormat("%s:", arg_name.c_str());
+      }
+    }
+
+    name_.args += StrFormat("%" PRId64, arg);
+    ++arg_i;
+  }
+
+  if (!IsZero(benchmark->min_time_)) {
+    name_.min_time = StrFormat("min_time:%0.3f", benchmark_.min_time_);
+  }
+
+  if (!IsZero(benchmark->min_warmup_time_)) {
+    name_.min_warmup_time =
+        StrFormat("min_warmup_time:%0.3f", benchmark_.min_warmup_time_);
+  }
+
+  if (benchmark_.iterations_ != 0) {
+    name_.iterations = StrFormat(
+        "iterations:%lu", static_cast<unsigned long>(benchmark_.iterations_));
+  }
+
+  if (benchmark_.repetitions_ != 0) {
+    name_.repetitions = StrFormat("repeats:%d", benchmark_.repetitions_);
+  }
+
+  if (benchmark_.measure_process_cpu_time_) {
+    name_.time_type = "process_time";
+  }
+
+  if (benchmark_.use_manual_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "manual_time";
+  } else if (benchmark_.use_real_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "real_time";
+  }
+
+  if (!benchmark_.thread_counts_.empty()) {
+    name_.threads = StrFormat("threads:%d", threads_);
+  }
+
+  setup_ = benchmark_.setup_;
+  teardown_ = benchmark_.teardown_;
+}
+
+State BenchmarkInstance::Run(
+    IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+    internal::ThreadManager* manager,
+    internal::PerfCountersMeasurement* perf_counters_measurement) const {
+  State st(name_.function_name, iters, args_, thread_id, threads_, timer,
+           manager, perf_counters_measurement);
+  benchmark_.Run(st);
  return st;
 }

-}  // internal
-}  // benchmark
+void BenchmarkInstance::Setup() const {
+  if (setup_) {
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr);
+    setup_(st);
+  }
+}
+
+void BenchmarkInstance::Teardown() const {
+  if (teardown_) {
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr);
+    teardown_(st);
+  }
+}
+}  // namespace internal
+}  // namespace benchmark
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@ -1,9 +1,6 @@
 #ifndef BENCHMARK_API_INTERNAL_H
 #define BENCHMARK_API_INTERNAL_H

-#include "benchmark/benchmark.h"
-#include "commandlineflags.h"
-
 #include <cmath>
 #include <iosfwd>
 #include <limits>
@ -11,32 +8,68 @@
 #include <string>
 #include <vector>

+#include "benchmark/benchmark.h"
+#include "commandlineflags.h"
+
 namespace benchmark {
 namespace internal {

 // Information kept per benchmark we may want to run
-struct BenchmarkInstance {
-  BenchmarkName name;
-  Benchmark* benchmark;
-  AggregationReportMode aggregation_report_mode;
-  std::vector<int64_t> arg;
-  TimeUnit time_unit;
-  int range_multiplier;
-  bool measure_process_cpu_time;
-  bool use_real_time;
-  bool use_manual_time;
-  BigO complexity;
-  BigOFunc* complexity_lambda;
-  UserCounters counters;
-  const std::vector<Statistics>* statistics;
-  bool last_benchmark_instance;
-  int repetitions;
-  double min_time;
-  IterationCount iterations;
-  int threads;  // Number of concurrent threads to us
+class BenchmarkInstance {
+ public:
+  BenchmarkInstance(Benchmark* benchmark, int family_index,
+                    int per_family_instance_index,
+                    const std::vector<int64_t>& args, int threads);
+
+  const BenchmarkName& name() const { return name_; }
+  int family_index() const { return family_index_; }
+  int per_family_instance_index() const { return per_family_instance_index_; }
+  AggregationReportMode aggregation_report_mode() const {
+    return aggregation_report_mode_;
+  }
+  TimeUnit time_unit() const { return time_unit_; }
+  bool measure_process_cpu_time() const { return measure_process_cpu_time_; }
+  bool use_real_time() const { return use_real_time_; }
+  bool use_manual_time() const { return use_manual_time_; }
+  BigO complexity() const { return complexity_; }
+  BigOFunc* complexity_lambda() const { return complexity_lambda_; }
+  const std::vector<Statistics>& statistics() const { return statistics_; }
+  int repetitions() const { return repetitions_; }
+  double min_time() const { return min_time_; }
+  double min_warmup_time() const { return min_warmup_time_; }
+  IterationCount iterations() const { return iterations_; }
+  int threads() const { return threads_; }
+  void Setup() const;
+  void Teardown() const;

  State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
-            internal::ThreadManager* manager) const;
+            internal::ThreadManager* manager,
+            internal::PerfCountersMeasurement* perf_counters_measurement) const;
+
+ private:
+  BenchmarkName name_;
+  Benchmark& benchmark_;
+  const int family_index_;
+  const int per_family_instance_index_;
+  AggregationReportMode aggregation_report_mode_;
+  const std::vector<int64_t>& args_;
+  TimeUnit time_unit_;
+  bool measure_process_cpu_time_;
+  bool use_real_time_;
+  bool use_manual_time_;
+  BigO complexity_;
+  BigOFunc* complexity_lambda_;
+  UserCounters counters_;
+  const std::vector<Statistics>& statistics_;
+  int repetitions_;
+  double min_time_;
+  double min_warmup_time_;
+  IterationCount iterations_;
+  int threads_;  // Number of concurrent threads to us
+
+  typedef void (*callback_function)(const benchmark::State&);
+  callback_function setup_ = nullptr;
+  callback_function teardown_ = nullptr;
 };

 bool FindBenchmarksInternal(const std::string& re,
@ -45,6 +78,7 @@ bool FindBenchmarksInternal(const std::string& re,

 bool IsZero(double n);

+BENCHMARK_EXPORT
 ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false);

 }  // end namespace internal
--- a/src/benchmark_main.cc
+++ b/src/benchmark_main.cc
@ -14,4 +14,5 @@

 #include "benchmark/benchmark.h"

+BENCHMARK_EXPORT int main(int, char**);
 BENCHMARK_MAIN();
--- a/src/benchmark_name.cc
+++ b/src/benchmark_name.cc
@ -51,8 +51,9 @@ std::string join(char delimiter, const Ts&... ts) {
 }
 }  // namespace

+BENCHMARK_EXPORT
 std::string BenchmarkName::str() const {
-  return join('/', function_name, args, min_time, iterations, repetitions,
-              time_type, threads);
+  return join('/', function_name, args, min_time, min_warmup_time, iterations,
+              repetitions, time_type, threads);
 }
 }  // namespace benchmark
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@ -15,7 +15,7 @@
 #include "benchmark_register.h"

 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@ -24,6 +24,7 @@

 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
@ -31,14 +32,10 @@
 #include <fstream>
 #include <iostream>
 #include <memory>
+#include <numeric>
 #include <sstream>
 #include <thread>

-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-#include <inttypes.h>
-
 #include "benchmark/benchmark.h"
 #include "benchmark_api_internal.h"
 #include "check.h"
@ -56,10 +53,13 @@ namespace benchmark {

 namespace {
 // For non-dense Range, intermediate values are powers of kRangeMultiplier.
-static const int kRangeMultiplier = 8;
+static constexpr int kRangeMultiplier = 8;
+
 // The size of a benchmark family determines is the number of inputs to repeat
 // the benchmark on. If this is "large" then warn the user during configuration.
-static const size_t kMaxFamilySize = 100;
+static constexpr size_t kMaxFamilySize = 100;
+
+static constexpr char kDisabledPrefix[] = "DISABLED_";
 }  // end namespace

 namespace internal {
@ -114,15 +114,15 @@ void BenchmarkFamilies::ClearBenchmarks() {
 bool BenchmarkFamilies::FindBenchmarks(
    std::string spec, std::vector<BenchmarkInstance>* benchmarks,
    std::ostream* ErrStream) {
-  CHECK(ErrStream);
+  BM_CHECK(ErrStream);
  auto& Err = *ErrStream;
  // Make regular expression out of command-line flag
  std::string error_msg;
  Regex re;
-  bool isNegativeFilter = false;
+  bool is_negative_filter = false;
  if (spec[0] == '-') {
    spec.replace(0, 1, "");
-    isNegativeFilter = true;
+    is_negative_filter = true;
  }
  if (!re.Init(spec, &error_msg)) {
    Err << "Could not compile benchmark re: " << error_msg << std::endl;
@ -132,8 +132,13 @@ bool BenchmarkFamilies::FindBenchmarks(
  // Special list of thread counts to use when none are specified
  const std::vector<int> one_thread = {1};

+  int next_family_index = 0;
+
  MutexLock l(mutex_);
  for (std::unique_ptr<Benchmark>& family : families_) {
+    int family_index = next_family_index;
+    int per_family_instance_index = 0;
+
    // Family was deleted or benchmark doesn't match
    if (!family) continue;

@ -152,85 +157,27 @@ bool BenchmarkFamilies::FindBenchmarks(
          << " will be repeated at least " << family_size << " times.\n";
    }
    // reserve in the special case the regex ".", since we know the final
-    // family size.
-    if (spec == ".") benchmarks->reserve(family_size);
+    // family size.  this doesn't take into account any disabled benchmarks
+    // so worst case we reserve more than we need.
+    if (spec == ".") benchmarks->reserve(benchmarks->size() + family_size);

    for (auto const& args : family->args_) {
      for (int num_threads : *thread_counts) {
-        BenchmarkInstance instance;
-        instance.name.function_name = family->name_;
-        instance.benchmark = family.get();
-        instance.aggregation_report_mode = family->aggregation_report_mode_;
-        instance.arg = args;
-        instance.time_unit = family->time_unit_;
-        instance.range_multiplier = family->range_multiplier_;
-        instance.min_time = family->min_time_;
-        instance.iterations = family->iterations_;
-        instance.repetitions = family->repetitions_;
-        instance.measure_process_cpu_time = family->measure_process_cpu_time_;
-        instance.use_real_time = family->use_real_time_;
-        instance.use_manual_time = family->use_manual_time_;
-        instance.complexity = family->complexity_;
-        instance.complexity_lambda = family->complexity_lambda_;
-        instance.statistics = &family->statistics_;
-        instance.threads = num_threads;
+        BenchmarkInstance instance(family.get(), family_index,
+                                   per_family_instance_index, args,
+                                   num_threads);

-        // Add arguments to instance name
-        size_t arg_i = 0;
-        for (auto const& arg : args) {
-          if (!instance.name.args.empty()) {
-            instance.name.args += '/';
-          }
-
-          if (arg_i < family->arg_names_.size()) {
-            const auto& arg_name = family->arg_names_[arg_i];
-            if (!arg_name.empty()) {
-              instance.name.args += StrFormat("%s:", arg_name.c_str());
-            }
-          }
-
-          instance.name.args += StrFormat("%" PRId64, arg);
-          ++arg_i;
-        }
-
-        if (!IsZero(family->min_time_))
-          instance.name.min_time =
-              StrFormat("min_time:%0.3f", family->min_time_);
-        if (family->iterations_ != 0) {
-          instance.name.iterations =
-              StrFormat("iterations:%lu",
-                        static_cast<unsigned long>(family->iterations_));
-        }
-        if (family->repetitions_ != 0)
-          instance.name.repetitions =
-              StrFormat("repeats:%d", family->repetitions_);
-
-        if (family->measure_process_cpu_time_) {
-          instance.name.time_type = "process_time";
-        }
-
-        if (family->use_manual_time_) {
-          if (!instance.name.time_type.empty()) {
-            instance.name.time_type += '/';
-          }
-          instance.name.time_type += "manual_time";
-        } else if (family->use_real_time_) {
-          if (!instance.name.time_type.empty()) {
-            instance.name.time_type += '/';
-          }
-          instance.name.time_type += "real_time";
-        }
-
-        // Add the number of threads used to the name
-        if (!family->thread_counts_.empty()) {
-          instance.name.threads = StrFormat("threads:%d", instance.threads);
-        }
-
-        const auto full_name = instance.name.str();
-        if ((re.Match(full_name) && !isNegativeFilter) ||
-            (!re.Match(full_name) && isNegativeFilter)) {
-          instance.last_benchmark_instance = (&args == &family->args_.back());
+        const auto full_name = instance.name().str();
+        if (full_name.rfind(kDisabledPrefix, 0) != 0 &&
+            ((re.Match(full_name) && !is_negative_filter) ||
+             (!re.Match(full_name) && is_negative_filter))) {
          benchmarks->push_back(std::move(instance));
+
+          ++per_family_instance_index;
+
+          // Only bump the next family index once we've estabilished that
+          // at least one instance of this family will be run.
+          if (next_family_index == family_index) ++next_family_index;
        }
      }
    }
@ -257,39 +204,50 @@ bool FindBenchmarksInternal(const std::string& re,
 //                               Benchmark
 //=============================================================================//

-Benchmark::Benchmark(const char* name)
+Benchmark::Benchmark(const std::string& name)
    : name_(name),
      aggregation_report_mode_(ARM_Unspecified),
-      time_unit_(kNanosecond),
+      time_unit_(GetDefaultTimeUnit()),
+      use_default_time_unit_(true),
      range_multiplier_(kRangeMultiplier),
      min_time_(0),
+      min_warmup_time_(0),
      iterations_(0),
      repetitions_(0),
      measure_process_cpu_time_(false),
      use_real_time_(false),
      use_manual_time_(false),
      complexity_(oNone),
-      complexity_lambda_(nullptr) {
+      complexity_lambda_(nullptr),
+      setup_(nullptr),
+      teardown_(nullptr) {
  ComputeStatistics("mean", StatisticsMean);
  ComputeStatistics("median", StatisticsMedian);
  ComputeStatistics("stddev", StatisticsStdDev);
+  ComputeStatistics("cv", StatisticsCV, kPercentage);
 }

 Benchmark::~Benchmark() {}

+Benchmark* Benchmark::Name(const std::string& name) {
+  SetName(name);
+  return this;
+}
+
 Benchmark* Benchmark::Arg(int64_t x) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
  args_.push_back({x});
  return this;
 }

 Benchmark* Benchmark::Unit(TimeUnit unit) {
  time_unit_ = unit;
+  use_default_time_unit_ = false;
  return this;
 }

 Benchmark* Benchmark::Range(int64_t start, int64_t limit) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
  std::vector<int64_t> arglist;
  AddRange(&arglist, start, limit, range_multiplier_);

@ -301,53 +259,61 @@ Benchmark* Benchmark::Range(int64_t start, int64_t limit) {

 Benchmark* Benchmark::Ranges(
    const std::vector<std::pair<int64_t, int64_t>>& ranges) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
  std::vector<std::vector<int64_t>> arglists(ranges.size());
-  std::size_t total = 1;
  for (std::size_t i = 0; i < ranges.size(); i++) {
    AddRange(&arglists[i], ranges[i].first, ranges[i].second,
             range_multiplier_);
-    total *= arglists[i].size();
  }

-  std::vector<std::size_t> ctr(arglists.size(), 0);
+  ArgsProduct(arglists);

+  return this;
+}
+
+Benchmark* Benchmark::ArgsProduct(
+    const std::vector<std::vector<int64_t>>& arglists) {
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(arglists.size()));
+
+  std::vector<std::size_t> indices(arglists.size());
+  const std::size_t total = std::accumulate(
+      std::begin(arglists), std::end(arglists), std::size_t{1},
+      [](const std::size_t res, const std::vector<int64_t>& arglist) {
+        return res * arglist.size();
+      });
+  std::vector<int64_t> args;
+  args.reserve(arglists.size());
  for (std::size_t i = 0; i < total; i++) {
-    std::vector<int64_t> tmp;
-    tmp.reserve(arglists.size());
-
-    for (std::size_t j = 0; j < arglists.size(); j++) {
-      tmp.push_back(arglists[j].at(ctr[j]));
+    for (std::size_t arg = 0; arg < arglists.size(); arg++) {
+      args.push_back(arglists[arg][indices[arg]]);
    }
+    args_.push_back(args);
+    args.clear();

-    args_.push_back(std::move(tmp));
-
-    for (std::size_t j = 0; j < arglists.size(); j++) {
-      if (ctr[j] + 1 < arglists[j].size()) {
-        ++ctr[j];
-        break;
-      }
-      ctr[j] = 0;
-    }
+    std::size_t arg = 0;
+    do {
+      indices[arg] = (indices[arg] + 1) % arglists[arg].size();
+    } while (indices[arg++] == 0 && arg < arglists.size());
  }
+
  return this;
 }

 Benchmark* Benchmark::ArgName(const std::string& name) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
  arg_names_ = {name};
  return this;
 }

 Benchmark* Benchmark::ArgNames(const std::vector<std::string>& names) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
  arg_names_ = names;
  return this;
 }

 Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-  CHECK_LE(start, limit);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK_LE(start, limit);
  for (int64_t arg = start; arg <= limit; arg += step) {
    args_.push_back({arg});
  }
@ -355,7 +321,7 @@ Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
 }

 Benchmark* Benchmark::Args(const std::vector<int64_t>& args) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
  args_.push_back(args);
  return this;
 }
@ -365,28 +331,48 @@ Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
  return this;
 }

+Benchmark* Benchmark::Setup(void (*setup)(const benchmark::State&)) {
+  BM_CHECK(setup != nullptr);
+  setup_ = setup;
+  return this;
+}
+
+Benchmark* Benchmark::Teardown(void (*teardown)(const benchmark::State&)) {
+  BM_CHECK(teardown != nullptr);
+  teardown_ = teardown;
+  return this;
+}
+
 Benchmark* Benchmark::RangeMultiplier(int multiplier) {
-  CHECK(multiplier > 1);
+  BM_CHECK(multiplier > 1);
  range_multiplier_ = multiplier;
  return this;
 }

 Benchmark* Benchmark::MinTime(double t) {
-  CHECK(t > 0.0);
-  CHECK(iterations_ == 0);
+  BM_CHECK(t > 0.0);
+  BM_CHECK(iterations_ == 0);
  min_time_ = t;
  return this;
 }

+Benchmark* Benchmark::MinWarmUpTime(double t) {
+  BM_CHECK(t >= 0.0);
+  BM_CHECK(iterations_ == 0);
+  min_warmup_time_ = t;
+  return this;
+}
+
 Benchmark* Benchmark::Iterations(IterationCount n) {
-  CHECK(n > 0);
-  CHECK(IsZero(min_time_));
+  BM_CHECK(n > 0);
+  BM_CHECK(IsZero(min_time_));
+  BM_CHECK(IsZero(min_warmup_time_));
  iterations_ = n;
  return this;
 }

 Benchmark* Benchmark::Repetitions(int n) {
-  CHECK(n > 0);
+  BM_CHECK(n > 0);
  repetitions_ = n;
  return this;
 }
@ -419,14 +405,14 @@ Benchmark* Benchmark::MeasureProcessCPUTime() {
 }

 Benchmark* Benchmark::UseRealTime() {
-  CHECK(!use_manual_time_)
+  BM_CHECK(!use_manual_time_)
      << "Cannot set UseRealTime and UseManualTime simultaneously.";
  use_real_time_ = true;
  return this;
 }

 Benchmark* Benchmark::UseManualTime() {
-  CHECK(!use_real_time_)
+  BM_CHECK(!use_real_time_)
      << "Cannot set UseRealTime and UseManualTime simultaneously.";
  use_manual_time_ = true;
  return this;
@ -443,21 +429,22 @@ Benchmark* Benchmark::Complexity(BigOFunc* complexity) {
  return this;
 }

-Benchmark* Benchmark::ComputeStatistics(std::string name,
-                                        StatisticsFunc* statistics) {
-  statistics_.emplace_back(name, statistics);
+Benchmark* Benchmark::ComputeStatistics(const std::string& name,
+                                        StatisticsFunc* statistics,
+                                        StatisticUnit unit) {
+  statistics_.emplace_back(name, statistics, unit);
  return this;
 }

 Benchmark* Benchmark::Threads(int t) {
-  CHECK_GT(t, 0);
+  BM_CHECK_GT(t, 0);
  thread_counts_.push_back(t);
  return this;
 }

 Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);

  AddRange(&thread_counts_, min_threads, max_threads, 2);
  return this;
@ -465,9 +452,9 @@ Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {

 Benchmark* Benchmark::DenseThreadRange(int min_threads, int max_threads,
                                       int stride) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
-  CHECK_GE(stride, 1);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GE(stride, 1);

  for (auto i = min_threads; i < max_threads; i += stride) {
    thread_counts_.push_back(i);
@ -481,7 +468,9 @@ Benchmark* Benchmark::ThreadPerCpu() {
  return this;
 }

-void Benchmark::SetName(const char* name) { name_ = name; }
+void Benchmark::SetName(const std::string& name) { name_ = name; }
+
+const char* Benchmark::GetName() const { return name_.c_str(); }

 int Benchmark::ArgsCnt() const {
  if (args_.empty()) {
@ -491,6 +480,16 @@ int Benchmark::ArgsCnt() const {
  return static_cast<int>(args_.front().size());
 }

+const char* Benchmark::GetArgName(int arg) const {
+  BM_CHECK_GE(arg, 0);
+  BM_CHECK_LT(arg, static_cast<int>(arg_names_.size()));
+  return arg_names_[arg].c_str();
+}
+
+TimeUnit Benchmark::GetTimeUnit() const {
+  return use_default_time_unit_ ? GetDefaultTimeUnit() : time_unit_;
+}
+
 //=============================================================================//
 //                            FunctionBenchmark
 //=============================================================================//
@ -503,4 +502,19 @@ void ClearRegisteredBenchmarks() {
  internal::BenchmarkFamilies::GetInstance()->ClearBenchmarks();
 }

+std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi) {
+  std::vector<int64_t> args;
+  internal::AddRange(&args, lo, hi, multi);
+  return args;
+}
+
+std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit, int step) {
+  BM_CHECK_LE(start, limit);
+  std::vector<int64_t> args;
+  for (int64_t arg = start; arg <= limit; arg += step) {
+    args.push_back(arg);
+  }
+  return args;
+}
+
 }  // end namespace benchmark
--- a/src/benchmark_register.h
+++ b/src/benchmark_register.h
@ -1,6 +1,8 @@
 #ifndef BENCHMARK_REGISTER_H
 #define BENCHMARK_REGISTER_H

+#include <algorithm>
+#include <limits>
 #include <vector>

 #include "check.h"
@ -11,18 +13,18 @@ namespace internal {
 // Append the powers of 'mult' in the closed interval [lo, hi].
 // Returns iterator to the start of the inserted range.
 template <typename T>
-typename std::vector<T>::iterator
-AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
-  CHECK_GE(lo, 0);
-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
+typename std::vector<T>::iterator AddPowers(std::vector<T>* dst, T lo, T hi,
+                                            int mult) {
+  BM_CHECK_GE(lo, 0);
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);

  const size_t start_offset = dst->size();

  static const T kmax = std::numeric_limits<T>::max();

  // Space out the values in multiples of "mult"
-  for (T i = 1; i <= hi; i *= mult) {
+  for (T i = static_cast<T>(1); i <= hi; i *= static_cast<T>(mult)) {
    if (i >= lo) {
      dst->push_back(i);
    }
@ -31,16 +33,16 @@ AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
    if (i > kmax / mult) break;
  }

-  return dst->begin() + start_offset;
+  return dst->begin() + static_cast<int>(start_offset);
 }

 template <typename T>
 void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
  // We negate lo and hi so we require that they cannot be equal to 'min'.
-  CHECK_GT(lo, std::numeric_limits<T>::min());
-  CHECK_GT(hi, std::numeric_limits<T>::min());
-  CHECK_GE(hi, lo);
-  CHECK_LE(hi, 0);
+  BM_CHECK_GT(lo, std::numeric_limits<T>::min());
+  BM_CHECK_GT(hi, std::numeric_limits<T>::min());
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_LE(hi, 0);

  // Add positive powers, then negate and reverse.
  // Casts necessary since small integers get promoted
@ -59,8 +61,8 @@ void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
  static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
                "Args type must be a signed integer");

-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);

  // Add "lo"
  dst->push_back(lo);
@ -86,7 +88,7 @@ void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
  }

  // Treat 0 as a special case (see discussion on #762).
-  if (lo <= 0 && hi >= 0) {
+  if (lo < 0 && hi >= 0) {
    dst->push_back(0);
  }

--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@ -13,12 +13,13 @@
 // limitations under the License.

 #include "benchmark_runner.h"
+
 #include "benchmark/benchmark.h"
 #include "benchmark_api_internal.h"
 #include "internal_macros.h"

 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@ -27,11 +28,14 @@

 #include <algorithm>
 #include <atomic>
+#include <climits>
+#include <cmath>
 #include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
+#include <limits>
 #include <memory>
 #include <string>
 #include <thread>
@ -45,6 +49,7 @@
 #include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
+#include "perf_counters.h"
 #include "re.h"
 #include "statistics.h"
 #include "string_util.h"
@ -60,64 +65,72 @@ MemoryManager* memory_manager = nullptr;
 namespace {

 static constexpr IterationCount kMaxIterations = 1000000000;
+const double kDefaultMinTime =
+    std::strtod(::benchmark::kDefaultMinTimeStr, /*p_end*/ nullptr);

 BenchmarkReporter::Run CreateRunReport(
    const benchmark::internal::BenchmarkInstance& b,
    const internal::ThreadManager::Result& results,
    IterationCount memory_iterations,
-    const MemoryManager::Result& memory_result, double seconds,
-    int64_t repetition_index) {
+    const MemoryManager::Result* memory_result, double seconds,
+    int64_t repetition_index, int64_t repeats) {
  // Create report about this benchmark run.
  BenchmarkReporter::Run report;

-  report.run_name = b.name;
-  report.error_occurred = results.has_error_;
-  report.error_message = results.error_message_;
+  report.run_name = b.name();
+  report.family_index = b.family_index();
+  report.per_family_instance_index = b.per_family_instance_index();
+  report.skipped = results.skipped_;
+  report.skip_message = results.skip_message_;
  report.report_label = results.report_label_;
  // This is the total iterations across all threads.
  report.iterations = results.iterations;
-  report.time_unit = b.time_unit;
-  report.threads = b.threads;
+  report.time_unit = b.time_unit();
+  report.threads = b.threads();
  report.repetition_index = repetition_index;
-  report.repetitions = b.repetitions;
+  report.repetitions = repeats;

-  if (!report.error_occurred) {
-    if (b.use_manual_time) {
+  if (!report.skipped) {
+    if (b.use_manual_time()) {
      report.real_accumulated_time = results.manual_time_used;
    } else {
      report.real_accumulated_time = results.real_time_used;
    }
    report.cpu_accumulated_time = results.cpu_time_used;
    report.complexity_n = results.complexity_n;
-    report.complexity = b.complexity;
-    report.complexity_lambda = b.complexity_lambda;
-    report.statistics = b.statistics;
+    report.complexity = b.complexity();
+    report.complexity_lambda = b.complexity_lambda();
+    report.statistics = &b.statistics();
    report.counters = results.counters;

    if (memory_iterations > 0) {
-      report.has_memory_result = true;
+      assert(memory_result != nullptr);
+      report.memory_result = memory_result;
      report.allocs_per_iter =
-          memory_iterations ? static_cast<double>(memory_result.num_allocs) /
+          memory_iterations ? static_cast<double>(memory_result->num_allocs) /
                                  memory_iterations
                            : 0;
-      report.max_bytes_used = memory_result.max_bytes_used;
    }

-    internal::Finish(&report.counters, results.iterations, seconds, b.threads);
+    internal::Finish(&report.counters, results.iterations, seconds,
+                     b.threads());
  }
  return report;
 }

 // Execute one thread of benchmark b for the specified number of iterations.
-// Adds the stats collected for the thread into *total.
+// Adds the stats collected for the thread into manager->results.
 void RunInThread(const BenchmarkInstance* b, IterationCount iters,
-                 int thread_id, ThreadManager* manager) {
+                 int thread_id, ThreadManager* manager,
+                 PerfCountersMeasurement* perf_counters_measurement) {
  internal::ThreadTimer timer(
-      b->measure_process_cpu_time
+      b->measure_process_cpu_time()
          ? internal::ThreadTimer::CreateProcessCpuTime()
          : internal::ThreadTimer::Create());
-  State st = b->Run(iters, thread_id, &timer, manager);
-  CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
+
+  State st =
+      b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
+  BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations)
      << "Benchmark returned before State::KeepRunning() returned false!";
  {
    MutexLock l(manager->GetBenchmarkMutex());
@ -132,229 +145,351 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,
  manager->NotifyThreadComplete();
 }

-class BenchmarkRunner {
- public:
-  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
-                  std::vector<BenchmarkReporter::Run>* complexity_reports_)
-      : b(b_),
-        complexity_reports(*complexity_reports_),
-        min_time(!IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time),
-        repeats(b.repetitions != 0 ? b.repetitions
-                                   : FLAGS_benchmark_repetitions),
-        has_explicit_iteration_count(b.iterations != 0),
-        pool(b.threads - 1),
-        iters(has_explicit_iteration_count ? b.iterations : 1) {
-    run_results.display_report_aggregates_only =
-        (FLAGS_benchmark_report_aggregates_only ||
-         FLAGS_benchmark_display_aggregates_only);
-    run_results.file_report_aggregates_only =
-        FLAGS_benchmark_report_aggregates_only;
-    if (b.aggregation_report_mode != internal::ARM_Unspecified) {
-      run_results.display_report_aggregates_only =
-          (b.aggregation_report_mode &
-           internal::ARM_DisplayReportAggregatesOnly);
-      run_results.file_report_aggregates_only =
-          (b.aggregation_report_mode & internal::ARM_FileReportAggregatesOnly);
-    }
+double ComputeMinTime(const benchmark::internal::BenchmarkInstance& b,
+                      const BenchTimeType& iters_or_time) {
+  if (!IsZero(b.min_time())) return b.min_time();
+  // If the flag was used to specify number of iters, then return the default
+  // min_time.
+  if (iters_or_time.tag == BenchTimeType::ITERS) return kDefaultMinTime;

-    for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
-      DoOneRepetition(repetition_num);
-    }
+  return iters_or_time.time;
+}

-    // Calculate additional statistics
-    run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+IterationCount ComputeIters(const benchmark::internal::BenchmarkInstance& b,
+                            const BenchTimeType& iters_or_time) {
+  if (b.iterations() != 0) return b.iterations();

-    // Maybe calculate complexity report
-    if ((b.complexity != oNone) && b.last_benchmark_instance) {
-      auto additional_run_stats = ComputeBigO(complexity_reports);
-      run_results.aggregates_only.insert(run_results.aggregates_only.end(),
-                                         additional_run_stats.begin(),
-                                         additional_run_stats.end());
-      complexity_reports.clear();
-    }
-  }
-
-  RunResults&& get_results() { return std::move(run_results); }
-
- private:
-  RunResults run_results;
-
-  const benchmark::internal::BenchmarkInstance& b;
-  std::vector<BenchmarkReporter::Run>& complexity_reports;
-
-  const double min_time;
-  const int repeats;
-  const bool has_explicit_iteration_count;
-
-  std::vector<std::thread> pool;
-
-  IterationCount iters;  // preserved between repetitions!
-  // So only the first repetition has to find/calculate it,
-  // the other repetitions will just use that precomputed iteration count.
-
-  struct IterationResults {
-    internal::ThreadManager::Result results;
-    IterationCount iters;
-    double seconds;
-  };
-  IterationResults DoNIterations() {
-    VLOG(2) << "Running " << b.name.str() << " for " << iters << "\n";
-
-    std::unique_ptr<internal::ThreadManager> manager;
-    manager.reset(new internal::ThreadManager(b.threads));
-
-    // Run all but one thread in separate threads
-    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-      pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
-                             manager.get());
-    }
-    // And run one thread here directly.
-    // (If we were asked to run just one thread, we don't create new threads.)
-    // Yes, we need to do this here *after* we start the separate threads.
-    RunInThread(&b, iters, 0, manager.get());
-
-    // The main thread has finished. Now let's wait for the other threads.
-    manager->WaitForAllThreads();
-    for (std::thread& thread : pool) thread.join();
-
-    IterationResults i;
-    // Acquire the measurements/counters from the manager, UNDER THE LOCK!
-    {
-      MutexLock l(manager->GetBenchmarkMutex());
-      i.results = manager->results;
-    }
-
-    // And get rid of the manager.
-    manager.reset();
-
-    // Adjust real/manual time stats since they were reported per thread.
-    i.results.real_time_used /= b.threads;
-    i.results.manual_time_used /= b.threads;
-    // If we were measuring whole-process CPU usage, adjust the CPU time too.
-    if (b.measure_process_cpu_time) i.results.cpu_time_used /= b.threads;
-
-    VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
-            << i.results.real_time_used << "\n";
-
-    // So for how long were we running?
-    i.iters = iters;
-    // Base decisions off of real time if requested by this benchmark.
-    i.seconds = i.results.cpu_time_used;
-    if (b.use_manual_time) {
-      i.seconds = i.results.manual_time_used;
-    } else if (b.use_real_time) {
-      i.seconds = i.results.real_time_used;
-    }
-
-    return i;
-  }
-
-  IterationCount PredictNumItersNeeded(const IterationResults& i) const {
-    // See how much iterations should be increased by.
-    // Note: Avoid division by zero with max(seconds, 1ns).
-    double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
-    // If our last run was at least 10% of FLAGS_benchmark_min_time then we
-    // use the multiplier directly.
-    // Otherwise we use at most 10 times expansion.
-    // NOTE: When the last run was at least 10% of the min time the max
-    // expansion should be 14x.
-    bool is_significant = (i.seconds / min_time) > 0.1;
-    multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
-    if (multiplier <= 1.0) multiplier = 2.0;
-
-    // So what seems to be the sufficiently-large iteration count? Round up.
-    const IterationCount max_next_iters = static_cast<IterationCount>(
-        std::lround(std::max(multiplier * static_cast<double>(i.iters),
-                             static_cast<double>(i.iters) + 1.0)));
-    // But we do have *some* sanity limits though..
-    const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
-
-    VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
-    return next_iters;  // round up before conversion to integer.
-  }
-
-  bool ShouldReportIterationResults(const IterationResults& i) const {
-    // Determine if this run should be reported;
-    // Either it has run for a sufficient amount of time
-    // or because an error was reported.
-    return i.results.has_error_ ||
-           i.iters >= kMaxIterations ||  // Too many iterations already.
-           i.seconds >= min_time ||      // The elapsed time is large enough.
-           // CPU time is specified but the elapsed real time greatly exceeds
-           // the minimum time.
-           // Note that user provided timers are except from this sanity check.
-           ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time);
-  }
-
-  void DoOneRepetition(int64_t repetition_index) {
-    const bool is_the_first_repetition = repetition_index == 0;
-    IterationResults i;
-
-    // We *may* be gradually increasing the length (iteration count)
-    // of the benchmark until we decide the results are significant.
-    // And once we do, we report those last results and exit.
-    // Please do note that the if there are repetitions, the iteration count
-    // is *only* calculated for the *first* repetition, and other repetitions
-    // simply use that precomputed iteration count.
-    for (;;) {
-      i = DoNIterations();
-
-      // Do we consider the results to be significant?
-      // If we are doing repetitions, and the first repetition was already done,
-      // it has calculated the correct iteration time, so we have run that very
-      // iteration count just now. No need to calculate anything. Just report.
-      // Else, the normal rules apply.
-      const bool results_are_significant = !is_the_first_repetition ||
-                                           has_explicit_iteration_count ||
-                                           ShouldReportIterationResults(i);
-
-      if (results_are_significant) break;  // Good, let's report them!
-
-      // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
-      // iteration count, and run the benchmark again...
-
-      iters = PredictNumItersNeeded(i);
-      assert(iters > i.iters &&
-             "if we did more iterations than we want to do the next time, "
-             "then we should have accepted the current iteration run.");
-    }
-
-    // Oh, one last thing, we need to also produce the 'memory measurements'..
-    MemoryManager::Result memory_result;
-    IterationCount memory_iterations = 0;
-    if (memory_manager != nullptr) {
-      // Only run a few iterations to reduce the impact of one-time
-      // allocations in benchmarks that are not properly managed.
-      memory_iterations = std::min<IterationCount>(16, iters);
-      memory_manager->Start();
-      std::unique_ptr<internal::ThreadManager> manager;
-      manager.reset(new internal::ThreadManager(1));
-      RunInThread(&b, memory_iterations, 0, manager.get());
-      manager->WaitForAllThreads();
-      manager.reset();
-
-      memory_manager->Stop(&memory_result);
-    }
-
-    // Ok, now actualy report.
-    BenchmarkReporter::Run report =
-        CreateRunReport(b, i.results, memory_iterations, memory_result,
-                        i.seconds, repetition_index);
-
-    if (!report.error_occurred && b.complexity != oNone)
-      complexity_reports.push_back(report);
-
-    run_results.non_aggregates.push_back(report);
-  }
-};
+  // We've already concluded that this flag is currently used to pass
+  // iters but do a check here again anyway.
+  BM_CHECK(iters_or_time.tag == BenchTimeType::ITERS);
+  return iters_or_time.iters;
+}

 }  // end namespace

-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports) {
-  internal::BenchmarkRunner r(b, complexity_reports);
-  return r.get_results();
+BenchTimeType ParseBenchMinTime(const std::string& value) {
+  BenchTimeType ret;
+
+  if (value.empty()) {
+    ret.tag = BenchTimeType::TIME;
+    ret.time = 0.0;
+    return ret;
+  }
+
+  if (value.back() == 'x') {
+    char* p_end;
+    // Reset errno before it's changed by strtol.
+    errno = 0;
+    IterationCount num_iters = std::strtol(value.c_str(), &p_end, 10);
+
+    // After a valid parse, p_end should have been set to
+    // point to the 'x' suffix.
+    BM_CHECK(errno == 0 && p_end != nullptr && *p_end == 'x')
+        << "Malformed iters value passed to --benchmark_min_time: `" << value
+        << "`. Expected --benchmark_min_time=<integer>x.";
+
+    ret.tag = BenchTimeType::ITERS;
+    ret.iters = num_iters;
+    return ret;
+  }
+
+  bool has_suffix = value.back() == 's';
+  if (!has_suffix) {
+    BM_VLOG(0) << "Value passed to --benchmark_min_time should have a suffix. "
+                  "Eg., `30s` for 30-seconds.";
+  }
+
+  char* p_end;
+  // Reset errno before it's changed by strtod.
+  errno = 0;
+  double min_time = std::strtod(value.c_str(), &p_end);
+
+  // After a successful parse, p_end should point to the suffix 's',
+  // or the end of the string if the suffix was omitted.
+  BM_CHECK(errno == 0 && p_end != nullptr &&
+           ((has_suffix && *p_end == 's') || *p_end == '\0'))
+      << "Malformed seconds value passed to --benchmark_min_time: `" << value
+      << "`. Expected --benchmark_min_time=<float>x.";
+
+  ret.tag = BenchTimeType::TIME;
+  ret.time = min_time;
+
+  return ret;
+}
+
+BenchmarkRunner::BenchmarkRunner(
+    const benchmark::internal::BenchmarkInstance& b_,
+    PerfCountersMeasurement* pcm_,
+    BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
+    : b(b_),
+      reports_for_family(reports_for_family_),
+      parsed_benchtime_flag(ParseBenchMinTime(FLAGS_benchmark_min_time)),
+      min_time(ComputeMinTime(b_, parsed_benchtime_flag)),
+      min_warmup_time((!IsZero(b.min_time()) && b.min_warmup_time() > 0.0)
+                          ? b.min_warmup_time()
+                          : FLAGS_benchmark_min_warmup_time),
+      warmup_done(!(min_warmup_time > 0.0)),
+      repeats(b.repetitions() != 0 ? b.repetitions()
+                                   : FLAGS_benchmark_repetitions),
+      has_explicit_iteration_count(b.iterations() != 0 ||
+                                   parsed_benchtime_flag.tag ==
+                                       BenchTimeType::ITERS),
+      pool(b.threads() - 1),
+      iters(has_explicit_iteration_count
+                ? ComputeIters(b_, parsed_benchtime_flag)
+                : 1),
+      perf_counters_measurement_ptr(pcm_) {
+  run_results.display_report_aggregates_only =
+      (FLAGS_benchmark_report_aggregates_only ||
+       FLAGS_benchmark_display_aggregates_only);
+  run_results.file_report_aggregates_only =
+      FLAGS_benchmark_report_aggregates_only;
+  if (b.aggregation_report_mode() != internal::ARM_Unspecified) {
+    run_results.display_report_aggregates_only =
+        (b.aggregation_report_mode() &
+         internal::ARM_DisplayReportAggregatesOnly);
+    run_results.file_report_aggregates_only =
+        (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
+    BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
+             (perf_counters_measurement_ptr->num_counters() == 0))
+        << "Perf counters were requested but could not be set up.";
+  }
+}
+
+BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
+  BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n";
+
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(b.threads()));
+
+  // Run all but one thread in separate threads
+  for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+    pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
+                           manager.get(), perf_counters_measurement_ptr);
+  }
+  // And run one thread here directly.
+  // (If we were asked to run just one thread, we don't create new threads.)
+  // Yes, we need to do this here *after* we start the separate threads.
+  RunInThread(&b, iters, 0, manager.get(), perf_counters_measurement_ptr);
+
+  // The main thread has finished. Now let's wait for the other threads.
+  manager->WaitForAllThreads();
+  for (std::thread& thread : pool) thread.join();
+
+  IterationResults i;
+  // Acquire the measurements/counters from the manager, UNDER THE LOCK!
+  {
+    MutexLock l(manager->GetBenchmarkMutex());
+    i.results = manager->results;
+  }
+
+  // And get rid of the manager.
+  manager.reset();
+
+  // Adjust real/manual time stats since they were reported per thread.
+  i.results.real_time_used /= b.threads();
+  i.results.manual_time_used /= b.threads();
+  // If we were measuring whole-process CPU usage, adjust the CPU time too.
+  if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads();
+
+  BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
+             << i.results.real_time_used << "\n";
+
+  // By using KeepRunningBatch a benchmark can iterate more times than
+  // requested, so take the iteration count from i.results.
+  i.iters = i.results.iterations / b.threads();
+
+  // Base decisions off of real time if requested by this benchmark.
+  i.seconds = i.results.cpu_time_used;
+  if (b.use_manual_time()) {
+    i.seconds = i.results.manual_time_used;
+  } else if (b.use_real_time()) {
+    i.seconds = i.results.real_time_used;
+  }
+
+  return i;
+}
+
+IterationCount BenchmarkRunner::PredictNumItersNeeded(
+    const IterationResults& i) const {
+  // See how much iterations should be increased by.
+  // Note: Avoid division by zero with max(seconds, 1ns).
+  double multiplier = GetMinTimeToApply() * 1.4 / std::max(i.seconds, 1e-9);
+  // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+  // use the multiplier directly.
+  // Otherwise we use at most 10 times expansion.
+  // NOTE: When the last run was at least 10% of the min time the max
+  // expansion should be 14x.
+  const bool is_significant = (i.seconds / GetMinTimeToApply()) > 0.1;
+  multiplier = is_significant ? multiplier : 10.0;
+
+  // So what seems to be the sufficiently-large iteration count? Round up.
+  const IterationCount max_next_iters = static_cast<IterationCount>(
+      std::lround(std::max(multiplier * static_cast<double>(i.iters),
+                           static_cast<double>(i.iters) + 1.0)));
+  // But we do have *some* limits though..
+  const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
+
+  BM_VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+  return next_iters;  // round up before conversion to integer.
+}
+
+bool BenchmarkRunner::ShouldReportIterationResults(
+    const IterationResults& i) const {
+  // Determine if this run should be reported;
+  // Either it has run for a sufficient amount of time
+  // or because an error was reported.
+  return i.results.skipped_ ||
+         i.iters >= kMaxIterations ||  // Too many iterations already.
+         i.seconds >=
+             GetMinTimeToApply() ||  // The elapsed time is large enough.
+         // CPU time is specified but the elapsed real time greatly exceeds
+         // the minimum time.
+         // Note that user provided timers are except from this test.
+         ((i.results.real_time_used >= 5 * GetMinTimeToApply()) &&
+          !b.use_manual_time());
+}
+
+double BenchmarkRunner::GetMinTimeToApply() const {
+  // In order to re-use functionality to run and measure benchmarks for running
+  // a warmup phase of the benchmark, we need a way of telling whether to apply
+  // min_time or min_warmup_time. This function will figure out if we are in the
+  // warmup phase and therefore need to apply min_warmup_time or if we already
+  // in the benchmarking phase and min_time needs to be applied.
+  return warmup_done ? min_time : min_warmup_time;
+}
+
+void BenchmarkRunner::FinishWarmUp(const IterationCount& i) {
+  warmup_done = true;
+  iters = i;
+}
+
+void BenchmarkRunner::RunWarmUp() {
+  // Use the same mechanisms for warming up the benchmark as used for actually
+  // running and measuring the benchmark.
+  IterationResults i_warmup;
+  // Dont use the iterations determined in the warmup phase for the actual
+  // measured benchmark phase. While this may be a good starting point for the
+  // benchmark and it would therefore get rid of the need to figure out how many
+  // iterations are needed if min_time is set again, this may also be a complete
+  // wrong guess since the warmup loops might be considerably slower (e.g
+  // because of caching effects).
+  const IterationCount i_backup = iters;
+
+  for (;;) {
+    b.Setup();
+    i_warmup = DoNIterations();
+    b.Teardown();
+
+    const bool finish = ShouldReportIterationResults(i_warmup);
+
+    if (finish) {
+      FinishWarmUp(i_backup);
+      break;
+    }
+
+    // Although we are running "only" a warmup phase where running enough
+    // iterations at once without measuring time isn't as important as it is for
+    // the benchmarking phase, we still do it the same way as otherwise it is
+    // very confusing for the user to know how to choose a proper value for
+    // min_warmup_time if a different approach on running it is used.
+    iters = PredictNumItersNeeded(i_warmup);
+    assert(iters > i_warmup.iters &&
+           "if we did more iterations than we want to do the next time, "
+           "then we should have accepted the current iteration run.");
+  }
+}
+
+void BenchmarkRunner::DoOneRepetition() {
+  assert(HasRepeatsRemaining() && "Already done all repetitions?");
+
+  const bool is_the_first_repetition = num_repetitions_done == 0;
+
+  // In case a warmup phase is requested by the benchmark, run it now.
+  // After running the warmup phase the BenchmarkRunner should be in a state as
+  // this warmup never happened except the fact that warmup_done is set. Every
+  // other manipulation of the BenchmarkRunner instance would be a bug! Please
+  // fix it.
+  if (!warmup_done) RunWarmUp();
+
+  IterationResults i;
+  // We *may* be gradually increasing the length (iteration count)
+  // of the benchmark until we decide the results are significant.
+  // And once we do, we report those last results and exit.
+  // Please do note that the if there are repetitions, the iteration count
+  // is *only* calculated for the *first* repetition, and other repetitions
+  // simply use that precomputed iteration count.
+  for (;;) {
+    b.Setup();
+    i = DoNIterations();
+    b.Teardown();
+
+    // Do we consider the results to be significant?
+    // If we are doing repetitions, and the first repetition was already done,
+    // it has calculated the correct iteration time, so we have run that very
+    // iteration count just now. No need to calculate anything. Just report.
+    // Else, the normal rules apply.
+    const bool results_are_significant = !is_the_first_repetition ||
+                                         has_explicit_iteration_count ||
+                                         ShouldReportIterationResults(i);
+
+    if (results_are_significant) break;  // Good, let's report them!
+
+    // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
+    // iteration count, and run the benchmark again...
+
+    iters = PredictNumItersNeeded(i);
+    assert(iters > i.iters &&
+           "if we did more iterations than we want to do the next time, "
+           "then we should have accepted the current iteration run.");
+  }
+
+  // Oh, one last thing, we need to also produce the 'memory measurements'..
+  MemoryManager::Result* memory_result = nullptr;
+  IterationCount memory_iterations = 0;
+  if (memory_manager != nullptr) {
+    // TODO(vyng): Consider making BenchmarkReporter::Run::memory_result an
+    // optional so we don't have to own the Result here.
+    // Can't do it now due to cxx03.
+    memory_results.push_back(MemoryManager::Result());
+    memory_result = &memory_results.back();
+    // Only run a few iterations to reduce the impact of one-time
+    // allocations in benchmarks that are not properly managed.
+    memory_iterations = std::min<IterationCount>(16, iters);
+    memory_manager->Start();
+    std::unique_ptr<internal::ThreadManager> manager;
+    manager.reset(new internal::ThreadManager(1));
+    b.Setup();
+    RunInThread(&b, memory_iterations, 0, manager.get(),
+                perf_counters_measurement_ptr);
+    manager->WaitForAllThreads();
+    manager.reset();
+    b.Teardown();
+    memory_manager->Stop(*memory_result);
+  }
+
+  // Ok, now actually report.
+  BenchmarkReporter::Run report =
+      CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
+                      num_repetitions_done, repeats);
+
+  if (reports_for_family) {
+    ++reports_for_family->num_runs_done;
+    if (!report.skipped) reports_for_family->Runs.push_back(report);
+  }
+
+  run_results.non_aggregates.push_back(report);
+
+  ++num_repetitions_done;
+}
+
+RunResults&& BenchmarkRunner::GetResults() {
+  assert(!HasRepeatsRemaining() && "Did not run all repetitions yet?");
+
+  // Calculate additional statistics over the repetitions of this instance.
+  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+
+  return std::move(run_results);
 }

 }  // end namespace internal
--- a/src/benchmark_runner.h
+++ b/src/benchmark_runner.h
@ -15,19 +15,23 @@
 #ifndef BENCHMARK_RUNNER_H_
 #define BENCHMARK_RUNNER_H_

+#include <thread>
+#include <vector>
+
 #include "benchmark_api_internal.h"
 #include "internal_macros.h"
-
-DECLARE_double(benchmark_min_time);
-
-DECLARE_int32(benchmark_repetitions);
-
-DECLARE_bool(benchmark_report_aggregates_only);
-
-DECLARE_bool(benchmark_display_aggregates_only);
+#include "perf_counters.h"
+#include "thread_manager.h"

 namespace benchmark {

+BM_DECLARE_string(benchmark_min_time);
+BM_DECLARE_double(benchmark_min_warmup_time);
+BM_DECLARE_int32(benchmark_repetitions);
+BM_DECLARE_bool(benchmark_report_aggregates_only);
+BM_DECLARE_bool(benchmark_display_aggregates_only);
+BM_DECLARE_string(benchmark_perf_counters);
+
 namespace internal {

 extern MemoryManager* memory_manager;
@ -40,9 +44,85 @@ struct RunResults {
  bool file_report_aggregates_only = false;
 };

-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports);
+struct BENCHMARK_EXPORT BenchTimeType {
+  enum { ITERS, TIME } tag;
+  union {
+    IterationCount iters;
+    double time;
+  };
+};
+
+BENCHMARK_EXPORT
+BenchTimeType ParseBenchMinTime(const std::string& value);
+
+class BenchmarkRunner {
+ public:
+  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
+                  benchmark::internal::PerfCountersMeasurement* pmc_,
+                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);
+
+  int GetNumRepeats() const { return repeats; }
+
+  bool HasRepeatsRemaining() const {
+    return GetNumRepeats() != num_repetitions_done;
+  }
+
+  void DoOneRepetition();
+
+  RunResults&& GetResults();
+
+  BenchmarkReporter::PerFamilyRunReports* GetReportsForFamily() const {
+    return reports_for_family;
+  }
+
+  double GetMinTime() const { return min_time; }
+
+  bool HasExplicitIters() const { return has_explicit_iteration_count; }
+
+  IterationCount GetIters() const { return iters; }
+
+ private:
+  RunResults run_results;
+
+  const benchmark::internal::BenchmarkInstance& b;
+  BenchmarkReporter::PerFamilyRunReports* reports_for_family;
+
+  BenchTimeType parsed_benchtime_flag;
+  const double min_time;
+  const double min_warmup_time;
+  bool warmup_done;
+  const int repeats;
+  const bool has_explicit_iteration_count;
+
+  int num_repetitions_done = 0;
+
+  std::vector<std::thread> pool;
+
+  std::vector<MemoryManager::Result> memory_results;
+
+  IterationCount iters;  // preserved between repetitions!
+  // So only the first repetition has to find/calculate it,
+  // the other repetitions will just use that precomputed iteration count.
+
+  PerfCountersMeasurement* const perf_counters_measurement_ptr = nullptr;
+
+  struct IterationResults {
+    internal::ThreadManager::Result results;
+    IterationCount iters;
+    double seconds;
+  };
+  IterationResults DoNIterations();
+
+  IterationCount PredictNumItersNeeded(const IterationResults& i) const;
+
+  bool ShouldReportIterationResults(const IterationResults& i) const;
+
+  double GetMinTimeToApply() const;
+
+  void FinishWarmUp(const IterationCount& i);
+
+  void RunWarmUp();
+};

 }  // namespace internal

--- a/src/check.cc
+++ b/src/check.cc
@ -0,0 +1,11 @@
+#include "check.h"
+
+namespace benchmark {
+namespace internal {
+
+static AbortHandlerT* handler = &std::abort;
+
+BENCHMARK_EXPORT AbortHandlerT*& GetAbortHandler() { return handler; }
+
+}  // namespace internal
+}  // namespace benchmark
--- a/src/check.h
+++ b/src/check.h
@ -5,26 +5,43 @@
 #include <cstdlib>
 #include <ostream>

+#include "benchmark/export.h"
 #include "internal_macros.h"
 #include "log.h"

+#if defined(__GNUC__) || defined(__clang__)
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#elif defined(_MSC_VER) && !defined(__clang__)
+#if _MSC_VER >= 1900
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#else
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+#define __func__ __FUNCTION__
+#else
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+
 namespace benchmark {
 namespace internal {

 typedef void(AbortHandlerT)();

-inline AbortHandlerT*& GetAbortHandler() {
-  static AbortHandlerT* handler = &std::abort;
-  return handler;
-}
+BENCHMARK_EXPORT
+AbortHandlerT*& GetAbortHandler();

 BENCHMARK_NORETURN inline void CallAbortHandler() {
  GetAbortHandler()();
  std::abort();  // fallback to enforce noreturn
 }

-// CheckHandler is the class constructed by failing CHECK macros. CheckHandler
-// will log information about the failures and abort when it is destructed.
+// CheckHandler is the class constructed by failing BM_CHECK macros.
+// CheckHandler will log information about the failures and abort when it is
+// destructed.
 class CheckHandler {
 public:
  CheckHandler(const char* check, const char* file, const char* func, int line)
@ -35,10 +52,17 @@ class CheckHandler {

  LogType& GetLog() { return log_; }

+#if defined(COMPILER_MSVC)
+#pragma warning(push)
+#pragma warning(disable : 4722)
+#endif
  BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
    log_ << std::endl;
    CallAbortHandler();
  }
+#if defined(COMPILER_MSVC)
+#pragma warning(pop)
+#endif

  CheckHandler& operator=(const CheckHandler&) = delete;
  CheckHandler(const CheckHandler&) = delete;
@ -51,32 +75,32 @@ class CheckHandler {
 }  // end namespace internal
 }  // end namespace benchmark

-// The CHECK macro returns a std::ostream object that can have extra information
-// written to it.
+// The BM_CHECK macro returns a std::ostream object that can have extra
+// information written to it.
 #ifndef NDEBUG
-#define CHECK(b)                                                             \
+#define BM_CHECK(b)                                                          \
  (b ? ::benchmark::internal::GetNullLogInstance()                           \
     : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
           .GetLog())
 #else
-#define CHECK(b) ::benchmark::internal::GetNullLogInstance()
+#define BM_CHECK(b) ::benchmark::internal::GetNullLogInstance()
 #endif

 // clang-format off
 // preserve whitespacing between operators for alignment
-#define CHECK_EQ(a, b) CHECK((a) == (b))
-#define CHECK_NE(a, b) CHECK((a) != (b))
-#define CHECK_GE(a, b) CHECK((a) >= (b))
-#define CHECK_LE(a, b) CHECK((a) <= (b))
-#define CHECK_GT(a, b) CHECK((a) > (b))
-#define CHECK_LT(a, b) CHECK((a) < (b))
+#define BM_CHECK_EQ(a, b) BM_CHECK((a) == (b))
+#define BM_CHECK_NE(a, b) BM_CHECK((a) != (b))
+#define BM_CHECK_GE(a, b) BM_CHECK((a) >= (b))
+#define BM_CHECK_LE(a, b) BM_CHECK((a) <= (b))
+#define BM_CHECK_GT(a, b) BM_CHECK((a) > (b))
+#define BM_CHECK_LT(a, b) BM_CHECK((a) < (b))

-#define CHECK_FLOAT_EQ(a, b, eps) CHECK(std::fabs((a) - (b)) <  (eps))
-#define CHECK_FLOAT_NE(a, b, eps) CHECK(std::fabs((a) - (b)) >= (eps))
-#define CHECK_FLOAT_GE(a, b, eps) CHECK((a) - (b) > -(eps))
-#define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps))
-#define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) >  (eps))
-#define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) >  (eps))
+#define BM_CHECK_FLOAT_EQ(a, b, eps) BM_CHECK(std::fabs((a) - (b)) <  (eps))
+#define BM_CHECK_FLOAT_NE(a, b, eps) BM_CHECK(std::fabs((a) - (b)) >= (eps))
+#define BM_CHECK_FLOAT_GE(a, b, eps) BM_CHECK((a) - (b) > -(eps))
+#define BM_CHECK_FLOAT_LE(a, b, eps) BM_CHECK((b) - (a) > -(eps))
+#define BM_CHECK_FLOAT_GT(a, b, eps) BM_CHECK((a) - (b) >  (eps))
+#define BM_CHECK_FLOAT_LT(a, b, eps) BM_CHECK((b) - (a) >  (eps))
 //clang-format on

 #endif  // CHECK_H_
--- a/src/colorprint.cc
+++ b/src/colorprint.cc
@ -25,8 +25,8 @@
 #include "internal_macros.h"

 #ifdef BENCHMARK_OS_WINDOWS
-#include <windows.h>
 #include <io.h>
+#include <windows.h>
 #else
 #include <unistd.h>
 #endif  // BENCHMARK_OS_WINDOWS
@ -94,20 +94,20 @@ std::string FormatString(const char* msg, va_list args) {
  va_end(args_cp);

  // currently there is no error handling for failure, so this is hack.
-  CHECK(ret >= 0);
+  BM_CHECK(ret >= 0);

-  if (ret == 0)  // handle empty expansion
+  if (ret == 0) {  // handle empty expansion
    return {};
-  else if (static_cast<size_t>(ret) < size)
-    return local_buff;
-  else {
-    // we did not provide a long enough buffer on our first attempt.
-    size = (size_t)ret + 1;  // + 1 for the null byte
-    std::unique_ptr<char[]> buff(new char[size]);
-    ret = vsnprintf(buff.get(), size, msg, args);
-    CHECK(ret > 0 && ((size_t)ret) < size);
-    return buff.get();
  }
+  if (static_cast<size_t>(ret) < size) {
+    return local_buff;
+  }
+  // we did not provide a long enough buffer on our first attempt.
+  size = static_cast<size_t>(ret) + 1;  // + 1 for the null byte
+  std::unique_ptr<char[]> buff(new char[size]);
+  ret = vsnprintf(buff.get(), size, msg, args);
+  BM_CHECK(ret > 0 && (static_cast<size_t>(ret)) < size);
+  return buff.get();
 }

 std::string FormatString(const char* msg, ...) {
@ -163,12 +163,24 @@ bool IsColorTerminal() {
 #else
  // On non-Windows platforms, we rely on the TERM variable. This list of
  // supported TERM values is copied from Google Test:
-  // <https://github.com/google/googletest/blob/master/googletest/src/gtest.cc#L2925>.
+  // <https://github.com/google/googletest/blob/v1.13.0/googletest/src/gtest.cc#L3225-L3259>.
  const char* const SUPPORTED_TERM_VALUES[] = {
-      "xterm",         "xterm-color",     "xterm-256color",
-      "screen",        "screen-256color", "tmux",
-      "tmux-256color", "rxvt-unicode",    "rxvt-unicode-256color",
-      "linux",         "cygwin",
+      "xterm",
+      "xterm-color",
+      "xterm-256color",
+      "screen",
+      "screen-256color",
+      "tmux",
+      "tmux-256color",
+      "rxvt-unicode",
+      "rxvt-unicode-256color",
+      "linux",
+      "cygwin",
+      "xterm-kitty",
+      "alacritty",
+      "foot",
+      "foot-extra",
+      "wezterm",
  };

  const char* const term = getenv("TERM");
--- a/src/commandlineflags.cc
+++ b/src/commandlineflags.cc
@ -20,6 +20,10 @@
 #include <cstring>
 #include <iostream>
 #include <limits>
+#include <map>
+#include <utility>
+
+#include "../src/string_util.h"

 namespace benchmark {
 namespace {
@ -78,6 +82,30 @@ bool ParseDouble(const std::string& src_text, const char* str, double* value) {
  return true;
 }

+// Parses 'str' into KV pairs. If successful, writes the result to *value and
+// returns true; otherwise leaves *value unchanged and returns false.
+bool ParseKvPairs(const std::string& src_text, const char* str,
+                  std::map<std::string, std::string>* value) {
+  std::map<std::string, std::string> kvs;
+  for (const auto& kvpair : StrSplit(str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) {
+      std::cerr << src_text << " is expected to be a comma-separated list of "
+                << "<key>=<value> strings, but actually has value \"" << str
+                << "\".\n";
+      return false;
+    }
+    if (!kvs.emplace(kv[0], kv[1]).second) {
+      std::cerr << src_text << " is expected to contain unique keys but key \""
+                << kv[0] << "\" was repeated.\n";
+      return false;
+    }
+  }
+
+  *value = kvs;
+  return true;
+}
+
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "BENCHMARK_FOO" in the open-source version.
@ -93,12 +121,14 @@ static std::string FlagToEnvVar(const char* flag) {

 }  // namespace

+BENCHMARK_EXPORT
 bool BoolFromEnv(const char* flag, bool default_val) {
  const std::string env_var = FlagToEnvVar(flag);
  const char* const value_str = getenv(env_var.c_str());
  return value_str == nullptr ? default_val : IsTruthyFlagValue(value_str);
 }

+BENCHMARK_EXPORT
 int32_t Int32FromEnv(const char* flag, int32_t default_val) {
  const std::string env_var = FlagToEnvVar(flag);
  const char* const value_str = getenv(env_var.c_str());
@ -111,6 +141,7 @@ int32_t Int32FromEnv(const char* flag, int32_t default_val) {
  return value;
 }

+BENCHMARK_EXPORT
 double DoubleFromEnv(const char* flag, double default_val) {
  const std::string env_var = FlagToEnvVar(flag);
  const char* const value_str = getenv(env_var.c_str());
@ -123,12 +154,28 @@ double DoubleFromEnv(const char* flag, double default_val) {
  return value;
 }

+BENCHMARK_EXPORT
 const char* StringFromEnv(const char* flag, const char* default_val) {
  const std::string env_var = FlagToEnvVar(flag);
  const char* const value = getenv(env_var.c_str());
  return value == nullptr ? default_val : value;
 }

+BENCHMARK_EXPORT
+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value_str = getenv(env_var.c_str());
+
+  if (value_str == nullptr) return default_val;
+
+  std::map<std::string, std::string> value;
+  if (!ParseKvPairs("Environment variable " + env_var, value_str, &value)) {
+    return default_val;
+  }
+  return value;
+}
+
 // Parses a string as a command line flag.  The string should have
 // the format "--flag=value".  When def_optional is true, the "=value"
 // part can be omitted.
@ -159,6 +206,7 @@ const char* ParseFlagValue(const char* str, const char* flag,
  return flag_end + 1;
 }

+BENCHMARK_EXPORT
 bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
  // Gets the value of the flag as a string.
  const char* const value_str = ParseFlagValue(str, flag, true);
@ -171,6 +219,7 @@ bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
  return true;
 }

+BENCHMARK_EXPORT
 bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
  // Gets the value of the flag as a string.
  const char* const value_str = ParseFlagValue(str, flag, false);
@ -183,6 +232,7 @@ bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
                    value);
 }

+BENCHMARK_EXPORT
 bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
  // Gets the value of the flag as a string.
  const char* const value_str = ParseFlagValue(str, flag, false);
@ -195,6 +245,7 @@ bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
                     value);
 }

+BENCHMARK_EXPORT
 bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
  // Gets the value of the flag as a string.
  const char* const value_str = ParseFlagValue(str, flag, false);
@ -206,23 +257,42 @@ bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
  return true;
 }

+BENCHMARK_EXPORT
+bool ParseKeyValueFlag(const char* str, const char* flag,
+                       std::map<std::string, std::string>* value) {
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  if (value_str == nullptr) return false;
+
+  for (const auto& kvpair : StrSplit(value_str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) return false;
+    value->emplace(kv[0], kv[1]);
+  }
+
+  return true;
+}
+
+BENCHMARK_EXPORT
 bool IsFlag(const char* str, const char* flag) {
  return (ParseFlagValue(str, flag, true) != nullptr);
 }

+BENCHMARK_EXPORT
 bool IsTruthyFlagValue(const std::string& value) {
  if (value.size() == 1) {
    char v = value[0];
    return isalnum(v) &&
           !(v == '0' || v == 'f' || v == 'F' || v == 'n' || v == 'N');
-  } else if (!value.empty()) {
+  }
+  if (!value.empty()) {
    std::string value_lower(value);
    std::transform(value_lower.begin(), value_lower.end(), value_lower.begin(),
                   [](char c) { return static_cast<char>(::tolower(c)); });
    return !(value_lower == "false" || value_lower == "no" ||
             value_lower == "off");
-  } else
-    return true;
+  }
+  return true;
 }

 }  // end namespace benchmark
--- a/src/commandlineflags.h
+++ b/src/commandlineflags.h
@ -2,61 +2,80 @@
 #define BENCHMARK_COMMANDLINEFLAGS_H_

 #include <cstdint>
+#include <map>
 #include <string>

+#include "benchmark/export.h"
+
 // Macro for referencing flags.
 #define FLAG(name) FLAGS_##name

 // Macros for declaring flags.
-#define DECLARE_bool(name) extern bool FLAG(name)
-#define DECLARE_int32(name) extern int32_t FLAG(name)
-#define DECLARE_double(name) extern double FLAG(name)
-#define DECLARE_string(name) extern std::string FLAG(name)
+#define BM_DECLARE_bool(name) BENCHMARK_EXPORT extern bool FLAG(name)
+#define BM_DECLARE_int32(name) BENCHMARK_EXPORT extern int32_t FLAG(name)
+#define BM_DECLARE_double(name) BENCHMARK_EXPORT extern double FLAG(name)
+#define BM_DECLARE_string(name) BENCHMARK_EXPORT extern std::string FLAG(name)
+#define BM_DECLARE_kvpairs(name) \
+  BENCHMARK_EXPORT extern std::map<std::string, std::string> FLAG(name)

 // Macros for defining flags.
-#define DEFINE_bool(name, default_val)            \
-  bool FLAG(name) =                               \
-    benchmark::BoolFromEnv(#name, default_val)
-#define DEFINE_int32(name, default_val)           \
-  int32_t FLAG(name) =                            \
-    benchmark::Int32FromEnv(#name, default_val)
-#define DEFINE_double(name, default_val)          \
-  double FLAG(name) =                             \
-    benchmark::DoubleFromEnv(#name, default_val)
-#define DEFINE_string(name, default_val)          \
-  std::string FLAG(name) =                        \
-    benchmark::StringFromEnv(#name, default_val)
+#define BM_DEFINE_bool(name, default_val) \
+  BENCHMARK_EXPORT bool FLAG(name) = benchmark::BoolFromEnv(#name, default_val)
+#define BM_DEFINE_int32(name, default_val) \
+  BENCHMARK_EXPORT int32_t FLAG(name) =    \
+      benchmark::Int32FromEnv(#name, default_val)
+#define BM_DEFINE_double(name, default_val) \
+  BENCHMARK_EXPORT double FLAG(name) =      \
+      benchmark::DoubleFromEnv(#name, default_val)
+#define BM_DEFINE_string(name, default_val) \
+  BENCHMARK_EXPORT std::string FLAG(name) = \
+      benchmark::StringFromEnv(#name, default_val)
+#define BM_DEFINE_kvpairs(name, default_val)                       \
+  BENCHMARK_EXPORT std::map<std::string, std::string> FLAG(name) = \
+      benchmark::KvPairsFromEnv(#name, default_val)

 namespace benchmark {

-// Parses a bool from the environment variable
-// corresponding to the given flag.
+// Parses a bool from the environment variable corresponding to the given flag.
 //
 // If the variable exists, returns IsTruthyFlagValue() value;  if not,
 // returns the given default value.
+BENCHMARK_EXPORT
 bool BoolFromEnv(const char* flag, bool default_val);

-// Parses an Int32 from the environment variable
-// corresponding to the given flag.
+// Parses an Int32 from the environment variable corresponding to the given
+// flag.
 //
 // If the variable exists, returns ParseInt32() value;  if not, returns
 // the given default value.
+BENCHMARK_EXPORT
 int32_t Int32FromEnv(const char* flag, int32_t default_val);

-// Parses an Double from the environment variable
-// corresponding to the given flag.
+// Parses an Double from the environment variable corresponding to the given
+// flag.
 //
 // If the variable exists, returns ParseDouble();  if not, returns
 // the given default value.
+BENCHMARK_EXPORT
 double DoubleFromEnv(const char* flag, double default_val);

-// Parses a string from the environment variable
-// corresponding to the given flag.
+// Parses a string from the environment variable corresponding to the given
+// flag.
 //
 // If variable exists, returns its value;  if not, returns
 // the given default value.
+BENCHMARK_EXPORT
 const char* StringFromEnv(const char* flag, const char* default_val);

+// Parses a set of kvpairs from the environment variable corresponding to the
+// given flag.
+//
+// If variable exists, returns its value;  if not, returns
+// the given default value.
+BENCHMARK_EXPORT
+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val);
+
 // Parses a string for a bool flag, in the form of either
 // "--flag=value" or "--flag".
 //
@ -66,36 +85,47 @@ const char* StringFromEnv(const char* flag, const char* default_val);
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseBoolFlag(const char* str, const char* flag, bool* value);

-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
+// Parses a string for an Int32 flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);

-// Parses a string for a Double flag, in the form of
-// "--flag=value".
+// Parses a string for a Double flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseDoubleFlag(const char* str, const char* flag, double* value);

-// Parses a string for a string flag, in the form of
-// "--flag=value".
+// Parses a string for a string flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseStringFlag(const char* str, const char* flag, std::string* value);

+// Parses a string for a kvpairs flag in the form "--flag=key=value,key=value"
+//
+// On success, stores the value of the flag in *value and returns true. On
+// failure returns false, though *value may have been mutated.
+BENCHMARK_EXPORT
+bool ParseKeyValueFlag(const char* str, const char* flag,
+                       std::map<std::string, std::string>* value);
+
 // Returns true if the string matches the flag.
+BENCHMARK_EXPORT
 bool IsFlag(const char* str, const char* flag);

 // Returns true unless value starts with one of: '0', 'f', 'F', 'n' or 'N', or
 // some non-alphanumeric character. Also returns false if the value matches
 // one of 'no', 'false', 'off' (case-insensitive). As a special case, also
 // returns true if value is the empty string.
+BENCHMARK_EXPORT
 bool IsTruthyFlagValue(const std::string& value);

 }  // end namespace benchmark
--- a/src/complexity.cc
+++ b/src/complexity.cc
@ -15,12 +15,13 @@
 // Source project : https://github.com/ismaelJimenez/cpp.leastsq
 // Adapted to be used with google benchmark

-#include "benchmark/benchmark.h"
+#include "complexity.h"

 #include <algorithm>
 #include <cmath>
+
+#include "benchmark/benchmark.h"
 #include "check.h"
-#include "complexity.h"

 namespace benchmark {

@ -82,7 +83,6 @@ std::string GetBigOString(BigO complexity) {
 LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
                       const std::vector<double>& time,
                       BigOFunc* fitting_curve) {
-  double sigma_gn = 0.0;
  double sigma_gn_squared = 0.0;
  double sigma_time = 0.0;
  double sigma_time_gn = 0.0;
@ -90,7 +90,6 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
  // Calculate least square fitting parameter
  for (size_t i = 0; i < n.size(); ++i) {
    double gn_i = fitting_curve(n[i]);
-    sigma_gn += gn_i;
    sigma_gn_squared += gn_i * gn_i;
    sigma_time += time[i];
    sigma_time_gn += time[i] * gn_i;
@ -125,10 +124,10 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
 //                  fitting curve.
 LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
                       const std::vector<double>& time, const BigO complexity) {
-  CHECK_EQ(n.size(), time.size());
-  CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
-                          // benchmark runs are given
-  CHECK_NE(complexity, oNone);
+  BM_CHECK_EQ(n.size(), time.size());
+  BM_CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
+                             // benchmark runs are given
+  BM_CHECK_NE(complexity, oNone);

  LeastSq best_fit;

@ -169,7 +168,8 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(

  // Populate the accumulators.
  for (const Run& run : reports) {
-    CHECK_GT(run.complexity_n, 0) << "Did you forget to call SetComplexityN?";
+    BM_CHECK_GT(run.complexity_n, 0)
+        << "Did you forget to call SetComplexityN?";
    n.push_back(run.complexity_n);
    real_time.push_back(run.real_accumulated_time / run.iterations);
    cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
@ -193,11 +193,14 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
  // Get the data from the accumulator to BenchmarkReporter::Run's.
  Run big_o;
  big_o.run_name = run_name;
+  big_o.family_index = reports[0].family_index;
+  big_o.per_family_instance_index = reports[0].per_family_instance_index;
  big_o.run_type = BenchmarkReporter::Run::RT_Aggregate;
  big_o.repetitions = reports[0].repetitions;
  big_o.repetition_index = Run::no_repetition_index;
  big_o.threads = reports[0].threads;
  big_o.aggregate_name = "BigO";
+  big_o.aggregate_unit = StatisticUnit::kTime;
  big_o.report_label = reports[0].report_label;
  big_o.iterations = 0;
  big_o.real_accumulated_time = result_real.coef;
@ -215,8 +218,11 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
  // Only add label to mean/stddev if it is same for all runs
  Run rms;
  rms.run_name = run_name;
+  rms.family_index = reports[0].family_index;
+  rms.per_family_instance_index = reports[0].per_family_instance_index;
  rms.run_type = BenchmarkReporter::Run::RT_Aggregate;
  rms.aggregate_name = "RMS";
+  rms.aggregate_unit = StatisticUnit::kPercentage;
  rms.report_label = big_o.report_label;
  rms.iterations = 0;
  rms.repetition_index = Run::no_repetition_index;
--- a/src/complexity.h
+++ b/src/complexity.h
@ -31,7 +31,7 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
    const std::vector<BenchmarkReporter::Run>& reports);

 // This data structure will contain the result returned by MinimalLeastSq
-//   - coef        : Estimated coeficient for the high-order term as
+//   - coef        : Estimated coefficient for the high-order term as
 //                   interpolated from data.
 //   - rms         : Normalized Root Mean Squared Error.
 //   - complexity  : Scalability form (e.g. oN, oNLogN). In case a scalability
--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@ -33,6 +33,7 @@

 namespace benchmark {

+BENCHMARK_EXPORT
 bool ConsoleReporter::ReportContext(const Context& context) {
  name_field_width_ = context.name_field_width;
  printed_header_ = false;
@ -45,19 +46,21 @@ bool ConsoleReporter::ReportContext(const Context& context) {
    GetErrorStream()
        << "Color printing is only supported for stdout on windows."
           " Disabling color printing\n";
-    output_options_ = static_cast< OutputOptions >(output_options_ & ~OO_Color);
+    output_options_ = static_cast<OutputOptions>(output_options_ & ~OO_Color);
  }
 #endif

  return true;
 }

+BENCHMARK_EXPORT
 void ConsoleReporter::PrintHeader(const Run& run) {
-  std::string str = FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
-                                 "Benchmark", "Time", "CPU", "Iterations");
-  if(!run.counters.empty()) {
-    if(output_options_ & OO_Tabular) {
-      for(auto const& c : run.counters) {
+  std::string str =
+      FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
+                   "Benchmark", "Time", "CPU", "Iterations");
+  if (!run.counters.empty()) {
+    if (output_options_ & OO_Tabular) {
+      for (auto const& c : run.counters) {
        str += FormatString(" %10s", c.first.c_str());
      }
    } else {
@ -68,6 +71,7 @@ void ConsoleReporter::PrintHeader(const Run& run) {
  GetOutputStream() << line << "\n" << str << "\n" << line << "\n";
 }

+BENCHMARK_EXPORT
 void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
  for (const auto& run : reports) {
    // print the header:
@ -97,8 +101,10 @@ static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
  va_end(args);
 }

-
 static std::string FormatTime(double time) {
+  // For the time columns of the console printer 13 digits are reserved. One of
+  // them is a space and max two of them are the time unit (e.g ns). That puts
+  // us at 10 digits usable for the number.
  // Align decimal places...
  if (time < 1.0) {
    return FormatString("%10.3f", time);
@ -109,22 +115,33 @@ static std::string FormatTime(double time) {
  if (time < 100.0) {
    return FormatString("%10.1f", time);
  }
+  // Assuming the time is at max 9.9999e+99 and we have 10 digits for the
+  // number, we get 10-1(.)-1(e)-1(sign)-2(exponent) = 5 digits to print.
+  if (time > 9999999999 /*max 10 digit number*/) {
+    return FormatString("%1.4e", time);
+  }
  return FormatString("%10.0f", time);
 }

+BENCHMARK_EXPORT
 void ConsoleReporter::PrintRunData(const Run& result) {
  typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
  auto& Out = GetOutputStream();
-  PrinterFn* printer = (output_options_ & OO_Color) ?
-                         (PrinterFn*)ColorPrintf : IgnoreColorPrint;
+  PrinterFn* printer = (output_options_ & OO_Color)
+                           ? static_cast<PrinterFn*>(ColorPrintf)
+                           : IgnoreColorPrint;
  auto name_color =
      (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
  printer(Out, name_color, "%-*s ", name_field_width_,
          result.benchmark_name().c_str());

-  if (result.error_occurred) {
+  if (internal::SkippedWithError == result.skipped) {
    printer(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
-            result.error_message.c_str());
+            result.skip_message.c_str());
+    printer(Out, COLOR_DEFAULT, "\n");
+    return;
+  } else if (internal::SkippedWithMessage == result.skipped) {
+    printer(Out, COLOR_WHITE, "SKIPPED: \'%s\'", result.skip_message.c_str());
    printer(Out, COLOR_DEFAULT, "\n");
    return;
  }
@ -134,18 +151,23 @@ void ConsoleReporter::PrintRunData(const Run& result) {
  const std::string real_time_str = FormatTime(real_time);
  const std::string cpu_time_str = FormatTime(cpu_time);

-
  if (result.report_big_o) {
    std::string big_o = GetBigOString(result.complexity);
-    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time, big_o.c_str(),
-            cpu_time, big_o.c_str());
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time,
+            big_o.c_str(), cpu_time, big_o.c_str());
  } else if (result.report_rms) {
    printer(Out, COLOR_YELLOW, "%10.0f %-4s %10.0f %-4s ", real_time * 100, "%",
            cpu_time * 100, "%");
-  } else {
+  } else if (result.run_type != Run::RT_Aggregate ||
+             result.aggregate_unit == StatisticUnit::kTime) {
    const char* timeLabel = GetTimeUnitString(result.time_unit);
-    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(), timeLabel,
-            cpu_time_str.c_str(), timeLabel);
+    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(),
+            timeLabel, cpu_time_str.c_str(), timeLabel);
+  } else {
+    assert(result.aggregate_unit == StatisticUnit::kPercentage);
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ",
+            (100. * result.real_accumulated_time), "%",
+            (100. * result.cpu_accumulated_time), "%");
  }

  if (!result.report_big_o && !result.report_rms) {
@ -153,12 +175,19 @@ void ConsoleReporter::PrintRunData(const Run& result) {
  }

  for (auto& c : result.counters) {
-    const std::size_t cNameLen = std::max(std::string::size_type(10),
-                                          c.first.length());
-    auto const& s = HumanReadableNumber(c.second.value, c.second.oneK);
+    const std::size_t cNameLen =
+        std::max(std::string::size_type(10), c.first.length());
+    std::string s;
    const char* unit = "";
-    if (c.second.flags & Counter::kIsRate)
-      unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
+    if (result.run_type == Run::RT_Aggregate &&
+        result.aggregate_unit == StatisticUnit::kPercentage) {
+      s = StrFormat("%.2f", 100. * c.second.value);
+      unit = "%";
+    } else {
+      s = HumanReadableNumber(c.second.value, c.second.oneK);
+      if (c.second.flags & Counter::kIsRate)
+        unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
+    }
    if (output_options_ & OO_Tabular) {
      printer(Out, COLOR_DEFAULT, " %*s%s", cNameLen - strlen(unit), s.c_str(),
              unit);
--- a/src/csv_reporter.cc
+++ b/src/csv_reporter.cc
@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
 #include <algorithm>
 #include <cstdint>
 #include <iostream>
@ -22,7 +19,9 @@
 #include <tuple>
 #include <vector>

+#include "benchmark/benchmark.h"
 #include "check.h"
+#include "complexity.h"
 #include "string_util.h"
 #include "timers.h"

@ -37,23 +36,29 @@ std::vector<std::string> elements = {
    "error_occurred", "error_message"};
 }  // namespace

-std::string CsvEscape(const std::string & s) {
+std::string CsvEscape(const std::string& s) {
  std::string tmp;
  tmp.reserve(s.size() + 2);
  for (char c : s) {
    switch (c) {
-    case '"' : tmp += "\"\""; break;
-    default  : tmp += c; break;
+      case '"':
+        tmp += "\"\"";
+        break;
+      default:
+        tmp += c;
+        break;
    }
  }
  return '"' + tmp + '"';
 }

+BENCHMARK_EXPORT
 bool CSVReporter::ReportContext(const Context& context) {
  PrintBasicContext(&GetErrorStream(), context);
  return true;
 }

+BENCHMARK_EXPORT
 void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
  std::ostream& Out = GetOutputStream();

@ -85,7 +90,8 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
      for (const auto& cnt : run.counters) {
        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
          continue;
-        CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
+        BM_CHECK(user_counter_names_.find(cnt.first) !=
+                 user_counter_names_.end())
            << "All counters must be present in each run. "
            << "Counter named \"" << cnt.first
            << "\" was not in a run after being added to the header";
@ -99,13 +105,14 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
  }
 }

+BENCHMARK_EXPORT
 void CSVReporter::PrintRunData(const Run& run) {
  std::ostream& Out = GetOutputStream();
  Out << CsvEscape(run.benchmark_name()) << ",";
-  if (run.error_occurred) {
+  if (run.skipped) {
    Out << std::string(elements.size() - 3, ',');
-    Out << "true,";
-    Out << CsvEscape(run.error_message) << "\n";
+    Out << std::boolalpha << (internal::SkippedWithError == run.skipped) << ",";
+    Out << CsvEscape(run.skip_message) << "\n";
    return;
  }

--- a/src/cycleclock.h
+++ b/src/cycleclock.h
@ -36,7 +36,8 @@
 // declarations of some other intrinsics, breaking compilation.
 // Therefore, we simply declare __rdtsc ourselves. See also
 // http://connect.microsoft.com/VisualStudio/feedback/details/262047
-#if defined(COMPILER_MSVC) && !defined(_M_IX86)
+#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64) && \
+    !defined(_M_ARM64EC)
 extern "C" uint64_t __rdtsc();
 #pragma intrinsic(__rdtsc)
 #endif
@ -92,7 +93,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  uint32_t tbl, tbu0, tbu1;
  asm volatile(
      "mftbu %0\n"
-      "mftbl %1\n"
+      "mftb %1\n"
      "mftbu %2"
      : "=r"(tbu0), "=r"(tbl), "=r"(tbu1));
  tbl &= -static_cast<int32_t>(tbu0 == tbu1);
@ -114,6 +115,12 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  // when I know it will work.  Otherwise, I'll use __rdtsc and hope
  // the code is being compiled with a non-ancient compiler.
  _asm rdtsc
+#elif defined(COMPILER_MSVC) && (defined(_M_ARM64) || defined(_M_ARM64EC))
+  // See // https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics
+  // and https://reviews.llvm.org/D53115
+  int64_t virtual_timer_value;
+  virtual_timer_value = _ReadStatusReg(ARM64_CNTVCT);
+  return virtual_timer_value;
 #elif defined(COMPILER_MSVC)
  return __rdtsc();
 #elif defined(BENCHMARK_OS_NACL)
@ -126,7 +133,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {

  // Native Client does not provide any API to access cycle counter.
  // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
-  // because is provides nanosecond resolution (which is noticable at
+  // because is provides nanosecond resolution (which is noticeable at
  // least for PNaCl modules running on x86 Mac & Linux).
  // Initialize to always return 0 if clock_gettime fails.
  struct timespec ts = {0, 0};
@ -161,18 +168,27 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  struct timeval tv;
  gettimeofday(&tv, nullptr);
  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-#elif defined(__mips__)
+#elif defined(__mips__) || defined(__m68k__)
  // mips apparently only allows rdtsc for superusers, so we fall
  // back to gettimeofday.  It's possible clock_gettime would be better.
  struct timeval tv;
  gettimeofday(&tv, nullptr);
  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__loongarch__) || defined(__csky__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 #elif defined(__s390__)  // Covers both s390 and s390x.
  // Return the CPU clock.
  uint64_t tsc;
+#if defined(BENCHMARK_OS_ZOS) && defined(COMPILER_IBMXL)
+  // z/OS XL compiler HLASM syntax.
+  asm(" stck %0" : "=m"(tsc) : : "cc");
+#else
  asm("stck %0" : "=Q"(tsc) : : "cc");
+#endif
  return tsc;
-#elif defined(__riscv) // RISC-V
+#elif defined(__riscv)  // RISC-V
  // Use RDCYCLE (and RDCYCLEH on riscv32)
 #if __riscv_xlen == 32
  uint32_t cycles_lo, cycles_hi0, cycles_hi1;
@ -193,6 +209,14 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  asm volatile("rdcycle %0" : "=r"(cycles));
  return cycles;
 #endif
+#elif defined(__e2k__) || defined(__elbrus__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__hexagon__)
+  uint64_t pcycle;
+  asm volatile("%0 = C15:14" : "=r"(pcycle));
+  return static_cast<double>(pcycle);
 #else
 // The soft failover to a generic implementation is automatic only for ARM.
 // For other platforms the developer is expected to make an attempt to create
--- a/src/internal_macros.h
+++ b/src/internal_macros.h
@ -1,8 +1,6 @@
 #ifndef BENCHMARK_INTERNAL_MACROS_H_
 #define BENCHMARK_INTERNAL_MACROS_H_

-#include "benchmark/benchmark.h"
-
 /* Needed to detect STL */
 #include <cstdlib>

@ -13,7 +11,11 @@
 #endif

 #if defined(__clang__)
-  #if !defined(COMPILER_CLANG)
+  #if defined(__ibmxl__)
+    #if !defined(COMPILER_IBMXL)
+      #define COMPILER_IBMXL
+    #endif
+  #elif !defined(COMPILER_CLANG)
    #define COMPILER_CLANG
  #endif
 #elif defined(_MSC_VER)
@ -40,6 +42,19 @@
  #define BENCHMARK_OS_CYGWIN 1
 #elif defined(_WIN32)
  #define BENCHMARK_OS_WINDOWS 1
+  // WINAPI_FAMILY_PARTITION is defined in winapifamily.h.
+  // We include windows.h which implicitly includes winapifamily.h for compatibility.
+  #ifndef NOMINMAX
+    #define NOMINMAX
+  #endif
+  #include <windows.h>
+  #if defined(WINAPI_FAMILY_PARTITION)
+    #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+      #define BENCHMARK_OS_WINDOWS_WIN32 1
+    #elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+      #define BENCHMARK_OS_WINDOWS_RT 1
+    #endif
+  #endif
  #if defined(__MINGW32__)
    #define BENCHMARK_OS_MINGW 1
  #endif
@ -58,6 +73,8 @@
  #define BENCHMARK_OS_NETBSD 1
 #elif defined(__OpenBSD__)
  #define BENCHMARK_OS_OPENBSD 1
+#elif defined(__DragonFly__)
+  #define BENCHMARK_OS_DRAGONFLY 1
 #elif defined(__linux__)
  #define BENCHMARK_OS_LINUX 1
 #elif defined(__native_client__)
@ -72,6 +89,10 @@
 #define BENCHMARK_OS_SOLARIS 1
 #elif defined(__QNX__)
 #define BENCHMARK_OS_QNX 1
+#elif defined(__MVS__)
+#define BENCHMARK_OS_ZOS 1
+#elif defined(__hexagon__)
+#define BENCHMARK_OS_QURT 1
 #endif

 #if defined(__ANDROID__) && defined(__GLIBCXX__)
--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@ -25,41 +22,61 @@
 #include <tuple>
 #include <vector>

+#include "benchmark/benchmark.h"
+#include "complexity.h"
 #include "string_util.h"
 #include "timers.h"

 namespace benchmark {
-
 namespace {

-std::string StrEscape(const std::string & s) {
+std::string StrEscape(const std::string& s) {
  std::string tmp;
  tmp.reserve(s.size());
  for (char c : s) {
    switch (c) {
-    case '\b': tmp += "\\b"; break;
-    case '\f': tmp += "\\f"; break;
-    case '\n': tmp += "\\n"; break;
-    case '\r': tmp += "\\r"; break;
-    case '\t': tmp += "\\t"; break;
-    case '\\': tmp += "\\\\"; break;
-    case '"' : tmp += "\\\""; break;
-    default  : tmp += c; break;
+      case '\b':
+        tmp += "\\b";
+        break;
+      case '\f':
+        tmp += "\\f";
+        break;
+      case '\n':
+        tmp += "\\n";
+        break;
+      case '\r':
+        tmp += "\\r";
+        break;
+      case '\t':
+        tmp += "\\t";
+        break;
+      case '\\':
+        tmp += "\\\\";
+        break;
+      case '"':
+        tmp += "\\\"";
+        break;
+      default:
+        tmp += c;
+        break;
    }
  }
  return tmp;
 }

 std::string FormatKV(std::string const& key, std::string const& value) {
-  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }

 std::string FormatKV(std::string const& key, const char* value) {
-  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }

 std::string FormatKV(std::string const& key, bool value) {
-  return StrFormat("\"%s\": %s", StrEscape(key).c_str(), value ? "true" : "false");
+  return StrFormat("\"%s\": %s", StrEscape(key).c_str(),
+                   value ? "true" : "false");
 }

 std::string FormatKV(std::string const& key, int64_t value) {
@ -68,12 +85,6 @@ std::string FormatKV(std::string const& key, int64_t value) {
  return ss.str();
 }

-std::string FormatKV(std::string const& key, IterationCount value) {
-  std::stringstream ss;
-  ss << '"' << StrEscape(key) << "\": " << value;
-  return ss.str();
-}
-
 std::string FormatKV(std::string const& key, double value) {
  std::stringstream ss;
  ss << '"' << StrEscape(key) << "\": ";
@ -123,7 +134,9 @@ bool JSONReporter::ReportContext(const Context& context) {
                  RoundDouble(info.cycles_per_second / 1000000.0))
      << ",\n";
  if (CPUInfo::Scaling::UNKNOWN != info.scaling) {
-    out << indent << FormatKV("cpu_scaling_enabled", info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
+    out << indent
+        << FormatKV("cpu_scaling_enabled",
+                    info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
        << ",\n";
  }

@ -136,8 +149,8 @@ bool JSONReporter::ReportContext(const Context& context) {
    out << cache_indent << FormatKV("type", CI.type) << ",\n";
    out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
        << ",\n";
-    out << cache_indent
-        << FormatKV("size", static_cast<int64_t>(CI.size)) << ",\n";
+    out << cache_indent << FormatKV("size", static_cast<int64_t>(CI.size))
+        << ",\n";
    out << cache_indent
        << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
        << "\n";
@ -159,7 +172,19 @@ bool JSONReporter::ReportContext(const Context& context) {
 #else
  const char build_type[] = "debug";
 #endif
-  out << indent << FormatKV("library_build_type", build_type) << "\n";
+  out << indent << FormatKV("library_build_type", build_type);
+
+  std::map<std::string, std::string>* global_context =
+      internal::GetGlobalContext();
+
+  if (global_context != nullptr) {
+    for (const auto& kv : *global_context) {
+      out << ",\n";
+      out << indent << FormatKV(kv.first, kv.second);
+    }
+  }
+  out << "\n";
+
  // Close context block and open the list of benchmarks.
  out << inner_indent << "},\n";
  out << inner_indent << "\"benchmarks\": [\n";
@ -197,6 +222,10 @@ void JSONReporter::PrintRunData(Run const& run) {
  std::string indent(6, ' ');
  std::ostream& out = GetOutputStream();
  out << indent << FormatKV("name", run.benchmark_name()) << ",\n";
+  out << indent << FormatKV("family_index", run.family_index) << ",\n";
+  out << indent
+      << FormatKV("per_family_instance_index", run.per_family_instance_index)
+      << ",\n";
  out << indent << FormatKV("run_name", run.run_name.str()) << ",\n";
  out << indent << FormatKV("run_type", [&run]() -> const char* {
    switch (run.run_type) {
@ -215,15 +244,36 @@ void JSONReporter::PrintRunData(Run const& run) {
  out << indent << FormatKV("threads", run.threads) << ",\n";
  if (run.run_type == BenchmarkReporter::Run::RT_Aggregate) {
    out << indent << FormatKV("aggregate_name", run.aggregate_name) << ",\n";
+    out << indent << FormatKV("aggregate_unit", [&run]() -> const char* {
+      switch (run.aggregate_unit) {
+        case StatisticUnit::kTime:
+          return "time";
+        case StatisticUnit::kPercentage:
+          return "percentage";
+      }
+      BENCHMARK_UNREACHABLE();
+    }()) << ",\n";
  }
-  if (run.error_occurred) {
-    out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
-    out << indent << FormatKV("error_message", run.error_message) << ",\n";
+  if (internal::SkippedWithError == run.skipped) {
+    out << indent << FormatKV("error_occurred", true) << ",\n";
+    out << indent << FormatKV("error_message", run.skip_message) << ",\n";
+  } else if (internal::SkippedWithMessage == run.skipped) {
+    out << indent << FormatKV("skipped", true) << ",\n";
+    out << indent << FormatKV("skip_message", run.skip_message) << ",\n";
  }
  if (!run.report_big_o && !run.report_rms) {
    out << indent << FormatKV("iterations", run.iterations) << ",\n";
-    out << indent << FormatKV("real_time", run.GetAdjustedRealTime()) << ",\n";
-    out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    if (run.run_type != Run::RT_Aggregate ||
+        run.aggregate_unit == StatisticUnit::kTime) {
+      out << indent << FormatKV("real_time", run.GetAdjustedRealTime())
+          << ",\n";
+      out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    } else {
+      assert(run.aggregate_unit == StatisticUnit::kPercentage);
+      out << indent << FormatKV("real_time", run.real_accumulated_time)
+          << ",\n";
+      out << indent << FormatKV("cpu_time", run.cpu_accumulated_time);
+    }
    out << ",\n"
        << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
  } else if (run.report_big_o) {
@ -241,9 +291,21 @@ void JSONReporter::PrintRunData(Run const& run) {
    out << ",\n" << indent << FormatKV(c.first, c.second);
  }

-  if (run.has_memory_result) {
+  if (run.memory_result) {
+    const MemoryManager::Result memory_result = *run.memory_result;
    out << ",\n" << indent << FormatKV("allocs_per_iter", run.allocs_per_iter);
-    out << ",\n" << indent << FormatKV("max_bytes_used", run.max_bytes_used);
+    out << ",\n"
+        << indent << FormatKV("max_bytes_used", memory_result.max_bytes_used);
+
+    auto report_if_present = [&out, &indent](const std::string& label,
+                                             int64_t val) {
+      if (val != MemoryManager::TombstoneValue)
+        out << ",\n" << indent << FormatKV(label, val);
+    };
+
+    report_if_present("total_allocated_bytes",
+                      memory_result.total_allocated_bytes);
+    report_if_present("net_heap_growth", memory_result.net_heap_growth);
  }

  if (!run.report_label.empty()) {
@ -252,4 +314,7 @@ void JSONReporter::PrintRunData(Run const& run) {
  out << '\n';
 }

+const int64_t MemoryManager::TombstoneValue =
+    std::numeric_limits<int64_t>::max();
+
 }  // end namespace benchmark
--- a/src/log.h
+++ b/src/log.h
@ -4,7 +4,12 @@
 #include <iostream>
 #include <ostream>

-#include "benchmark/benchmark.h"
+// NOTE: this is also defined in benchmark.h but we're trying to avoid a
+// dependency.
+// The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer.
+#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
+#define BENCHMARK_HAS_CXX11
+#endif

 namespace benchmark {
 namespace internal {
@ -23,7 +28,16 @@ class LogType {
 private:
  LogType(std::ostream* out) : out_(out) {}
  std::ostream* out_;
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(LogType);
+
+  // NOTE: we could use BENCHMARK_DISALLOW_COPY_AND_ASSIGN but we shouldn't have
+  // a dependency on benchmark.h from here.
+#ifndef BENCHMARK_HAS_CXX11
+  LogType(const LogType&);
+  LogType& operator=(const LogType&);
+#else
+  LogType(const LogType&) = delete;
+  LogType& operator=(const LogType&) = delete;
+#endif
 };

 template <class Tp>
@ -47,13 +61,13 @@ inline int& LogLevel() {
 }

 inline LogType& GetNullLogInstance() {
-  static LogType log(nullptr);
-  return log;
+  static LogType null_log(static_cast<std::ostream*>(nullptr));
+  return null_log;
 }

 inline LogType& GetErrorLogInstance() {
-  static LogType log(&std::clog);
-  return log;
+  static LogType error_log(&std::clog);
+  return error_log;
 }

 inline LogType& GetLogInstanceForLevel(int level) {
@ -67,7 +81,7 @@ inline LogType& GetLogInstanceForLevel(int level) {
 }  // end namespace benchmark

 // clang-format off
-#define VLOG(x)                                                               \
+#define BM_VLOG(x)                                                               \
  (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \
                                                                         " ")
 // clang-format on
--- a/src/mutex.h
+++ b/src/mutex.h
@ -9,60 +9,60 @@
 // Enable thread safety attributes only with clang.
 // The attributes can be safely erased when compiling with other compilers.
 #if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
-#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
+#define THREAD_ANNOTATION_ATTRIBUTE_(x) __attribute__((x))
 #else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
+#define THREAD_ANNOTATION_ATTRIBUTE_(x)  // no-op
 #endif

-#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(capability(x))

-#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE_(scoped_lockable)

-#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(guarded_by(x))

-#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(pt_guarded_by(x))

 #define ACQUIRED_BEFORE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_before(__VA_ARGS__))

 #define ACQUIRED_AFTER(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_after(__VA_ARGS__))

 #define REQUIRES(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_capability(__VA_ARGS__))

 #define REQUIRES_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_shared_capability(__VA_ARGS__))

 #define ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_capability(__VA_ARGS__))

 #define ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_shared_capability(__VA_ARGS__))

 #define RELEASE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_capability(__VA_ARGS__))

 #define RELEASE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_shared_capability(__VA_ARGS__))

 #define TRY_ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_capability(__VA_ARGS__))

 #define TRY_ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_shared_capability(__VA_ARGS__))

-#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE_(locks_excluded(__VA_ARGS__))

-#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(assert_capability(x))

 #define ASSERT_SHARED_CAPABILITY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
+  THREAD_ANNOTATION_ATTRIBUTE_(assert_shared_capability(x))

-#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(lock_returned(x))

 #define NO_THREAD_SAFETY_ANALYSIS \
-  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+  THREAD_ANNOTATION_ATTRIBUTE_(no_thread_safety_analysis)

 namespace benchmark {

@ -130,7 +130,7 @@ class Barrier {
  // entered the barrier.  Returns iff this is the last thread to
  // enter the barrier.
  bool createBarrier(MutexLock& ml) REQUIRES(lock_) {
-    CHECK_LT(entered_, running_threads_);
+    BM_CHECK_LT(entered_, running_threads_);
    entered_++;
    if (entered_ < running_threads_) {
      // Wait for all threads to enter
--- a/src/perf_counters.cc
+++ b/src/perf_counters.cc
@ -0,0 +1,282 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "perf_counters.h"
+
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#if defined HAVE_LIBPFM
+#include "perfmon/pfmlib.h"
+#include "perfmon/pfmlib_perf_event.h"
+#endif
+
+namespace benchmark {
+namespace internal {
+
+constexpr size_t PerfCounterValues::kMaxCounters;
+
+#if defined HAVE_LIBPFM
+
+size_t PerfCounterValues::Read(const std::vector<int>& leaders) {
+  // Create a pointer for multiple reads
+  const size_t bufsize = values_.size() * sizeof(values_[0]);
+  char* ptr = reinterpret_cast<char*>(values_.data());
+  size_t size = bufsize;
+  for (int lead : leaders) {
+    auto read_bytes = ::read(lead, ptr, size);
+    if (read_bytes >= ssize_t(sizeof(uint64_t))) {
+      // Actual data bytes are all bytes minus initial padding
+      std::size_t data_bytes = read_bytes - sizeof(uint64_t);
+      // This should be very cheap since it's in hot cache
+      std::memmove(ptr, ptr + sizeof(uint64_t), data_bytes);
+      // Increment our counters
+      ptr += data_bytes;
+      size -= data_bytes;
+    } else {
+      int err = errno;
+      GetErrorLogInstance() << "Error reading lead " << lead << " errno:" << err
+                            << " " << ::strerror(err) << "\n";
+      return 0;
+    }
+  }
+  return (bufsize - size) / sizeof(uint64_t);
+}
+
+const bool PerfCounters::kSupported = true;
+
+// Initializes libpfm only on the first call.  Returns whether that single
+// initialization was successful.
+bool PerfCounters::Initialize() {
+  // Function-scope static gets initialized only once on first call.
+  static const bool success = []() {
+    return pfm_initialize() == PFM_SUCCESS;
+  }();
+  return success;
+}
+
+bool PerfCounters::IsCounterSupported(const std::string& name) {
+  Initialize();
+  perf_event_attr_t attr;
+  std::memset(&attr, 0, sizeof(attr));
+  pfm_perf_encode_arg_t arg;
+  std::memset(&arg, 0, sizeof(arg));
+  arg.attr = &attr;
+  const int mode = PFM_PLM3;  // user mode only
+  int ret = pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT_EXT,
+                                      &arg);
+  return (ret == PFM_SUCCESS);
+}
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (!counter_names.empty()) {
+    Initialize();
+  }
+
+  // Valid counters will populate these arrays but we start empty
+  std::vector<std::string> valid_names;
+  std::vector<int> counter_ids;
+  std::vector<int> leader_ids;
+
+  // Resize to the maximum possible
+  valid_names.reserve(counter_names.size());
+  counter_ids.reserve(counter_names.size());
+
+  const int kCounterMode = PFM_PLM3;  // user mode only
+
+  // Group leads will be assigned on demand. The idea is that once we cannot
+  // create a counter descriptor, the reason is that this group has maxed out
+  // so we set the group_id again to -1 and retry - giving the algorithm a
+  // chance to create a new group leader to hold the next set of counters.
+  int group_id = -1;
+
+  // Loop through all performance counters
+  for (size_t i = 0; i < counter_names.size(); ++i) {
+    // we are about to push into the valid names vector
+    // check if we did not reach the maximum
+    if (valid_names.size() == PerfCounterValues::kMaxCounters) {
+      // Log a message if we maxed out and stop adding
+      GetErrorLogInstance()
+          << counter_names.size() << " counters were requested. The maximum is "
+          << PerfCounterValues::kMaxCounters << " and " << valid_names.size()
+          << " were already added. All remaining counters will be ignored\n";
+      // stop the loop and return what we have already
+      break;
+    }
+
+    // Check if this name is empty
+    const auto& name = counter_names[i];
+    if (name.empty()) {
+      GetErrorLogInstance()
+          << "A performance counter name was the empty string\n";
+      continue;
+    }
+
+    // Here first means first in group, ie the group leader
+    const bool is_first = (group_id < 0);
+
+    // This struct will be populated by libpfm from the counter string
+    // and then fed into the syscall perf_event_open
+    struct perf_event_attr attr {};
+    attr.size = sizeof(attr);
+
+    // This is the input struct to libpfm.
+    pfm_perf_encode_arg_t arg{};
+    arg.attr = &attr;
+    const int pfm_get = pfm_get_os_event_encoding(name.c_str(), kCounterMode,
+                                                  PFM_OS_PERF_EVENT, &arg);
+    if (pfm_get != PFM_SUCCESS) {
+      GetErrorLogInstance()
+          << "Unknown performance counter name: " << name << "\n";
+      continue;
+    }
+
+    // We then proceed to populate the remaining fields in our attribute struct
+    // Note: the man page for perf_event_create suggests inherit = true and
+    // read_format = PERF_FORMAT_GROUP don't work together, but that's not the
+    // case.
+    attr.disabled = is_first;
+    attr.inherit = true;
+    attr.pinned = is_first;
+    attr.exclude_kernel = true;
+    attr.exclude_user = false;
+    attr.exclude_hv = true;
+
+    // Read all counters in a group in one read.
+    attr.read_format = PERF_FORMAT_GROUP;
+
+    int id = -1;
+    while (id < 0) {
+      static constexpr size_t kNrOfSyscallRetries = 5;
+      // Retry syscall as it was interrupted often (b/64774091).
+      for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
+           ++num_retries) {
+        id = perf_event_open(&attr, 0, -1, group_id, 0);
+        if (id >= 0 || errno != EINTR) {
+          break;
+        }
+      }
+      if (id < 0) {
+        // If the file descriptor is negative we might have reached a limit
+        // in the current group. Set the group_id to -1 and retry
+        if (group_id >= 0) {
+          // Create a new group
+          group_id = -1;
+        } else {
+          // At this point we have already retried to set a new group id and
+          // failed. We then give up.
+          break;
+        }
+      }
+    }
+
+    // We failed to get a new file descriptor. We might have reached a hard
+    // hardware limit that cannot be resolved even with group multiplexing
+    if (id < 0) {
+      GetErrorLogInstance() << "***WARNING** Failed to get a file descriptor "
+                               "for performance counter "
+                            << name << ". Ignoring\n";
+
+      // We give up on this counter but try to keep going
+      // as the others would be fine
+      continue;
+    }
+    if (group_id < 0) {
+      // This is a leader, store and assign it to the current file descriptor
+      leader_ids.push_back(id);
+      group_id = id;
+    }
+    // This is a valid counter, add it to our descriptor's list
+    counter_ids.push_back(id);
+    valid_names.push_back(name);
+  }
+
+  // Loop through all group leaders activating them
+  // There is another option of starting ALL counters in a process but
+  // that would be far reaching an intrusion. If the user is using PMCs
+  // by themselves then this would have a side effect on them. It is
+  // friendlier to loop through all groups individually.
+  for (int lead : leader_ids) {
+    if (ioctl(lead, PERF_EVENT_IOC_ENABLE) != 0) {
+      // This should never happen but if it does, we give up on the
+      // entire batch as recovery would be a mess.
+      GetErrorLogInstance() << "***WARNING*** Failed to start counters. "
+                               "Claring out all counters.\n";
+
+      // Close all peformance counters
+      for (int id : counter_ids) {
+        ::close(id);
+      }
+
+      // Return an empty object so our internal state is still good and
+      // the process can continue normally without impact
+      return NoCounters();
+    }
+  }
+
+  return PerfCounters(std::move(valid_names), std::move(counter_ids),
+                      std::move(leader_ids));
+}
+
+void PerfCounters::CloseCounters() const {
+  if (counter_ids_.empty()) {
+    return;
+  }
+  for (int lead : leader_ids_) {
+    ioctl(lead, PERF_EVENT_IOC_DISABLE);
+  }
+  for (int fd : counter_ids_) {
+    close(fd);
+  }
+}
+#else   // defined HAVE_LIBPFM
+size_t PerfCounterValues::Read(const std::vector<int>&) { return 0; }
+
+const bool PerfCounters::kSupported = false;
+
+bool PerfCounters::Initialize() { return false; }
+
+bool PerfCounters::IsCounterSupported(const std::string&) { return false; }
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (!counter_names.empty()) {
+    GetErrorLogInstance() << "Performance counters not supported.";
+  }
+  return NoCounters();
+}
+
+void PerfCounters::CloseCounters() const {}
+#endif  // defined HAVE_LIBPFM
+
+PerfCountersMeasurement::PerfCountersMeasurement(
+    const std::vector<std::string>& counter_names)
+    : start_values_(counter_names.size()), end_values_(counter_names.size()) {
+  counters_ = PerfCounters::Create(counter_names);
+}
+
+PerfCounters& PerfCounters::operator=(PerfCounters&& other) noexcept {
+  if (this != &other) {
+    CloseCounters();
+
+    counter_ids_ = std::move(other.counter_ids_);
+    leader_ids_ = std::move(other.leader_ids_);
+    counter_names_ = std::move(other.counter_names_);
+  }
+  return *this;
+}
+}  // namespace internal
+}  // namespace benchmark
--- a/src/perf_counters.h
+++ b/src/perf_counters.h
@ -0,0 +1,200 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_PERF_COUNTERS_H
+#define BENCHMARK_PERF_COUNTERS_H
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "check.h"
+#include "log.h"
+#include "mutex.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#include <unistd.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// C4251: <symbol> needs to have dll-interface to be used by clients of class
+#pragma warning(disable : 4251)
+#endif
+
+namespace benchmark {
+namespace internal {
+
+// Typically, we can only read a small number of counters. There is also a
+// padding preceding counter values, when reading multiple counters with one
+// syscall (which is desirable). PerfCounterValues abstracts these details.
+// The implementation ensures the storage is inlined, and allows 0-based
+// indexing into the counter values.
+// The object is used in conjunction with a PerfCounters object, by passing it
+// to Snapshot(). The Read() method relocates individual reads, discarding
+// the initial padding from each group leader in the values buffer such that
+// all user accesses through the [] operator are correct.
+class BENCHMARK_EXPORT PerfCounterValues {
+ public:
+  explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) {
+    BM_CHECK_LE(nr_counters_, kMaxCounters);
+  }
+
+  // We are reading correctly now so the values don't need to skip padding
+  uint64_t operator[](size_t pos) const { return values_[pos]; }
+
+  // Increased the maximum to 32 only since the buffer
+  // is std::array<> backed
+  static constexpr size_t kMaxCounters = 32;
+
+ private:
+  friend class PerfCounters;
+  // Get the byte buffer in which perf counters can be captured.
+  // This is used by PerfCounters::Read
+  std::pair<char*, size_t> get_data_buffer() {
+    return {reinterpret_cast<char*>(values_.data()),
+            sizeof(uint64_t) * (kPadding + nr_counters_)};
+  }
+
+  // This reading is complex and as the goal of this class is to
+  // abstract away the intrincacies of the reading process, this is
+  // a better place for it
+  size_t Read(const std::vector<int>& leaders);
+
+  // Move the padding to 2 due to the reading algorithm (1st padding plus a
+  // current read padding)
+  static constexpr size_t kPadding = 2;
+  std::array<uint64_t, kPadding + kMaxCounters> values_;
+  const size_t nr_counters_;
+};
+
+// Collect PMU counters. The object, once constructed, is ready to be used by
+// calling read(). PMU counter collection is enabled from the time create() is
+// called, to obtain the object, until the object's destructor is called.
+class BENCHMARK_EXPORT PerfCounters final {
+ public:
+  // True iff this platform supports performance counters.
+  static const bool kSupported;
+
+  // Returns an empty object
+  static PerfCounters NoCounters() { return PerfCounters(); }
+
+  ~PerfCounters() { CloseCounters(); }
+  PerfCounters() = default;
+  PerfCounters(PerfCounters&&) = default;
+  PerfCounters(const PerfCounters&) = delete;
+  PerfCounters& operator=(PerfCounters&&) noexcept;
+  PerfCounters& operator=(const PerfCounters&) = delete;
+
+  // Platform-specific implementations may choose to do some library
+  // initialization here.
+  static bool Initialize();
+
+  // Check if the given counter is supported, if the app wants to
+  // check before passing
+  static bool IsCounterSupported(const std::string& name);
+
+  // Return a PerfCounters object ready to read the counters with the names
+  // specified. The values are user-mode only. The counter name format is
+  // implementation and OS specific.
+  // In case of failure, this method will in the worst case return an
+  // empty object whose state will still be valid.
+  static PerfCounters Create(const std::vector<std::string>& counter_names);
+
+  // Take a snapshot of the current value of the counters into the provided
+  // valid PerfCounterValues storage. The values are populated such that:
+  // names()[i]'s value is (*values)[i]
+  BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) const {
+#ifndef BENCHMARK_OS_WINDOWS
+    assert(values != nullptr);
+    return values->Read(leader_ids_) == counter_ids_.size();
+#else
+    (void)values;
+    return false;
+#endif
+  }
+
+  const std::vector<std::string>& names() const { return counter_names_; }
+  size_t num_counters() const { return counter_names_.size(); }
+
+ private:
+  PerfCounters(const std::vector<std::string>& counter_names,
+               std::vector<int>&& counter_ids, std::vector<int>&& leader_ids)
+      : counter_ids_(std::move(counter_ids)),
+        leader_ids_(std::move(leader_ids)),
+        counter_names_(counter_names) {}
+
+  void CloseCounters() const;
+
+  std::vector<int> counter_ids_;
+  std::vector<int> leader_ids_;
+  std::vector<std::string> counter_names_;
+};
+
+// Typical usage of the above primitives.
+class BENCHMARK_EXPORT PerfCountersMeasurement final {
+ public:
+  PerfCountersMeasurement(const std::vector<std::string>& counter_names);
+
+  size_t num_counters() const { return counters_.num_counters(); }
+
+  std::vector<std::string> names() const { return counters_.names(); }
+
+  BENCHMARK_ALWAYS_INLINE bool Start() {
+    if (num_counters() == 0) return true;
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    valid_read_ &= counters_.Snapshot(&start_values_);
+    ClobberMemory();
+
+    return valid_read_;
+  }
+
+  BENCHMARK_ALWAYS_INLINE bool Stop(
+      std::vector<std::pair<std::string, double>>& measurements) {
+    if (num_counters() == 0) return true;
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    valid_read_ &= counters_.Snapshot(&end_values_);
+    ClobberMemory();
+
+    for (size_t i = 0; i < counters_.names().size(); ++i) {
+      double measurement = static_cast<double>(end_values_[i]) -
+                           static_cast<double>(start_values_[i]);
+      measurements.push_back({counters_.names()[i], measurement});
+    }
+
+    return valid_read_;
+  }
+
+ private:
+  PerfCounters counters_;
+  bool valid_read_ = true;
+  PerfCounterValues start_values_;
+  PerfCounterValues end_values_;
+};
+
+}  // namespace internal
+}  // namespace benchmark
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#endif  // BENCHMARK_PERF_COUNTERS_H
--- a/src/re.h
+++ b/src/re.h
@ -33,7 +33,7 @@
 // Prefer C regex libraries when compiling w/o exceptions so that we can
 // correctly report errors.
 #if defined(BENCHMARK_HAS_NO_EXCEPTIONS) && \
-    defined(BENCHMARK_HAVE_STD_REGEX) && \
+    defined(HAVE_STD_REGEX) && \
    (defined(HAVE_GNU_POSIX_REGEX) || defined(HAVE_POSIX_REGEX))
  #undef HAVE_STD_REGEX
 #endif
@ -126,7 +126,7 @@ inline bool Regex::Init(const std::string& spec, std::string* error) {

      // regerror returns the number of bytes necessary to null terminate
      // the string, so we move that when assigning to error.
-      CHECK_NE(needed, 0);
+      BM_CHECK_NE(needed, 0);
      error->assign(errbuf, needed - 1);

      delete[] errbuf;
--- a/src/reporter.cc
+++ b/src/reporter.cc
@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "benchmark/benchmark.h"
-#include "timers.h"
-
 #include <cstdlib>
-
 #include <iostream>
+#include <map>
+#include <string>
 #include <tuple>
 #include <vector>

+#include "benchmark/benchmark.h"
 #include "check.h"
 #include "string_util.h"
+#include "timers.h"

 namespace benchmark {

@ -33,10 +33,14 @@ BenchmarkReporter::~BenchmarkReporter() {}

 void BenchmarkReporter::PrintBasicContext(std::ostream *out,
                                          Context const &context) {
-  CHECK(out) << "cannot be null";
+  BM_CHECK(out) << "cannot be null";
  auto &Out = *out;

+#ifndef BENCHMARK_OS_QURT
+  // Date/time information is not available on QuRT.
+  // Attempting to get it via this call cause the binary to crash.
  Out << LocalDateTimeString() << "\n";
+#endif

  if (context.executable_name)
    Out << "Running " << context.executable_name << "\n";
@ -64,6 +68,15 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
    Out << "\n";
  }

+  std::map<std::string, std::string> *global_context =
+      internal::GetGlobalContext();
+
+  if (global_context != nullptr) {
+    for (const auto &kv : *global_context) {
+      Out << kv.first << ": " << kv.second << "\n";
+    }
+  }
+
  if (CPUInfo::Scaling::ENABLED == info.scaling) {
    Out << "***WARNING*** CPU scaling is enabled, the benchmark "
           "real time measurements may be noisy and will incur extra "
--- a/src/sleep.cc
+++ b/src/sleep.cc
@ -1,51 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "sleep.h"
-
-#include <cerrno>
-#include <cstdlib>
-#include <ctime>
-
-#include "internal_macros.h"
-
-#ifdef BENCHMARK_OS_WINDOWS
-#include <windows.h>
-#endif
-
-namespace benchmark {
-#ifdef BENCHMARK_OS_WINDOWS
-// Window's Sleep takes milliseconds argument.
-void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); }
-void SleepForSeconds(double seconds) {
-  SleepForMilliseconds(static_cast<int>(kNumMillisPerSecond * seconds));
-}
-#else   // BENCHMARK_OS_WINDOWS
-void SleepForMicroseconds(int microseconds) {
-  struct timespec sleep_time;
-  sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
-  sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
-  while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
-    ;  // Ignore signals and wait for the full interval to elapse.
-}
-
-void SleepForMilliseconds(int milliseconds) {
-  SleepForMicroseconds(milliseconds * kNumMicrosPerMilli);
-}
-
-void SleepForSeconds(double seconds) {
-  SleepForMicroseconds(static_cast<int>(seconds * kNumMicrosPerSecond));
-}
-#endif  // BENCHMARK_OS_WINDOWS
-}  // end namespace benchmark
--- a/src/sleep.h
+++ b/src/sleep.h
@ -1,15 +0,0 @@
-#ifndef BENCHMARK_SLEEP_H_
-#define BENCHMARK_SLEEP_H_
-
-namespace benchmark {
-const int kNumMillisPerSecond = 1000;
-const int kNumMicrosPerMilli = 1000;
-const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000;
-const int kNumNanosPerMicro = 1000;
-const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
-
-void SleepForMilliseconds(int milliseconds);
-void SleepForSeconds(double seconds);
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_SLEEP_H_
--- a/Show More
+++ b/Show More