Deprecate ubuntu-20.04 images in actions (#1971 )

https://github.com/actions/runner-images/issues/11101
Fixed private macro name issue (#1968 )
2025-04-21 17:00:28 +08:00 · 2025-04-16 11:29:10 +01:00 · 2025-04-11 15:02:03 +01:00 · 2025-04-11 12:25:46 +01:00 · 2025-03-29 10:49:25 +03:00 · 2025-03-27 18:10:05 +03:00
161 changed files with 7074 additions and 3722 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@ -1,7 +1,37 @@
 ---
-Checks:          'clang-analyzer-*,readability-redundant-*,performance-*'
-WarningsAsErrors: 'clang-analyzer-*,readability-redundant-*,performance-*'
-HeaderFilterRegex: '.*'
-AnalyzeTemporaryDtors: false
+Checks: >
+  abseil-*,
+  bugprone-*,
+  clang-analyzer-*,
+  cppcoreguidelines-*,
+  google-*,
+  misc-*,
+  performance-*,
+  readability-*,
+  -clang-analyzer-deadcode*,
+  -clang-analyzer-optin*,
+  -readability-identifier-length
+WarningsAsErrors: ''
+HeaderFilterRegex: ''
 FormatStyle:     none
-User:            user
+CheckOptions:
+  llvm-else-after-return.WarnOnConditionVariables: 'false'
+  modernize-loop-convert.MinConfidence: reasonable
+  modernize-replace-auto-ptr.IncludeStyle: llvm
+  cert-str34-c.DiagnoseSignedUnsignedCharComparisons: 'false'
+  google-readability-namespace-comments.ShortNamespaceLines: '10'
+  cert-err33-c.CheckedFunctions: '::aligned_alloc;::asctime_s;::at_quick_exit;::atexit;::bsearch;::bsearch_s;::btowc;::c16rtomb;::c32rtomb;::calloc;::clock;::cnd_broadcast;::cnd_init;::cnd_signal;::cnd_timedwait;::cnd_wait;::ctime_s;::fclose;::fflush;::fgetc;::fgetpos;::fgets;::fgetwc;::fopen;::fopen_s;::fprintf;::fprintf_s;::fputc;::fputs;::fputwc;::fputws;::fread;::freopen;::freopen_s;::fscanf;::fscanf_s;::fseek;::fsetpos;::ftell;::fwprintf;::fwprintf_s;::fwrite;::fwscanf;::fwscanf_s;::getc;::getchar;::getenv;::getenv_s;::gets_s;::getwc;::getwchar;::gmtime;::gmtime_s;::localtime;::localtime_s;::malloc;::mbrtoc16;::mbrtoc32;::mbsrtowcs;::mbsrtowcs_s;::mbstowcs;::mbstowcs_s;::memchr;::mktime;::mtx_init;::mtx_lock;::mtx_timedlock;::mtx_trylock;::mtx_unlock;::printf_s;::putc;::putwc;::raise;::realloc;::remove;::rename;::scanf;::scanf_s;::setlocale;::setvbuf;::signal;::snprintf;::snprintf_s;::sprintf;::sprintf_s;::sscanf;::sscanf_s;::strchr;::strerror_s;::strftime;::strpbrk;::strrchr;::strstr;::strtod;::strtof;::strtoimax;::strtok;::strtok_s;::strtol;::strtold;::strtoll;::strtoul;::strtoull;::strtoumax;::strxfrm;::swprintf;::swprintf_s;::swscanf;::swscanf_s;::thrd_create;::thrd_detach;::thrd_join;::thrd_sleep;::time;::timespec_get;::tmpfile;::tmpfile_s;::tmpnam;::tmpnam_s;::tss_create;::tss_get;::tss_set;::ungetc;::ungetwc;::vfprintf;::vfprintf_s;::vfscanf;::vfscanf_s;::vfwprintf;::vfwprintf_s;::vfwscanf;::vfwscanf_s;::vprintf_s;::vscanf;::vscanf_s;::vsnprintf;::vsnprintf_s;::vsprintf;::vsprintf_s;::vsscanf;::vsscanf_s;::vswprintf;::vswprintf_s;::vswscanf;::vswscanf_s;::vwprintf_s;::vwscanf;::vwscanf_s;::wcrtomb;::wcschr;::wcsftime;::wcspbrk;::wcsrchr;::wcsrtombs;::wcsrtombs_s;::wcsstr;::wcstod;::wcstof;::wcstoimax;::wcstok;::wcstok_s;::wcstol;::wcstold;::wcstoll;::wcstombs;::wcstombs_s;::wcstoul;::wcstoull;::wcstoumax;::wcsxfrm;::wctob;::wctrans;::wctype;::wmemchr;::wprintf_s;::wscanf;::wscanf_s;'
+  cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField: 'false'
+  cert-dcl16-c.NewSuffixes: 'L;LL;LU;LLU'
+  google-readability-braces-around-statements.ShortStatementLines: '1'
+  cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic: 'true'
+  google-readability-namespace-comments.SpacesBeforeComments: '2'
+  modernize-loop-convert.MaxCopySize: '16'
+  modernize-pass-by-value.IncludeStyle: llvm
+  modernize-use-nullptr.NullMacros: 'NULL'
+  llvm-qualified-auto.AddConstToQualified: 'false'
+  modernize-loop-convert.NamingStyle: CamelCase
+  llvm-else-after-return.WarnOnUnfixable: 'false'
+  google-readability-function-size.StatementThreshold: '800'
+...
+
--- a/.clang-tidy.ignore
+++ b/.clang-tidy.ignore
@ -0,0 +1 @@
+.*third_party/.*
--- a/.github/.libcxx-setup.sh
+++ b/.github/.libcxx-setup.sh
@ -1,24 +0,0 @@
-#!/usr/bin/env bash
-
-# Checkout LLVM sources
-git clone --depth=1 https://github.com/llvm/llvm-project.git llvm-project
-
-# Setup libc++ options
-if [ -z "$BUILD_32_BITS" ]; then
-  export BUILD_32_BITS=OFF && echo disabling 32 bit build
-fi
-
-# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
-cd ./llvm-project
-cmake -DCMAKE_C_COMPILER=${CC}                  \
-      -DCMAKE_CXX_COMPILER=${CXX}               \
-      -DCMAKE_BUILD_TYPE=RelWithDebInfo         \
-      -DCMAKE_INSTALL_PREFIX=/usr               \
-      -DLIBCXX_ABI_UNSTABLE=OFF                 \
-      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER}  \
-      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS}     \
-      -DLLVM_ENABLE_RUNTIMES='libcxx;libcxxabi' \
-      -S llvm -B llvm-build -G "Unix Makefiles"
-make -C llvm-build -j3 cxx cxxabi
-sudo make -C llvm-build install-cxx install-cxxabi
-cd ..
--- a/.github/install_bazel.sh
+++ b/.github/install_bazel.sh
@ -3,11 +3,10 @@ if ! bazel version; then
  if [ "$arch" == "aarch64" ]; then
    arch="arm64"
  fi
-  echo "Installing wget and downloading $arch Bazel binary from GitHub releases."
-  yum install -y wget
-  wget "https://github.com/bazelbuild/bazel/releases/download/5.2.0/bazel-5.2.0-linux-$arch" -O /usr/local/bin/bazel
-  chmod +x /usr/local/bin/bazel
+  echo "Downloading $arch Bazel binary from GitHub releases."
+  curl -L -o $HOME/bin/bazel --create-dirs "https://github.com/bazelbuild/bazel/releases/download/7.1.1/bazel-7.1.1-linux-$arch"
+  chmod +x $HOME/bin/bazel
 else
-  # bazel is installed for the correct architecture
+  # Bazel is installed for the correct architecture
  exit 0
 fi
--- a/.github/libcxx-setup.sh
+++ b/.github/libcxx-setup.sh
@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Checkout LLVM sources
+git clone --filter=blob:none --depth=1 --branch llvmorg-19.1.6 --no-checkout https://github.com/llvm/llvm-project.git llvm-project
+cd llvm-project
+git sparse-checkout set --cone
+git checkout llvmorg-19.1.6
+git sparse-checkout set cmake llvm/cmake runtimes libcxx libcxxabi
+cd ..
+
+## Setup libc++ options
+if [ -z "$BUILD_32_BITS" ]; then
+  export BUILD_32_BITS=OFF && echo disabling 32 bit build
+fi
+
+## Build and install libc++ (Use unstable ABI for better sanitizer coverage)
+mkdir llvm-build && cd llvm-build
+cmake -GNinja                                   \
+      -DCMAKE_C_COMPILER=${CC}                  \
+      -DCMAKE_CXX_COMPILER=${CXX}               \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo         \
+      -DCMAKE_INSTALL_PREFIX=/usr               \
+      -DLIBCXX_ABI_UNSTABLE=OFF                 \
+      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER}  \
+      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS}     \
+      -DLIBCXXABI_USE_LLVM_UNWINDER=OFF         \
+      -DLLVM_INCLUDE_TESTS=OFF                  \
+      -DLIBCXX_INCLUDE_TESTS=OFF                \
+      -DLIBCXX_INCLUDE_BENCHMARKS=OFF           \
+      -DLLVM_ENABLE_RUNTIMES='libcxx;libcxxabi' \
+      ../llvm-project/runtimes/
+cmake --build . -- cxx cxxabi
+cd ..
--- a/.github/workflows/bazel.yml
+++ b/.github/workflows/bazel.yml
@ -4,20 +4,22 @@ on:
  push: {}
  pull_request: {}

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
-  job:
+  build_and_test_default:
    name: bazel.${{ matrix.os }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2022]
-
+        os: [ubuntu-latest, macos-latest, windows-latest]
    steps:
-    - uses: actions/checkout@v1
+    - uses: actions/checkout@v4

    - name: mount bazel cache
-      uses: actions/cache@v2.0.0
+      uses: actions/cache@v4
      env:
        cache-name: bazel-cache
      with:
--- a/.github/workflows/build-and-test-min-cmake.yml
+++ b/.github/workflows/build-and-test-min-cmake.yml
@ -0,0 +1,49 @@
+name: build-and-test-min-cmake
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  job:
+    name: ${{ matrix.os }}.min-cmake
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: lukka/get-cmake@latest
+        with:
+          cmakeVersion: 3.13.0
+
+      - name: create build environment
+        run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+      - name: setup cmake initial cache
+        run: touch compiler-cache.cmake
+
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.compiler }}
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: >
+          cmake -C ${{ github.workspace }}/compiler-cache.cmake
+          $GITHUB_WORKSPACE
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DCMAKE_CXX_VISIBILITY_PRESET=hidden
+          -DCMAKE_VISIBILITY_INLINES_HIDDEN=ON
+
+      - name: build
+        shell: bash
+        working-directory: ${{ runner.workspace }}/_build
+        run: cmake --build .
--- a/.github/workflows/build-and-test-perfcounters.yml
+++ b/.github/workflows/build-and-test-perfcounters.yml
@ -6,6 +6,9 @@ on:
  pull_request:
    branches: [ main ]

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
  job:
    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
@ -14,20 +17,15 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        # ubuntu-18.04 is deprecated but included for best-effort
-        os: [ubuntu-22.04, ubuntu-20.04, ubuntu-18.04]
+        os: [ubuntu-latest]
        build_type: ['Release', 'Debug']
    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4

    - name: install libpfm
-      run: sudo apt -y install libpfm4-dev
-
-    - name: setup cmake
-      if: matrix.os == 'ubuntu-18.04'
-      uses: jwlawson/actions-setup-cmake@v1.9
-      with:
-        cmake-version: '3.16.3'
+      run: |
+        sudo apt update
+        sudo apt -y install libpfm4-dev

    - name: create build environment
      run: cmake -E make_directory ${{ runner.workspace }}/_build
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@ -6,6 +6,9 @@ on:
  pull_request:
    branches: [ main ]

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
  # TODO: add 32-bit builds (g++ and clang++) for ubuntu
  #   (requires g++-multilib and libc6:i386)
@ -17,45 +20,30 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        # ubuntu-18.04 is deprecated but included for best-effort support
-        os: [ubuntu-22.04, ubuntu-20.04, ubuntu-18.04, macos-latest]
+        os: [ubuntu-24.04, ubuntu-22.04, ubuntu-24.04-arm, macos-latest]
        build_type: ['Release', 'Debug']
-        compiler: [g++, clang++]
+        compiler: ['g++', 'clang++']
        lib: ['shared', 'static']

    steps:
-      - uses: actions/checkout@v2
+      - name: Install dependencies (macos)
+        if: runner.os == 'macOS'
+        run: brew install ninja

-      - name: setup cmake
-        if: matrix.os == 'ubuntu-18.04'
-        uses: jwlawson/actions-setup-cmake@v1.9
-        with:
-          cmake-version: '3.16.3'
-
-      - name: create build environment
-        run: cmake -E make_directory ${{ runner.workspace }}/_build
-
-      - name: setup cmake initial cache
-        run: touch compiler-cache.cmake
-
-      - name: configure cmake
-        env:
-          CXX: ${{ matrix.compiler }}
-        shell: bash
-        working-directory: ${{ runner.workspace }}/_build
-        run: >
-          cmake -C ${{ github.workspace }}/compiler-cache.cmake
-          $GITHUB_WORKSPACE
-          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
-          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
-          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-          -DCMAKE_CXX_VISIBILITY_PRESET=hidden
-          -DCMAKE_VISIBILITY_INLINES_HIDDEN=ON
+      - uses: actions/checkout@v4

      - name: build
-        shell: bash
-        working-directory: ${{ runner.workspace }}/_build
-        run: cmake --build . --config ${{ matrix.build_type }}
+        uses: threeal/cmake-action@v2.1.0
+        with:
+          build-dir: ${{ runner.workspace }}/_build
+          cxx-compiler: ${{ matrix.compiler }}
+          options: |
+            BENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+            BUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+            CMAKE_BUILD_TYPE=${{ matrix.build_type }}
+            CMAKE_CXX_COMPILER=${{ matrix.compiler }}
+            CMAKE_CXX_VISIBILITY_PRESET=hidden
+            CMAKE_VISIBILITY_INLINES_HIDDEN=ON

      - name: test
        shell: bash
@ -74,8 +62,6 @@ jobs:
        msvc:
          - VS-16-2019
          - VS-17-2022
-        arch:
-          - x64
        build_type:
          - Debug
          - Release
@ -91,26 +77,75 @@ jobs:
            generator: 'Visual Studio 17 2022'

    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
+
+      - uses: lukka/get-cmake@latest

      - name: configure cmake
        run: >
-          cmake -S . -B _build/
-          -A ${{ matrix.arch }}
+          cmake -S . -B ${{ runner.workspace }}/_build/
          -G "${{ matrix.generator }}"
          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}

+      - name: build
+        run: cmake --build ${{ runner.workspace }}/_build/ --config ${{ matrix.build_type }}
+
+      - name: test
+        run: ctest --test-dir ${{ runner.workspace }}/_build/ -C ${{ matrix.build_type }} -VV
+
+  msys2:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.msys2.msystem }}
+    runs-on: ${{ matrix.os }}
+    defaults:
+        run:
+            shell: msys2 {0}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ windows-latest ]
+        msys2:
+          - { msystem: MINGW64,    arch: x86_64,  family: GNU,  compiler: g++ }
+          - { msystem: CLANG64,    arch: x86_64,  family: LLVM, compiler: clang++ }
+          - { msystem: UCRT64,     arch: x86_64,  family: GNU,  compiler: g++ }
+        build_type:
+          - Debug
+          - Release
+        lib:
+          - shared
+          - static
+
+    steps:
+      - name: setup msys2
+        uses: msys2/setup-msys2@v2
+        with:
+          cache: false
+          msystem: ${{ matrix.msys2.msystem }}
+          update: true
+          install: >-
+            git
+            base-devel
+          pacboy: >-
+            gcc:p
+            clang:p
+            cmake:p
+            ninja:p
+
+      - uses: actions/checkout@v4
+
+      # NOTE: we can't use cmake actions here as we need to do everything in msys2 shell.
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.msys2.compiler }}
+        run: >
+          cmake -S . -B _build/
+          -GNinja
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+
      - name: build
        run: cmake --build _build/ --config ${{ matrix.build_type }}

-      - name: setup test environment
-        # Make sure gmock and benchmark DLLs can be found
-        run: >
-            echo "$((Get-Item .).FullName)/_build/bin/${{ matrix.build_type }}" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append;
-            echo "$((Get-Item .).FullName)/_build/src/${{ matrix.build_type }}" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append;
-
      - name: test
-        run: ctest --test-dir _build/ -C ${{ matrix.build_type }} -VV
-
-
+        working-directory: _build
+        run: ctest -C ${{ matrix.build_type }} -VV
--- a/.github/workflows/clang-format-lint.yml
+++ b/.github/workflows/clang-format-lint.yml
@ -3,15 +3,17 @@ on:
  push: {}
  pull_request: {}

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
-  build:
+  job:
+    name: check-clang-format
    runs-on: ubuntu-latest

    steps:
-    - uses: actions/checkout@v2
-    - uses: DoozyX/clang-format-lint-action@v0.13
+    - uses: actions/checkout@v4
+    - uses: DoozyX/clang-format-lint-action@v0.18.2
      with:
        source: './include/benchmark ./src ./test'
-        extensions: 'h,cc'
-        clangFormatVersion: 12
-        style: Google
+        clangFormatVersion: 18
--- a/.github/workflows/clang-tidy-lint.yml
+++ b/.github/workflows/clang-tidy-lint.yml
@ -4,6 +4,9 @@ on:
  push: {}
  pull_request: {}

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
  job:
    name: run-clang-tidy
@ -11,17 +14,17 @@ jobs:
    strategy:
      fail-fast: false
    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4

    - name: install clang-tidy
      run: sudo apt update && sudo apt -y install clang-tidy

    - name: create build environment
-      run: cmake -E make_directory ${{ runner.workspace }}/_build
+      run: cmake -E make_directory ${{ github.workspace }}/_build

    - name: configure cmake
      shell: bash
-      working-directory: ${{ runner.workspace }}/_build
+      working-directory: ${{ github.workspace }}/_build
      run: >
        cmake $GITHUB_WORKSPACE
        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
@ -34,5 +37,5 @@ jobs:

    - name: run
      shell: bash
-      working-directory: ${{ runner.workspace }}/_build
-      run: run-clang-tidy
+      working-directory: ${{ github.workspace }}/_build
+      run: run-clang-tidy -config-file=$GITHUB_WORKSPACE/.clang-tidy
--- a/.github/workflows/doxygen.yml
+++ b/.github/workflows/doxygen.yml
@ -6,13 +6,16 @@ on:
  pull_request:
    branches: [main]

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
  build-and-deploy:
    name: Build HTML documentation
    runs-on: ubuntu-latest
    steps:
    - name: Fetching sources
-      uses: actions/checkout@v2
+      uses: actions/checkout@v4

    - name: Installing build dependencies
      run: |
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -0,0 +1,41 @@
+name: python + Bazel pre-commit checks
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    env:
+      MYPY_CACHE_DIR: "${{ github.workspace }}/.cache/mypy"
+      RUFF_CACHE_DIR: "${{ github.workspace }}/.cache/ruff"
+      PRE_COMMIT_HOME: "${{ github.workspace }}/.cache/pre-commit"
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.11
+        cache: pip
+        cache-dependency-path: pyproject.toml
+    - name: Install dependencies
+      run: python -m pip install ".[dev]"
+    - name: Cache pre-commit tools
+      uses: actions/cache@v4
+      with:
+        path: |
+          ${{ env.MYPY_CACHE_DIR }}
+          ${{ env.RUFF_CACHE_DIR }}
+          ${{ env.PRE_COMMIT_HOME }}
+        key: ${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }}-linter-cache
+    - name: Run pre-commit checks
+      run: pre-commit run --all-files --verbose --show-diff-on-failure
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@ -1,28 +0,0 @@
-name: pylint
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-
-jobs:
-  pylint:
-
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v1
-      with:
-        python-version: 3.8
-
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install pylint pylint-exit conan
-
-    - name: Run pylint
-      run: |
-        pylint `find . -name '*.py'|xargs` || pylint-exit $?
--- a/.github/workflows/sanitizer.yml
+++ b/.github/workflows/sanitizer.yml
@ -5,21 +5,21 @@ on:
  pull_request: {}

 env:
+  CMAKE_GENERATOR: Ninja
  UBSAN_OPTIONS: "print_stacktrace=1"

 jobs:
  job:
-    name: ${{ matrix.sanitizer }}.${{ matrix.build_type }}.${{ matrix.compiler }}
+    name: ${{ matrix.sanitizer }}.${{ matrix.build_type }}
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        build_type: ['Debug', 'RelWithDebInfo']
-        sanitizer: ['asan', 'ubsan', 'tsan']
-        compiler: ['clang', 'gcc']
-        # TODO: add 'msan' above. currently failing and needs investigation.
+        sanitizer: ['asan', 'ubsan', 'tsan', 'msan']
+
    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4

    - name: configure msan env
      if: matrix.sanitizer == 'msan'
@ -45,31 +45,28 @@ jobs:
        echo "EXTRA_FLAGS=-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all" >> $GITHUB_ENV
        echo "LIBCXX_SANITIZER=Thread" >> $GITHUB_ENV

+    - name: fine-tune asan options
+      # in asan we get an error from std::regex. ignore it.
+      if: matrix.sanitizer == 'asan'
+      run: |
+        echo "ASAN_OPTIONS=alloc_dealloc_mismatch=0" >> $GITHUB_ENV
+
    - name: setup clang
-      if: matrix.compiler == 'clang'
      uses: egor-tensin/setup-clang@v1
      with:
        version: latest
        platform: x64

    - name: configure clang
-      if: matrix.compiler == 'clang'
      run: |
        echo "CC=cc" >> $GITHUB_ENV
        echo "CXX=c++" >> $GITHUB_ENV

-    - name: configure gcc
-      if: matrix.compiler == 'gcc'
+    - name: build libc++ (non-asan)
+      if: matrix.sanitizer != 'asan'
      run: |
-        sudo apt update && sudo apt -y install gcc-10 g++-10
-        echo "CC=gcc-10" >> $GITHUB_ENV
-        echo "CXX=g++-10" >> $GITHUB_ENV
-
-    - name: install llvm stuff
-      if: matrix.compiler == 'clang'
-      run: |
-        "${GITHUB_WORKSPACE}/.github/.libcxx-setup.sh"
-        echo "EXTRA_CXX_FLAGS=\"-stdlib=libc++\"" >> $GITHUB_ENV
+        "${GITHUB_WORKSPACE}/.github/libcxx-setup.sh"
+        echo "EXTRA_CXX_FLAGS=-stdlib=libc++ -L${GITHUB_WORKSPACE}/llvm-build/lib -lc++abi -I${GITHUB_WORKSPACE}/llvm-build/include/c++/v1 -Isystem${GITHUB_WORKSPACE}/llvm-build/include/c++/v1 -Wl,-rpath,${GITHUB_WORKSPACE}/llvm-build/lib" >> $GITHUB_ENV

    - name: create build environment
      run: cmake -E make_directory ${{ runner.workspace }}/_build
@ -79,7 +76,7 @@ jobs:
      working-directory: ${{ runner.workspace }}/_build
      run: >
        VERBOSE=1
-        cmake $GITHUB_WORKSPACE
+        cmake -GNinja $GITHUB_WORKSPACE
        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
        -DBENCHMARK_ENABLE_LIBPFM=OFF
        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
--- a/.github/workflows/test_bindings.yml
+++ b/.github/workflows/test_bindings.yml
@ -6,19 +6,28 @@ on:
  pull_request:
    branches: [main]

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
  python_bindings:
-    runs-on: ubuntu-latest
+    name: Test GBM Python ${{ matrix.python-version }} bindings on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
+        python-version: [ "3.10", "3.11", "3.12", "3.13" ]

    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python
-        uses: actions/setup-python@v1
+      - uses: actions/checkout@v4
        with:
-          python-version: 3.8
-      - name: Install benchmark
-        run:
-          python setup.py install
-      - name: Run example bindings
-        run:
-          python bindings/python/google_benchmark/example.py
+          fetch-depth: 0
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install GBM Python bindings on ${{ matrix.os }}
+        run: python -m pip install .
+      - name: Run example on ${{ matrix.os }} under Python ${{ matrix.python-version }}
+        run: python bindings/python/google_benchmark/example.py
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@ -6,26 +6,28 @@ on:
    types:
      - published

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
  build_sdist:
    name: Build source distribution
    runs-on: ubuntu-latest
    steps:
      - name: Check out repo
-        uses: actions/checkout@v3
-
-      - name: Install Python 3.9
-        uses: actions/setup-python@v3
+        uses: actions/checkout@v4
        with:
-          python-version: 3.9
-
-      - name: Build and check sdist
-        run: |
-          python setup.py sdist
-      - name: Upload sdist
-        uses: actions/upload-artifact@v3
+          fetch-depth: 0
+      - name: Install Python 3.12
+        uses: actions/setup-python@v5
        with:
-          name: dist
+          python-version: "3.12"
+      - run: python -m pip install build
+      - name: Build sdist
+        run: python -m build --sdist
+      - uses: actions/upload-artifact@v4
+        with:
+          name: dist-sdist
          path: dist/*.tar.gz

  build_wheels:
@ -33,47 +35,57 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, macos-13, macos-14, windows-latest]

    steps:
      - name: Check out Google Benchmark
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - uses: actions/setup-python@v5
+        name: Install Python 3.12
+        with:
+          python-version: "3.12"
+      - run: pip install --upgrade pip uv

      - name: Set up QEMU
        if: runner.os == 'Linux'
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3
        with:
          platforms: all

      - name: Build wheels on ${{ matrix.os }} using cibuildwheel
-        uses: pypa/cibuildwheel@v2.9.0
+        uses: pypa/cibuildwheel@v2.22.0
        env:
-          CIBW_BUILD: 'cp37-* cp38-* cp39-* cp310-* cp311-*'
-          CIBW_SKIP: "cp37-*-arm64 *-musllinux_*"
-          # TODO: Build ppc64le using some other trick
-          CIBW_ARCHS_LINUX: x86_64 aarch64
-          CIBW_ARCHS_MACOS: x86_64 arm64
-          CIBW_ARCHS_WINDOWS: AMD64
+          CIBW_BUILD: "cp310-* cp311-* cp312-*"
+          CIBW_BUILD_FRONTEND: "build[uv]"
+          CIBW_SKIP: "*-musllinux_*"
+          CIBW_ARCHS_LINUX: auto64 aarch64
+          CIBW_ARCHS_WINDOWS: auto64
          CIBW_BEFORE_ALL_LINUX: bash .github/install_bazel.sh
+          # Grab the rootless Bazel installation inside the container.
+          CIBW_ENVIRONMENT_LINUX: PATH=$PATH:$HOME/bin
          CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
+          # unused by Bazel, but needed explicitly by delocate on MacOS.
+          MACOSX_DEPLOYMENT_TARGET: "10.14"

      - name: Upload Google Benchmark ${{ matrix.os }} wheels
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
-          name: dist
-          path: ./wheelhouse/*.whl
+          name: dist-${{ matrix.os }}
+          path: wheelhouse/*.whl

  pypi_upload:
    name: Publish google-benchmark wheels to PyPI
    needs: [build_sdist, build_wheels]
    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
    steps:
-    - uses: actions/download-artifact@v3
-      with:
-        name: dist
-        path: dist
-
-    - uses: pypa/gh-action-pypi-publish@v1.5.0
-      with:
-        user: __token__
-        password: ${{ secrets.PYPI_PASSWORD }}
+      - uses: actions/download-artifact@v4
+        with:
+          path: dist
+          pattern: dist-*
+          merge-multiple: true
+      - uses: pypa/gh-action-pypi-publish@release/v1
--- a/.gitignore
+++ b/.gitignore
@ -46,6 +46,7 @@ rules.ninja

 # bazel output symlinks.
 bazel-*
+MODULE.bazel.lock

 # out-of-source build top-level folders.
 build/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,18 @@
+repos:
+  -   repo: https://github.com/keith/pre-commit-buildifier
+      rev: 8.0.1
+      hooks:
+      -   id: buildifier
+      -   id: buildifier-lint
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.15.0
+    hooks:
+      - id: mypy
+        types_or: [ python, pyi ]
+        args: [ "--ignore-missing-imports", "--scripts-are-modules" ]
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.9.6
+    hooks:
+      - id: ruff
+        args: [ --fix, --exit-non-zero-on-fix ]
+      - id: ruff-format
--- a/.travis.yml
+++ b/.travis.yml
@ -1,208 +0,0 @@
-sudo: required
-dist: trusty
-language: cpp
-
-matrix:
-  include:
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - lcov
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - g++-multilib
-            - libc6:i386
-      env:
-        - COMPILER=g++
-        - C_COMPILER=gcc
-        - BUILD_TYPE=Debug
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-m32"
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - g++-multilib
-            - libc6:i386
-      env:
-        - COMPILER=g++
-        - C_COMPILER=gcc
-        - BUILD_TYPE=Release
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-m32"
-    - compiler: gcc
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
-    # Clang w/ libc++
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
-        - LIBCXX_BUILD=1
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    # Clang w/ 32bit libc++
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            - clang-3.8
-            - g++-multilib
-            - libc6:i386
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-m32"
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    # Clang w/ 32bit libc++
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            - clang-3.8
-            - g++-multilib
-            - libc6:i386
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
-        - LIBCXX_BUILD=1
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-m32"
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    # Clang w/ libc++, ASAN, UBSAN
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address"
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all"
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-        - UBSAN_OPTIONS=print_stacktrace=1
-    # Clang w/ libc++ and MSAN
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    # Clang w/ libc++ and MSAN
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++
-        - BUILD_TYPE=Release
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-m32"
-
-before_script:
-  - if [ -n "${LIBCXX_BUILD}" ]; then
-      source .libcxx-setup.sh;
-    fi
-  - if [ -n "${ENABLE_SANITIZER}" ]; then
-      export EXTRA_OPTIONS="-DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF";
-    else
-      export EXTRA_OPTIONS="";
-    fi
-  - mkdir -p build && cd build
-
-before_install:
-  - if [ -z "$BUILD_32_BITS" ]; then
-      export BUILD_32_BITS=OFF && echo disabling 32 bit build;
-    fi
-  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
-      sudo add-apt-repository -y "ppa:ubuntu-toolchain-r/test";
-      sudo apt-get update --option Acquire::Retries=100 --option Acquire::http::Timeout="60";
-    fi
-
-install:
-  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
-      travis_wait sudo -E apt-get -yq --no-install-suggests --no-install-recommends install g++-6;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "linux" -a "${BUILD_32_BITS}" == "OFF" ]; then
-      travis_wait sudo -E apt-get -y --no-install-suggests --no-install-recommends install llvm-3.9-tools;
-      sudo cp /usr/lib/llvm-3.9/bin/FileCheck /usr/local/bin/;
-    fi
-  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
-      PATH=~/.local/bin:${PATH};
-      pip install --user --upgrade pip;
-      travis_wait pip install --user cpp-coveralls;
-    fi
-  - if [ "${C_COMPILER}" == "gcc-7" -a "${TRAVIS_OS_NAME}" == "osx" ]; then
-      rm -f /usr/local/include/c++;
-      brew update;
-      travis_wait brew install gcc@7;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "linux" ]; then
-      sudo apt-get update -qq;
-      sudo apt-get install -qq unzip cmake3;
-      wget https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-linux-x86_64.sh --output-document bazel-installer.sh;
-      travis_wait sudo bash bazel-installer.sh;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
-      curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-darwin-x86_64.sh;
-      travis_wait sudo bash bazel-installer.sh;
-    fi
-
-script:
-  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_C_FLAGS="${EXTRA_FLAGS}" -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS} ${EXTRA_CXX_FLAGS}" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ${EXTRA_OPTIONS} ..
-  - make
-  - ctest -C ${BUILD_TYPE} --output-on-failure
-  - bazel test -c dbg --define google_benchmark.have_regex=posix --announce_rc --verbose_failures --test_output=errors --keep_going //test/...
-
-after_success:
-  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
-      coveralls --include src --include include --gcov-options '\-lp' --root .. --build-root .;
-    fi
--- a/.ycm_extra_conf.py
+++ b/.ycm_extra_conf.py
@ -1,25 +1,30 @@
 import os
+
 import ycm_core

 # These are the compilation flags that will be used in case there's no
 # compilation database set (by default, one is not set).
 # CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR.
 flags = [
-'-Wall',
-'-Werror',
-'-pedantic-errors',
-'-std=c++0x',
-'-fno-strict-aliasing',
-'-O3',
-'-DNDEBUG',
-# ...and the same thing goes for the magic -x option which specifies the
-# language that the files to be compiled are written in. This is mostly
-# relevant for c++ headers.
-# For a C project, you would set this to 'c' instead of 'c++'.
-'-x', 'c++',
-'-I', 'include',
-'-isystem', '/usr/include',
-'-isystem', '/usr/local/include',
+    "-Wall",
+    "-Werror",
+    "-pedantic-errors",
+    "-std=c++0x",
+    "-fno-strict-aliasing",
+    "-O3",
+    "-DNDEBUG",
+    # ...and the same thing goes for the magic -x option which specifies the
+    # language that the files to be compiled are written in. This is mostly
+    # relevant for c++ headers.
+    # For a C project, you would set this to 'c' instead of 'c++'.
+    "-x",
+    "c++",
+    "-I",
+    "include",
+    "-isystem",
+    "/usr/include",
+    "-isystem",
+    "/usr/local/include",
 ]


@ -29,87 +34,87 @@ flags = [
 #
 # Most projects will NOT need to set this to anything; you can just change the
 # 'flags' list of compilation flags. Notice that YCM itself uses that approach.
-compilation_database_folder = ''
+compilation_database_folder = ""

-if os.path.exists( compilation_database_folder ):
-  database = ycm_core.CompilationDatabase( compilation_database_folder )
+if os.path.exists(compilation_database_folder):
+    database = ycm_core.CompilationDatabase(compilation_database_folder)
 else:
-  database = None
+    database = None
+
+SOURCE_EXTENSIONS = [".cc"]

-SOURCE_EXTENSIONS = [ '.cc' ]

 def DirectoryOfThisScript():
-  return os.path.dirname( os.path.abspath( __file__ ) )
+    return os.path.dirname(os.path.abspath(__file__))


-def MakeRelativePathsInFlagsAbsolute( flags, working_directory ):
-  if not working_directory:
-    return list( flags )
-  new_flags = []
-  make_next_absolute = False
-  path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ]
-  for flag in flags:
-    new_flag = flag
+def MakeRelativePathsInFlagsAbsolute(flags, working_directory):
+    if not working_directory:
+        return list(flags)
+    new_flags = []
+    make_next_absolute = False
+    path_flags = ["-isystem", "-I", "-iquote", "--sysroot="]
+    for flag in flags:
+        new_flag = flag

-    if make_next_absolute:
-      make_next_absolute = False
-      if not flag.startswith( '/' ):
-        new_flag = os.path.join( working_directory, flag )
+        if make_next_absolute:
+            make_next_absolute = False
+            if not flag.startswith("/"):
+                new_flag = os.path.join(working_directory, flag)

-    for path_flag in path_flags:
-      if flag == path_flag:
-        make_next_absolute = True
-        break
+        for path_flag in path_flags:
+            if flag == path_flag:
+                make_next_absolute = True
+                break

-      if flag.startswith( path_flag ):
-        path = flag[ len( path_flag ): ]
-        new_flag = path_flag + os.path.join( working_directory, path )
-        break
+            if flag.startswith(path_flag):
+                path = flag[len(path_flag) :]
+                new_flag = path_flag + os.path.join(working_directory, path)
+                break

-    if new_flag:
-      new_flags.append( new_flag )
-  return new_flags
+        if new_flag:
+            new_flags.append(new_flag)
+    return new_flags


-def IsHeaderFile( filename ):
-  extension = os.path.splitext( filename )[ 1 ]
-  return extension in [ '.h', '.hxx', '.hpp', '.hh' ]
+def IsHeaderFile(filename):
+    extension = os.path.splitext(filename)[1]
+    return extension in [".h", ".hxx", ".hpp", ".hh"]


-def GetCompilationInfoForFile( filename ):
-  # The compilation_commands.json file generated by CMake does not have entries
-  # for header files. So we do our best by asking the db for flags for a
-  # corresponding source file, if any. If one exists, the flags for that file
-  # should be good enough.
-  if IsHeaderFile( filename ):
-    basename = os.path.splitext( filename )[ 0 ]
-    for extension in SOURCE_EXTENSIONS:
-      replacement_file = basename + extension
-      if os.path.exists( replacement_file ):
-        compilation_info = database.GetCompilationInfoForFile(
-          replacement_file )
-        if compilation_info.compiler_flags_:
-          return compilation_info
-    return None
-  return database.GetCompilationInfoForFile( filename )
+def GetCompilationInfoForFile(filename):
+    # The compilation_commands.json file generated by CMake does not have
+    # entries for header files. So we do our best by asking the db for flags for
+    # a corresponding source file, if any. If one exists, the flags for that
+    # file should be good enough.
+    if IsHeaderFile(filename):
+        basename = os.path.splitext(filename)[0]
+        for extension in SOURCE_EXTENSIONS:
+            replacement_file = basename + extension
+            if os.path.exists(replacement_file):
+                compilation_info = database.GetCompilationInfoForFile(
+                    replacement_file
+                )
+                if compilation_info.compiler_flags_:
+                    return compilation_info
+        return None
+    return database.GetCompilationInfoForFile(filename)


-def FlagsForFile( filename, **kwargs ):
-  if database:
-    # Bear in mind that compilation_info.compiler_flags_ does NOT return a
-    # python list, but a "list-like" StringVec object
-    compilation_info = GetCompilationInfoForFile( filename )
-    if not compilation_info:
-      return None
+def FlagsForFile(filename, **kwargs):
+    if database:
+        # Bear in mind that compilation_info.compiler_flags_ does NOT return a
+        # python list, but a "list-like" StringVec object
+        compilation_info = GetCompilationInfoForFile(filename)
+        if not compilation_info:
+            return None

-    final_flags = MakeRelativePathsInFlagsAbsolute(
-      compilation_info.compiler_flags_,
-      compilation_info.compiler_working_dir_ )
-  else:
-    relative_to = DirectoryOfThisScript()
-    final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to )
+        final_flags = MakeRelativePathsInFlagsAbsolute(
+            compilation_info.compiler_flags_,
+            compilation_info.compiler_working_dir_,
+        )
+    else:
+        relative_to = DirectoryOfThisScript()
+        final_flags = MakeRelativePathsInFlagsAbsolute(flags, relative_to)

-  return {
-    'flags': final_flags,
-    'do_cache': True
-  }
+    return {"flags": final_flags, "do_cache": True}
--- a/9
+++ b/9
@ -28,10 +28,13 @@ Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fabien Pichot <pichot.fabien@gmail.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Gergely Meszaros <maetveis@gmail.com>
 Gergő Szitár <szitar.gergo@gmail.com>
 Google Inc.
+Henrique Bucher <hbucher@gmail.com>
 International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
@ -42,16 +45,19 @@ Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
 MongoDB Inc.
 Nick Hutchinson <nshutchinson@gmail.com>
 Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
-Raghu Raja <raghu@enfabrica.net>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Raghu Raja <raghu@enfabrica.net>
 Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Roman Lebedev <lebedev.ri@gmail.com>
 Sayan Bhattacharjee <aero.sayan@gmail.com>
@ -64,4 +70,3 @@ Tobias Schmidt <tobias.schmidt@in.tum.de>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
-Min-Yih Hsu <yihshyng223@gmail.com>
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -1,20 +1,31 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
 licenses(["notice"])

-config_setting(
-    name = "qnx",
-    constraint_values = ["@platforms//os:qnx"],
-    values = {
-        "cpu": "x64_qnx",
-    },
-    visibility = [":__subpackages__"],
-)
+COPTS = [
+    "-pedantic",
+    "-pedantic-errors",
+    "-std=c++17",
+    "-Wall",
+    "-Wconversion",
+    "-Wextra",
+    "-Wshadow",
+    #    "-Wshorten-64-to-32",
+    "-Wfloat-equal",
+    "-fstrict-aliasing",
+    ## assert() are used a lot in tests upstream, which may be optimised out leading to
+    ## unused-variable warning.
+    "-Wno-unused-variable",
+    "-Werror=old-style-cast",
+]
+
+MSVC_COPTS = [
+    "/std:c++17",
+]

 config_setting(
    name = "windows",
    constraint_values = ["@platforms//os:windows"],
-    values = {
-        "cpu": "x64_windows",
-    },
    visibility = [":__subpackages__"],
 )

@ -39,24 +50,35 @@ cc_library(
        "include/benchmark/benchmark.h",
        "include/benchmark/export.h",
    ],
-    linkopts = select({
-        ":windows": ["-DEFAULTLIB:shlwapi.lib"],
-        "//conditions:default": ["-pthread"],
+    copts = select({
+        ":windows": MSVC_COPTS,
+        "//conditions:default": COPTS,
    }),
-    strip_include_prefix = "include",
-    visibility = ["//visibility:public"],
-    # Only static linking is allowed; no .so will be produced.
-    # Using `defines` (i.e. not `local_defines`) means that no
-    # dependent rules need to bother about defining the macro.
-    linkstatic = True,
    defines = [
        "BENCHMARK_STATIC_DEFINE",
+        "BENCHMARK_VERSION=\\\"" + (module_version() if module_version() != None else "") + "\\\"",
    ] + select({
        ":perfcounters": ["HAVE_LIBPFM"],
        "//conditions:default": [],
    }),
+    includes = ["include"],
+    linkopts = select({
+        ":windows": ["-DEFAULTLIB:shlwapi.lib"],
+        "//conditions:default": ["-pthread"],
+    }),
+    # Only static linking is allowed; no .so will be produced.
+    # Using `defines` (i.e. not `local_defines`) means that no
+    # dependent rules need to bother about defining the macro.
+    linkstatic = True,
+    local_defines = [
+        # Turn on Large-file Support
+        "_FILE_OFFSET_BITS=64",
+        "_LARGEFILE64_SOURCE",
+        "_LARGEFILE_SOURCE",
+    ],
+    visibility = ["//visibility:public"],
    deps = select({
-        ":perfcounters": ["@libpfm//:libpfm"],
+        ":perfcounters": ["@libpfm"],
        "//conditions:default": [],
    }),
 )
@ -64,8 +86,11 @@ cc_library(
 cc_library(
    name = "benchmark_main",
    srcs = ["src/benchmark_main.cc"],
-    hdrs = ["include/benchmark/benchmark.h", "include/benchmark/export.h"],
-    strip_include_prefix = "include",
+    hdrs = [
+        "include/benchmark/benchmark.h",
+        "include/benchmark/export.h",
+    ],
+    includes = ["include"],
    visibility = ["//visibility:public"],
    deps = [":benchmark"],
 )
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,20 +1,7 @@
-cmake_minimum_required (VERSION 3.16.3)
+# Require CMake 3.10. If available, use the policies up to CMake 3.22.
+cmake_minimum_required (VERSION 3.13...3.22)

-foreach(p
-    CMP0048 # OK to clear PROJECT_VERSION on project()
-    CMP0054 # CMake 3.1
-    CMP0056 # export EXE_LINKER_FLAGS to try_run
-    CMP0057 # Support no if() IN_LIST operator
-    CMP0063 # Honor visibility properties for all targets
-    CMP0067 # Honor language standard in try_compile() source file signature
-    CMP0077 # Allow option() overrides in importing projects
-    )
-  if(POLICY ${p})
-    cmake_policy(SET ${p} NEW)
-  endif()
-endforeach()
-
-project (benchmark VERSION 1.7.1 LANGUAGES CXX)
+project (benchmark VERSION 1.9.2 LANGUAGES CXX)

 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
@ -27,11 +14,14 @@ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
  # PGC++ maybe reporting false positives.
  set(BENCHMARK_ENABLE_WERROR OFF)
 endif()
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "NVHPC")
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
 if(BENCHMARK_FORCE_WERROR)
  set(BENCHMARK_ENABLE_WERROR ON)
 endif(BENCHMARK_FORCE_WERROR)

-if(NOT MSVC)
+if(NOT (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC"))
  option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
 else()
  set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
@ -55,7 +45,7 @@ option(BENCHMARK_ENABLE_LIBPFM "Enable performance counters provided by libpfm"
 set(CMAKE_CXX_VISIBILITY_PRESET hidden)
 set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)

-if(MSVC)
+if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
    # As of CMake 3.18, CMAKE_SYSTEM_PROCESSOR is not set properly for MSVC and
    # cross-compilation (e.g. Host=x86_64, target=aarch64) requires using the
    # undocumented, but working variable.
@ -76,7 +66,7 @@ function(should_enable_assembly_tests)
      return()
    endif()
  endif()
-  if (MSVC)
+  if (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC")
    return()
  elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
    return()
@ -114,17 +104,27 @@ get_git_version(GIT_VERSION)

 # If no git version can be determined, use the version
 # from the project() command
-if ("${GIT_VERSION}" STREQUAL "0.0.0")
-  set(VERSION "${benchmark_VERSION}")
+if ("${GIT_VERSION}" STREQUAL "v0.0.0")
+  set(VERSION "v${benchmark_VERSION}")
 else()
  set(VERSION "${GIT_VERSION}")
 endif()
+
+# Normalize version: drop "v" prefix, replace first "-" with ".",
+# drop everything after second "-" (including said "-").
+string(STRIP ${VERSION} VERSION)
+if(VERSION MATCHES v[^-]*-)
+   string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  NORMALIZED_VERSION ${VERSION})
+else()
+   string(REGEX REPLACE "v(.*)" "\\1" NORMALIZED_VERSION ${VERSION})
+endif()
+
 # Tell the user what versions we are using
-message(STATUS "Version: ${VERSION}")
+message(STATUS "Google Benchmark version: ${VERSION}, normalized to ${NORMALIZED_VERSION}")

 # The version of the libraries
-set(GENERIC_LIB_VERSION ${VERSION})
-string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)
+set(GENERIC_LIB_VERSION ${NORMALIZED_VERSION})
+string(SUBSTRING ${NORMALIZED_VERSION} 0 1 GENERIC_LIB_SOVERSION)

 # Import our CMake modules
 include(AddCXXCompilerFlag)
@ -138,11 +138,7 @@ if (BENCHMARK_BUILD_32_BITS)
  add_required_cxx_compiler_flag(-m32)
 endif()

-if (MSVC)
-  set(BENCHMARK_CXX_STANDARD 14)
-else()
-  set(BENCHMARK_CXX_STANDARD 11)
-endif()
+set(BENCHMARK_CXX_STANDARD 17)

 set(CMAKE_CXX_STANDARD ${BENCHMARK_CXX_STANDARD})
 set(CMAKE_CXX_STANDARD_REQUIRED YES)
@ -152,8 +148,17 @@ if (MSVC)
  # Turn compiler warnings up to 11
  string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
+
+  # MP flag only applies to cl, not cl frontends to other compilers (e.g. clang-cl, icx-cl etc)
+  if(CMAKE_CXX_COMPILER_ID MATCHES MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
+  endif()
  add_definitions(-D_CRT_SECURE_NO_WARNINGS)

+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-WX)
+  endif()
+
  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
    add_cxx_compiler_flag(-EHs-)
    add_cxx_compiler_flag(-EHa-)
@ -180,15 +185,19 @@ if (MSVC)
    set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG")
  endif()
 else()
+  # Turn on Large-file Support
+  add_definitions(-D_FILE_OFFSET_BITS=64)
+  add_definitions(-D_LARGEFILE64_SOURCE)
+  add_definitions(-D_LARGEFILE_SOURCE)
  # Turn compiler warnings up to 11
  add_cxx_compiler_flag(-Wall)
  add_cxx_compiler_flag(-Wextra)
  add_cxx_compiler_flag(-Wshadow)
  add_cxx_compiler_flag(-Wfloat-equal)
+  add_cxx_compiler_flag(-Wold-style-cast)
+  add_cxx_compiler_flag(-Wconversion)
  if(BENCHMARK_ENABLE_WERROR)
-      add_cxx_compiler_flag(-Werror RELEASE)
-      add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
-      add_cxx_compiler_flag(-Werror MINSIZEREL)
+      add_cxx_compiler_flag(-Werror)
  endif()
  if (NOT BENCHMARK_ENABLE_TESTING)
    # Disable warning when compiling tests as gtest does not use 'override'.
@ -201,24 +210,23 @@ else()
  # Disable warnings regarding deprecated parts of the library while building
  # and testing those parts of the library.
  add_cxx_compiler_flag(-Wno-deprecated-declarations)
-  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
    # Intel silently ignores '-Wno-deprecated-declarations',
    # warning no. 1786 must be explicitly disabled.
    # See #631 for rationale.
    add_cxx_compiler_flag(-wd1786)
+    add_cxx_compiler_flag(-fno-finite-math-only)
  endif()
  # Disable deprecation warnings for release builds (when -Werror is enabled).
  if(BENCHMARK_ENABLE_WERROR)
-      add_cxx_compiler_flag(-Wno-deprecated RELEASE)
-      add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
-      add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
+      add_cxx_compiler_flag(-Wno-deprecated)
  endif()
  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
    add_cxx_compiler_flag(-fno-exceptions)
  endif()

  if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
-    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
+    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM") #ICC17u2: Many false positives for Wstrict-aliasing
      add_cxx_compiler_flag(-Wstrict-aliasing)
    endif()
  endif()
@ -232,7 +240,7 @@ else()

  # On most UNIX like platforms g++ and clang++ define _GNU_SOURCE as a
  # predefined macro, which turns on all of the wonderful libc extensions.
-  # However g++ doesn't do this in Cygwin so we have to define it ourselfs
+  # However g++ doesn't do this in Cygwin so we have to define it ourselves
  # since we depend on GNU/POSIX/BSD extensions.
  if (CYGWIN)
    add_definitions(-D_GNU_SOURCE=1)
@ -283,7 +291,8 @@ if (BENCHMARK_USE_LIBCXX)
  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
    add_cxx_compiler_flag(-stdlib=libc++)
  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
-          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel" OR
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM")
    add_cxx_compiler_flag(-nostdinc++)
    message(WARNING "libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
    # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
@ -297,17 +306,11 @@ if (BENCHMARK_USE_LIBCXX)
  endif()
 endif(BENCHMARK_USE_LIBCXX)

-set(EXTRA_CXX_FLAGS "")
-if (WIN32 AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-  # Clang on Windows fails to compile the regex feature check under C++11
-  set(EXTRA_CXX_FLAGS "-DCMAKE_CXX_STANDARD=14")
-endif()
-
 # C++ feature checks
 # Determine the correct regular expression engine to use
-cxx_feature_check(STD_REGEX ${EXTRA_CXX_FLAGS})
-cxx_feature_check(GNU_POSIX_REGEX ${EXTRA_CXX_FLAGS})
-cxx_feature_check(POSIX_REGEX ${EXTRA_CXX_FLAGS})
+cxx_feature_check(STD_REGEX)
+cxx_feature_check(GNU_POSIX_REGEX)
+cxx_feature_check(POSIX_REGEX)
 if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
  message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
 endif()
@ -320,9 +323,10 @@ cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
+cxx_feature_check(PTHREAD_AFFINITY)

 if (BENCHMARK_ENABLE_LIBPFM)
-  find_package(PFM)
+  find_package(PFM REQUIRED)
 endif()

 # Set up directories
--- a/13
+++ b/13
@ -42,29 +42,37 @@ Dominic Hamon <dma@stripysock.com> <dominic@google.com>
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
 Dominik Korman <kormandominik@gmail.com>
 Donald Aingworth <donalds_junk_mail@yahoo.com>
+Doug Evans <xdje42@gmail.com>
 Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fabien Pichot <pichot.fabien@gmail.com>
 Fanbo Meng <fanbo.meng@ibm.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Geoffrey Martin-Noble <gcmn@google.com> <gmngeoffrey@gmail.com>
+Gergely Meszaros <maetveis@gmail.com>
 Gergő Szitár <szitar.gergo@gmail.com>
 Hannes Hauswedell <h2@fsfe.org>
+Henrique Bucher <hbucher@gmail.com>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
+Iakov Sergeev <yahontu@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 John Millikin <jmillikin@stripe.com>
 Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
-Kai Wolf <kai.wolf@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
+Kai Wolf <kai.wolf@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
 Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
@ -73,8 +81,8 @@ Pascal Leroy <phl@google.com>
 Paul Redmond <paul.redmond@gmail.com>
 Pierre Phaneuf <pphaneuf@google.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
-Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Raghu Raja <raghu@enfabrica.net>
+Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Raul Marin <rmrodriguez@cartodb.com>
 Ray Glover <ray.glover@uk.ibm.com>
 Robert Guo <robert.guo@mongodb.com>
@ -88,4 +96,3 @@ Tom Madams <tom.ej.madams@gmail.com> <tmadams@google.com>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
-Min-Yih Hsu <yihshyng223@gmail.com>
--- a/MODULE.bazel
+++ b/MODULE.bazel
@ -0,0 +1,41 @@
+module(
+    name = "google_benchmark",
+    version = "1.9.2",
+)
+
+bazel_dep(name = "bazel_skylib", version = "1.7.1")
+bazel_dep(name = "platforms", version = "0.0.10")
+bazel_dep(name = "rules_cc", version = "0.0.9")
+
+bazel_dep(name = "rules_python", version = "1.0.0", dev_dependency = True)
+bazel_dep(name = "googletest", version = "1.14.0", dev_dependency = True, repo_name = "com_google_googletest")
+
+bazel_dep(name = "libpfm", version = "4.11.0.bcr.1")
+
+# Register a toolchain for Python 3.9 to be able to build numpy. Python
+# versions >=3.10 are problematic.
+# A second reason for this is to be able to build Python hermetically instead
+# of relying on the changing default version from rules_python.
+
+python = use_extension("@rules_python//python/extensions:python.bzl", "python", dev_dependency = True)
+python.toolchain(python_version = "3.8")
+python.toolchain(python_version = "3.9")
+python.toolchain(python_version = "3.10")
+python.toolchain(python_version = "3.11")
+python.toolchain(
+    is_default = True,
+    python_version = "3.12",
+)
+python.toolchain(python_version = "3.13")
+
+pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip", dev_dependency = True)
+pip.parse(
+    hub_name = "tools_pip_deps",
+    python_version = "3.9",
+    requirements_lock = "//tools:requirements.txt",
+)
+use_repo(pip, "tools_pip_deps")
+
+# -- bazel_dep definitions -- #
+
+bazel_dep(name = "nanobind_bazel", version = "2.5.0", dev_dependency = True)
--- a/README.md
+++ b/README.md
@ -4,10 +4,9 @@
 [![bazel](https://github.com/google/benchmark/actions/workflows/bazel.yml/badge.svg)](https://github.com/google/benchmark/actions/workflows/bazel.yml)
 [![pylint](https://github.com/google/benchmark/workflows/pylint/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Apylint)
 [![test-bindings](https://github.com/google/benchmark/workflows/test-bindings/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Atest-bindings)
-
-[![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark)
 [![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)

+[![Discord](https://discordapp.com/api/guilds/1125694995928719494/widget.png?style=shield)](https://discord.gg/cz7UX7wKC2)

 A library to benchmark code snippets, similar to unit tests. Example:

@ -33,7 +32,7 @@ To get started, see [Requirements](#requirements) and
 [Installation](#installation). See [Usage](#usage) for a full example and the
 [User Guide](docs/user_guide.md) for a more comprehensive feature overview.

-It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/master/docs/primer.md)
+It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/main/docs/primer.md)
 as some of the structural aspects of the APIs are similar.

 ## Resources
@ -51,15 +50,13 @@ IRC channels:

 ## Requirements

-The library can be used with C++03. However, it requires C++11 to build,
+The library can be used with C++11. However, it requires C++17 to build,
 including compiler and standard library support.

-The following minimum versions are required to build the library:
+_See [dependencies.md](docs/dependencies.md) for more details regarding supported
+compilers and standards._

-* GCC 4.8
-* Clang 3.4
-* Visual Studio 14 2015
-* Intel 2015 Update 1
+If you have need for a particular compiler to be supported, patches are very welcome.

 See [Platform-Specific Build Instructions](docs/platform_specific_build_instructions.md).

@ -81,7 +78,7 @@ $ cmake -E make_directory "build"
 # Generate build system files with cmake, and download any dependencies.
 $ cmake -E chdir "build" cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=on -DCMAKE_BUILD_TYPE=Release ../
 # or, starting with CMake 3.13, use a simpler form:
-# cmake -DCMAKE_BUILD_TYPE=Release -S . -B "build"
+# cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=on -DCMAKE_BUILD_TYPE=Release -S . -B "build"
 # Build the library.
 $ cmake --build "build" --config Release
 ```
@ -139,6 +136,12 @@ cache variables, if autodetection fails.
 If you are using clang, you may need to set `LLVMAR_EXECUTABLE`,
 `LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.

+To enable sanitizer checks (eg., `asan` and `tsan`), add:
+```
+ -DCMAKE_C_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=address -fsanitize=thread -fno-sanitize-recover=all"
+ -DCMAKE_CXX_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=address -fsanitize=thread -fno-sanitize-recover=all "  
+```
+
 ### Stable and Experimental Library Versions

 The main branch contains the latest stable version of the benchmarking library;
--- a/81
+++ b/81
@ -1,75 +1,20 @@
 workspace(name = "com_github_google_benchmark")

-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
+load("//:bazel/benchmark_deps.bzl", "benchmark_deps")

+benchmark_deps()

-http_archive(
-    name = "bazel_skylib",
-    urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
-        "https://github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
-    ],
-    sha256 = "f7be3474d42aae265405a592bb7da8e171919d74c16f082a5457840f06054728",
+load("@rules_python//python:repositories.bzl", "py_repositories")
+
+py_repositories()
+
+load("@rules_python//python:pip.bzl", "pip_parse")
+
+pip_parse(
+    name = "tools_pip_deps",
+    requirements_lock = "//tools:requirements.txt",
 )

-# https://github.com/bazelbuild/rules_foreign_cc/
-http_archive(
-    name = "rules_foreign_cc",
-    sha256 = "bcd0c5f46a49b85b384906daae41d277b3dc0ff27c7c752cc51e43048a58ec83",
-    strip_prefix = "rules_foreign_cc-0.7.1",
-    url = "https://github.com/bazelbuild/rules_foreign_cc/archive/0.7.1.tar.gz",
-)
+load("@tools_pip_deps//:requirements.bzl", "install_deps")

-load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
-rules_foreign_cc_dependencies()
-
-http_archive(
-    name = "com_google_absl",
-    sha256 = "f41868f7a938605c92936230081175d1eae87f6ea2c248f41077c8f88316f111",
-    strip_prefix = "abseil-cpp-20200225.2",
-    urls = ["https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz"],
-)
-
-git_repository(
-    name = "com_google_googletest",
-    remote = "https://github.com/google/googletest.git",
-    tag = "release-1.11.0",
-)
-
-# Downloaded from v4.9.0 tag at https://sourceforge.net/p/perfmon2/libpfm4/ref/master/tags/
-http_archive(
-    name = "libpfm",
-    build_file = "//tools:libpfm.BUILD.bazel",
-    sha256 = "5da5f8872bde14b3634c9688d980f68bda28b510268723cc12973eedbab9fecc",
-    type = "tar.gz",
-    strip_prefix = "libpfm-4.11.0",
-    urls = ["https://sourceforge.net/projects/perfmon2/files/libpfm4/libpfm-4.11.0.tar.gz/download"],
-)
-
-http_archive(
-    name = "pybind11",
-    build_file = "@//bindings/python:pybind11.BUILD",
-    sha256 = "eacf582fa8f696227988d08cfc46121770823839fe9e301a20fbce67e7cd70ec",
-    strip_prefix = "pybind11-2.10.0",
-    urls = ["https://github.com/pybind/pybind11/archive/v2.10.0.tar.gz"],
-)
-
-new_local_repository(
-    name = "python_headers",
-    build_file = "@//bindings/python:python_headers.BUILD",
-    path = "/usr/include/python3.6",  # May be overwritten by setup.py.
-)
-
-http_archive(
-    name = "rules_python",
-    url = "https://github.com/bazelbuild/rules_python/releases/download/0.1.0/rules_python-0.1.0.tar.gz",
-    sha256 = "b6d46438523a3ec0f3cead544190ee13223a52f6a6765a29eae7b7cc24cc83a0",
-)
-
-load("@rules_python//python:pip.bzl", pip3_install="pip_install")
-
-pip3_install(
-   name = "py_deps",
-   requirements = "//:requirements.txt",
-)
+install_deps()
--- a/WORKSPACE.bzlmod
+++ b/WORKSPACE.bzlmod
@ -0,0 +1,2 @@
+# This file marks the root of the Bazel workspace.
+# See MODULE.bazel for dependencies and setup.
--- a/bazel/benchmark_deps.bzl
+++ b/bazel/benchmark_deps.bzl
@ -0,0 +1,54 @@
+"""
+This file contains the Bazel build dependencies for Google Benchmark (both C++ source and Python bindings).
+"""
+
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "new_git_repository")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+def benchmark_deps():
+    """Loads dependencies required to build Google Benchmark."""
+
+    if "bazel_skylib" not in native.existing_rules():
+        http_archive(
+            name = "bazel_skylib",
+            sha256 = "cd55a062e763b9349921f0f5db8c3933288dc8ba4f76dd9416aac68acee3cb94",
+            urls = [
+                "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
+                "https://github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
+            ],
+        )
+
+    if "rules_python" not in native.existing_rules():
+        http_archive(
+            name = "rules_python",
+            sha256 = "e85ae30de33625a63eca7fc40a94fea845e641888e52f32b6beea91e8b1b2793",
+            strip_prefix = "rules_python-0.27.1",
+            url = "https://github.com/bazelbuild/rules_python/releases/download/0.27.1/rules_python-0.27.1.tar.gz",
+        )
+
+    if "com_google_googletest" not in native.existing_rules():
+        new_git_repository(
+            name = "com_google_googletest",
+            remote = "https://github.com/google/googletest.git",
+            tag = "release-1.12.1",
+        )
+
+    if "nanobind" not in native.existing_rules():
+        new_git_repository(
+            name = "nanobind",
+            remote = "https://github.com/wjakob/nanobind.git",
+            tag = "v1.9.2",
+            build_file = "@//bindings/python:nanobind.BUILD",
+            recursive_init_submodules = True,
+        )
+
+    if "libpfm" not in native.existing_rules():
+        # Downloaded from v4.9.0 tag at https://sourceforge.net/p/perfmon2/libpfm4/ref/master/tags/
+        http_archive(
+            name = "libpfm",
+            build_file = str(Label("//tools:libpfm.BUILD.bazel")),
+            sha256 = "5da5f8872bde14b3634c9688d980f68bda28b510268723cc12973eedbab9fecc",
+            type = "tar.gz",
+            strip_prefix = "libpfm-4.11.0",
+            urls = ["https://sourceforge.net/projects/perfmon2/files/libpfm4/libpfm-4.11.0.tar.gz/download"],
+        )
--- a/bindings/python/BUILD
+++ b/bindings/python/BUILD
@ -1,3 +0,0 @@
-exports_files(glob(["*.BUILD"]))
-exports_files(["build_defs.bzl"])
-
--- a/bindings/python/build_defs.bzl
+++ b/bindings/python/build_defs.bzl
@ -1,25 +0,0 @@
-_SHARED_LIB_SUFFIX = {
-    "//conditions:default": ".so",
-    "//:windows": ".dll",
-}
-
-def py_extension(name, srcs, hdrs = [], copts = [], features = [], deps = []):
-    for shared_lib_suffix in _SHARED_LIB_SUFFIX.values():
-        shared_lib_name = name + shared_lib_suffix
-        native.cc_binary(
-            name = shared_lib_name,
-            linkshared = True,
-            linkstatic = True,
-            srcs = srcs + hdrs,
-            copts = copts,
-            features = features,
-            deps = deps,
-        )
-
-    return native.py_library(
-        name = name,
-        data = select({
-            platform: [name + shared_lib_suffix]
-            for platform, shared_lib_suffix in _SHARED_LIB_SUFFIX.items()
-        }),
-    )
--- a/bindings/python/google_benchmark/BUILD
+++ b/bindings/python/google_benchmark/BUILD
@ -1,4 +1,5 @@
-load("//bindings/python:build_defs.bzl", "py_extension")
+load("@nanobind_bazel//:build_defs.bzl", "nanobind_extension", "nanobind_stubgen")
+load("@rules_python//python:defs.bzl", "py_library", "py_test")

 py_library(
    name = "google_benchmark",
@ -6,23 +7,19 @@ py_library(
    visibility = ["//visibility:public"],
    deps = [
        ":_benchmark",
-        # pip; absl:app
    ],
 )

-py_extension(
+nanobind_extension(
    name = "_benchmark",
    srcs = ["benchmark.cc"],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    deps = [
-        "//:benchmark",
-        "@pybind11",
-        "@python_headers",
-    ],
+    deps = ["//:benchmark"],
+)
+
+nanobind_stubgen(
+    name = "benchmark_stubgen",
+    marker_file = "bindings/python/google_benchmark/py.typed",
+    module = ":_benchmark",
 )

 py_test(
@ -35,4 +32,3 @@ py_test(
        ":google_benchmark",
    ],
 )
-
--- a/bindings/python/google_benchmark/init.py
+++ b/bindings/python/google_benchmark/init.py
@ -26,50 +26,31 @@ Example usage:
  if __name__ == '__main__':
    benchmark.main()
 """
+
 import atexit

 from absl import app
+
 from google_benchmark import _benchmark
 from google_benchmark._benchmark import (
-    Counter,
-    kNanosecond,
-    kMicrosecond,
-    kMillisecond,
-    kSecond,
-    oNone,
-    o1,
-    oN,
-    oNSquared,
-    oNCubed,
-    oLogN,
-    oNLogN,
-    oAuto,
-    oLambda,
-    State,
+    Counter as Counter,
+    State as State,
+    kMicrosecond as kMicrosecond,
+    kMillisecond as kMillisecond,
+    kNanosecond as kNanosecond,
+    kSecond as kSecond,
+    o1 as o1,
+    oAuto as oAuto,
+    oLambda as oLambda,
+    oLogN as oLogN,
+    oN as oN,
+    oNCubed as oNCubed,
+    oNLogN as oNLogN,
+    oNone as oNone,
+    oNSquared as oNSquared,
 )

-
-__all__ = [
-    "register",
-    "main",
-    "Counter",
-    "kNanosecond",
-    "kMicrosecond",
-    "kMillisecond",
-    "kSecond",
-    "oNone",
-    "o1",
-    "oN",
-    "oNSquared",
-    "oNCubed",
-    "oLogN",
-    "oNLogN",
-    "oAuto",
-    "oLambda",
-    "State",
-]
-
-__version__ = "1.7.1"
+__version__ = "1.9.2"


 class __OptionMaker:
@ -79,7 +60,8 @@ class __OptionMaker:
    """

    class Options:
-        """Pure data class to store options calls, along with the benchmarked function."""
+        """Pure data class to store options calls, along with the benchmarked
+        function."""

        def __init__(self, func):
            self.func = func
@ -97,14 +79,13 @@ class __OptionMaker:

        # The function that get returned on @option.range(start=0, limit=1<<5).
        def __builder_method(*args, **kwargs):
-
            # The decorator that get called, either with the benchmared function
            # or the previous Options
            def __decorator(func_or_options):
                options = self.make(func_or_options)
                options.builder_calls.append((builder_name, args, kwargs))
-                # The decorator returns Options so it is not technically a decorator
-                # and needs a final call to @regiser
+                # The decorator returns Options so it is not technically a
+                # decorator and needs a final call to @register
                return options

            return __decorator
@ -113,8 +94,8 @@ class __OptionMaker:


 # Alias for nicer API.
-# We have to instantiate an object, even if stateless, to be able to use __getattr__
-# on option.range
+# We have to instantiate an object, even if stateless, to be able to use
+# __getattr__ on option.range
 option = __OptionMaker()


@ -124,8 +105,8 @@ def register(undefined=None, *, name=None):
        # Decorator is called without parenthesis so we return a decorator
        return lambda f: register(f, name=name)

-    # We have either the function to benchmark (simple case) or an instance of Options
-    # (@option._ case).
+    # We have either the function to benchmark (simple case) or an instance of
+    # Options (@option._ case).
    options = __OptionMaker.make(undefined)

    if name is None:
--- a/bindings/python/google_benchmark/benchmark.cc
+++ b/bindings/python/google_benchmark/benchmark.cc
@ -2,19 +2,16 @@

 #include "benchmark/benchmark.h"

-#include <map>
-#include <string>
-#include <vector>
+#include "nanobind/nanobind.h"
+#include "nanobind/operators.h"
+#include "nanobind/stl/bind_map.h"
+#include "nanobind/stl/string.h"
+#include "nanobind/stl/vector.h"

-#include "pybind11/operators.h"
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-#include "pybind11/stl_bind.h"
-
-PYBIND11_MAKE_OPAQUE(benchmark::UserCounters);
+NB_MAKE_OPAQUE(benchmark::UserCounters);

 namespace {
-namespace py = ::pybind11;
+namespace nb = nanobind;

 std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
  // The `argv` pointers here become invalid when this function returns, but
@ -37,15 +34,16 @@ std::vector<std::string> Initialize(const std::vector<std::string>& argv) {
  return remaining_argv;
 }

-benchmark::internal::Benchmark* RegisterBenchmark(const char* name,
-                                                  py::function f) {
+benchmark::internal::Benchmark* RegisterBenchmark(const std::string& name,
+                                                  nb::callable f) {
  return benchmark::RegisterBenchmark(
      name, [f](benchmark::State& state) { f(&state); });
 }

-PYBIND11_MODULE(_benchmark, m) {
+NB_MODULE(_benchmark, m) {
+
  using benchmark::TimeUnit;
-  py::enum_<TimeUnit>(m, "TimeUnit")
+  nb::enum_<TimeUnit>(m, "TimeUnit")
      .value("kNanosecond", TimeUnit::kNanosecond)
      .value("kMicrosecond", TimeUnit::kMicrosecond)
      .value("kMillisecond", TimeUnit::kMillisecond)
@ -53,74 +51,74 @@ PYBIND11_MODULE(_benchmark, m) {
      .export_values();

  using benchmark::BigO;
-  py::enum_<BigO>(m, "BigO")
+  nb::enum_<BigO>(m, "BigO")
      .value("oNone", BigO::oNone)
      .value("o1", BigO::o1)
      .value("oN", BigO::oN)
      .value("oNSquared", BigO::oNSquared)
      .value("oNCubed", BigO::oNCubed)
      .value("oLogN", BigO::oLogN)
-      .value("oNLogN", BigO::oLogN)
+      .value("oNLogN", BigO::oNLogN)
      .value("oAuto", BigO::oAuto)
      .value("oLambda", BigO::oLambda)
      .export_values();

  using benchmark::internal::Benchmark;
-  py::class_<Benchmark>(m, "Benchmark")
-      // For methods returning a pointer tor the current object, reference
-      // return policy is used to ask pybind not to take ownership oof the
+  nb::class_<Benchmark>(m, "Benchmark")
+      // For methods returning a pointer to the current object, reference
+      // return policy is used to ask nanobind not to take ownership of the
      // returned object and avoid calling delete on it.
      // https://pybind11.readthedocs.io/en/stable/advanced/functions.html#return-value-policies
      //
      // For methods taking a const std::vector<...>&, a copy is created
      // because a it is bound to a Python list.
      // https://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html
-      .def("unit", &Benchmark::Unit, py::return_value_policy::reference)
-      .def("arg", &Benchmark::Arg, py::return_value_policy::reference)
-      .def("args", &Benchmark::Args, py::return_value_policy::reference)
-      .def("range", &Benchmark::Range, py::return_value_policy::reference,
-           py::arg("start"), py::arg("limit"))
+      .def("unit", &Benchmark::Unit, nb::rv_policy::reference)
+      .def("arg", &Benchmark::Arg, nb::rv_policy::reference)
+      .def("args", &Benchmark::Args, nb::rv_policy::reference)
+      .def("range", &Benchmark::Range, nb::rv_policy::reference,
+           nb::arg("start"), nb::arg("limit"))
      .def("dense_range", &Benchmark::DenseRange,
-           py::return_value_policy::reference, py::arg("start"),
-           py::arg("limit"), py::arg("step") = 1)
-      .def("ranges", &Benchmark::Ranges, py::return_value_policy::reference)
+           nb::rv_policy::reference, nb::arg("start"),
+           nb::arg("limit"), nb::arg("step") = 1)
+      .def("ranges", &Benchmark::Ranges, nb::rv_policy::reference)
      .def("args_product", &Benchmark::ArgsProduct,
-           py::return_value_policy::reference)
-      .def("arg_name", &Benchmark::ArgName, py::return_value_policy::reference)
+           nb::rv_policy::reference)
+      .def("arg_name", &Benchmark::ArgName, nb::rv_policy::reference)
      .def("arg_names", &Benchmark::ArgNames,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
      .def("range_pair", &Benchmark::RangePair,
-           py::return_value_policy::reference, py::arg("lo1"), py::arg("hi1"),
-           py::arg("lo2"), py::arg("hi2"))
+           nb::rv_policy::reference, nb::arg("lo1"), nb::arg("hi1"),
+           nb::arg("lo2"), nb::arg("hi2"))
      .def("range_multiplier", &Benchmark::RangeMultiplier,
-           py::return_value_policy::reference)
-      .def("min_time", &Benchmark::MinTime, py::return_value_policy::reference)
+           nb::rv_policy::reference)
+      .def("min_time", &Benchmark::MinTime, nb::rv_policy::reference)
      .def("min_warmup_time", &Benchmark::MinWarmUpTime,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
      .def("iterations", &Benchmark::Iterations,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
      .def("repetitions", &Benchmark::Repetitions,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
      .def("report_aggregates_only", &Benchmark::ReportAggregatesOnly,
-           py::return_value_policy::reference, py::arg("value") = true)
+           nb::rv_policy::reference, nb::arg("value") = true)
      .def("display_aggregates_only", &Benchmark::DisplayAggregatesOnly,
-           py::return_value_policy::reference, py::arg("value") = true)
+           nb::rv_policy::reference, nb::arg("value") = true)
      .def("measure_process_cpu_time", &Benchmark::MeasureProcessCPUTime,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
      .def("use_real_time", &Benchmark::UseRealTime,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
      .def("use_manual_time", &Benchmark::UseManualTime,
-           py::return_value_policy::reference)
+           nb::rv_policy::reference)
      .def(
          "complexity",
          (Benchmark * (Benchmark::*)(benchmark::BigO)) & Benchmark::Complexity,
-          py::return_value_policy::reference,
-          py::arg("complexity") = benchmark::oAuto);
+          nb::rv_policy::reference,
+          nb::arg("complexity") = benchmark::oAuto);

  using benchmark::Counter;
-  py::class_<Counter> py_counter(m, "Counter");
+  nb::class_<Counter> py_counter(m, "Counter");

-  py::enum_<Counter::Flags>(py_counter, "Flags")
+  nb::enum_<Counter::Flags>(py_counter, "Flags", nb::is_arithmetic(), nb::is_flag())
      .value("kDefaults", Counter::Flags::kDefaults)
      .value("kIsRate", Counter::Flags::kIsRate)
      .value("kAvgThreads", Counter::Flags::kAvgThreads)
@ -131,52 +129,54 @@ PYBIND11_MODULE(_benchmark, m) {
      .value("kAvgIterations", Counter::Flags::kAvgIterations)
      .value("kAvgIterationsRate", Counter::Flags::kAvgIterationsRate)
      .value("kInvert", Counter::Flags::kInvert)
-      .export_values()
-      .def(py::self | py::self);
+      .export_values();

-  py::enum_<Counter::OneK>(py_counter, "OneK")
+  nb::enum_<Counter::OneK>(py_counter, "OneK")
      .value("kIs1000", Counter::OneK::kIs1000)
      .value("kIs1024", Counter::OneK::kIs1024)
      .export_values();

  py_counter
-      .def(py::init<double, Counter::Flags, Counter::OneK>(),
-           py::arg("value") = 0., py::arg("flags") = Counter::kDefaults,
-           py::arg("k") = Counter::kIs1000)
-      .def(py::init([](double value) { return Counter(value); }))
-      .def_readwrite("value", &Counter::value)
-      .def_readwrite("flags", &Counter::flags)
-      .def_readwrite("oneK", &Counter::oneK);
-  py::implicitly_convertible<py::float_, Counter>();
-  py::implicitly_convertible<py::int_, Counter>();
+      .def(nb::init<double, Counter::Flags, Counter::OneK>(),
+           nb::arg("value") = 0., nb::arg("flags") = Counter::kDefaults,
+           nb::arg("k") = Counter::kIs1000)
+      .def("__init__",
+           ([](Counter* c, double value) { new (c) Counter(value); }))
+      .def_rw("value", &Counter::value)
+      .def_rw("flags", &Counter::flags)
+      .def_rw("oneK", &Counter::oneK)
+      .def(nb::init_implicit<double>());

-  py::bind_map<benchmark::UserCounters>(m, "UserCounters");
+  nb::implicitly_convertible<nb::int_, Counter>();
+
+  nb::bind_map<benchmark::UserCounters>(m, "UserCounters");

  using benchmark::State;
-  py::class_<State>(m, "State")
+  nb::class_<State>(m, "State")
      .def("__bool__", &State::KeepRunning)
-      .def_property_readonly("keep_running", &State::KeepRunning)
+      .def_prop_ro("keep_running", &State::KeepRunning)
      .def("pause_timing", &State::PauseTiming)
      .def("resume_timing", &State::ResumeTiming)
      .def("skip_with_error", &State::SkipWithError)
-      .def_property_readonly("error_occurred", &State::error_occurred)
+      .def_prop_ro("error_occurred", &State::error_occurred)
      .def("set_iteration_time", &State::SetIterationTime)
-      .def_property("bytes_processed", &State::bytes_processed,
+      .def_prop_rw("bytes_processed", &State::bytes_processed,
                    &State::SetBytesProcessed)
-      .def_property("complexity_n", &State::complexity_length_n,
+      .def_prop_rw("complexity_n", &State::complexity_length_n,
                    &State::SetComplexityN)
-      .def_property("items_processed", &State::items_processed,
-                    &State::SetItemsProcessed)
-      .def("set_label", (void (State::*)(const char*)) & State::SetLabel)
-      .def("range", &State::range, py::arg("pos") = 0)
-      .def_property_readonly("iterations", &State::iterations)
-      .def_readwrite("counters", &State::counters)
-      .def_property_readonly("thread_index", &State::thread_index)
-      .def_property_readonly("threads", &State::threads);
+      .def_prop_rw("items_processed", &State::items_processed,
+                   &State::SetItemsProcessed)
+      .def("set_label", &State::SetLabel)
+      .def("range", &State::range, nb::arg("pos") = 0)
+      .def_prop_ro("iterations", &State::iterations)
+      .def_prop_ro("name", &State::name)
+      .def_rw("counters", &State::counters)
+      .def_prop_ro("thread_index", &State::thread_index)
+      .def_prop_ro("threads", &State::threads);

  m.def("Initialize", Initialize);
  m.def("RegisterBenchmark", RegisterBenchmark,
-        py::return_value_policy::reference);
+        nb::rv_policy::reference);
  m.def("RunSpecifiedBenchmarks",
        []() { benchmark::RunSpecifiedBenchmarks(); });
  m.def("ClearRegisteredBenchmarks", benchmark::ClearRegisteredBenchmarks);
--- a/bindings/python/google_benchmark/example.py
+++ b/bindings/python/google_benchmark/example.py
@ -13,7 +13,8 @@
 # limitations under the License.
 """Example of Python using C++ benchmark framework.

-To run this example, you must first install the `google_benchmark` Python package.
+To run this example, you must first install the `google_benchmark` Python
+package.

 To install using `setup.py`, download and extract the `google_benchmark` source.
 In the extracted directory, execute:
@ -38,6 +39,7 @@ def sum_million(state):
    while state:
        sum(range(1_000_000))

+
@benchmark.register
 def pause_timing(state):
    """Pause timing every iteration."""
@ -56,10 +58,11 @@ def skipped(state):
        state.skip_with_error("some error")
        return  # NOTE: You must explicitly return, or benchmark will continue.

-    ...  # Benchmark code would be here.
+    # Benchmark code would be here.


@benchmark.register
+@benchmark.option.use_manual_time()
 def manual_timing(state):
    while state:
        # Manually count Python CPU time
@ -72,11 +75,10 @@ def manual_timing(state):

@benchmark.register
 def custom_counters(state):
-    """Collect cutom metric using benchmark.Counter."""
+    """Collect custom metric using benchmark.Counter."""
    num_foo = 0.0
    while state:
        # Benchmark some code here
-        pass
        # Collect some custom metric named foo
        num_foo += 0.13

@ -85,7 +87,9 @@ def custom_counters(state):
    # Set a counter as a rate.
    state.counters["foo_rate"] = Counter(num_foo, Counter.kIsRate)
    #  Set a counter as an inverse of rate.
-    state.counters["foo_inv_rate"] = Counter(num_foo, Counter.kIsRate | Counter.kInvert)
+    state.counters["foo_inv_rate"] = Counter(
+        num_foo, Counter.kIsRate | Counter.kInvert
+    )
    # Set a counter as a thread-average quantity.
    state.counters["foo_avg"] = Counter(num_foo, Counter.kAvgThreads)
    # There's also a combined flag:
--- a/bindings/python/pybind11.BUILD
+++ b/bindings/python/pybind11.BUILD
@ -1,20 +0,0 @@
-cc_library(
-    name = "pybind11",
-    hdrs = glob(
-        include = [
-            "include/pybind11/*.h",
-            "include/pybind11/detail/*.h",
-        ],
-        exclude = [
-            "include/pybind11/common.h",
-            "include/pybind11/eigen.h",
-        ],
-    ),
-    copts = [
-        "-fexceptions",
-        "-Wno-undefined-inline",
-        "-Wno-pragma-once-outside-header",
-    ],
-    includes = ["include"],
-    visibility = ["//visibility:public"],
-)
--- a/bindings/python/python_headers.BUILD
+++ b/bindings/python/python_headers.BUILD
@ -1,6 +0,0 @@
-cc_library(
-    name = "python_headers",
-    hdrs = glob(["**/*.h"]),
-    includes = ["."],
-    visibility = ["//visibility:public"],
-)
--- a/bindings/python/requirements.txt
+++ b/bindings/python/requirements.txt
@ -1,2 +0,0 @@
-absl-py>=0.7.1
-
--- a/cmake/CXXFeatureCheck.cmake
+++ b/cmake/CXXFeatureCheck.cmake
@ -40,6 +40,8 @@ function(cxx_feature_check FILE)
      message(STATUS "Cross-compiling to test ${FEATURE}")
      try_compile(COMPILE_${FEATURE}
              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
+              CXX_STANDARD 17
+              CXX_STANDARD_REQUIRED ON
              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
              OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
@ -54,6 +56,8 @@ function(cxx_feature_check FILE)
      message(STATUS "Compiling and running to test ${FEATURE}")
      try_run(RUN_${FEATURE} COMPILE_${FEATURE}
              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
+              CXX_STANDARD 17
+              CXX_STANDARD_REQUIRED ON
              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
              COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
--- a/cmake/Config.cmake.in
+++ b/cmake/Config.cmake.in
@ -4,4 +4,9 @@ include (CMakeFindDependencyMacro)

 find_dependency (Threads)

+if (@BENCHMARK_ENABLE_LIBPFM@)
+    list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
+    find_dependency (PFM)
+endif()
+
 include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
--- a/cmake/GetGitVersion.cmake
+++ b/cmake/GetGitVersion.cmake
@ -20,38 +20,16 @@ set(__get_git_version INCLUDED)

 function(get_git_version var)
  if(GIT_EXECUTABLE)
-      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8 --dirty
          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
          RESULT_VARIABLE status
-          OUTPUT_VARIABLE GIT_DESCRIBE_VERSION
+          OUTPUT_VARIABLE GIT_VERSION
          ERROR_QUIET)
      if(status)
-          set(GIT_DESCRIBE_VERSION "v0.0.0")
+          set(GIT_VERSION "v0.0.0")
      endif()
-      
-      string(STRIP ${GIT_DESCRIBE_VERSION} GIT_DESCRIBE_VERSION)
-      if(GIT_DESCRIBE_VERSION MATCHES v[^-]*-) 
-         string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  GIT_VERSION ${GIT_DESCRIBE_VERSION})
-      else()
-         string(REGEX REPLACE "v(.*)" "\\1" GIT_VERSION ${GIT_DESCRIBE_VERSION})
-      endif()
-
-      # Work out if the repository is dirty
-      execute_process(COMMAND ${GIT_EXECUTABLE} update-index -q --refresh
-          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-          OUTPUT_QUIET
-          ERROR_QUIET)
-      execute_process(COMMAND ${GIT_EXECUTABLE} diff-index --name-only HEAD --
-          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-          OUTPUT_VARIABLE GIT_DIFF_INDEX
-          ERROR_QUIET)
-      string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
-      if (${GIT_DIRTY})
-          set(GIT_DESCRIBE_VERSION "${GIT_DESCRIBE_VERSION}-dirty")
-      endif()
-      message(STATUS "git version: ${GIT_DESCRIBE_VERSION} normalized to ${GIT_VERSION}")
  else()
-      set(GIT_VERSION "0.0.0")
+      set(GIT_VERSION "v0.0.0")
  endif()

  set(${var} ${GIT_VERSION} PARENT_SCOPE)
--- a/cmake/GoogleTest.cmake
+++ b/cmake/GoogleTest.cmake
@ -29,19 +29,25 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)

 include(${GOOGLETEST_PREFIX}/googletest-paths.cmake)

-# googletest doesn't seem to want to stay build warning clean so let's not hurt ourselves.
-if (MSVC)
-  add_compile_options(/wd4244 /wd4722)
-else()
-  add_compile_options(-w)
-endif()
-
 # Add googletest directly to our build. This defines
 # the gtest and gtest_main targets.
 add_subdirectory(${GOOGLETEST_SOURCE_DIR}
                 ${GOOGLETEST_BINARY_DIR}
                 EXCLUDE_FROM_ALL)

+# googletest doesn't seem to want to stay build warning clean so let's not hurt ourselves.
+if (MSVC)
+  target_compile_options(gtest PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gtest_main PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock_main PRIVATE "/wd4244" "/wd4722")
+else()
+  target_compile_options(gtest PRIVATE "-w")
+  target_compile_options(gtest_main PRIVATE "-w")
+  target_compile_options(gmock PRIVATE "-w")
+  target_compile_options(gmock_main PRIVATE "-w")
+endif()
+
 if(NOT DEFINED GTEST_COMPILE_COMMANDS)
    set(GTEST_COMPILE_COMMANDS ON)
 endif()
--- a/cmake/GoogleTest.cmake.in
+++ b/cmake/GoogleTest.cmake.in
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required (VERSION 3.13...3.22)

 project(googletest-download NONE)

@ -34,11 +34,12 @@ else()
    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_USE_BUNDLED_GTEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
    return()
  else()
-    message(WARNING "Did not find Google Test sources! Fetching from web...")
+    message(STATUS "Did not find Google Test sources! Fetching from web...")
    ExternalProject_Add(
      googletest
      GIT_REPOSITORY    https://github.com/google/googletest.git
-      GIT_TAG           "release-1.11.0"
+      GIT_TAG           "v1.15.2"
+      GIT_SHALLOW       "ON"
      PREFIX            "${CMAKE_BINARY_DIR}"
      STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
      DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
--- a/cmake/Modules/FindPFM.cmake
+++ b/cmake/Modules/FindPFM.cmake
@ -1,26 +1,28 @@
 # If successful, the following variables will be defined:
-# HAVE_LIBPFM.
-# Set BENCHMARK_ENABLE_LIBPFM to 0 to disable, regardless of libpfm presence.
-include(CheckIncludeFile)
-include(CheckLibraryExists)
+# PFM_FOUND.
+# PFM_LIBRARIES
+# PFM_INCLUDE_DIRS
+# the following target will be defined:
+# PFM::libpfm
+
 include(FeatureSummary)
-enable_language(C)
+include(FindPackageHandleStandardArgs)

 set_package_properties(PFM PROPERTIES
                       URL http://perfmon2.sourceforge.net/
-                       DESCRIPTION "a helper library to develop monitoring tools"
+                       DESCRIPTION "A helper library to develop monitoring tools"
                       PURPOSE "Used to program specific performance monitoring events")

-check_library_exists(libpfm.a pfm_initialize "" HAVE_LIBPFM_INITIALIZE)
-if(HAVE_LIBPFM_INITIALIZE)
-  check_include_file(perfmon/perf_event.h HAVE_PERFMON_PERF_EVENT_H)
-  check_include_file(perfmon/pfmlib.h HAVE_PERFMON_PFMLIB_H)
-  check_include_file(perfmon/pfmlib_perf_event.h HAVE_PERFMON_PFMLIB_PERF_EVENT_H)
-  if(HAVE_PERFMON_PERF_EVENT_H AND HAVE_PERFMON_PFMLIB_H AND HAVE_PERFMON_PFMLIB_PERF_EVENT_H)
-    message("Using Perf Counters.")
-    set(HAVE_LIBPFM 1)
-    set(PFM_FOUND 1)
-  endif()
-else()
-  message("Perf Counters support requested, but was unable to find libpfm.")
+find_library(PFM_LIBRARY NAMES pfm)
+find_path(PFM_INCLUDE_DIR NAMES perfmon/pfmlib.h)
+
+find_package_handle_standard_args(PFM REQUIRED_VARS PFM_LIBRARY PFM_INCLUDE_DIR)
+
+if (PFM_FOUND AND NOT TARGET PFM::libpfm)
+    add_library(PFM::libpfm UNKNOWN IMPORTED)
+    set_target_properties(PFM::libpfm PROPERTIES
+        IMPORTED_LOCATION "${PFM_LIBRARY}"
+        INTERFACE_INCLUDE_DIRECTORIES "${PFM_INCLUDE_DIR}")
 endif()
+
+mark_as_advanced(PFM_LIBRARY PFM_INCLUDE_DIR)
--- a/cmake/benchmark.pc.in
+++ b/cmake/benchmark.pc.in
@ -5,8 +5,8 @@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@

 Name: @PROJECT_NAME@
 Description: Google microbenchmark framework
-Version: @VERSION@
+Version: @NORMALIZED_VERSION@

 Libs: -L${libdir} -lbenchmark
-Libs.private: -lpthread
+Libs.private: -lpthread @BENCHMARK_PRIVATE_LINK_LIBRARIES@
 Cflags: -I${includedir}
--- a/cmake/benchmark_main.pc.in
+++ b/cmake/benchmark_main.pc.in
@ -0,0 +1,7 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+
+Name: @PROJECT_NAME@
+Description: Google microbenchmark framework (with main() function)
+Version: @NORMALIZED_VERSION@
+Requires: benchmark
+Libs: -L${libdir} -lbenchmark_main
--- a/cmake/pthread_affinity.cpp
+++ b/cmake/pthread_affinity.cpp
@ -0,0 +1,16 @@
+#include <pthread.h>
+int main() {
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  for (int i = 0; i < CPU_SETSIZE; ++i) {
+    CPU_SET(i, &set);
+    CPU_CLR(i, &set);
+  }
+  pthread_t self = pthread_self();
+  int ret;
+  ret = pthread_getaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  ret = pthread_setaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  return 0;
+}
--- a/docs/AssemblyTests.md
+++ b/docs/AssemblyTests.md
@ -111,6 +111,7 @@ between compilers or compiler versions. A common example of this
 is matching stack frame addresses. In this case regular expressions
 can be used to match the differing bits of output. For example:

+<!-- {% raw %} -->
 ```c++
 int ExternInt;
 struct Point { int x, y, z; };
@ -127,6 +128,7 @@ extern "C" void test_store_point() {
    // CHECK: ret
 }
 ```
+<!-- {% endraw %} -->

 ## Current Requirements and Limitations

--- a/docs/_config.yml
+++ b/docs/_config.yml
@ -1 +1,3 @@
-theme: jekyll-theme-minimal
+theme: jekyll-theme-minimal
+logo: /assets/images/icon_black.png
+show_downloads: true
--- a/docs/assets/images/icon.png
+++ b/docs/assets/images/icon.png
--- a/docs/assets/images/icon.xcf
+++ b/docs/assets/images/icon.xcf
--- a/docs/assets/images/icon_black.png
+++ b/docs/assets/images/icon_black.png
--- a/docs/assets/images/icon_black.xcf
+++ b/docs/assets/images/icon_black.xcf
--- a/docs/dependencies.md
+++ b/docs/dependencies.md
@ -1,21 +1,19 @@
 # Build tool dependency policy

-To ensure the broadest compatibility when building the benchmark library, but
-still allow forward progress, we require any build tooling to be available for:
+We follow the [Foundational C++ support policy](https://opensource.google/documentation/policies/cplusplus-support) for our build tools. In
+particular the ["Build Systems" section](https://opensource.google/documentation/policies/cplusplus-support#build-systems).

-* Debian stable _and_
-* The last two Ubuntu LTS releases
+## CMake

-Currently, this means using build tool versions that are available for Ubuntu
-Ubuntu 20.04 (Focal Fossa), Ubuntu 22.04 (Jammy Jellyfish) and Debian 11.4 (bullseye).
+The current supported version is CMake 3.10 as of 2023-08-10. Most modern
+distributions include newer versions, for example:

-_Note, CI also runs ubuntu-18.04 to attempt best effort support for older versions._
+* Ubuntu 20.04 provides CMake 3.16.3
+* Debian 11.4 provides CMake 3.18.4
+* Ubuntu 22.04 provides CMake 3.22.1

-## cmake
-The current supported version is cmake 3.16.3 as of 2022-08-10.
-
-* _3.10.2 (ubuntu 18.04)_
-* 3.16.3 (ubuntu 20.04)
-* 3.18.4 (debian 11.4)
-* 3.22.1 (ubuntu 22.04)
+## Python

+The Python bindings require Python 3.10+ as of v1.9.0 (2024-08-16) for installation from PyPI.
+Building from source for older versions probably still works, though. See the [user guide](python_bindings.md) for details on how to build from source.
+The minimum theoretically supported version is Python 3.8, since the used bindings generator (nanobind) only supports Python 3.8+.
--- a/docs/index.md
+++ b/docs/index.md
@ -4,7 +4,9 @@
 * [Dependencies](dependencies.md)
 * [Perf Counters](perf_counters.md)
 * [Platform Specific Build Instructions](platform_specific_build_instructions.md)
+* [Python Bindings](python_bindings.md)
 * [Random Interleaving](random_interleaving.md)
+* [Reducing Variance](reducing_variance.md)
 * [Releasing](releasing.md)
 * [Tools](tools.md)
-* [User Guide](user_guide.md)
+* [User Guide](user_guide.md)
--- a/docs/perf_counters.md
+++ b/docs/perf_counters.md
@ -19,7 +19,7 @@ The feature does not require modifying benchmark code. Counter collection is
 handled at the boundaries where timer collection is also handled. 

 To opt-in:
-* If using a Bazel build, add `--define pfm=1` to your buid flags
+* If using a Bazel build, add `--define pfm=1` to your build flags
 * If using CMake:
  * Install `libpfm4-dev`, e.g. `apt-get install libpfm4-dev`.
  * Enable the CMake flag `BENCHMARK_ENABLE_LIBPFM` in `CMakeLists.txt`.
--- a/docs/python_bindings.md
+++ b/docs/python_bindings.md
@ -3,7 +3,7 @@
 Python bindings are available as wheels on [PyPI](https://pypi.org/project/google-benchmark/) for importing and 
 using Google Benchmark directly in Python. 
 Currently, pre-built wheels exist for macOS (both ARM64 and Intel x86), Linux x86-64 and 64-bit Windows.
-Supported Python versions are Python 3.7 - 3.10.
+Supported Python versions are Python 3.8 - 3.12.

 To install Google Benchmark's Python bindings, run:

@ -25,9 +25,9 @@ python3 -m venv venv --system-site-packages
 source venv/bin/activate  # .\venv\Scripts\Activate.ps1 on Windows

 # upgrade Python's system-wide packages
-python -m pip install --upgrade pip setuptools wheel
-# builds the wheel and stores it in the directory "wheelhouse".
-python -m pip wheel . -w wheelhouse
+python -m pip install --upgrade pip build
+# builds the wheel and stores it in the directory "dist".
+python -m build
 ```

 NB: Building wheels from source requires Bazel. For platform-specific instructions on how to install Bazel,
--- a/docs/reducing_variance.md
+++ b/docs/reducing_variance.md
@ -14,8 +14,6 @@ you might want to disable the CPU frequency scaling while running the
 benchmark, as well as consider other ways to stabilize the performance of
 your system while benchmarking.

-See [Reducing Variance](reducing_variance.md) for more information.
-
 Exactly how to do this depends on the Linux distribution,
 desktop environment, and installed programs.  Specific details are a moving
 target, so we will not attempt to exhaustively document them here.
@ -67,10 +65,10 @@ program.
 Reducing sources of variance is OS and architecture dependent, which is one
 reason some companies maintain machines dedicated to performance testing.

-Some of the easier and and effective ways of reducing variance on a typical
+Some of the easier and effective ways of reducing variance on a typical
 Linux workstation are:

-1. Use the performance governer as [discussed
+1. Use the performance governor as [discussed
 above](user_guide#disabling-cpu-frequency-scaling).
 1. Disable processor boosting by:
   ```sh
@ -89,7 +87,7 @@ above](user_guide#disabling-cpu-frequency-scaling).
 4. Close other programs that do non-trivial things based on timers, such as
   your web browser, desktop environment, etc.
 5. Reduce the working set of your benchmark to fit within the L1 cache, but
-   do be aware that this may lead you to optimize for an unrelistic
+   do be aware that this may lead you to optimize for an unrealistic
   situation.

 Further resources on this topic:
--- a/docs/releasing.md
+++ b/docs/releasing.md
@ -8,23 +8,24 @@
    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
      commits between the last annotated tag and HEAD
    * Pick the most interesting.
-* Create one last commit that updates the version saved in `CMakeLists.txt` and the
-  `__version__` variable in `bindings/python/google_benchmark/__init__.py`to the release
-  version you're creating. (This version will be used if benchmark is installed from the
-  archive you'll be creating in the next step.)
+* Create one last commit that updates the version saved in `CMakeLists.txt`, `MODULE.bazel`,
+  and `bindings/python/google_benchmark/__init__.py` to the release version you're creating.
+  (This version will be used if benchmark is installed from the archive you'll be creating
+  in the next step.)

 ```
-project (benchmark VERSION 1.6.0 LANGUAGES CXX)
+# CMakeLists.txt
+project (benchmark VERSION 1.9.0 LANGUAGES CXX)
 ```

-```python
-# bindings/python/google_benchmark/__init__.py
+```
+# MODULE.bazel
+module(name = "com_github_google_benchmark", version="1.9.0")
+```

-# ...
-
-__version__ = "1.6.0"  # <-- change this to the release version you are creating
-
-# ...
+```
+# google_benchmark/__init__.py
+__version__ = "1.9.0"
 ```

 * Create a release through github's interface
@ -34,4 +35,4 @@ __version__ = "1.6.0"  # <-- change this to the release version you are creating
      * `git tag -a -f <tag> <tag>`
      * `git push --force --tags origin`
 * Confirm that the "Build and upload Python wheels" action runs to completion
-    * run it manually if it hasn't run
+    * Run it manually if it hasn't run.
--- a/docs/tools.md
+++ b/docs/tools.md
@ -186,6 +186,146 @@ Benchmark                               Time             CPU      Time Old
 This is a mix of the previous two modes, two (potentially different) benchmark binaries are run, and a different filter is applied to each one.
 As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.

+### Note: Interpreting the output
+
+Performance measurements are an art, and performance comparisons are doubly so.
+Results are often noisy and don't necessarily have large absolute differences to
+them, so just by visual inspection, it is not at all apparent if two
+measurements are actually showing a performance change or not. It is even more
+confusing with multiple benchmark repetitions.
+
+Thankfully, what we can do, is use statistical tests on the results to determine
+whether the performance has statistically-significantly changed. `compare.py`
+uses [Mann–Whitney U
+test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), with a null
+hypothesis being that there's no difference in performance.
+ 
+**The below output is a summary of a benchmark comparison with statistics
+provided for a multi-threaded process.**
+```
+Benchmark                                               Time        CPU    Time Old      Time New       CPU Old       CPU New
+-----------------------------------------------------------------------------------------------------------------------------
+benchmark/threads:1/process_time/real_time_pvalue     0.0000     0.0000    U Test, Repetitions: 27 vs 27
+benchmark/threads:1/process_time/real_time_mean      -0.1442    -0.1442          90            77            90            77
+benchmark/threads:1/process_time/real_time_median    -0.1444    -0.1444          90            77            90            77
+benchmark/threads:1/process_time/real_time_stddev    +0.3974    +0.3933           0             0             0             0
+benchmark/threads:1/process_time/real_time_cv        +0.6329    +0.6280           0             0             0             0
+OVERALL_GEOMEAN                                      -0.1442    -0.1442           0             0             0             0
+```
+--------------------------------------------
+Here's a breakdown of each row:
+
+**benchmark/threads:1/process_time/real_time_pvalue**: This shows the _p-value_ for
+the statistical test comparing the performance of the process running with one
+thread. A value of 0.0000 suggests a statistically significant difference in
+performance. The comparison was conducted using the U Test (Mann-Whitney
+U Test) with 27 repetitions for each case.
+
+**benchmark/threads:1/process_time/real_time_mean**: This shows the relative
+difference in mean execution time between two different cases. The negative
+value (-0.1442) implies that the new process is faster by about 14.42%. The old
+time was 90 units, while the new time is 77 units.
+
+**benchmark/threads:1/process_time/real_time_median**: Similarly, this shows the
+relative difference in the median execution time. Again, the new process is
+faster by 14.44%.
+
+**benchmark/threads:1/process_time/real_time_stddev**: This is the relative
+difference in the standard deviation of the execution time, which is a measure
+of how much variation or dispersion there is from the mean. A positive value
+(+0.3974) implies there is more variance in the execution time in the new
+process.
+
+**benchmark/threads:1/process_time/real_time_cv**: CV stands for Coefficient of
+Variation. It is the ratio of the standard deviation to the mean. It provides a
+standardized measure of dispersion. An increase (+0.6329) indicates more
+relative variability in the new process.
+
+**OVERALL_GEOMEAN**: Geomean stands for geometric mean, a type of average that is
+less influenced by outliers. The negative value indicates a general improvement
+in the new process. However, given the values are all zero for the old and new
+times, this seems to be a mistake or placeholder in the output.
+
+-----------------------------------------
+
+
+
+Let's first try to see what the different columns represent in the above
+`compare.py` benchmarking output:
+
+  1. **Benchmark:** The name of the function being benchmarked, along with the
+     size of the input (after the slash).
+
+  2. **Time:** The average time per operation, across all iterations.
+
+  3. **CPU:** The average CPU time per operation, across all iterations.
+
+  4. **Iterations:** The number of iterations the benchmark was run to get a
+     stable estimate.
+
+  5. **Time Old and Time New:** These represent the average time it takes for a
+     function to run in two different scenarios or versions. For example, you
+     might be comparing how fast a function runs before and after you make some
+     changes to it.
+
+  6. **CPU Old and CPU New:** These show the average amount of CPU time that the
+     function uses in two different scenarios or versions. This is similar to
+     Time Old and Time New, but focuses on CPU usage instead of overall time.
+
+In the comparison section, the relative differences in both time and CPU time
+are displayed for each input size.
+
+
+A statistically-significant difference is determined by a **p-value**, which is
+a measure of the probability that the observed difference could have occurred
+just by random chance. A smaller p-value indicates stronger evidence against the
+null hypothesis. 
+
+**Therefore:**
+  1. If the p-value is less than the chosen significance level (alpha), we
+     reject the null hypothesis and conclude the benchmarks are significantly
+     different.
+  2. If the p-value is greater than or equal to alpha, we fail to reject the
+     null hypothesis and treat the two benchmarks as similar.
+
+
+
+The result of said the statistical test is additionally communicated through color coding:
+```diff
+ Green:
+```
+  The benchmarks are _**statistically different**_. This could mean the
+  performance has either **significantly improved** or **significantly
+  deteriorated**. You should look at the actual performance numbers to see which
+  is the case.
+```diff
+- Red:
+```
+  The benchmarks are _**statistically similar**_. This means the performance
+  **hasn't significantly changed**.
+
+In statistical terms, **'green'** means we reject the null hypothesis that
+there's no difference in performance, and **'red'** means we fail to reject the
+null hypothesis. This might seem counter-intuitive if you're expecting 'green'
+to mean 'improved performance' and 'red' to mean 'worsened performance'. 
+```bash
+  But remember, in this context:
+
+    'Success' means 'successfully finding a difference'.
+    'Failure' means 'failing to find a difference'.
+```
+
+
+Also, please note that **even if** we determine that there **is** a
+statistically-significant difference between the two measurements, it does not
+_necessarily_ mean that the actual benchmarks that were measured **are**
+different, or vice versa, even if we determine that there is **no**
+statistically-significant difference between the two measurements, it does not
+necessarily mean that the actual benchmarks that were measured **are not**
+different.
+
+
+
 ### U test

 If there is a sufficient repetition count of the benchmarks, the tool can do
--- a/docs/user_guide.md
+++ b/docs/user_guide.md
@ -28,6 +28,8 @@

 [Templated Benchmarks](#templated-benchmarks)

+[Templated Benchmarks that take arguments](#templated-benchmarks-with-arguments)
+
 [Fixtures](#fixtures)

 [Custom Counters](#custom-counters)
@ -56,7 +58,7 @@

 [Exiting with an Error](#exiting-with-an-error)

-[A Faster KeepRunning Loop](#a-faster-keep-running-loop)
+[A Faster `KeepRunning` Loop](#a-faster-keep-running-loop)

 ## Benchmarking Tips

@ -80,9 +82,9 @@ tabular data on stdout. Example tabular output looks like:
 ```
 Benchmark                               Time(ns)    CPU(ns) Iterations
 ----------------------------------------------------------------------
-BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
-BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
-BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
+BM_SetInsert/1024/1                        28928      29349      23853  133.097kiB/s   33.2742k items/s
+BM_SetInsert/1024/8                        32065      32913      21375  949.487kiB/s   237.372k items/s
+BM_SetInsert/1024/10                       33157      33648      21431  1.13369MiB/s   290.225k items/s
 ```

 The JSON format outputs human readable json split into two top level attributes.
@ -165,6 +167,13 @@ line interface or by setting environment variables before execution. For every
 prevails). A complete list of CLI options is available running benchmarks
 with the `--help` switch.

+### Dry runs
+
+To confirm that benchmarks can run successfully without needing to wait for
+multiple repetitions and iterations, the `--benchmark_dry_run` flag can be
+used.  This will run the benchmarks as normal, but for 1 iteration and 1
+repetition only.
+
 <a name="running-a-subset-of-benchmarks" />

 ## Running a Subset of Benchmarks
@ -271,10 +280,12 @@ information about the machine on which the benchmarks are run.
 Global setup/teardown specific to each benchmark can be done by
 passing a callback to Setup/Teardown:

-The setup/teardown callbacks will be invoked once for each benchmark.
-If the benchmark is multi-threaded (will run in k threads), they will be invoked exactly once before
-each run with k threads.
-If the benchmark uses different size groups of threads, the above will be true for each size group.
+The setup/teardown callbacks will be invoked once for each benchmark. If the
+benchmark is multi-threaded (will run in k threads), they will be invoked
+exactly once before each run with k threads.
+
+If the benchmark uses different size groups of threads, the above will be true
+for each size group.

 Eg.,

@ -346,7 +357,8 @@ the performance of `std::vector` initialization for uniformly increasing sizes.
 static void BM_DenseRange(benchmark::State& state) {
  for(auto _ : state) {
    std::vector<int> v(state.range(0), state.range(0));
-    benchmark::DoNotOptimize(v.data());
+    auto data = v.data();
+    benchmark::DoNotOptimize(data);
    benchmark::ClobberMemory();
  }
 }
@ -386,14 +398,17 @@ short-hand. The following macro will pick a few appropriate arguments in the
 product of the two specified ranges and will generate a benchmark for each such
 pair.

+<!-- {% raw %} -->
 ```c++
 BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
 ```
+<!-- {% endraw %} -->

 Some benchmarks may require specific argument values that cannot be expressed
 with `Ranges`. In this case, `ArgsProduct` offers the ability to generate a
 benchmark input for each combination in the product of the supplied vectors.

+<!-- {% raw %} -->
 ```c++
 BENCHMARK(BM_SetInsert)
    ->ArgsProduct({{1<<10, 3<<10, 8<<10}, {20, 40, 60, 80}})
@ -412,6 +427,7 @@ BENCHMARK(BM_SetInsert)
    ->Args({3<<10, 80})
    ->Args({8<<10, 80});
 ```
+<!-- {% endraw %} -->

 For the most common scenarios, helper methods for creating a list of
 integers for a given sparse or dense range are provided.
@ -446,7 +462,7 @@ BENCHMARK(BM_SetInsert)->Apply(CustomArguments);

 ### Passing Arbitrary Arguments to a Benchmark

-In C++11 it is possible to define a benchmark that takes an arbitrary number
+It is possible to define a benchmark that takes an arbitrary number
 of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
 macro creates a benchmark that invokes `func`  with the `benchmark::State` as
 the first argument followed by the specified `args...`.
@ -488,7 +504,8 @@ static void BM_StringCompare(benchmark::State& state) {
  std::string s1(state.range(0), '-');
  std::string s2(state.range(0), '-');
  for (auto _ : state) {
-    benchmark::DoNotOptimize(s1.compare(s2));
+    auto comparison_result = s1.compare(s2);
+    benchmark::DoNotOptimize(comparison_result);
  }
  state.SetComplexityN(state.range(0));
 }
@ -546,26 +563,47 @@ template <class Q> void BM_Sequential(benchmark::State& state) {
  state.SetBytesProcessed(
      static_cast<int64_t>(state.iterations())*state.range(0));
 }
-// C++03
-BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);

-// C++11 or newer, you can use the BENCHMARK macro with template parameters:
+// You can use the BENCHMARK macro with template parameters:
 BENCHMARK(BM_Sequential<WaitQueue<int>>)->Range(1<<0, 1<<10);

+// Old, legacy verbose C++03 syntax:
+BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+
 ```

 Three macros are provided for adding benchmark templates.

 ```c++
-#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK(func<...>) // Takes any number of parameters.
-#else // C++ < C++11
-#define BENCHMARK_TEMPLATE(func, arg1)
-#endif
 #define BENCHMARK_TEMPLATE1(func, arg1)
 #define BENCHMARK_TEMPLATE2(func, arg1, arg2)
 ```

+<a name="templated-benchmarks-with-arguments" />
+
+## Templated Benchmarks that take arguments
+
+Sometimes there is a need to template benchmarks, and provide arguments to them.
+
+```c++
+template <class Q> void BM_Sequential_With_Step(benchmark::State& state, int step) {
+  Q q;
+  typename Q::value_type v;
+  for (auto _ : state) {
+    for (int i = state.range(0); i-=step; )
+      q.push(v);
+    for (int e = state.range(0); e-=step; )
+      q.Wait(&v);
+  }
+  // actually messages, not bytes:
+  state.SetBytesProcessed(
+      static_cast<int64_t>(state.iterations())*state.range(0));
+}
+
+BENCHMARK_TEMPLATE1_CAPTURE(BM_Sequential, WaitQueue<int>, Step1, 1)->Range(1<<0, 1<<10);
+```
+
 <a name="fixtures" />

 ## Fixtures
@ -583,27 +621,29 @@ For Example:
 ```c++
 class MyFixture : public benchmark::Fixture {
 public:
-  void SetUp(const ::benchmark::State& state) {
+  void SetUp(::benchmark::State& state) {
  }

-  void TearDown(const ::benchmark::State& state) {
+  void TearDown(::benchmark::State& state) {
  }
 };

+// Defines and registers `FooTest` using the class `MyFixture`.
 BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
   for (auto _ : st) {
     ...
  }
 }

+// Only defines `BarTest` using the class `MyFixture`.
 BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
   for (auto _ : st) {
     ...
  }
 }
-/* BarTest is NOT registered */
+// `BarTest` is NOT registered.
 BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
-/* BarTest is now registered */
+// `BarTest` is now registered.
 ```

 ### Templated Fixtures
@ -619,19 +659,70 @@ For example:
 template<typename T>
 class MyFixture : public benchmark::Fixture {};

+// Defines and registers `IntTest` using the class template `MyFixture<int>`.
 BENCHMARK_TEMPLATE_F(MyFixture, IntTest, int)(benchmark::State& st) {
   for (auto _ : st) {
     ...
  }
 }

+// Only defines `DoubleTest` using the class template `MyFixture<double>`.
 BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, DoubleTest, double)(benchmark::State& st) {
   for (auto _ : st) {
     ...
  }
 }
-
+// `DoubleTest` is NOT registered.
 BENCHMARK_REGISTER_F(MyFixture, DoubleTest)->Threads(2);
+// `DoubleTest` is now registered.
+```
+
+If you want to use a method template for your fixtures,
+which you instantiate afterward, use the following macros:
+
+* `BENCHMARK_TEMPLATE_METHOD_F(ClassName, Method)`
+* `BENCHMARK_TEMPLATE_INSTANTIATE_F(ClassName, Method, ...)`
+
+With these macros you can define one method for several instantiations.
+Example (using `MyFixture` from above):
+
+```c++
+// Defines `Test` using the class template `MyFixture`.
+BENCHMARK_TEMPLATE_METHOD_F(MyFixture, Test)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+// Instantiates and registers the benchmark `MyFixture<int>::Test`.
+BENCHMARK_TEMPLATE_INSTANTIATE_F(MyFixture, Test, int)->Threads(2);
+// Instantiates and registers the benchmark `MyFixture<double>::Test`.
+BENCHMARK_TEMPLATE_INSTANTIATE_F(MyFixture, Test, double)->Threads(4);
+```
+
+Inside the method definition of `BENCHMARK_TEMPLATE_METHOD_F` the type `Base` refers
+to the type of the instantiated fixture.
+Accesses to members of the fixture must be prefixed by `this->`.
+
+`BENCHMARK_TEMPLATE_METHOD_F`and `BENCHMARK_TEMPLATE_INSTANTIATE_F` can only be used,
+if the fixture does not use non-type template parameters.
+If you want to pass values as template parameters, use e.g. `std::integral_constant`.
+For example:
+
+```c++
+template<typename Sz>
+class SizedFixture : public benchmark::Fixture {
+  static constexpr auto Size = Sz::value;
+  int myValue;
+};
+
+BENCHMARK_TEMPLATE_METHOD_F(SizedFixture, Test)(benchmark::State& st) {
+   for (auto _ : st) {
+     this->myValue = Base::Size;
+  }
+}
+
+BENCHMARK_TEMPLATE_INSTANTIATE_F(SizedFixture, Test, std::integral_constant<5>)->Threads(2);
 ```

 <a name="custom-counters" />
@ -694,17 +785,17 @@ is 1k a 1000 (default, `benchmark::Counter::OneK::kIs1000`), or 1024
  state.counters["BytesProcessed"] = Counter(state.range(0), benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
 ```

-When you're compiling in C++11 mode or later you can use `insert()` with
-`std::initializer_list`:
+You can use `insert()` with `std::initializer_list`:

+<!-- {% raw %} -->
 ```c++
-  // With C++11, this can be done:
  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
  // ... instead of:
  state.counters["Foo"] = numFoos;
  state.counters["Bar"] = numBars;
  state.counters["Baz"] = numBazs;
 ```
+<!-- {% endraw %} -->

 ### Counter Reporting

@ -820,6 +911,46 @@ BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();

 Without `UseRealTime`, CPU time is used by default.

+### Manual Multithreaded Benchmarks
+
+Google/benchmark uses `std::thread` as multithreading environment per default.
+If you want to use another multithreading environment (e.g. OpenMP), you can provide
+a factory function to your benchmark using the `ThreadRunner` function.
+The factory function takes the number of threads as argument and creates a custom class
+derived from `benchmark::ThreadRunnerBase`.
+This custom class must override the function
+`void RunThreads(const std::function<void(int)>& fn)`.
+`RunThreads` is called by the main thread and spawns the requested number of threads.
+Each spawned thread must call `fn(thread_index)`, where `thread_index` is its own
+thread index. Before `RunThreads` returns, all spawned threads must be joined.
+```c++
+class OpenMPThreadRunner : public benchmark::ThreadRunnerBase
+{
+  OpenMPThreadRunner(int num_threads)
+  : num_threads_(num_threads)
+  {}
+
+  void RunThreads(const std::function<void(int)>& fn) final
+  {
+#pragma omp parallel num_threads(num_threads_)
+    fn(omp_get_thread_num());
+  }
+
+private:
+  int num_threads_;
+};
+
+BENCHMARK(BM_MultiThreaded)
+  ->ThreadRunner([](int num_threads) {
+    return std::make_unique<OpenMPThreadRunner>(num_threads);
+  })
+  ->Threads(1)->Threads(2)->Threads(4);
+```
+The above example creates a parallel OpenMP region before it enters `BM_MultiThreaded`.
+The actual benchmark code can remain the same and is therefore not tied to a specific
+thread runner. The measurement does not include the time for creating and joining the
+threads.
+
 <a name="cpu-timers" />

 ## CPU Timers
@ -851,7 +982,7 @@ BENCHMARK(BM_OpenMP)->Range(8, 8<<10);

 // Measure the user-visible time, the wall clock (literally, the time that
 // has passed on the clock on the wall), use it to decide for how long to
-// run the benchmark loop. This will always be meaningful, an will match the
+// run the benchmark loop. This will always be meaningful, and will match the
 // time spent by the main thread in single-threaded case, in general decreasing
 // with the number of internal threads doing the work.
 BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->UseRealTime();
@ -873,6 +1004,7 @@ is measured. But sometimes, it is necessary to do some work inside of
 that loop, every iteration, but without counting that time to the benchmark time.
 That is possible, although it is not recommended, since it has high overhead.

+<!-- {% raw %} -->
 ```c++
 static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
  std::set<int> data;
@ -887,6 +1019,7 @@ static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
 }
 BENCHMARK(BM_SetInsert_With_Timer_Control)->Ranges({{1<<10, 8<<10}, {128, 512}});
 ```
+<!-- {% endraw %} -->

 <a name="manual-timing" />

@ -974,11 +1107,11 @@ in any way. `<expr>` may even be removed entirely when the result is already
 known. For example:

 ```c++
-  /* Example 1: `<expr>` is removed entirely. */
+  // Example 1: `<expr>` is removed entirely.
  int foo(int x) { return x + 42; }
  while (...) DoNotOptimize(foo(0)); // Optimized to DoNotOptimize(42);

-  /*  Example 2: Result of '<expr>' is only reused */
+  // Example 2: Result of '<expr>' is only reused.
  int bar(int) __attribute__((const));
  while (...) DoNotOptimize(bar(0)); // Optimized to:
  // int __result__ = bar(0);
@ -997,7 +1130,8 @@ static void BM_vector_push_back(benchmark::State& state) {
  for (auto _ : state) {
    std::vector<int> v;
    v.reserve(1);
-    benchmark::DoNotOptimize(v.data()); // Allow v.data() to be clobbered.
+    auto data = v.data();           // Allow v.data() to be clobbered. Pass as non-const
+    benchmark::DoNotOptimize(data); // lvalue to avoid undesired compiler optimizations
    v.push_back(42);
    benchmark::ClobberMemory(); // Force 42 to be written to memory.
  }
@ -1055,6 +1189,7 @@ void BM_spin_empty(benchmark::State& state) {
 }

 BENCHMARK(BM_spin_empty)
+  ->Repetitions(3) // or add option --benchmark_repetitions=3
  ->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
    return *(std::max_element(std::begin(v), std::end(v)));
  })
@ -1074,8 +1209,9 @@ void BM_spin_empty(benchmark::State& state) {
 }

 BENCHMARK(BM_spin_empty)
+  ->Repetitions(3) // or add option --benchmark_repetitions=3
  ->ComputeStatistics("ratio", [](const std::vector<double>& v) -> double {
-    return std::begin(v) / std::end(v);
+    return v.front() / v.back();
  }, benchmark::StatisticUnit::kPercentage)
  ->Arg(512);
 ```
@ -1095,6 +1231,21 @@ a report on the number of allocations, bytes used, etc.
 This data will then be reported alongside other performance data, currently
 only when using JSON output.

+<a name="profiling" />
+
+## Profiling
+
+It's often useful to also profile benchmarks in particular ways, in addition to
+CPU performance. For this reason, benchmark offers the `RegisterProfilerManager`
+method that allows a custom `ProfilerManager` to be injected.
+
+If set, the `ProfilerManager::AfterSetupStart` and
+`ProfilerManager::BeforeTeardownStop` methods will be called at the start and
+end of a separate benchmark run to allow user code to collect and report
+user-provided profile metrics.
+
+Output collected from this profiling run must be reported separately.
+
 <a name="using-register-benchmark" />

 ## Using RegisterBenchmark(name, fn, args...)
@ -1131,7 +1282,7 @@ int main(int argc, char** argv) {

 When errors caused by external influences, such as file I/O and network
 communication, occur within a benchmark the
-`State::SkipWithError(const char* msg)` function can be used to skip that run
+`State::SkipWithError(const std::string& msg)` function can be used to skip that run
 of benchmark and report the error. Note that only future iterations of the
 `KeepRunning()` are skipped. For the ranged-for version of the benchmark loop
 Users must explicitly exit the loop, otherwise all iterations will be performed.
@ -1181,7 +1332,7 @@ static void BM_test_ranged_fo(benchmark::State & state) {

 ## A Faster KeepRunning Loop

-In C++11 mode, a ranged-based for loop should be used in preference to
+A ranged-based for loop should be used in preference to
 the `KeepRunning` loop for running the benchmarks. For example:

 ```c++
@ -1242,7 +1393,8 @@ the benchmark loop should be preferred.
 If you see this error:

 ```
-***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may
+be noisy and will incur extra overhead.
 ```

 you might want to disable the CPU frequency scaling while running the
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,78 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "google_benchmark"
+description = "A library to benchmark code snippets."
+requires-python = ">=3.10"
+license = { file = "LICENSE" }
+keywords = ["benchmark"]
+
+authors = [{ name = "Google", email = "benchmark-discuss@googlegroups.com" }]
+
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Software Development :: Testing",
+    "Topic :: System :: Benchmark",
+]
+
+dynamic = ["readme", "version"]
+
+dependencies = ["absl-py>=0.7.1"]
+
+[project.optional-dependencies]
+dev = ["pre-commit>=3.3.3"]
+
+[project.urls]
+Homepage = "https://github.com/google/benchmark"
+Documentation = "https://github.com/google/benchmark/tree/main/docs"
+Repository = "https://github.com/google/benchmark.git"
+Discord = "https://discord.gg/cz7UX7wKC2"
+
+[tool.setuptools]
+package-dir = { "" = "bindings/python" }
+zip-safe = false
+
+[tool.setuptools.packages.find]
+where = ["bindings/python"]
+
+[tool.setuptools.dynamic]
+readme = { file = "README.md", content-type = "text/markdown" }
+version = { attr = "google_benchmark.__version__" }
+
+[tool.mypy]
+check_untyped_defs = true
+disallow_incomplete_defs = true
+pretty = true
+python_version = "3.11"
+strict_optional = false
+warn_unreachable = true
+
+[[tool.mypy.overrides]]
+module = ["yaml"]
+ignore_missing_imports = true
+
+[tool.ruff]
+# explicitly tell ruff the source directory to correctly identify first-party package.
+src = ["bindings/python"]
+
+line-length = 80
+target-version = "py311"
+
+[tool.ruff.lint]
+# Enable pycodestyle (`E`, `W`), Pyflakes (`F`), and isort (`I`) codes by default.
+select = ["ASYNC", "B", "C4", "C90", "E", "F", "I", "PERF", "PIE", "PT018", "RUF", "SIM", "UP", "W"]
+ignore = [
+    "PLW2901",  # redefined-loop-name
+    "UP031",    # printf-string-formatting
+]
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +0,0 @@
-numpy == 1.22
-scipy == 1.5.4
--- a/setup.py
+++ b/setup.py
@ -1,60 +1,75 @@
+import contextlib
 import os
-import posixpath
 import platform
 import re
 import shutil
 import sys
+from collections.abc import Generator
+from pathlib import Path
+from typing import Any

-from distutils import sysconfig
 import setuptools
 from setuptools.command import build_ext

+IS_WINDOWS = platform.system() == "Windows"
+IS_MAC = platform.system() == "Darwin"
+IS_LINUX = platform.system() == "Linux"

-HERE = os.path.dirname(os.path.abspath(__file__))
+# hardcoded SABI-related options. Requires that each Python interpreter
+# (hermetic or not) participating is of the same major-minor version.
+py_limited_api = sys.version_info >= (3, 12)
+options = {"bdist_wheel": {"py_limited_api": "cp312"}} if py_limited_api else {}


-IS_WINDOWS = sys.platform.startswith("win")
+def is_cibuildwheel() -> bool:
+    return os.getenv("CIBUILDWHEEL") is not None


-with open("README.md", "r", encoding="utf-8") as fp:
-    long_description = fp.read()
+@contextlib.contextmanager
+def _maybe_patch_toolchains() -> Generator[None, None, None]:
+    """
+    Patch rules_python toolchains to ignore root user error
+    when run in a Docker container on Linux in cibuildwheel.
+    """

-
-def _get_version():
-    """Parse the version string from __init__.py."""
-    with open(
-        os.path.join(HERE, "bindings", "python", "google_benchmark", "__init__.py")
-    ) as init_file:
-        try:
-            version_line = next(
-                line for line in init_file if line.startswith("__version__")
-            )
-        except StopIteration:
-            raise ValueError("__version__ not defined in __init__.py")
+    def fmt_toolchain_args(matchobj):
+        suffix = "ignore_root_user_error = True"
+        callargs = matchobj.group(1)
+        # toolchain def is broken over multiple lines
+        if callargs.endswith("\n"):
+            callargs = callargs + "    " + suffix + ",\n"
+        # toolchain def is on one line.
        else:
-            namespace = {}
-            exec(version_line, namespace)  # pylint: disable=exec-used
-            return namespace["__version__"]
+            callargs = callargs + ", " + suffix
+        return "python.toolchain(" + callargs + ")"

-
-def _parse_requirements(path):
-    with open(os.path.join(HERE, path)) as requirements:
-        return [
-            line.rstrip()
-            for line in requirements
-            if not (line.isspace() or line.startswith("#"))
-        ]
+    CIBW_LINUX = is_cibuildwheel() and IS_LINUX
+    module_bazel = Path("MODULE.bazel")
+    content: str = module_bazel.read_text()
+    try:
+        if CIBW_LINUX:
+            module_bazel.write_text(
+                re.sub(
+                    r"python.toolchain\(([\w\"\s,.=]*)\)",
+                    fmt_toolchain_args,
+                    content,
+                )
+            )
+        yield
+    finally:
+        if CIBW_LINUX:
+            module_bazel.write_text(content)


 class BazelExtension(setuptools.Extension):
    """A C/C++ extension that is defined as a Bazel BUILD target."""

-    def __init__(self, name, bazel_target):
+    def __init__(self, name: str, bazel_target: str, **kwargs: Any):
+        super().__init__(name=name, sources=[], **kwargs)
+
        self.bazel_target = bazel_target
-        self.relpath, self.target_name = posixpath.relpath(bazel_target, "//").split(
-            ":"
-        )
-        setuptools.Extension.__init__(self, name, sources=[])
+        stripped_target = bazel_target.split("//")[-1]
+        self.relpath, self.target_name = stripped_target.split(":")


 class BuildBazelExtension(build_ext.build_ext):
@ -63,98 +78,89 @@ class BuildBazelExtension(build_ext.build_ext):
    def run(self):
        for ext in self.extensions:
            self.bazel_build(ext)
-        build_ext.build_ext.run(self)
+        # explicitly call `bazel shutdown` for graceful exit
+        self.spawn(["bazel", "shutdown"])

-    def bazel_build(self, ext):
+    def copy_extensions_to_source(self):
+        """
+        Copy generated extensions into the source tree.
+        This is done in the ``bazel_build`` method, so it's not necessary to
+        do again in the `build_ext` base class.
+        """
+
+    def bazel_build(self, ext: BazelExtension) -> None:  # noqa: C901
        """Runs the bazel build to create the package."""
-        with open("WORKSPACE", "r") as workspace:
-            workspace_contents = workspace.read()
+        temp_path = Path(self.build_temp)

-        with open("WORKSPACE", "w") as workspace:
-            workspace.write(
-                re.sub(
-                    r'(?<=path = ").*(?=",  # May be overwritten by setup\.py\.)',
-                    sysconfig.get_python_inc().replace(os.path.sep, posixpath.sep),
-                    workspace_contents,
-                )
-            )
-
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
+        # We round to the minor version, which makes rules_python
+        # look up the latest available patch version internally.
+        python_version = "{}.{}".format(*sys.version_info[:2])

        bazel_argv = [
            "bazel",
-            "build",
+            "run",
            ext.bazel_target,
-            "--symlink_prefix=" + os.path.join(self.build_temp, "bazel-"),
-            "--compilation_mode=" + ("dbg" if self.debug else "opt"),
+            f"--symlink_prefix={temp_path / 'bazel-'}",
+            f"--compilation_mode={'dbg' if self.debug else 'opt'}",
+            # C++17 is required by nanobind
+            f"--cxxopt={'/std:c++17' if IS_WINDOWS else '-std=c++17'}",
+            f"--@rules_python//python/config_settings:python_version={python_version}",
        ]

+        if ext.py_limited_api:
+            bazel_argv += ["--@nanobind_bazel//:py-limited-api=cp312"]
+
        if IS_WINDOWS:
            # Link with python*.lib.
            for library_dir in self.library_dirs:
                bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
-        elif sys.platform == "darwin" and platform.machine() == "x86_64":
-            bazel_argv.append("--macos_minimum_os=10.9")
+        elif IS_MAC:
+            # C++17 needs macOS 10.14 at minimum
+            bazel_argv.append("--macos_minimum_os=10.14")

-            # ARCHFLAGS is always set by cibuildwheel before macOS wheel builds.
-            archflags = os.getenv("ARCHFLAGS", "")
-            if "arm64" in archflags:
-                bazel_argv.append("--cpu=darwin_arm64")
-                bazel_argv.append("--macos_cpus=arm64")
+        with _maybe_patch_toolchains():
+            self.spawn(bazel_argv)

-        self.spawn(bazel_argv)
+        if IS_WINDOWS:
+            suffix = ".pyd"
+        else:
+            suffix = ".abi3.so" if ext.py_limited_api else ".so"

-        shared_lib_suffix = '.dll' if IS_WINDOWS else '.so'
-        ext_bazel_bin_path = os.path.join(
-            self.build_temp, 'bazel-bin',
-            ext.relpath, ext.target_name + shared_lib_suffix)
+        # copy the Bazel build artifacts into setuptools' libdir,
+        # from where the wheel is built.
+        pkgname = "google_benchmark"
+        pythonroot = Path("bindings") / "python" / "google_benchmark"
+        srcdir = temp_path / "bazel-bin" / pythonroot
+        libdir = Path(self.build_lib) / pkgname
+        for root, dirs, files in os.walk(srcdir, topdown=True):
+            # exclude runfiles directories and children.
+            dirs[:] = [d for d in dirs if "runfiles" not in d]

-        ext_dest_path = self.get_ext_fullpath(ext.name)
-        ext_dest_dir = os.path.dirname(ext_dest_path)
-        if not os.path.exists(ext_dest_dir):
-            os.makedirs(ext_dest_dir)
-        shutil.copyfile(ext_bazel_bin_path, ext_dest_path)
+            for f in files:
+                fp = Path(f)
+                should_copy = False
+                # we do not want the bare .so file included
+                # when building for ABI3, so we require a
+                # full and exact match on the file extension.
+                if "".join(fp.suffixes) == suffix or fp.suffix == ".pyi":
+                    should_copy = True
+                elif Path(root) == srcdir and f == "py.typed":
+                    # copy py.typed, but only at the package root.
+                    should_copy = True

-        # explicitly call `bazel shutdown` for graceful exit
-        self.spawn(["bazel", "shutdown"])
+                if should_copy:
+                    shutil.copyfile(root / fp, libdir / fp)


 setuptools.setup(
-    name="google_benchmark",
-    version=_get_version(),
-    url="https://github.com/google/benchmark",
-    description="A library to benchmark code snippets.",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    author="Google",
-    author_email="benchmark-py@google.com",
-    # Contained modules and scripts.
-    package_dir={"": "bindings/python"},
-    packages=setuptools.find_packages("bindings/python"),
-    install_requires=_parse_requirements("bindings/python/requirements.txt"),
-    cmdclass=dict(build_ext=BuildBazelExtension),
+    cmdclass={"build_ext": BuildBazelExtension},
+    package_data={"google_benchmark": ["py.typed", "*.pyi"]},
    ext_modules=[
        BazelExtension(
-            "google_benchmark._benchmark",
-            "//bindings/python/google_benchmark:_benchmark",
+            name="google_benchmark._benchmark",
+            bazel_target="//bindings/python/google_benchmark:benchmark_stubgen",
+            py_limited_api=py_limited_api,
        )
    ],
-    zip_safe=False,
-    # PyPI package information.
-    classifiers=[
-        "Development Status :: 4 - Beta",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Science/Research",
-        "License :: OSI Approved :: Apache Software License",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.11",
-        "Topic :: Software Development :: Testing",
-        "Topic :: System :: Benchmark",
-    ],
-    license="Apache 2.0",
-    keywords="benchmark",
+    options=options,
 )
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -1,4 +1,4 @@
-# Allow the source files to find headers in src/
+#Allow the source files to find headers in src /
 include(GNUInstallDirs)
 include_directories(${PROJECT_SOURCE_DIR}/src)

@ -28,10 +28,25 @@ target_include_directories(benchmark PUBLIC
  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
 )

+set_property(
+  SOURCE benchmark.cc
+  APPEND
+  PROPERTY COMPILE_DEFINITIONS
+  BENCHMARK_VERSION="${VERSION}"
+)
+
 # libpfm, if available
-if (HAVE_LIBPFM)
-  target_link_libraries(benchmark PRIVATE pfm)
+if (PFM_FOUND)
+  target_link_libraries(benchmark PRIVATE PFM::libpfm)
  target_compile_definitions(benchmark PRIVATE -DHAVE_LIBPFM)
+  install(
+      FILES "${PROJECT_SOURCE_DIR}/cmake/Modules/FindPFM.cmake"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+endif()
+
+# pthread affinity, if available
+if(HAVE_PTHREAD_AFFINITY)
+  target_compile_definitions(benchmark PRIVATE -DBENCHMARK_HAS_PTHREAD_AFFINITY)
 endif()

 # Link threads.
@ -52,6 +67,7 @@ endif()
 # We need extra libraries on Solaris
 if(${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
  target_link_libraries(benchmark PRIVATE kstat)
+  set(BENCHMARK_PRIVATE_LINK_LIBRARIES -lkstat)
 endif()

 if (NOT BUILD_SHARED_LIBS)
@ -74,6 +90,7 @@ set(generated_dir "${PROJECT_BINARY_DIR}")
 set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
 set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
 set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
+set(pkg_config_main "${generated_dir}/${PROJECT_NAME}_main.pc")
 set(targets_to_export benchmark benchmark_main)
 set(targets_export_name "${PROJECT_NAME}Targets")

@ -93,6 +110,7 @@ write_basic_package_version_file(
 )

 configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)
+configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark_main.pc.in" "${pkg_config_main}" @ONLY)

 export (
  TARGETS ${targets_to_export}
@ -121,7 +139,7 @@ if (BENCHMARK_ENABLE_INSTALL)
      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")

  install(
-      FILES "${pkg_config}"
+      FILES "${pkg_config}" "${pkg_config_main}"
      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")

  install(
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@ -46,7 +46,6 @@
 #include "commandlineflags.h"
 #include "complexity.h"
 #include "counter.h"
-#include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
 #include "perf_counters.h"
@ -65,16 +64,25 @@ BM_DEFINE_bool(benchmark_list_tests, false);
 // linked into the binary are run.
 BM_DEFINE_string(benchmark_filter, "");

-// Minimum number of seconds we should run benchmark before results are
-// considered significant.  For cpu-time based tests, this is the lower bound
+// Specification of how long to run the benchmark.
+//
+// It can be either an exact number of iterations (specified as `<integer>x`),
+// or a minimum number of seconds (specified as `<float>s`). If the latter
+// format (ie., min seconds) is used, the system may run the benchmark longer
+// until the results are considered significant.
+//
+// For backward compatibility, the `s` suffix may be omitted, in which case,
+// the specified number is interpreted as the number of seconds.
+//
+// For cpu-time based tests, this is the lower bound
 // on the total cpu time used by all threads that make up the test.  For
 // real-time based tests, this is the lower bound on the elapsed time of the
 // benchmark execution, regardless of number of threads.
-BM_DEFINE_double(benchmark_min_time, 0.5);
+BM_DEFINE_string(benchmark_min_time, kDefaultMinTimeStr);

 // Minimum number of seconds a benchmark should be run before results should be
-// taken into account. This e.g can be neccessary for benchmarks of code which
-// needs to fill some form of cache before performance is of interrest.
+// taken into account. This e.g can be necessary for benchmarks of code which
+// needs to fill some form of cache before performance is of interest.
 // Note: results gathered within this period are discarded and not used for
 // reported result.
 BM_DEFINE_double(benchmark_min_warmup_time, 0.0);
@ -83,6 +91,11 @@ BM_DEFINE_double(benchmark_min_warmup_time, 0.0);
 // standard deviation of the runs will be reported.
 BM_DEFINE_int32(benchmark_repetitions, 1);

+// If enabled, forces each benchmark to execute exactly one iteration and one
+// repetition, bypassing any configured
+// MinTime()/MinWarmUpTime()/Iterations()/Repetitions()
+BM_DEFINE_bool(benchmark_dry_run, false);
+
 // If set, enable random interleaving of repetitions of all benchmarks.
 // See http://github.com/google/benchmark/issues/1051 for details.
 BM_DEFINE_bool(benchmark_enable_random_interleaving, false);
@ -137,38 +150,64 @@ BM_DEFINE_int32(v, 0);

 namespace internal {

+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::map<std::string, std::string>* global_context = nullptr;

 BENCHMARK_EXPORT std::map<std::string, std::string>*& GetGlobalContext() {
  return global_context;
 }

-// FIXME: wouldn't LTO mess this up?
-void UseCharPointer(char const volatile*) {}
+namespace {
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+void const volatile* volatile global_force_escape_pointer;
+}  // namespace
+
+// FIXME: Verify if LTO still messes this up?
+void UseCharPointer(char const volatile* const v) {
+  // We want to escape the pointer `v` so that the compiler can not eliminate
+  // computations that produced it. To do that, we escape the pointer by storing
+  // it into a volatile variable, since generally, volatile store, is not
+  // something the compiler is allowed to elide.
+  global_force_escape_pointer = reinterpret_cast<void const volatile*>(v);
+}

 }  // namespace internal

-State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
-             int thread_i, int n_threads, internal::ThreadTimer* timer,
-             internal::ThreadManager* manager,
-             internal::PerfCountersMeasurement* perf_counters_measurement)
+State::State(std::string name, IterationCount max_iters,
+             const std::vector<int64_t>& ranges, int thread_i, int n_threads,
+             internal::ThreadTimer* timer, internal::ThreadManager* manager,
+             internal::PerfCountersMeasurement* perf_counters_measurement,
+             ProfilerManager* profiler_manager)
    : total_iterations_(0),
      batch_leftover_(0),
      max_iterations(max_iters),
      started_(false),
      finished_(false),
-      error_occurred_(false),
+      skipped_(internal::NotSkipped),
      range_(ranges),
      complexity_n_(0),
+      name_(std::move(name)),
      thread_index_(thread_i),
      threads_(n_threads),
      timer_(timer),
      manager_(manager),
-      perf_counters_measurement_(perf_counters_measurement) {
+      perf_counters_measurement_(perf_counters_measurement),
+      profiler_manager_(profiler_manager) {
  BM_CHECK(max_iterations != 0) << "At least one iteration must be run";
  BM_CHECK_LT(thread_index_, threads_)
      << "thread_index must be less than threads";

+  // Add counters with correct flag now.  If added with `counters[name]` in
+  // `PauseTiming`, a new `Counter` will be inserted the first time, which
+  // won't have the flag.  Inserting them now also reduces the allocations
+  // during the benchmark.
+  if (perf_counters_measurement_ != nullptr) {
+    for (const std::string& counter_name :
+         perf_counters_measurement_->names()) {
+      counters[counter_name] = Counter(0.0, Counter::kAvgIterations);
+    }
+  }
+
  // Note: The use of offsetof below is technically undefined until C++17
  // because State is not a standard layout type. However, all compilers
  // currently provide well-defined behavior as an extension (which is
@ -178,74 +217,97 @@ State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
 #if defined(__INTEL_COMPILER)
 #pragma warning push
 #pragma warning(disable : 1875)
-#elif defined(__GNUC__)
+#elif defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Winvalid-offsetof"
 #endif
 #if defined(__NVCC__)
 #pragma nv_diagnostic push
 #pragma nv_diag_suppress 1427
+#endif
+#if defined(__NVCOMPILER)
+#pragma diagnostic push
+#pragma diag_suppress offset_in_non_POD_nonstandard
 #endif
  // Offset tests to ensure commonly accessed data is on the first cache line.
  const int cache_line_size = 64;
-  static_assert(offsetof(State, error_occurred_) <=
-                    (cache_line_size - sizeof(error_occurred_)),
-                "");
+  static_assert(
+      offsetof(State, skipped_) <= (cache_line_size - sizeof(skipped_)), "");
 #if defined(__INTEL_COMPILER)
 #pragma warning pop
-#elif defined(__GNUC__)
+#elif defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic pop
 #endif
 #if defined(__NVCC__)
 #pragma nv_diagnostic pop
 #endif
+#if defined(__NVCOMPILER)
+#pragma diagnostic pop
+#endif
 }

 void State::PauseTiming() {
  // Add in time accumulated so far
-  BM_CHECK(started_ && !finished_ && !error_occurred_);
+  BM_CHECK(started_ && !finished_ && !skipped());
  timer_->StopTimer();
-  if (perf_counters_measurement_) {
+  if (perf_counters_measurement_ != nullptr) {
    std::vector<std::pair<std::string, double>> measurements;
    if (!perf_counters_measurement_->Stop(measurements)) {
      BM_CHECK(false) << "Perf counters read the value failed.";
    }
    for (const auto& name_and_measurement : measurements) {
-      auto name = name_and_measurement.first;
-      auto measurement = name_and_measurement.second;
-      BM_CHECK_EQ(std::fpclassify((double)counters[name]), FP_ZERO);
-      counters[name] = Counter(measurement, Counter::kAvgIterations);
+      const std::string& name = name_and_measurement.first;
+      const double measurement = name_and_measurement.second;
+      // Counter was inserted with `kAvgIterations` flag by the constructor.
+      assert(counters.find(name) != counters.end());
+      counters[name].value += measurement;
    }
  }
 }

 void State::ResumeTiming() {
-  BM_CHECK(started_ && !finished_ && !error_occurred_);
+  BM_CHECK(started_ && !finished_ && !skipped());
  timer_->StartTimer();
-  if (perf_counters_measurement_) {
+  if (perf_counters_measurement_ != nullptr) {
    perf_counters_measurement_->Start();
  }
 }

-void State::SkipWithError(const char* msg) {
-  BM_CHECK(msg);
-  error_occurred_ = true;
+void State::SkipWithMessage(const std::string& msg) {
+  skipped_ = internal::SkippedWithMessage;
  {
    MutexLock l(manager_->GetBenchmarkMutex());
-    if (manager_->results.has_error_ == false) {
-      manager_->results.error_message_ = msg;
-      manager_->results.has_error_ = true;
+    if (internal::NotSkipped == manager_->results.skipped_) {
+      manager_->results.skip_message_ = msg;
+      manager_->results.skipped_ = skipped_;
    }
  }
  total_iterations_ = 0;
-  if (timer_->running()) timer_->StopTimer();
+  if (timer_->running()) {
+    timer_->StopTimer();
+  }
+}
+
+void State::SkipWithError(const std::string& msg) {
+  skipped_ = internal::SkippedWithError;
+  {
+    MutexLock l(manager_->GetBenchmarkMutex());
+    if (internal::NotSkipped == manager_->results.skipped_) {
+      manager_->results.skip_message_ = msg;
+      manager_->results.skipped_ = skipped_;
+    }
+  }
+  total_iterations_ = 0;
+  if (timer_->running()) {
+    timer_->StopTimer();
+  }
 }

 void State::SetIterationTime(double seconds) {
  timer_->SetIterationTime(seconds);
 }

-void State::SetLabel(const char* label) {
+void State::SetLabel(const std::string& label) {
  MutexLock l(manager_->GetBenchmarkMutex());
  manager_->results.report_label_ = label;
 }
@ -253,20 +315,28 @@ void State::SetLabel(const char* label) {
 void State::StartKeepRunning() {
  BM_CHECK(!started_ && !finished_);
  started_ = true;
-  total_iterations_ = error_occurred_ ? 0 : max_iterations;
+  total_iterations_ = skipped() ? 0 : max_iterations;
+  if (BENCHMARK_BUILTIN_EXPECT(profiler_manager_ != nullptr, false)) {
+    profiler_manager_->AfterSetupStart();
+  }
  manager_->StartStopBarrier();
-  if (!error_occurred_) ResumeTiming();
+  if (!skipped()) {
+    ResumeTiming();
+  }
 }

 void State::FinishKeepRunning() {
-  BM_CHECK(started_ && (!finished_ || error_occurred_));
-  if (!error_occurred_) {
+  BM_CHECK(started_ && (!finished_ || skipped()));
+  if (!skipped()) {
    PauseTiming();
  }
  // Total iterations has now wrapped around past 0. Fix this.
  total_iterations_ = 0;
  finished_ = true;
  manager_->StartStopBarrier();
+  if (BENCHMARK_BUILTIN_EXPECT(profiler_manager_ != nullptr, false)) {
+    profiler_manager_->BeforeTeardownStop();
+  }
 }

 namespace internal {
@ -275,7 +345,9 @@ namespace {
 // Flushes streams after invoking reporter methods that write to them. This
 // ensures users get timely updates even when streams are not line-buffered.
 void FlushStreams(BenchmarkReporter* reporter) {
-  if (!reporter) return;
+  if (reporter == nullptr) {
+    return;
+  }
  std::flush(reporter->GetOutputStream());
  std::flush(reporter->GetErrorStream());
 }
@ -288,16 +360,20 @@ void Report(BenchmarkReporter* display_reporter,
    assert(reporter);
    // If there are no aggregates, do output non-aggregates.
    aggregates_only &= !results.aggregates_only.empty();
-    if (!aggregates_only) reporter->ReportRuns(results.non_aggregates);
-    if (!results.aggregates_only.empty())
+    if (!aggregates_only) {
+      reporter->ReportRuns(results.non_aggregates);
+    }
+    if (!results.aggregates_only.empty()) {
      reporter->ReportRuns(results.aggregates_only);
+    }
  };

  report_one(display_reporter, run_results.display_report_aggregates_only,
             run_results);
-  if (file_reporter)
+  if (file_reporter != nullptr) {
    report_one(file_reporter, run_results.file_report_aggregates_only,
               run_results);
+  }

  FlushStreams(display_reporter);
  FlushStreams(file_reporter);
@ -318,10 +394,13 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
        std::max<size_t>(name_field_width, benchmark.name().str().size());
    might_have_aggregates |= benchmark.repetitions() > 1;

-    for (const auto& Stat : benchmark.statistics())
+    for (const auto& Stat : benchmark.statistics()) {
      stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
+    }
+  }
+  if (might_have_aggregates) {
+    name_field_width += 1 + stat_field_width;
  }
-  if (might_have_aggregates) name_field_width += 1 + stat_field_width;

  // Print header here
  BenchmarkReporter::Context context;
@ -332,27 +411,53 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
      per_family_reports;

  if (display_reporter->ReportContext(context) &&
-      (!file_reporter || file_reporter->ReportContext(context))) {
+      ((file_reporter == nullptr) || file_reporter->ReportContext(context))) {
    FlushStreams(display_reporter);
    FlushStreams(file_reporter);

    size_t num_repetitions_total = 0;

+    // This perfcounters object needs to be created before the runners vector
+    // below so it outlasts their lifetime.
+    PerfCountersMeasurement perfcounters(
+        StrSplit(FLAGS_benchmark_perf_counters, ','));
+
+    // Vector of benchmarks to run
    std::vector<internal::BenchmarkRunner> runners;
    runners.reserve(benchmarks.size());
+
+    // Count the number of benchmarks with threads to warn the user in case
+    // performance counters are used.
+    int benchmarks_with_threads = 0;
+
+    // Loop through all benchmarks
    for (const BenchmarkInstance& benchmark : benchmarks) {
      BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
-      if (benchmark.complexity() != oNone)
+      if (benchmark.complexity() != oNone) {
        reports_for_family = &per_family_reports[benchmark.family_index()];
-
-      runners.emplace_back(benchmark, reports_for_family);
+      }
+      benchmarks_with_threads += static_cast<int>(benchmark.threads() > 1);
+      runners.emplace_back(benchmark, &perfcounters, reports_for_family);
      int num_repeats_of_this_instance = runners.back().GetNumRepeats();
-      num_repetitions_total += num_repeats_of_this_instance;
-      if (reports_for_family)
+      num_repetitions_total +=
+          static_cast<size_t>(num_repeats_of_this_instance);
+      if (reports_for_family != nullptr) {
        reports_for_family->num_runs_total += num_repeats_of_this_instance;
+      }
    }
    assert(runners.size() == benchmarks.size() && "Unexpected runner count.");

+    // The use of performance counters with threads would be unintuitive for
+    // the average user so we need to warn them about this case
+    if ((benchmarks_with_threads > 0) && (perfcounters.num_counters() > 0)) {
+      GetErrorLogInstance()
+          << "***WARNING*** There are " << benchmarks_with_threads
+          << " benchmarks with threads and " << perfcounters.num_counters()
+          << " performance counters were requested. Beware counters will "
+             "reflect the combined usage across all "
+             "threads.\n";
+    }
+
    std::vector<size_t> repetition_indices;
    repetition_indices.reserve(num_repetitions_total);
    for (size_t runner_index = 0, num_runners = runners.size();
@ -373,9 +478,18 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
    for (size_t repetition_index : repetition_indices) {
      internal::BenchmarkRunner& runner = runners[repetition_index];
      runner.DoOneRepetition();
-      if (runner.HasRepeatsRemaining()) continue;
+      if (runner.HasRepeatsRemaining()) {
+        continue;
+      }
      // FIXME: report each repetition separately, not all of them in bulk.

+      display_reporter->ReportRunsConfig(
+          runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());
+      if (file_reporter != nullptr) {
+        file_reporter->ReportRunsConfig(
+            runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());
+      }
+
      RunResults run_results = runner.GetResults();

      // Maybe calculate complexity report
@ -395,7 +509,9 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
    }
  }
  display_reporter->Finalize();
-  if (file_reporter) file_reporter->Finalize();
+  if (file_reporter != nullptr) {
+    file_reporter->Finalize();
+  }
  FlushStreams(display_reporter);
  FlushStreams(file_reporter);
 }
@ -409,14 +525,16 @@ std::unique_ptr<BenchmarkReporter> CreateReporter(
  typedef std::unique_ptr<BenchmarkReporter> PtrType;
  if (name == "console") {
    return PtrType(new ConsoleReporter(output_opts));
-  } else if (name == "json") {
-    return PtrType(new JSONReporter());
-  } else if (name == "csv") {
-    return PtrType(new CSVReporter());
-  } else {
-    std::cerr << "Unexpected format: '" << name << "'\n";
-    std::exit(1);
  }
+  if (name == "json") {
+    return PtrType(new JSONReporter());
+  }
+  if (name == "csv") {
+    return PtrType(new CSVReporter());
+  }
+  std::cerr << "Unexpected format: '" << name << "'\n";
+  std::flush(std::cerr);
+  std::exit(1);
 }

 BENCHMARK_RESTORE_DEPRECATED_WARNING
@ -454,7 +572,7 @@ ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
 }  // end namespace internal

 BenchmarkReporter* CreateDefaultDisplayReporter() {
-  static auto default_display_reporter =
+  static auto* default_display_reporter =
      internal::CreateReporter(FLAGS_benchmark_format,
                               internal::GetOutputOptions())
          .release();
@ -488,14 +606,15 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
                              BenchmarkReporter* file_reporter,
                              std::string spec) {
-  if (spec.empty() || spec == "all")
+  if (spec.empty() || spec == "all") {
    spec = ".";  // Regexp that matches all benchmarks
+  }

  // Setup the reporters
  std::ofstream output_file;
  std::unique_ptr<BenchmarkReporter> default_display_reporter;
  std::unique_ptr<BenchmarkReporter> default_file_reporter;
-  if (!display_reporter) {
+  if (display_reporter == nullptr) {
    default_display_reporter.reset(CreateDefaultDisplayReporter());
    display_reporter = default_display_reporter.get();
  }
@ -503,21 +622,26 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
  auto& Err = display_reporter->GetErrorStream();

  std::string const& fname = FLAGS_benchmark_out;
-  if (fname.empty() && file_reporter) {
+  if (fname.empty() && (file_reporter != nullptr)) {
    Err << "A custom file reporter was provided but "
-           "--benchmark_out=<file> was not specified."
-        << std::endl;
+           "--benchmark_out=<file> was not specified.\n";
+    Out.flush();
+    Err.flush();
    std::exit(1);
  }
  if (!fname.empty()) {
    output_file.open(fname);
    if (!output_file.is_open()) {
-      Err << "invalid file name: '" << fname << "'" << std::endl;
+      Err << "invalid file name: '" << fname << "'\n";
+      Out.flush();
+      Err.flush();
      std::exit(1);
    }
-    if (!file_reporter) {
+    if (file_reporter == nullptr) {
      default_file_reporter = internal::CreateReporter(
-          FLAGS_benchmark_out_format, ConsoleReporter::OO_None);
+          FLAGS_benchmark_out_format, FLAGS_benchmark_counters_tabular
+                                          ? ConsoleReporter::OO_Tabular
+                                          : ConsoleReporter::OO_None);
      file_reporter = default_file_reporter.get();
    }
    file_reporter->SetOutputStream(&output_file);
@ -525,20 +649,29 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
  }

  std::vector<internal::BenchmarkInstance> benchmarks;
-  if (!FindBenchmarksInternal(spec, &benchmarks, &Err)) return 0;
+  if (!FindBenchmarksInternal(spec, &benchmarks, &Err)) {
+    Out.flush();
+    Err.flush();
+    return 0;
+  }

  if (benchmarks.empty()) {
    Err << "Failed to match any benchmarks against regex: " << spec << "\n";
+    Out.flush();
+    Err.flush();
    return 0;
  }

  if (FLAGS_benchmark_list_tests) {
-    for (auto const& benchmark : benchmarks)
+    for (auto const& benchmark : benchmarks) {
      Out << benchmark.name().str() << "\n";
+    }
  } else {
    internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
  }

+  Out.flush();
+  Err.flush();
  return benchmarks.size();
 }

@ -563,6 +696,14 @@ void RegisterMemoryManager(MemoryManager* manager) {
  internal::memory_manager = manager;
 }

+void RegisterProfilerManager(ProfilerManager* manager) {
+  // Don't allow overwriting an existing manager.
+  if (manager != nullptr) {
+    BM_CHECK_EQ(internal::profiler_manager, nullptr);
+  }
+  internal::profiler_manager = manager;
+}
+
 void AddCustomContext(const std::string& key, const std::string& value) {
  if (internal::global_context == nullptr) {
    internal::global_context = new std::map<std::string, std::string>();
@ -579,19 +720,25 @@ void (*HelperPrintf)();

 void PrintUsageAndExit() {
  HelperPrintf();
-  exit(0);
+  std::flush(std::cout);
+  std::flush(std::cerr);
+  std::exit(0);
 }

 void SetDefaultTimeUnitFromFlag(const std::string& time_unit_flag) {
  if (time_unit_flag == "s") {
    return SetDefaultTimeUnit(kSecond);
-  } else if (time_unit_flag == "ms") {
+  }
+  if (time_unit_flag == "ms") {
    return SetDefaultTimeUnit(kMillisecond);
-  } else if (time_unit_flag == "us") {
+  }
+  if (time_unit_flag == "us") {
    return SetDefaultTimeUnit(kMicrosecond);
-  } else if (time_unit_flag == "ns") {
+  }
+  if (time_unit_flag == "ns") {
    return SetDefaultTimeUnit(kNanosecond);
-  } else if (!time_unit_flag.empty()) {
+  }
+  if (!time_unit_flag.empty()) {
    PrintUsageAndExit();
  }
 }
@ -599,17 +746,18 @@ void SetDefaultTimeUnitFromFlag(const std::string& time_unit_flag) {
 void ParseCommandLineFlags(int* argc, char** argv) {
  using namespace benchmark;
  BenchmarkReporter::Context::executable_name =
-      (argc && *argc > 0) ? argv[0] : "unknown";
-  for (int i = 1; argc && i < *argc; ++i) {
+      ((argc != nullptr) && *argc > 0) ? argv[0] : "unknown";
+  for (int i = 1; (argc != nullptr) && i < *argc; ++i) {
    if (ParseBoolFlag(argv[i], "benchmark_list_tests",
                      &FLAGS_benchmark_list_tests) ||
        ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
-        ParseDoubleFlag(argv[i], "benchmark_min_time",
+        ParseStringFlag(argv[i], "benchmark_min_time",
                        &FLAGS_benchmark_min_time) ||
        ParseDoubleFlag(argv[i], "benchmark_min_warmup_time",
                        &FLAGS_benchmark_min_warmup_time) ||
        ParseInt32Flag(argv[i], "benchmark_repetitions",
                       &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_dry_run", &FLAGS_benchmark_dry_run) ||
        ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
                      &FLAGS_benchmark_enable_random_interleaving) ||
        ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
@ -630,7 +778,9 @@ void ParseCommandLineFlags(int* argc, char** argv) {
        ParseStringFlag(argv[i], "benchmark_time_unit",
                        &FLAGS_benchmark_time_unit) ||
        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
-      for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
+      for (int j = i; j != *argc - 1; ++j) {
+        argv[j] = argv[j + 1];
+      }

      --(*argc);
      --i;
@ -648,6 +798,9 @@ void ParseCommandLineFlags(int* argc, char** argv) {
  if (FLAGS_benchmark_color.empty()) {
    PrintUsageAndExit();
  }
+  if (FLAGS_benchmark_dry_run) {
+    AddCustomContext("dry_run", "true");
+  }
  for (const auto& kv : FLAGS_benchmark_context) {
    AddCustomContext(kv.first, kv.second);
  }
@ -660,14 +813,23 @@ int InitializeStreams() {

 }  // end namespace internal

+std::string GetBenchmarkVersion() {
+#ifdef BENCHMARK_VERSION
+  return {BENCHMARK_VERSION};
+#else
+  return {""};
+#endif
+}
+
 void PrintDefaultHelp() {
  fprintf(stdout,
          "benchmark"
          " [--benchmark_list_tests={true|false}]\n"
          "          [--benchmark_filter=<regex>]\n"
-          "          [--benchmark_min_time=<min_time>]\n"
+          "          [--benchmark_min_time=`<integer>x` OR `<float>s` ]\n"
          "          [--benchmark_min_warmup_time=<min_warmup_time>]\n"
          "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_dry_run={true|false}]\n"
          "          [--benchmark_enable_random_interleaving={true|false}]\n"
          "          [--benchmark_report_aggregates_only={true|false}]\n"
          "          [--benchmark_display_aggregates_only={true|false}]\n"
@ -676,6 +838,9 @@ void PrintDefaultHelp() {
          "          [--benchmark_out_format=<json|console|csv>]\n"
          "          [--benchmark_color={auto|true|false}]\n"
          "          [--benchmark_counters_tabular={true|false}]\n"
+#if defined HAVE_LIBPFM
+          "          [--benchmark_perf_counters=<counter>,...]\n"
+#endif
          "          [--benchmark_context=<key>=<value>,...]\n"
          "          [--benchmark_time_unit={ns|us|ms|s}]\n"
          "          [--v=<verbosity>]\n");
--- a/src/benchmark_api_internal.cc
+++ b/src/benchmark_api_internal.cc
@ -27,7 +27,9 @@ BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
      min_time_(benchmark_.min_time_),
      min_warmup_time_(benchmark_.min_warmup_time_),
      iterations_(benchmark_.iterations_),
-      threads_(thread_count) {
+      threads_(thread_count),
+      setup_(benchmark_.setup_),
+      teardown_(benchmark_.teardown_) {
  name_.function_name = benchmark_.name_;

  size_t arg_i = 0;
@ -84,33 +86,31 @@ BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
  if (!benchmark_.thread_counts_.empty()) {
    name_.threads = StrFormat("threads:%d", threads_);
  }
-
-  setup_ = benchmark_.setup_;
-  teardown_ = benchmark_.teardown_;
 }

 State BenchmarkInstance::Run(
    IterationCount iters, int thread_id, internal::ThreadTimer* timer,
    internal::ThreadManager* manager,
-    internal::PerfCountersMeasurement* perf_counters_measurement) const {
-  State st(iters, args_, thread_id, threads_, timer, manager,
-           perf_counters_measurement);
+    internal::PerfCountersMeasurement* perf_counters_measurement,
+    ProfilerManager* profiler_manager) const {
+  State st(name_.function_name, iters, args_, thread_id, threads_, timer,
+           manager, perf_counters_measurement, profiler_manager);
  benchmark_.Run(st);
  return st;
 }

 void BenchmarkInstance::Setup() const {
-  if (setup_) {
-    State st(/*iters*/ 1, args_, /*thread_id*/ 0, threads_, nullptr, nullptr,
-             nullptr);
+  if (setup_ != nullptr) {
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr, nullptr);
    setup_(st);
  }
 }

 void BenchmarkInstance::Teardown() const {
-  if (teardown_) {
-    State st(/*iters*/ 1, args_, /*thread_id*/ 0, threads_, nullptr, nullptr,
-             nullptr);
+  if (teardown_ != nullptr) {
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr, nullptr);
    teardown_(st);
  }
 }
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@ -17,9 +17,9 @@ namespace internal {
 // Information kept per benchmark we may want to run
 class BenchmarkInstance {
 public:
-  BenchmarkInstance(Benchmark* benchmark, int family_index,
-                    int per_family_instance_index,
-                    const std::vector<int64_t>& args, int threads);
+  BenchmarkInstance(Benchmark* benchmark, int family_idx,
+                    int per_family_instance_idx,
+                    const std::vector<int64_t>& args, int thread_count);

  const BenchmarkName& name() const { return name_; }
  int family_index() const { return family_index_; }
@ -41,10 +41,14 @@ class BenchmarkInstance {
  int threads() const { return threads_; }
  void Setup() const;
  void Teardown() const;
+  const auto& GetUserThreadRunnerFactory() const {
+    return benchmark_.threadrunner_;
+  }

  State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
            internal::ThreadManager* manager,
-            internal::PerfCountersMeasurement* perf_counters_measurement) const;
+            internal::PerfCountersMeasurement* perf_counters_measurement,
+            ProfilerManager* profiler_manager) const;

 private:
  BenchmarkName name_;
@ -67,9 +71,8 @@ class BenchmarkInstance {
  IterationCount iterations_;
  int threads_;  // Number of concurrent threads to us

-  typedef void (*callback_function)(const benchmark::State&);
-  callback_function setup_ = nullptr;
-  callback_function teardown_ = nullptr;
+  callback_function setup_;
+  callback_function teardown_;
 };

 bool FindBenchmarksInternal(const std::string& re,
--- a/src/benchmark_main.cc
+++ b/src/benchmark_main.cc
@ -14,5 +14,5 @@

 #include "benchmark/benchmark.h"

-BENCHMARK_EXPORT int main(int, char**);
+BENCHMARK_EXPORT int main(int /*argc*/, char** /*argv*/);
 BENCHMARK_MAIN();
--- a/src/benchmark_name.cc
+++ b/src/benchmark_name.cc
@ -27,8 +27,8 @@ size_t size_impl(const Head& head, const Tail&... tail) {
 }

 // Join a pack of std::strings using a delimiter
-// TODO: use absl::StrJoin
-void join_impl(std::string&, char) {}
+// TODO(dominic): use absl::StrJoin
+void join_impl(std::string& /*unused*/, char /*unused*/) {}

 template <typename Head, typename... Tail>
 void join_impl(std::string& s, const char delimiter, const Head& head,
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@ -53,13 +53,13 @@ namespace benchmark {

 namespace {
 // For non-dense Range, intermediate values are powers of kRangeMultiplier.
-static constexpr int kRangeMultiplier = 8;
+constexpr int kRangeMultiplier = 8;

 // The size of a benchmark family determines is the number of inputs to repeat
 // the benchmark on. If this is "large" then warn the user during configuration.
-static constexpr size_t kMaxFamilySize = 100;
+constexpr size_t kMaxFamilySize = 100;

-static constexpr char kDisabledPrefix[] = "DISABLED_";
+constexpr char kDisabledPrefix[] = "DISABLED_";
 }  // end namespace

 namespace internal {
@ -82,7 +82,7 @@ class BenchmarkFamilies {

  // Extract the list of benchmark instances that match the specified
  // regular expression.
-  bool FindBenchmarks(std::string re,
+  bool FindBenchmarks(std::string spec,
                      std::vector<BenchmarkInstance>* benchmarks,
                      std::ostream* Err);

@ -125,7 +125,7 @@ bool BenchmarkFamilies::FindBenchmarks(
    is_negative_filter = true;
  }
  if (!re.Init(spec, &error_msg)) {
-    Err << "Could not compile benchmark re: " << error_msg << std::endl;
+    Err << "Could not compile benchmark re: " << error_msg << '\n';
    return false;
  }

@ -140,7 +140,9 @@ bool BenchmarkFamilies::FindBenchmarks(
    int per_family_instance_index = 0;

    // Family was deleted or benchmark doesn't match
-    if (!family) continue;
+    if (!family) {
+      continue;
+    }

    if (family->ArgsCnt() == -1) {
      family->Args({});
@ -159,7 +161,9 @@ bool BenchmarkFamilies::FindBenchmarks(
    // reserve in the special case the regex ".", since we know the final
    // family size.  this doesn't take into account any disabled benchmarks
    // so worst case we reserve more than we need.
-    if (spec == ".") benchmarks->reserve(benchmarks->size() + family_size);
+    if (spec == ".") {
+      benchmarks->reserve(benchmarks->size() + family_size);
+    }

    for (auto const& args : family->args_) {
      for (int num_threads : *thread_counts) {
@ -177,7 +181,9 @@ bool BenchmarkFamilies::FindBenchmarks(

          // Only bump the next family index once we've estabilished that
          // at least one instance of this family will be run.
-          if (next_family_index == family_index) ++next_family_index;
+          if (next_family_index == family_index) {
+            ++next_family_index;
+          }
        }
      }
    }
@ -185,11 +191,11 @@ bool BenchmarkFamilies::FindBenchmarks(
  return true;
 }

-Benchmark* RegisterBenchmarkInternal(Benchmark* bench) {
-  std::unique_ptr<Benchmark> bench_ptr(bench);
+Benchmark* RegisterBenchmarkInternal(std::unique_ptr<Benchmark> bench) {
+  Benchmark* bench_ptr = bench.get();
  BenchmarkFamilies* families = BenchmarkFamilies::GetInstance();
-  families->AddBenchmark(std::move(bench_ptr));
-  return bench;
+  families->AddBenchmark(std::move(bench));
+  return bench_ptr;
 }

 // FIXME: This function is a hack so that benchmark.cc can access
@ -204,7 +210,7 @@ bool FindBenchmarksInternal(const std::string& re,
 //                               Benchmark
 //=============================================================================//

-Benchmark::Benchmark(const char* name)
+Benchmark::Benchmark(const std::string& name)
    : name_(name),
      aggregation_report_mode_(ARM_Unspecified),
      time_unit_(GetDefaultTimeUnit()),
@ -218,9 +224,7 @@ Benchmark::Benchmark(const char* name)
      use_real_time_(false),
      use_manual_time_(false),
      complexity_(oNone),
-      complexity_lambda_(nullptr),
-      setup_(nullptr),
-      teardown_(nullptr) {
+      complexity_lambda_(nullptr) {
  ComputeStatistics("mean", StatisticsMean);
  ComputeStatistics("median", StatisticsMedian);
  ComputeStatistics("stddev", StatisticsStdDev);
@ -230,7 +234,7 @@ Benchmark::Benchmark(const char* name)
 Benchmark::~Benchmark() {}

 Benchmark* Benchmark::Name(const std::string& name) {
-  SetName(name.c_str());
+  SetName(name);
  return this;
 }

@ -331,13 +335,25 @@ Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
  return this;
 }

-Benchmark* Benchmark::Setup(void (*setup)(const benchmark::State&)) {
+Benchmark* Benchmark::Setup(callback_function&& setup) {
+  BM_CHECK(setup != nullptr);
+  setup_ = std::forward<callback_function>(setup);
+  return this;
+}
+
+Benchmark* Benchmark::Setup(const callback_function& setup) {
  BM_CHECK(setup != nullptr);
  setup_ = setup;
  return this;
 }

-Benchmark* Benchmark::Teardown(void (*teardown)(const benchmark::State&)) {
+Benchmark* Benchmark::Teardown(callback_function&& teardown) {
+  BM_CHECK(teardown != nullptr);
+  teardown_ = std::forward<callback_function>(teardown);
+  return this;
+}
+
+Benchmark* Benchmark::Teardown(const callback_function& teardown) {
  BM_CHECK(teardown != nullptr);
  teardown_ = teardown;
  return this;
@ -468,16 +484,32 @@ Benchmark* Benchmark::ThreadPerCpu() {
  return this;
 }

-void Benchmark::SetName(const char* name) { name_ = name; }
+Benchmark* Benchmark::ThreadRunner(threadrunner_factory&& factory) {
+  threadrunner_ = std::move(factory);
+  return this;
+}
+
+void Benchmark::SetName(const std::string& name) { name_ = name; }
+
+const char* Benchmark::GetName() const { return name_.c_str(); }

 int Benchmark::ArgsCnt() const {
  if (args_.empty()) {
-    if (arg_names_.empty()) return -1;
+    if (arg_names_.empty()) {
+      return -1;
+    }
    return static_cast<int>(arg_names_.size());
  }
  return static_cast<int>(args_.front().size());
 }

+const char* Benchmark::GetArgName(int arg) const {
+  BM_CHECK_GE(arg, 0);
+  size_t uarg = static_cast<size_t>(arg);
+  BM_CHECK_LT(uarg, arg_names_.size());
+  return arg_names_[uarg].c_str();
+}
+
 TimeUnit Benchmark::GetTimeUnit() const {
  return use_default_time_unit_ ? GetDefaultTimeUnit() : time_unit_;
 }
--- a/src/benchmark_register.h
+++ b/src/benchmark_register.h
@ -24,7 +24,7 @@ typename std::vector<T>::iterator AddPowers(std::vector<T>* dst, T lo, T hi,
  static const T kmax = std::numeric_limits<T>::max();

  // Space out the values in multiples of "mult"
-  for (T i = static_cast<T>(1); i <= hi; i *= static_cast<T>(mult)) {
+  for (T i = static_cast<T>(1); i <= hi; i = static_cast<T>(i * mult)) {
    if (i >= lo) {
      dst->push_back(i);
    }
@ -52,7 +52,7 @@ void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {

  const auto it = AddPowers(dst, hi_complement, lo_complement, mult);

-  std::for_each(it, dst->end(), [](T& t) { t *= -1; });
+  std::for_each(it, dst->end(), [](T& t) { t = static_cast<T>(t * -1); });
  std::reverse(it, dst->end());
 }

--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@ -28,11 +28,15 @@

 #include <algorithm>
 #include <atomic>
+#include <climits>
+#include <cmath>
 #include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
 #include <fstream>
+#include <functional>
 #include <iostream>
+#include <limits>
 #include <memory>
 #include <string>
 #include <thread>
@ -43,7 +47,6 @@
 #include "commandlineflags.h"
 #include "complexity.h"
 #include "counter.h"
-#include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
 #include "perf_counters.h"
@ -55,19 +58,31 @@

 namespace benchmark {

+BM_DECLARE_bool(benchmark_dry_run);
+BM_DECLARE_string(benchmark_min_time);
+BM_DECLARE_double(benchmark_min_warmup_time);
+BM_DECLARE_int32(benchmark_repetitions);
+BM_DECLARE_bool(benchmark_report_aggregates_only);
+BM_DECLARE_bool(benchmark_display_aggregates_only);
+BM_DECLARE_string(benchmark_perf_counters);
+
 namespace internal {

 MemoryManager* memory_manager = nullptr;

+ProfilerManager* profiler_manager = nullptr;
+
 namespace {

-static constexpr IterationCount kMaxIterations = 1000000000;
+constexpr IterationCount kMaxIterations = 1000000000000;
+const double kDefaultMinTime =
+    std::strtod(::benchmark::kDefaultMinTimeStr, /*p_end*/ nullptr);

 BenchmarkReporter::Run CreateRunReport(
    const benchmark::internal::BenchmarkInstance& b,
    const internal::ThreadManager::Result& results,
    IterationCount memory_iterations,
-    const MemoryManager::Result* memory_result, double seconds,
+    const MemoryManager::Result& memory_result, double seconds,
    int64_t repetition_index, int64_t repeats) {
  // Create report about this benchmark run.
  BenchmarkReporter::Run report;
@ -75,8 +90,8 @@ BenchmarkReporter::Run CreateRunReport(
  report.run_name = b.name();
  report.family_index = b.family_index();
  report.per_family_instance_index = b.per_family_instance_index();
-  report.error_occurred = results.has_error_;
-  report.error_message = results.error_message_;
+  report.skipped = results.skipped_;
+  report.skip_message = results.skip_message_;
  report.report_label = results.report_label_;
  // This is the total iterations across all threads.
  report.iterations = results.iterations;
@ -85,12 +100,13 @@ BenchmarkReporter::Run CreateRunReport(
  report.repetition_index = repetition_index;
  report.repetitions = repeats;

-  if (!report.error_occurred) {
+  if (report.skipped == 0u) {
    if (b.use_manual_time()) {
      report.real_accumulated_time = results.manual_time_used;
    } else {
      report.real_accumulated_time = results.real_time_used;
    }
+    report.use_real_time_for_initial_big_o = b.use_manual_time();
    report.cpu_accumulated_time = results.cpu_time_used;
    report.complexity_n = results.complexity_n;
    report.complexity = b.complexity();
@ -99,12 +115,12 @@ BenchmarkReporter::Run CreateRunReport(
    report.counters = results.counters;

    if (memory_iterations > 0) {
-      assert(memory_result != nullptr);
      report.memory_result = memory_result;
      report.allocs_per_iter =
-          memory_iterations ? static_cast<double>(memory_result->num_allocs) /
-                                  memory_iterations
-                            : 0;
+          memory_iterations != 0
+              ? static_cast<double>(memory_result.num_allocs) /
+                    static_cast<double>(memory_iterations)
+              : 0;
    }

    internal::Finish(&report.counters, results.iterations, seconds,
@ -117,15 +133,16 @@ BenchmarkReporter::Run CreateRunReport(
 // Adds the stats collected for the thread into manager->results.
 void RunInThread(const BenchmarkInstance* b, IterationCount iters,
                 int thread_id, ThreadManager* manager,
-                 PerfCountersMeasurement* perf_counters_measurement) {
+                 PerfCountersMeasurement* perf_counters_measurement,
+                 ProfilerManager* profiler_manager_) {
  internal::ThreadTimer timer(
      b->measure_process_cpu_time()
          ? internal::ThreadTimer::CreateProcessCpuTime()
          : internal::ThreadTimer::Create());

-  State st =
-      b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
-  BM_CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
+  State st = b->Run(iters, thread_id, &timer, manager,
+                    perf_counters_measurement, profiler_manager_);
+  BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations)
      << "Benchmark returned before State::KeepRunning() returned false!";
  {
    MutexLock l(manager->GetBenchmarkMutex());
@ -140,27 +157,148 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,
  manager->NotifyThreadComplete();
 }

+double ComputeMinTime(const benchmark::internal::BenchmarkInstance& b,
+                      const BenchTimeType& iters_or_time) {
+  if (!IsZero(b.min_time())) {
+    return b.min_time();
+  }
+  // If the flag was used to specify number of iters, then return the default
+  // min_time.
+  if (iters_or_time.tag == BenchTimeType::ITERS) {
+    return kDefaultMinTime;
+  }
+
+  return iters_or_time.time;
+}
+
+IterationCount ComputeIters(const benchmark::internal::BenchmarkInstance& b,
+                            const BenchTimeType& iters_or_time) {
+  if (b.iterations() != 0) {
+    return b.iterations();
+  }
+
+  // We've already concluded that this flag is currently used to pass
+  // iters but do a check here again anyway.
+  BM_CHECK(iters_or_time.tag == BenchTimeType::ITERS);
+  return iters_or_time.iters;
+}
+
+class ThreadRunnerDefault : public ThreadRunnerBase {
+ public:
+  explicit ThreadRunnerDefault(int num_threads)
+      : pool(static_cast<size_t>(num_threads - 1)) {}
+
+  void RunThreads(const std::function<void(int)>& fn) final {
+    // Run all but one thread in separate threads
+    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+      pool[ti] = std::thread(fn, static_cast<int>(ti + 1));
+    }
+    // And run one thread here directly.
+    // (If we were asked to run just one thread, we don't create new threads.)
+    // Yes, we need to do this here *after* we start the separate threads.
+    fn(0);
+
+    // The main thread has finished. Now let's wait for the other threads.
+    for (std::thread& thread : pool) {
+      thread.join();
+    }
+  }
+
+ private:
+  std::vector<std::thread> pool;
+};
+
+std::unique_ptr<ThreadRunnerBase> GetThreadRunner(
+    const threadrunner_factory& userThreadRunnerFactory, int num_threads) {
+  return userThreadRunnerFactory
+             ? userThreadRunnerFactory(num_threads)
+             : std::make_unique<ThreadRunnerDefault>(num_threads);
+}
+
 }  // end namespace

+BenchTimeType ParseBenchMinTime(const std::string& value) {
+  BenchTimeType ret = {};
+
+  if (value.empty()) {
+    ret.tag = BenchTimeType::TIME;
+    ret.time = 0.0;
+    return ret;
+  }
+
+  if (value.back() == 'x') {
+    char* p_end = nullptr;
+    // Reset errno before it's changed by strtol.
+    errno = 0;
+    IterationCount num_iters = std::strtol(value.c_str(), &p_end, 10);
+
+    // After a valid parse, p_end should have been set to
+    // point to the 'x' suffix.
+    BM_CHECK(errno == 0 && p_end != nullptr && *p_end == 'x')
+        << "Malformed iters value passed to --benchmark_min_time: `" << value
+        << "`. Expected --benchmark_min_time=<integer>x.";
+
+    ret.tag = BenchTimeType::ITERS;
+    ret.iters = num_iters;
+    return ret;
+  }
+
+  bool has_suffix = value.back() == 's';
+  if (!has_suffix) {
+    BM_VLOG(0) << "Value passed to --benchmark_min_time should have a suffix. "
+                  "Eg., `30s` for 30-seconds.";
+  }
+
+  char* p_end = nullptr;
+  // Reset errno before it's changed by strtod.
+  errno = 0;
+  double min_time = std::strtod(value.c_str(), &p_end);
+
+  // After a successful parse, p_end should point to the suffix 's',
+  // or the end of the string if the suffix was omitted.
+  BM_CHECK(errno == 0 && p_end != nullptr &&
+           ((has_suffix && *p_end == 's') || *p_end == '\0'))
+      << "Malformed seconds value passed to --benchmark_min_time: `" << value
+      << "`. Expected --benchmark_min_time=<float>x.";
+
+  ret.tag = BenchTimeType::TIME;
+  ret.time = min_time;
+
+  return ret;
+}
+
 BenchmarkRunner::BenchmarkRunner(
    const benchmark::internal::BenchmarkInstance& b_,
+    PerfCountersMeasurement* pcm_,
    BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
    : b(b_),
      reports_for_family(reports_for_family_),
-      min_time(!IsZero(b.min_time()) ? b.min_time() : FLAGS_benchmark_min_time),
-      min_warmup_time((!IsZero(b.min_time()) && b.min_warmup_time() > 0.0)
-                          ? b.min_warmup_time()
-                          : FLAGS_benchmark_min_warmup_time),
-      warmup_done(!(min_warmup_time > 0.0)),
-      repeats(b.repetitions() != 0 ? b.repetitions()
-                                   : FLAGS_benchmark_repetitions),
-      has_explicit_iteration_count(b.iterations() != 0),
-      pool(b.threads() - 1),
-      iters(has_explicit_iteration_count ? b.iterations() : 1),
-      perf_counters_measurement(StrSplit(FLAGS_benchmark_perf_counters, ',')),
-      perf_counters_measurement_ptr(perf_counters_measurement.IsValid()
-                                        ? &perf_counters_measurement
-                                        : nullptr) {
+      parsed_benchtime_flag(ParseBenchMinTime(FLAGS_benchmark_min_time)),
+      min_time(FLAGS_benchmark_dry_run
+                   ? 0
+                   : ComputeMinTime(b_, parsed_benchtime_flag)),
+      min_warmup_time(
+          FLAGS_benchmark_dry_run
+              ? 0
+              : ((!IsZero(b.min_time()) && b.min_warmup_time() > 0.0)
+                     ? b.min_warmup_time()
+                     : FLAGS_benchmark_min_warmup_time)),
+      warmup_done(FLAGS_benchmark_dry_run ? true : !(min_warmup_time > 0.0)),
+      repeats(FLAGS_benchmark_dry_run
+                  ? 1
+                  : (b.repetitions() != 0 ? b.repetitions()
+                                          : FLAGS_benchmark_repetitions)),
+      has_explicit_iteration_count(b.iterations() != 0 ||
+                                   parsed_benchtime_flag.tag ==
+                                       BenchTimeType::ITERS),
+      thread_runner(
+          GetThreadRunner(b.GetUserThreadRunnerFactory(), b.threads())),
+      iters(FLAGS_benchmark_dry_run
+                ? 1
+                : (has_explicit_iteration_count
+                       ? ComputeIters(b_, parsed_benchtime_flag)
+                       : 1)),
+      perf_counters_measurement_ptr(pcm_) {
  run_results.display_report_aggregates_only =
      (FLAGS_benchmark_report_aggregates_only ||
       FLAGS_benchmark_display_aggregates_only);
@ -168,12 +306,13 @@ BenchmarkRunner::BenchmarkRunner(
      FLAGS_benchmark_report_aggregates_only;
  if (b.aggregation_report_mode() != internal::ARM_Unspecified) {
    run_results.display_report_aggregates_only =
-        (b.aggregation_report_mode() &
-         internal::ARM_DisplayReportAggregatesOnly);
+        ((b.aggregation_report_mode() &
+          internal::ARM_DisplayReportAggregatesOnly) != 0u);
    run_results.file_report_aggregates_only =
-        (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
+        ((b.aggregation_report_mode() &
+          internal::ARM_FileReportAggregatesOnly) != 0u);
    BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
-             perf_counters_measurement.IsValid())
+             (perf_counters_measurement_ptr->num_counters() == 0))
        << "Perf counters were requested but could not be set up.";
  }
 }
@ -184,19 +323,10 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
  std::unique_ptr<internal::ThreadManager> manager;
  manager.reset(new internal::ThreadManager(b.threads()));

-  // Run all but one thread in separate threads
-  for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-    pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
-                           manager.get(), perf_counters_measurement_ptr);
-  }
-  // And run one thread here directly.
-  // (If we were asked to run just one thread, we don't create new threads.)
-  // Yes, we need to do this here *after* we start the separate threads.
-  RunInThread(&b, iters, 0, manager.get(), perf_counters_measurement_ptr);
-
-  // The main thread has finished. Now let's wait for the other threads.
-  manager->WaitForAllThreads();
-  for (std::thread& thread : pool) thread.join();
+  thread_runner->RunThreads([&](int thread_idx) {
+    RunInThread(&b, iters, thread_idx, manager.get(),
+                perf_counters_measurement_ptr, /*profiler_manager=*/nullptr);
+  });

  IterationResults i;
  // Acquire the measurements/counters from the manager, UNDER THE LOCK!
@ -208,12 +338,6 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
  // And get rid of the manager.
  manager.reset();

-  // Adjust real/manual time stats since they were reported per thread.
-  i.results.real_time_used /= b.threads();
-  i.results.manual_time_used /= b.threads();
-  // If we were measuring whole-process CPU usage, adjust the CPU time too.
-  if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads();
-
  BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
             << i.results.real_time_used << "\n";

@ -247,8 +371,8 @@ IterationCount BenchmarkRunner::PredictNumItersNeeded(

  // So what seems to be the sufficiently-large iteration count? Round up.
  const IterationCount max_next_iters = static_cast<IterationCount>(
-      std::lround(std::max(multiplier * static_cast<double>(i.iters),
-                           static_cast<double>(i.iters) + 1.0)));
+      std::llround(std::max(multiplier * static_cast<double>(i.iters),
+                            static_cast<double>(i.iters) + 1.0)));
  // But we do have *some* limits though..
  const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);

@ -261,7 +385,7 @@ bool BenchmarkRunner::ShouldReportIterationResults(
  // Determine if this run should be reported;
  // Either it has run for a sufficient amount of time
  // or because an error was reported.
-  return i.results.has_error_ ||
+  return (i.results.skipped_ != 0u) || FLAGS_benchmark_dry_run ||
         i.iters >= kMaxIterations ||  // Too many iterations already.
         i.seconds >=
             GetMinTimeToApply() ||  // The elapsed time is large enough.
@ -322,6 +446,34 @@ void BenchmarkRunner::RunWarmUp() {
  }
 }

+MemoryManager::Result BenchmarkRunner::RunMemoryManager(
+    IterationCount memory_iterations) {
+  memory_manager->Start();
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(1));
+  b.Setup();
+  RunInThread(&b, memory_iterations, 0, manager.get(),
+              perf_counters_measurement_ptr,
+              /*profiler_manager=*/nullptr);
+  manager.reset();
+  b.Teardown();
+  MemoryManager::Result memory_result;
+  memory_manager->Stop(memory_result);
+  memory_result.memory_iterations = memory_iterations;
+  return memory_result;
+}
+
+void BenchmarkRunner::RunProfilerManager(IterationCount profile_iterations) {
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(1));
+  b.Setup();
+  RunInThread(&b, profile_iterations, 0, manager.get(),
+              /*perf_counters_measurement_ptr=*/nullptr,
+              /*profiler_manager=*/profiler_manager);
+  manager.reset();
+  b.Teardown();
+}
+
 void BenchmarkRunner::DoOneRepetition() {
  assert(HasRepeatsRemaining() && "Already done all repetitions?");

@ -332,7 +484,9 @@ void BenchmarkRunner::DoOneRepetition() {
  // this warmup never happened except the fact that warmup_done is set. Every
  // other manipulation of the BenchmarkRunner instance would be a bug! Please
  // fix it.
-  if (!warmup_done) RunWarmUp();
+  if (!warmup_done) {
+    RunWarmUp();
+  }

  IterationResults i;
  // We *may* be gradually increasing the length (iteration count)
@ -354,8 +508,10 @@ void BenchmarkRunner::DoOneRepetition() {
    const bool results_are_significant = !is_the_first_repetition ||
                                         has_explicit_iteration_count ||
                                         ShouldReportIterationResults(i);
-
-    if (results_are_significant) break;  // Good, let's report them!
+    // Good, let's report them!
+    if (results_are_significant) {
+      break;
+    }

    // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
    // iteration count, and run the benchmark again...
@ -366,31 +522,21 @@ void BenchmarkRunner::DoOneRepetition() {
           "then we should have accepted the current iteration run.");
  }

-  // Oh, one last thing, we need to also produce the 'memory measurements'..
-  MemoryManager::Result* memory_result = nullptr;
+  // Produce memory measurements if requested.
+  MemoryManager::Result memory_result;
  IterationCount memory_iterations = 0;
  if (memory_manager != nullptr) {
-    // TODO(vyng): Consider making BenchmarkReporter::Run::memory_result an
-    // optional so we don't have to own the Result here.
-    // Can't do it now due to cxx03.
-    memory_results.push_back(MemoryManager::Result());
-    memory_result = &memory_results.back();
    // Only run a few iterations to reduce the impact of one-time
    // allocations in benchmarks that are not properly managed.
    memory_iterations = std::min<IterationCount>(16, iters);
-    memory_manager->Start();
-    std::unique_ptr<internal::ThreadManager> manager;
-    manager.reset(new internal::ThreadManager(1));
-    b.Setup();
-    RunInThread(&b, memory_iterations, 0, manager.get(),
-                perf_counters_measurement_ptr);
-    manager->WaitForAllThreads();
-    manager.reset();
-    b.Teardown();
+    memory_result = RunMemoryManager(memory_iterations);
+  }

-    BENCHMARK_DISABLE_DEPRECATED_WARNING
-    memory_manager->Stop(memory_result);
-    BENCHMARK_RESTORE_DEPRECATED_WARNING
+  if (profiler_manager != nullptr) {
+    // We want to externally profile the benchmark for the same number of
+    // iterations because, for example, if we're tracing the benchmark then we
+    // want trace data to reasonably match PMU data.
+    RunProfilerManager(iters);
  }

  // Ok, now actually report.
@ -398,9 +544,11 @@ void BenchmarkRunner::DoOneRepetition() {
      CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
                      num_repetitions_done, repeats);

-  if (reports_for_family) {
+  if (reports_for_family != nullptr) {
    ++reports_for_family->num_runs_done;
-    if (!report.error_occurred) reports_for_family->Runs.push_back(report);
+    if (report.skipped == 0u) {
+      reports_for_family->Runs.push_back(report);
+    }
  }

  run_results.non_aggregates.push_back(report);
--- a/src/benchmark_runner.h
+++ b/src/benchmark_runner.h
@ -15,26 +15,20 @@
 #ifndef BENCHMARK_RUNNER_H_
 #define BENCHMARK_RUNNER_H_

+#include <memory>
 #include <thread>
 #include <vector>

 #include "benchmark_api_internal.h"
-#include "internal_macros.h"
 #include "perf_counters.h"
 #include "thread_manager.h"

 namespace benchmark {

-BM_DECLARE_double(benchmark_min_time);
-BM_DECLARE_double(benchmark_min_warmup_time);
-BM_DECLARE_int32(benchmark_repetitions);
-BM_DECLARE_bool(benchmark_report_aggregates_only);
-BM_DECLARE_bool(benchmark_display_aggregates_only);
-BM_DECLARE_string(benchmark_perf_counters);
-
 namespace internal {

 extern MemoryManager* memory_manager;
+extern ProfilerManager* profiler_manager;

 struct RunResults {
  std::vector<BenchmarkReporter::Run> non_aggregates;
@ -44,9 +38,21 @@ struct RunResults {
  bool file_report_aggregates_only = false;
 };

+struct BENCHMARK_EXPORT BenchTimeType {
+  enum { UNSPECIFIED, ITERS, TIME } tag;
+  union {
+    IterationCount iters;
+    double time;
+  };
+};
+
+BENCHMARK_EXPORT
+BenchTimeType ParseBenchMinTime(const std::string& value);
+
 class BenchmarkRunner {
 public:
  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
+                  benchmark::internal::PerfCountersMeasurement* pcm_,
                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);

  int GetNumRepeats() const { return repeats; }
@ -63,12 +69,19 @@ class BenchmarkRunner {
    return reports_for_family;
  }

+  double GetMinTime() const { return min_time; }
+
+  bool HasExplicitIters() const { return has_explicit_iteration_count; }
+
+  IterationCount GetIters() const { return iters; }
+
 private:
  RunResults run_results;

  const benchmark::internal::BenchmarkInstance& b;
  BenchmarkReporter::PerFamilyRunReports* reports_for_family;

+  BenchTimeType parsed_benchtime_flag;
  const double min_time;
  const double min_warmup_time;
  bool warmup_done;
@ -77,16 +90,13 @@ class BenchmarkRunner {

  int num_repetitions_done = 0;

-  std::vector<std::thread> pool;
-
-  std::vector<MemoryManager::Result> memory_results;
+  std::unique_ptr<ThreadRunnerBase> thread_runner;

  IterationCount iters;  // preserved between repetitions!
  // So only the first repetition has to find/calculate it,
  // the other repetitions will just use that precomputed iteration count.

-  PerfCountersMeasurement perf_counters_measurement;
-  PerfCountersMeasurement* const perf_counters_measurement_ptr;
+  PerfCountersMeasurement* const perf_counters_measurement_ptr = nullptr;

  struct IterationResults {
    internal::ThreadManager::Result results;
@ -95,6 +105,10 @@ class BenchmarkRunner {
  };
  IterationResults DoNIterations();

+  MemoryManager::Result RunMemoryManager(IterationCount memory_iterations);
+
+  void RunProfilerManager(IterationCount profile_iterations);
+
  IterationCount PredictNumItersNeeded(const IterationResults& i) const;

  bool ShouldReportIterationResults(const IterationResults& i) const;
--- a/src/check.cc
+++ b/src/check.cc
@ -3,7 +3,10 @@
 namespace benchmark {
 namespace internal {

-static AbortHandlerT* handler = &std::abort;
+namespace {
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+AbortHandlerT* handler = &std::abort;
+}  // namespace

 BENCHMARK_EXPORT AbortHandlerT*& GetAbortHandler() { return handler; }

--- a/src/check.h
+++ b/src/check.h
@ -4,6 +4,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <ostream>
+#include <string_view>

 #include "benchmark/export.h"
 #include "internal_macros.h"
@ -36,6 +37,8 @@ AbortHandlerT*& GetAbortHandler();

 BENCHMARK_NORETURN inline void CallAbortHandler() {
  GetAbortHandler()();
+  std::flush(std::cout);
+  std::flush(std::cerr);
  std::abort();  // fallback to enforce noreturn
 }

@ -44,7 +47,8 @@ BENCHMARK_NORETURN inline void CallAbortHandler() {
 // destructed.
 class CheckHandler {
 public:
-  CheckHandler(const char* check, const char* file, const char* func, int line)
+  CheckHandler(std::string_view check, std::string_view file,
+               std::string_view func, int line)
      : log_(GetErrorLogInstance()) {
    log_ << file << ":" << line << ": " << func << ": Check `" << check
         << "' failed. ";
@ -57,7 +61,7 @@ class CheckHandler {
 #pragma warning(disable : 4722)
 #endif
  BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
-    log_ << std::endl;
+    log_ << '\n';
    CallAbortHandler();
  }
 #if defined(COMPILER_MSVC)
@ -78,9 +82,11 @@ class CheckHandler {
 // The BM_CHECK macro returns a std::ostream object that can have extra
 // information written to it.
 #ifndef NDEBUG
-#define BM_CHECK(b)                                                          \
-  (b ? ::benchmark::internal::GetNullLogInstance()                           \
-     : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
+#define BM_CHECK(b)                                          \
+  (b ? ::benchmark::internal::GetNullLogInstance()           \
+     : ::benchmark::internal::CheckHandler(                  \
+           std::string_view(#b), std::string_view(__FILE__), \
+           std::string_view(__func__), __LINE__)             \
           .GetLog())
 #else
 #define BM_CHECK(b) ::benchmark::internal::GetNullLogInstance()
--- a/src/colorprint.cc
+++ b/src/colorprint.cc
@ -96,18 +96,18 @@ std::string FormatString(const char* msg, va_list args) {
  // currently there is no error handling for failure, so this is hack.
  BM_CHECK(ret >= 0);

-  if (ret == 0)  // handle empty expansion
+  if (ret == 0) {  // handle empty expansion
    return {};
-  else if (static_cast<size_t>(ret) < size)
-    return local_buff;
-  else {
-    // we did not provide a long enough buffer on our first attempt.
-    size = static_cast<size_t>(ret) + 1;  // + 1 for the null byte
-    std::unique_ptr<char[]> buff(new char[size]);
-    ret = vsnprintf(buff.get(), size, msg, args);
-    BM_CHECK(ret > 0 && (static_cast<size_t>(ret)) < size);
-    return buff.get();
  }
+  if (static_cast<size_t>(ret) < size) {
+    return local_buff;
+  }
+  // we did not provide a long enough buffer on our first attempt.
+  size = static_cast<size_t>(ret) + 1;  // + 1 for the null byte
+  std::unique_ptr<char[]> buff(new char[size]);
+  ret = vsnprintf(buff.get(), size, msg, args);
+  BM_CHECK(ret > 0 && (static_cast<size_t>(ret)) < size);
+  return buff.get();
 }

 std::string FormatString(const char* msg, ...) {
@ -135,22 +135,30 @@ void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
  // Gets the current text color.
  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
-  const WORD old_color_attrs = buffer_info.wAttributes;
+  const WORD original_color_attrs = buffer_info.wAttributes;

  // We need to flush the stream buffers into the console before each
  // SetConsoleTextAttribute call lest it affect the text that is already
  // printed but has not yet reached the console.
-  fflush(stdout);
-  SetConsoleTextAttribute(stdout_handle,
-                          GetPlatformColorCode(color) | FOREGROUND_INTENSITY);
-  vprintf(fmt, args);
+  out.flush();

-  fflush(stdout);
-  // Restores the text color.
-  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+  const WORD original_background_attrs =
+      original_color_attrs & (BACKGROUND_RED | BACKGROUND_GREEN |
+                              BACKGROUND_BLUE | BACKGROUND_INTENSITY);
+
+  SetConsoleTextAttribute(stdout_handle, GetPlatformColorCode(color) |
+                                             FOREGROUND_INTENSITY |
+                                             original_background_attrs);
+  out << FormatString(fmt, args);
+
+  out.flush();
+  // Restores the text and background color.
+  SetConsoleTextAttribute(stdout_handle, original_color_attrs);
 #else
  const char* color_code = GetPlatformColorCode(color);
-  if (color_code) out << FormatString("\033[0;3%sm", color_code);
+  if (color_code != nullptr) {
+    out << FormatString("\033[0;3%sm", color_code);
+  }
  out << FormatString(fmt, args) << "\033[m";
 #endif
 }
@ -163,19 +171,31 @@ bool IsColorTerminal() {
 #else
  // On non-Windows platforms, we rely on the TERM variable. This list of
  // supported TERM values is copied from Google Test:
-  // <https://github.com/google/googletest/blob/master/googletest/src/gtest.cc#L2925>.
+  // <https://github.com/google/googletest/blob/v1.13.0/googletest/src/gtest.cc#L3225-L3259>.
  const char* const SUPPORTED_TERM_VALUES[] = {
-      "xterm",         "xterm-color",     "xterm-256color",
-      "screen",        "screen-256color", "tmux",
-      "tmux-256color", "rxvt-unicode",    "rxvt-unicode-256color",
-      "linux",         "cygwin",
+      "xterm",
+      "xterm-color",
+      "xterm-256color",
+      "screen",
+      "screen-256color",
+      "tmux",
+      "tmux-256color",
+      "rxvt-unicode",
+      "rxvt-unicode-256color",
+      "linux",
+      "cygwin",
+      "xterm-kitty",
+      "alacritty",
+      "foot",
+      "foot-extra",
+      "wezterm",
  };

  const char* const term = getenv("TERM");

  bool term_supports_color = false;
  for (const char* candidate : SUPPORTED_TERM_VALUES) {
-    if (term && 0 == strcmp(term, candidate)) {
+    if ((term != nullptr) && 0 == strcmp(term, candidate)) {
      term_supports_color = true;
      break;
    }
--- a/src/commandlineflags.cc
+++ b/src/commandlineflags.cc
@ -109,12 +109,13 @@ bool ParseKvPairs(const std::string& src_text, const char* str,
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "BENCHMARK_FOO" in the open-source version.
-static std::string FlagToEnvVar(const char* flag) {
+std::string FlagToEnvVar(const char* flag) {
  const std::string flag_str(flag);

  std::string env_var;
-  for (size_t i = 0; i != flag_str.length(); ++i)
+  for (size_t i = 0; i != flag_str.length(); ++i) {
    env_var += static_cast<char>(::toupper(flag_str.c_str()[i]));
+  }

  return env_var;
 }
@ -167,7 +168,9 @@ std::map<std::string, std::string> KvPairsFromEnv(
  const std::string env_var = FlagToEnvVar(flag);
  const char* const value_str = getenv(env_var.c_str());

-  if (value_str == nullptr) return default_val;
+  if (value_str == nullptr) {
+    return default_val;
+  }

  std::map<std::string, std::string> value;
  if (!ParseKvPairs("Environment variable " + env_var, value_str, &value)) {
@ -184,23 +187,31 @@ std::map<std::string, std::string> KvPairsFromEnv(
 const char* ParseFlagValue(const char* str, const char* flag,
                           bool def_optional) {
  // str and flag must not be nullptr.
-  if (str == nullptr || flag == nullptr) return nullptr;
+  if (str == nullptr || flag == nullptr) {
+    return nullptr;
+  }

  // The flag must start with "--".
  const std::string flag_str = std::string("--") + std::string(flag);
  const size_t flag_len = flag_str.length();
-  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) {
+    return nullptr;
+  }

  // Skips the flag name.
  const char* flag_end = str + flag_len;

  // When def_optional is true, it's OK to not have a "=value" part.
-  if (def_optional && (flag_end[0] == '\0')) return flag_end;
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }

  // If def_optional is true and there are more characters after the
  // flag name, or if def_optional is false, there must be a '=' after
  // the flag name.
-  if (flag_end[0] != '=') return nullptr;
+  if (flag_end[0] != '=') {
+    return nullptr;
+  }

  // Returns the string after "=".
  return flag_end + 1;
@ -212,7 +223,9 @@ bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
  const char* const value_str = ParseFlagValue(str, flag, true);

  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }

  // Converts the string value to a bool.
  *value = IsTruthyFlagValue(value_str);
@ -225,7 +238,9 @@ bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
  const char* const value_str = ParseFlagValue(str, flag, false);

  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }

  // Sets *value to the value of the flag.
  return ParseInt32(std::string("The value of flag --") + flag, value_str,
@ -238,7 +253,9 @@ bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
  const char* const value_str = ParseFlagValue(str, flag, false);

  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }

  // Sets *value to the value of the flag.
  return ParseDouble(std::string("The value of flag --") + flag, value_str,
@ -251,7 +268,9 @@ bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
  const char* const value_str = ParseFlagValue(str, flag, false);

  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }

  *value = value_str;
  return true;
@ -262,11 +281,15 @@ bool ParseKeyValueFlag(const char* str, const char* flag,
                       std::map<std::string, std::string>* value) {
  const char* const value_str = ParseFlagValue(str, flag, false);

-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }

  for (const auto& kvpair : StrSplit(value_str, ',')) {
    const auto kv = StrSplit(kvpair, '=');
-    if (kv.size() != 2) return false;
+    if (kv.size() != 2) {
+      return false;
+    }
    value->emplace(kv[0], kv[1]);
  }

@ -284,14 +307,15 @@ bool IsTruthyFlagValue(const std::string& value) {
    char v = value[0];
    return isalnum(v) &&
           !(v == '0' || v == 'f' || v == 'F' || v == 'n' || v == 'N');
-  } else if (!value.empty()) {
+  }
+  if (!value.empty()) {
    std::string value_lower(value);
    std::transform(value_lower.begin(), value_lower.end(), value_lower.begin(),
                   [](char c) { return static_cast<char>(::tolower(c)); });
    return !(value_lower == "false" || value_lower == "no" ||
             value_lower == "off");
-  } else
-    return true;
+  }
+  return true;
 }

 }  // end namespace benchmark
--- a/src/commandlineflags.h
+++ b/src/commandlineflags.h
@ -11,14 +11,17 @@
 #define FLAG(name) FLAGS_##name

 // Macros for declaring flags.
+// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
 #define BM_DECLARE_bool(name) BENCHMARK_EXPORT extern bool FLAG(name)
 #define BM_DECLARE_int32(name) BENCHMARK_EXPORT extern int32_t FLAG(name)
 #define BM_DECLARE_double(name) BENCHMARK_EXPORT extern double FLAG(name)
 #define BM_DECLARE_string(name) BENCHMARK_EXPORT extern std::string FLAG(name)
 #define BM_DECLARE_kvpairs(name) \
  BENCHMARK_EXPORT extern std::map<std::string, std::string> FLAG(name)
+// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)

 // Macros for defining flags.
+// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
 #define BM_DEFINE_bool(name, default_val) \
  BENCHMARK_EXPORT bool FLAG(name) = benchmark::BoolFromEnv(#name, default_val)
 #define BM_DEFINE_int32(name, default_val) \
@ -33,6 +36,7 @@
 #define BM_DEFINE_kvpairs(name, default_val)                       \
  BENCHMARK_EXPORT std::map<std::string, std::string> FLAG(name) = \
      benchmark::KvPairsFromEnv(#name, default_val)
+// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)

 namespace benchmark {

--- a/src/complexity.cc
+++ b/src/complexity.cc
@ -27,7 +27,6 @@ namespace benchmark {

 // Internal function to calculate the different scalability forms
 BigOFunc* FittingCurve(BigO complexity) {
-  static const double kLog2E = 1.44269504088896340736;
  switch (complexity) {
    case oN:
      return [](IterationCount n) -> double { return static_cast<double>(n); };
@ -36,13 +35,12 @@ BigOFunc* FittingCurve(BigO complexity) {
    case oNCubed:
      return [](IterationCount n) -> double { return std::pow(n, 3); };
    case oLogN:
-      /* Note: can't use log2 because Android's GNU STL lacks it */
-      return
-          [](IterationCount n) { return kLog2E * log(static_cast<double>(n)); };
+      return [](IterationCount n) -> double {
+        return std::log2(static_cast<double>(n));
+      };
    case oNLogN:
-      /* Note: can't use log2 because Android's GNU STL lacks it */
-      return [](IterationCount n) {
-        return kLog2E * n * log(static_cast<double>(n));
+      return [](IterationCount n) -> double {
+        return static_cast<double>(n) * std::log2(static_cast<double>(n));
      };
    case o1:
    default:
@ -75,12 +73,12 @@ std::string GetBigOString(BigO complexity) {
 // given by the lambda expression.
 //   - n             : Vector containing the size of the benchmark tests.
 //   - time          : Vector containing the times for the benchmark tests.
-//   - fitting_curve : lambda expression (e.g. [](int64_t n) {return n; };).
+//   - fitting_curve : lambda expression (e.g. [](ComplexityN n) {return n; };).

 // For a deeper explanation on the algorithm logic, please refer to
 // https://en.wikipedia.org/wiki/Least_squares#Least_squares,_regression_analysis_and_statistics

-LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
+LeastSq MinimalLeastSq(const std::vector<ComplexityN>& n,
                       const std::vector<double>& time,
                       BigOFunc* fitting_curve) {
  double sigma_gn_squared = 0.0;
@ -105,12 +103,12 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
  double rms = 0.0;
  for (size_t i = 0; i < n.size(); ++i) {
    double fit = result.coef * fitting_curve(n[i]);
-    rms += pow((time[i] - fit), 2);
+    rms += std::pow((time[i] - fit), 2);
  }

  // Normalized RMS by the mean of the observed values
-  double mean = sigma_time / n.size();
-  result.rms = sqrt(rms / n.size()) / mean;
+  double mean = sigma_time / static_cast<double>(n.size());
+  result.rms = std::sqrt(rms / static_cast<double>(n.size())) / mean;

  return result;
 }
@ -122,7 +120,7 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
 //   - complexity : If different than oAuto, the fitting curve will stick to
 //                  this one. If it is oAuto, it will be calculated the best
 //                  fitting curve.
-LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
+LeastSq MinimalLeastSq(const std::vector<ComplexityN>& n,
                       const std::vector<double>& time, const BigO complexity) {
  BM_CHECK_EQ(n.size(), time.size());
  BM_CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
@ -159,10 +157,12 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
  typedef BenchmarkReporter::Run Run;
  std::vector<Run> results;

-  if (reports.size() < 2) return results;
+  if (reports.size() < 2) {
+    return results;
+  }

  // Accumulators.
-  std::vector<int64_t> n;
+  std::vector<ComplexityN> n;
  std::vector<double> real_time;
  std::vector<double> cpu_time;

@ -171,8 +171,10 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
    BM_CHECK_GT(run.complexity_n, 0)
        << "Did you forget to call SetComplexityN?";
    n.push_back(run.complexity_n);
-    real_time.push_back(run.real_accumulated_time / run.iterations);
-    cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
+    real_time.push_back(run.real_accumulated_time /
+                        static_cast<double>(run.iterations));
+    cpu_time.push_back(run.cpu_accumulated_time /
+                       static_cast<double>(run.iterations));
  }

  LeastSq result_cpu;
@ -182,8 +184,19 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
    result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity_lambda);
    result_real = MinimalLeastSq(n, real_time, reports[0].complexity_lambda);
  } else {
-    result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity);
-    result_real = MinimalLeastSq(n, real_time, result_cpu.complexity);
+    const BigO* InitialBigO = &reports[0].complexity;
+    const bool use_real_time_for_initial_big_o =
+        reports[0].use_real_time_for_initial_big_o;
+    if (use_real_time_for_initial_big_o) {
+      result_real = MinimalLeastSq(n, real_time, *InitialBigO);
+      InitialBigO = &result_real.complexity;
+      // The Big-O complexity for CPU time must have the same Big-O function!
+    }
+    result_cpu = MinimalLeastSq(n, cpu_time, *InitialBigO);
+    InitialBigO = &result_cpu.complexity;
+    if (!use_real_time_for_initial_big_o) {
+      result_real = MinimalLeastSq(n, real_time, *InitialBigO);
+    }
  }

  // Drop the 'args' when reporting complexity.
--- a/src/complexity.h
+++ b/src/complexity.h
@ -31,7 +31,7 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
    const std::vector<BenchmarkReporter::Run>& reports);

 // This data structure will contain the result returned by MinimalLeastSq
-//   - coef        : Estimated coeficient for the high-order term as
+//   - coef        : Estimated coefficient for the high-order term as
 //                   interpolated from data.
 //   - rms         : Normalized Root Mean Squared Error.
 //   - complexity  : Scalability form (e.g. oN, oNLogN). In case a scalability
--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@ -42,11 +42,15 @@ bool ConsoleReporter::ReportContext(const Context& context) {
  PrintBasicContext(&GetErrorStream(), context);

 #ifdef BENCHMARK_OS_WINDOWS
-  if ((output_options_ & OO_Color) && &std::cout != &GetOutputStream()) {
-    GetErrorStream()
-        << "Color printing is only supported for stdout on windows."
-           " Disabling color printing\n";
-    output_options_ = static_cast<OutputOptions>(output_options_ & ~OO_Color);
+  if ((output_options_ & OO_Color)) {
+    auto stdOutBuf = std::cout.rdbuf();
+    auto outStreamBuf = GetOutputStream().rdbuf();
+    if (stdOutBuf != outStreamBuf) {
+      GetErrorStream()
+          << "Color printing is only supported for stdout on windows."
+             " Disabling color printing\n";
+      output_options_ = static_cast<OutputOptions>(output_options_ & ~OO_Color);
+    }
  }
 #endif

@ -59,7 +63,7 @@ void ConsoleReporter::PrintHeader(const Run& run) {
      FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
                   "Benchmark", "Time", "CPU", "Iterations");
  if (!run.counters.empty()) {
-    if (output_options_ & OO_Tabular) {
+    if ((output_options_ & OO_Tabular) != 0) {
      for (auto const& c : run.counters) {
        str += FormatString(" %10s", c.first.c_str());
      }
@ -79,7 +83,7 @@ void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
    bool print_header = !printed_header_;
    // --- or if the format is tabular and this run
    //     has different fields from the prev header
-    print_header |= (output_options_ & OO_Tabular) &&
+    print_header |= ((output_options_ & OO_Tabular) != 0) &&
                    (!internal::SameNames(run.counters, prev_counters_));
    if (print_header) {
      printed_header_ = true;
@ -93,8 +97,8 @@ void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
  }
 }

-static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
-                             ...) {
+static void IgnoreColorPrint(std::ostream& out, LogColor /*unused*/,
+                             const char* fmt, ...) {
  va_list args;
  va_start(args, fmt);
  out << FormatString(fmt, args);
@ -115,7 +119,7 @@ static std::string FormatTime(double time) {
  if (time < 100.0) {
    return FormatString("%10.1f", time);
  }
-  // Assuming the time ist at max 9.9999e+99 and we have 10 digits for the
+  // Assuming the time is at max 9.9999e+99 and we have 10 digits for the
  // number, we get 10-1(.)-1(e)-1(sign)-2(exponent) = 5 digits to print.
  if (time > 9999999999 /*max 10 digit number*/) {
    return FormatString("%1.4e", time);
@ -127,7 +131,7 @@ BENCHMARK_EXPORT
 void ConsoleReporter::PrintRunData(const Run& result) {
  typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
  auto& Out = GetOutputStream();
-  PrinterFn* printer = (output_options_ & OO_Color)
+  PrinterFn* printer = (output_options_ & OO_Color) != 0
                           ? static_cast<PrinterFn*>(ColorPrintf)
                           : IgnoreColorPrint;
  auto name_color =
@ -135,9 +139,14 @@ void ConsoleReporter::PrintRunData(const Run& result) {
  printer(Out, name_color, "%-*s ", name_field_width_,
          result.benchmark_name().c_str());

-  if (result.error_occurred) {
+  if (internal::SkippedWithError == result.skipped) {
    printer(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
-            result.error_message.c_str());
+            result.skip_message.c_str());
+    printer(Out, COLOR_DEFAULT, "\n");
+    return;
+  }
+  if (internal::SkippedWithMessage == result.skipped) {
+    printer(Out, COLOR_WHITE, "SKIPPED: \'%s\'", result.skip_message.c_str());
    printer(Out, COLOR_DEFAULT, "\n");
    return;
  }
@ -170,9 +179,9 @@ void ConsoleReporter::PrintRunData(const Run& result) {
    printer(Out, COLOR_CYAN, "%10lld", result.iterations);
  }

-  for (auto& c : result.counters) {
+  for (const auto& c : result.counters) {
    const std::size_t cNameLen =
-        std::max(std::string::size_type(10), c.first.length());
+        std::max(static_cast<std::size_t>(10), c.first.length());
    std::string s;
    const char* unit = "";
    if (result.run_type == Run::RT_Aggregate &&
@ -181,10 +190,11 @@ void ConsoleReporter::PrintRunData(const Run& result) {
      unit = "%";
    } else {
      s = HumanReadableNumber(c.second.value, c.second.oneK);
-      if (c.second.flags & Counter::kIsRate)
-        unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
+      if ((c.second.flags & Counter::kIsRate) != 0) {
+        unit = (c.second.flags & Counter::kInvert) != 0 ? "s" : "/s";
+      }
    }
-    if (output_options_ & OO_Tabular) {
+    if ((output_options_ & OO_Tabular) != 0) {
      printer(Out, COLOR_DEFAULT, " %*s%s", cNameLen - strlen(unit), s.c_str(),
              unit);
    } else {
--- a/src/counter.cc
+++ b/src/counter.cc
@ -20,20 +20,20 @@ namespace internal {
 double Finish(Counter const& c, IterationCount iterations, double cpu_time,
              double num_threads) {
  double v = c.value;
-  if (c.flags & Counter::kIsRate) {
+  if ((c.flags & Counter::kIsRate) != 0) {
    v /= cpu_time;
  }
-  if (c.flags & Counter::kAvgThreads) {
+  if ((c.flags & Counter::kAvgThreads) != 0) {
    v /= num_threads;
  }
-  if (c.flags & Counter::kIsIterationInvariant) {
-    v *= iterations;
+  if ((c.flags & Counter::kIsIterationInvariant) != 0) {
+    v *= static_cast<double>(iterations);
  }
-  if (c.flags & Counter::kAvgIterations) {
-    v /= iterations;
+  if ((c.flags & Counter::kAvgIterations) != 0) {
+    v /= static_cast<double>(iterations);
  }

-  if (c.flags & Counter::kInvert) {  // Invert is *always* last.
+  if ((c.flags & Counter::kInvert) != 0) {  // Invert is *always* last.
    v = 1.0 / v;
  }
  return v;
@ -64,7 +64,9 @@ void Increment(UserCounters* l, UserCounters const& r) {
 }

 bool SameNames(UserCounters const& l, UserCounters const& r) {
-  if (&l == &r) return true;
+  if (&l == &r) {
+    return true;
+  }
  if (l.size() != r.size()) {
    return false;
  }
--- a/src/csv_reporter.cc
+++ b/src/csv_reporter.cc
@ -66,8 +66,10 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
    // save the names of all the user counters
    for (const auto& run : reports) {
      for (const auto& cnt : run.counters) {
-        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+        if (cnt.first == "bytes_per_second" ||
+            cnt.first == "items_per_second") {
          continue;
+        }
        user_counter_names_.insert(cnt.first);
      }
    }
@ -75,7 +77,9 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
    // print the header
    for (auto B = elements.begin(); B != elements.end();) {
      Out << *B++;
-      if (B != elements.end()) Out << ",";
+      if (B != elements.end()) {
+        Out << ",";
+      }
    }
    for (auto B = user_counter_names_.begin();
         B != user_counter_names_.end();) {
@ -88,8 +92,10 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
    // check that all the current counters are saved in the name set
    for (const auto& run : reports) {
      for (const auto& cnt : run.counters) {
-        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+        if (cnt.first == "bytes_per_second" ||
+            cnt.first == "items_per_second") {
          continue;
+        }
        BM_CHECK(user_counter_names_.find(cnt.first) !=
                 user_counter_names_.end())
            << "All counters must be present in each run. "
@ -109,10 +115,10 @@ BENCHMARK_EXPORT
 void CSVReporter::PrintRunData(const Run& run) {
  std::ostream& Out = GetOutputStream();
  Out << CsvEscape(run.benchmark_name()) << ",";
-  if (run.error_occurred) {
+  if (run.skipped != 0u) {
    Out << std::string(elements.size() - 3, ',');
-    Out << "true,";
-    Out << CsvEscape(run.error_message) << "\n";
+    Out << std::boolalpha << (internal::SkippedWithError == run.skipped) << ",";
+    Out << CsvEscape(run.skip_message) << "\n";
    return;
  }

@ -122,13 +128,21 @@ void CSVReporter::PrintRunData(const Run& run) {
  }
  Out << ",";

-  Out << run.GetAdjustedRealTime() << ",";
-  Out << run.GetAdjustedCPUTime() << ",";
+  if (run.run_type != Run::RT_Aggregate ||
+      run.aggregate_unit == StatisticUnit::kTime) {
+    Out << run.GetAdjustedRealTime() << ",";
+    Out << run.GetAdjustedCPUTime() << ",";
+  } else {
+    assert(run.aggregate_unit == StatisticUnit::kPercentage);
+    Out << run.real_accumulated_time << ",";
+    Out << run.cpu_accumulated_time << ",";
+  }

  // Do not print timeLabel on bigO and RMS report
  if (run.report_big_o) {
    Out << GetBigOString(run.complexity);
-  } else if (!run.report_rms) {
+  } else if (!run.report_rms &&
+             run.aggregate_unit != StatisticUnit::kPercentage) {
    Out << GetTimeUnitString(run.time_unit);
  }
  Out << ",";
--- a/src/cycleclock.h
+++ b/src/cycleclock.h
@ -36,7 +36,8 @@
 // declarations of some other intrinsics, breaking compilation.
 // Therefore, we simply declare __rdtsc ourselves. See also
 // http://connect.microsoft.com/VisualStudio/feedback/details/262047
-#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64)
+#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64) && \
+    !defined(_M_ARM64EC)
 extern "C" uint64_t __rdtsc();
 #pragma intrinsic(__rdtsc)
 #endif
@ -69,7 +70,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  // frequency scaling).  Also note that when the Mac sleeps, this
  // counter pauses; it does not continue counting, nor does it
  // reset to zero.
-  return mach_absolute_time();
+  return static_cast<int64_t>(mach_absolute_time());
 #elif defined(BENCHMARK_OS_EMSCRIPTEN)
  // this goes above x86-specific code because old versions of Emscripten
  // define __x86_64__, although they have nothing to do with it.
@ -81,7 +82,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
 #elif defined(__x86_64__) || defined(__amd64__)
  uint64_t low, high;
  __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
-  return (high << 32) | low;
+  return static_cast<int64_t>((high << 32) | low);
 #elif defined(__powerpc__) || defined(__ppc__)
  // This returns a time-base, which is not always precisely a cycle-count.
 #if defined(__powerpc64__) || defined(__ppc64__)
@ -114,7 +115,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  // when I know it will work.  Otherwise, I'll use __rdtsc and hope
  // the code is being compiled with a non-ancient compiler.
  _asm rdtsc
-#elif defined(COMPILER_MSVC) && defined(_M_ARM64)
+#elif defined(COMPILER_MSVC) && (defined(_M_ARM64) || defined(_M_ARM64EC))
  // See // https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics
  // and https://reviews.llvm.org/D53115
  int64_t virtual_timer_value;
@ -180,33 +181,36 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
 #elif defined(__s390__)  // Covers both s390 and s390x.
  // Return the CPU clock.
  uint64_t tsc;
-#if defined(BENCHMARK_OS_ZOS) && defined(COMPILER_IBMXL)
-  // z/OS XL compiler HLASM syntax.
+#if defined(BENCHMARK_OS_ZOS)
+  // z/OS HLASM syntax.
  asm(" stck %0" : "=m"(tsc) : : "cc");
 #else
+  // Linux on Z syntax.
  asm("stck %0" : "=Q"(tsc) : : "cc");
 #endif
  return tsc;
 #elif defined(__riscv)  // RISC-V
-  // Use RDCYCLE (and RDCYCLEH on riscv32)
+  // Use RDTIME (and RDTIMEH on riscv32).
+  // RDCYCLE is a privileged instruction since Linux 6.6.
 #if __riscv_xlen == 32
  uint32_t cycles_lo, cycles_hi0, cycles_hi1;
  // This asm also includes the PowerPC overflow handling strategy, as above.
  // Implemented in assembly because Clang insisted on branching.
  asm volatile(
-      "rdcycleh %0\n"
-      "rdcycle %1\n"
-      "rdcycleh %2\n"
+      "rdtimeh %0\n"
+      "rdtime %1\n"
+      "rdtimeh %2\n"
      "sub %0, %0, %2\n"
      "seqz %0, %0\n"
      "sub %0, zero, %0\n"
      "and %1, %1, %0\n"
      : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1));
-  return (static_cast<uint64_t>(cycles_hi1) << 32) | cycles_lo;
+  return static_cast<int64_t>((static_cast<uint64_t>(cycles_hi1) << 32) |
+                              cycles_lo);
 #else
  uint64_t cycles;
-  asm volatile("rdcycle %0" : "=r"(cycles));
-  return cycles;
+  asm volatile("rdtime %0" : "=r"(cycles));
+  return static_cast<int64_t>(cycles);
 #endif
 #elif defined(__e2k__) || defined(__elbrus__)
  struct timeval tv;
@ -215,11 +219,33 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
 #elif defined(__hexagon__)
  uint64_t pcycle;
  asm volatile("%0 = C15:14" : "=r"(pcycle));
-  return static_cast<double>(pcycle);
+  return static_cast<int64_t>(pcycle);
+#elif defined(__alpha__)
+  // Alpha has a cycle counter, the PCC register, but it is an unsigned 32-bit
+  // integer and thus wraps every ~4s, making using it for tick counts
+  // unreliable beyond this time range.  The real-time clock is low-precision,
+  // roughtly ~1ms, but it is the only option that can reasonable count
+  // indefinitely.
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__hppa__) || defined(__linux__)
+  // Fallback for all other architectures with a recent Linux kernel, e.g.:
+  // HP PA-RISC provides a user-readable clock counter (cr16), but
+  // it's not syncronized across CPUs and only 32-bit wide when programs
+  // are built as 32-bit binaries.
+  // Same for SH-4 and possibly others.
+  // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
+  // because is provides nanosecond resolution.
+  // Initialize to always return 0 if clock_gettime fails.
+  struct timespec ts = {0, 0};
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return static_cast<int64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
 #else
-// The soft failover to a generic implementation is automatic only for ARM.
-// For other platforms the developer is expected to make an attempt to create
-// a fast implementation and use generic version if nothing better is available.
+  // The soft failover to a generic implementation is automatic only for ARM.
+  // For other platforms the developer is expected to make an attempt to create
+  // a fast implementation and use generic version if nothing better is
+  // available.
 #error You need to define CycleTimer for your OS and CPU
 #endif
 }
--- a/src/internal_macros.h
+++ b/src/internal_macros.h
@ -11,11 +11,7 @@
 #endif

 #if defined(__clang__)
-  #if defined(__ibmxl__)
-    #if !defined(COMPILER_IBMXL)
-      #define COMPILER_IBMXL
-    #endif
-  #elif !defined(COMPILER_CLANG)
+  #if !defined(COMPILER_CLANG)
    #define COMPILER_CLANG
  #endif
 #elif defined(_MSC_VER)
@ -42,6 +38,12 @@
  #define BENCHMARK_OS_CYGWIN 1
 #elif defined(_WIN32)
  #define BENCHMARK_OS_WINDOWS 1
+  // WINAPI_FAMILY_PARTITION is defined in winapifamily.h.
+  // We include windows.h which implicitly includes winapifamily.h for compatibility.
+  #ifndef NOMINMAX
+    #define NOMINMAX
+  #endif
+  #include <windows.h>
  #if defined(WINAPI_FAMILY_PARTITION)
    #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
      #define BENCHMARK_OS_WINDOWS_WIN32 1
--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@ -85,15 +85,19 @@ std::string FormatKV(std::string const& key, int64_t value) {
  return ss.str();
 }

+std::string FormatKV(std::string const& key, int value) {
+  return FormatKV(key, static_cast<int64_t>(value));
+}
+
 std::string FormatKV(std::string const& key, double value) {
  std::stringstream ss;
  ss << '"' << StrEscape(key) << "\": ";

-  if (std::isnan(value))
+  if (std::isnan(value)) {
    ss << (value < 0 ? "-" : "") << "NaN";
-  else if (std::isinf(value))
+  } else if (std::isinf(value)) {
    ss << (value < 0 ? "-" : "") << "Infinity";
-  else {
+  } else {
    const auto max_digits10 =
        std::numeric_limits<decltype(value)>::max_digits10;
    const auto max_fractional_digits10 = max_digits10 - 1;
@ -122,7 +126,7 @@ bool JSONReporter::ReportContext(const Context& context) {

  out << indent << FormatKV("host_name", context.sys_info.name) << ",\n";

-  if (Context::executable_name) {
+  if (Context::executable_name != nullptr) {
    out << indent << FormatKV("executable", Context::executable_name) << ",\n";
  }

@ -136,7 +140,7 @@ bool JSONReporter::ReportContext(const Context& context) {
  if (CPUInfo::Scaling::UNKNOWN != info.scaling) {
    out << indent
        << FormatKV("cpu_scaling_enabled",
-                    info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
+                    info.scaling == CPUInfo::Scaling::ENABLED)
        << ",\n";
  }

@ -144,7 +148,7 @@ bool JSONReporter::ReportContext(const Context& context) {
  indent = std::string(6, ' ');
  std::string cache_indent(8, ' ');
  for (size_t i = 0; i < info.caches.size(); ++i) {
-    auto& CI = info.caches[i];
+    const auto& CI = info.caches[i];
    out << indent << "{\n";
    out << cache_indent << FormatKV("type", CI.type) << ",\n";
    out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
@ -155,7 +159,9 @@ bool JSONReporter::ReportContext(const Context& context) {
        << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
        << "\n";
    out << indent << "}";
-    if (i != info.caches.size() - 1) out << ",";
+    if (i != info.caches.size() - 1) {
+      out << ",";
+    }
    out << "\n";
  }
  indent = std::string(4, ' ');
@ -163,16 +169,25 @@ bool JSONReporter::ReportContext(const Context& context) {
  out << indent << "\"load_avg\": [";
  for (auto it = info.load_avg.begin(); it != info.load_avg.end();) {
    out << *it++;
-    if (it != info.load_avg.end()) out << ",";
+    if (it != info.load_avg.end()) {
+      out << ",";
+    }
  }
  out << "],\n";

+  out << indent << FormatKV("library_version", GetBenchmarkVersion());
+  out << ",\n";
+
 #if defined(NDEBUG)
  const char build_type[] = "release";
 #else
  const char build_type[] = "debug";
 #endif
  out << indent << FormatKV("library_build_type", build_type);
+  out << ",\n";
+
+  // NOTE: our json schema is not strictly tied to the library version!
+  out << indent << FormatKV("json_schema_version", 1);

  std::map<std::string, std::string>* global_context =
      internal::GetGlobalContext();
@ -254,9 +269,12 @@ void JSONReporter::PrintRunData(Run const& run) {
      BENCHMARK_UNREACHABLE();
    }()) << ",\n";
  }
-  if (run.error_occurred) {
-    out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
-    out << indent << FormatKV("error_message", run.error_message) << ",\n";
+  if (internal::SkippedWithError == run.skipped) {
+    out << indent << FormatKV("error_occurred", true) << ",\n";
+    out << indent << FormatKV("error_message", run.skip_message) << ",\n";
+  } else if (internal::SkippedWithMessage == run.skipped) {
+    out << indent << FormatKV("skipped", true) << ",\n";
+    out << indent << FormatKV("skip_message", run.skip_message) << ",\n";
  }
  if (!run.report_big_o && !run.report_rms) {
    out << indent << FormatKV("iterations", run.iterations) << ",\n";
@ -284,19 +302,21 @@ void JSONReporter::PrintRunData(Run const& run) {
    out << indent << FormatKV("rms", run.GetAdjustedCPUTime());
  }

-  for (auto& c : run.counters) {
+  for (const auto& c : run.counters) {
    out << ",\n" << indent << FormatKV(c.first, c.second);
  }

-  if (run.memory_result) {
-    const MemoryManager::Result memory_result = *run.memory_result;
+  if (run.memory_result.memory_iterations > 0) {
+    const auto& memory_result = run.memory_result;
    out << ",\n" << indent << FormatKV("allocs_per_iter", run.allocs_per_iter);
    out << ",\n"
        << indent << FormatKV("max_bytes_used", memory_result.max_bytes_used);

-    auto report_if_present = [&out, &indent](const char* label, int64_t val) {
-      if (val != MemoryManager::TombstoneValue)
+    auto report_if_present = [&out, &indent](const std::string& label,
+                                             int64_t val) {
+      if (val != MemoryManager::TombstoneValue) {
        out << ",\n" << indent << FormatKV(label, val);
+      }
    };

    report_if_present("total_allocated_bytes",
@ -310,7 +330,4 @@ void JSONReporter::PrintRunData(Run const& run) {
  out << '\n';
 }

-const int64_t MemoryManager::TombstoneValue =
-    std::numeric_limits<int64_t>::max();
-
 }  // end namespace benchmark
--- a/src/log.h
+++ b/src/log.h
@ -4,13 +4,6 @@
 #include <iostream>
 #include <ostream>

-// NOTE: this is also defined in benchmark.h but we're trying to avoid a
-// dependency.
-// The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer.
-#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
-#define BENCHMARK_HAS_CXX11
-#endif
-
 namespace benchmark {
 namespace internal {

@ -31,13 +24,8 @@ class LogType {

  // NOTE: we could use BENCHMARK_DISALLOW_COPY_AND_ASSIGN but we shouldn't have
  // a dependency on benchmark.h from here.
-#ifndef BENCHMARK_HAS_CXX11
-  LogType(const LogType&);
-  LogType& operator=(const LogType&);
-#else
  LogType(const LogType&) = delete;
  LogType& operator=(const LogType&) = delete;
-#endif
 };

 template <class Tp>
@ -61,7 +49,7 @@ inline int& LogLevel() {
 }

 inline LogType& GetNullLogInstance() {
-  static LogType null_log((std::ostream*)nullptr);
+  static LogType null_log(static_cast<std::ostream*>(nullptr));
  return null_log;
 }

--- a/src/perf_counters.cc
+++ b/src/perf_counters.cc
@ -26,103 +26,235 @@
 namespace benchmark {
 namespace internal {

-constexpr size_t PerfCounterValues::kMaxCounters;
-
 #if defined HAVE_LIBPFM
+
+size_t PerfCounterValues::Read(const std::vector<int>& leaders) {
+  // Create a pointer for multiple reads
+  const size_t bufsize = values_.size() * sizeof(values_[0]);
+  char* ptr = reinterpret_cast<char*>(values_.data());
+  size_t size = bufsize;
+  for (int lead : leaders) {
+    auto read_bytes = ::read(lead, ptr, size);
+    if (read_bytes >= ssize_t(sizeof(uint64_t))) {
+      // Actual data bytes are all bytes minus initial padding
+      std::size_t data_bytes =
+          static_cast<std::size_t>(read_bytes) - sizeof(uint64_t);
+      // This should be very cheap since it's in hot cache
+      std::memmove(ptr, ptr + sizeof(uint64_t), data_bytes);
+      // Increment our counters
+      ptr += data_bytes;
+      size -= data_bytes;
+    } else {
+      int err = errno;
+      GetErrorLogInstance() << "Error reading lead " << lead << " errno:" << err
+                            << " " << ::strerror(err) << "\n";
+      return 0;
+    }
+  }
+  return (bufsize - size) / sizeof(uint64_t);
+}
+
 const bool PerfCounters::kSupported = true;

-bool PerfCounters::Initialize() { return pfm_initialize() == PFM_SUCCESS; }
+// Initializes libpfm only on the first call.  Returns whether that single
+// initialization was successful.
+bool PerfCounters::Initialize() {
+  // Function-scope static gets initialized only once on first call.
+  static const bool success = []() {
+    return pfm_initialize() == PFM_SUCCESS;
+  }();
+  return success;
+}
+
+bool PerfCounters::IsCounterSupported(const std::string& name) {
+  Initialize();
+  perf_event_attr_t attr;
+  std::memset(&attr, 0, sizeof(attr));
+  pfm_perf_encode_arg_t arg;
+  std::memset(&arg, 0, sizeof(arg));
+  arg.attr = &attr;
+  const int mode = PFM_PLM3;  // user mode only
+  int ret = pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT_EXT,
+                                      &arg);
+  return (ret == PFM_SUCCESS);
+}

 PerfCounters PerfCounters::Create(
    const std::vector<std::string>& counter_names) {
-  if (counter_names.empty()) {
-    return NoCounters();
+  if (!counter_names.empty()) {
+    Initialize();
  }
-  if (counter_names.size() > PerfCounterValues::kMaxCounters) {
-    GetErrorLogInstance()
-        << counter_names.size()
-        << " counters were requested. The minimum is 1, the maximum is "
-        << PerfCounterValues::kMaxCounters << "\n";
-    return NoCounters();
-  }
-  std::vector<int> counter_ids(counter_names.size());

-  const int mode = PFM_PLM3;  // user mode only
+  // Valid counters will populate these arrays but we start empty
+  std::vector<std::string> valid_names;
+  std::vector<int> counter_ids;
+  std::vector<int> leader_ids;
+
+  // Resize to the maximum possible
+  valid_names.reserve(counter_names.size());
+  counter_ids.reserve(counter_names.size());
+
+  const int kCounterMode = PFM_PLM3;  // user mode only
+
+  // Group leads will be assigned on demand. The idea is that once we cannot
+  // create a counter descriptor, the reason is that this group has maxed out
+  // so we set the group_id again to -1 and retry - giving the algorithm a
+  // chance to create a new group leader to hold the next set of counters.
+  int group_id = -1;
+
+  // Loop through all performance counters
  for (size_t i = 0; i < counter_names.size(); ++i) {
-    const bool is_first = i == 0;
-    struct perf_event_attr attr {};
-    attr.size = sizeof(attr);
-    const int group_id = !is_first ? counter_ids[0] : -1;
+    // we are about to push into the valid names vector
+    // check if we did not reach the maximum
+    if (valid_names.size() == PerfCounterValues::kMaxCounters) {
+      // Log a message if we maxed out and stop adding
+      GetErrorLogInstance()
+          << counter_names.size() << " counters were requested. The maximum is "
+          << PerfCounterValues::kMaxCounters << " and " << valid_names.size()
+          << " were already added. All remaining counters will be ignored\n";
+      // stop the loop and return what we have already
+      break;
+    }
+
+    // Check if this name is empty
    const auto& name = counter_names[i];
    if (name.empty()) {
-      GetErrorLogInstance() << "A counter name was the empty string\n";
-      return NoCounters();
+      GetErrorLogInstance()
+          << "A performance counter name was the empty string\n";
+      continue;
    }
+
+    // Here first means first in group, ie the group leader
+    const bool is_first = (group_id < 0);
+
+    // This struct will be populated by libpfm from the counter string
+    // and then fed into the syscall perf_event_open
+    struct perf_event_attr attr {};
+    attr.size = sizeof(attr);
+
+    // This is the input struct to libpfm.
    pfm_perf_encode_arg_t arg{};
    arg.attr = &attr;
-
-    const int pfm_get =
-        pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT, &arg);
+    const int pfm_get = pfm_get_os_event_encoding(name.c_str(), kCounterMode,
+                                                  PFM_OS_PERF_EVENT, &arg);
    if (pfm_get != PFM_SUCCESS) {
-      GetErrorLogInstance() << "Unknown counter name: " << name << "\n";
-      return NoCounters();
+      GetErrorLogInstance()
+          << "Unknown performance counter name: " << name << "\n";
+      continue;
    }
-    attr.disabled = is_first;
-    // Note: the man page for perf_event_create suggests inerit = true and
+
+    // We then proceed to populate the remaining fields in our attribute struct
+    // Note: the man page for perf_event_create suggests inherit = true and
    // read_format = PERF_FORMAT_GROUP don't work together, but that's not the
    // case.
+    attr.disabled = is_first;
    attr.inherit = true;
    attr.pinned = is_first;
    attr.exclude_kernel = true;
    attr.exclude_user = false;
    attr.exclude_hv = true;
-    // Read all counters in one read.
-    attr.read_format = PERF_FORMAT_GROUP;
+
+    // Read all counters in a group in one read.
+    attr.read_format = PERF_FORMAT_GROUP;  //| PERF_FORMAT_TOTAL_TIME_ENABLED |
+                                           // PERF_FORMAT_TOTAL_TIME_RUNNING;

    int id = -1;
-    static constexpr size_t kNrOfSyscallRetries = 5;
-    // Retry syscall as it was interrupted often (b/64774091).
-    for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
-         ++num_retries) {
-      id = perf_event_open(&attr, 0, -1, group_id, 0);
-      if (id >= 0 || errno != EINTR) {
-        break;
+    while (id < 0) {
+      static constexpr size_t kNrOfSyscallRetries = 5;
+      // Retry syscall as it was interrupted often (b/64774091).
+      for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
+           ++num_retries) {
+        id = perf_event_open(&attr, 0, -1, group_id, 0);
+        if (id >= 0 || errno != EINTR) {
+          break;
+        }
+      }
+      if (id < 0) {
+        // If the file descriptor is negative we might have reached a limit
+        // in the current group. Set the group_id to -1 and retry
+        if (group_id >= 0) {
+          // Create a new group
+          group_id = -1;
+        } else {
+          // At this point we have already retried to set a new group id and
+          // failed. We then give up.
+          break;
+        }
      }
    }
+
+    // We failed to get a new file descriptor. We might have reached a hard
+    // hardware limit that cannot be resolved even with group multiplexing
    if (id < 0) {
-      GetErrorLogInstance()
-          << "Failed to get a file descriptor for " << name << "\n";
+      GetErrorLogInstance() << "***WARNING** Failed to get a file descriptor "
+                               "for performance counter "
+                            << name << ". Ignoring\n";
+
+      // We give up on this counter but try to keep going
+      // as the others would be fine
+      continue;
+    }
+    if (group_id < 0) {
+      // This is a leader, store and assign it to the current file descriptor
+      leader_ids.push_back(id);
+      group_id = id;
+    }
+    // This is a valid counter, add it to our descriptor's list
+    counter_ids.push_back(id);
+    valid_names.push_back(name);
+  }
+
+  // Loop through all group leaders activating them
+  // There is another option of starting ALL counters in a process but
+  // that would be far reaching an intrusion. If the user is using PMCs
+  // by themselves then this would have a side effect on them. It is
+  // friendlier to loop through all groups individually.
+  for (int lead : leader_ids) {
+    if (ioctl(lead, PERF_EVENT_IOC_ENABLE) != 0) {
+      // This should never happen but if it does, we give up on the
+      // entire batch as recovery would be a mess.
+      GetErrorLogInstance() << "***WARNING*** Failed to start counters. "
+                               "Claring out all counters.\n";
+
+      // Close all performance counters
+      for (int id : counter_ids) {
+        ::close(id);
+      }
+
+      // Return an empty object so our internal state is still good and
+      // the process can continue normally without impact
      return NoCounters();
    }
-
-    counter_ids[i] = id;
-  }
-  if (ioctl(counter_ids[0], PERF_EVENT_IOC_ENABLE) != 0) {
-    GetErrorLogInstance() << "Failed to start counters\n";
-    return NoCounters();
  }

-  return PerfCounters(counter_names, std::move(counter_ids));
+  return PerfCounters(std::move(valid_names), std::move(counter_ids),
+                      std::move(leader_ids));
 }

 void PerfCounters::CloseCounters() const {
  if (counter_ids_.empty()) {
    return;
  }
-  ioctl(counter_ids_[0], PERF_EVENT_IOC_DISABLE);
+  for (int lead : leader_ids_) {
+    ioctl(lead, PERF_EVENT_IOC_DISABLE);
+  }
  for (int fd : counter_ids_) {
    close(fd);
  }
 }
 #else   // defined HAVE_LIBPFM
+size_t PerfCounterValues::Read(const std::vector<int>&) { return 0; }
+
 const bool PerfCounters::kSupported = false;

 bool PerfCounters::Initialize() { return false; }

+bool PerfCounters::IsCounterSupported(const std::string&) { return false; }
+
 PerfCounters PerfCounters::Create(
    const std::vector<std::string>& counter_names) {
  if (!counter_names.empty()) {
-    GetErrorLogInstance() << "Performance counters not supported.";
+    GetErrorLogInstance() << "Performance counters not supported.\n";
  }
  return NoCounters();
 }
@ -130,31 +262,10 @@ PerfCounters PerfCounters::Create(
 void PerfCounters::CloseCounters() const {}
 #endif  // defined HAVE_LIBPFM

-Mutex PerfCountersMeasurement::mutex_;
-int PerfCountersMeasurement::ref_count_ = 0;
-PerfCounters PerfCountersMeasurement::counters_ = PerfCounters::NoCounters();
-
 PerfCountersMeasurement::PerfCountersMeasurement(
    const std::vector<std::string>& counter_names)
    : start_values_(counter_names.size()), end_values_(counter_names.size()) {
-  MutexLock l(mutex_);
-  if (ref_count_ == 0) {
-    counters_ = PerfCounters::Create(counter_names);
-  }
-  // We chose to increment it even if `counters_` ends up invalid,
-  // so that we don't keep trying to create, and also since the dtor
-  // will decrement regardless of `counters_`'s validity
-  ++ref_count_;
-
-  BM_CHECK(!counters_.IsValid() || counters_.names() == counter_names);
-}
-
-PerfCountersMeasurement::~PerfCountersMeasurement() {
-  MutexLock l(mutex_);
-  --ref_count_;
-  if (ref_count_ == 0) {
-    counters_ = PerfCounters::NoCounters();
-  }
+  counters_ = PerfCounters::Create(counter_names);
 }

 PerfCounters& PerfCounters::operator=(PerfCounters&& other) noexcept {
@ -162,6 +273,7 @@ PerfCounters& PerfCounters::operator=(PerfCounters&& other) noexcept {
    CloseCounters();

    counter_ids_ = std::move(other.counter_ids_);
+    leader_ids_ = std::move(other.leader_ids_);
    counter_names_ = std::move(other.counter_names_);
  }
  return *this;
--- a/src/perf_counters.h
+++ b/src/perf_counters.h
@ -17,6 +17,7 @@

 #include <array>
 #include <cstdint>
+#include <cstring>
 #include <memory>
 #include <vector>

@ -44,18 +45,21 @@ namespace internal {
 // The implementation ensures the storage is inlined, and allows 0-based
 // indexing into the counter values.
 // The object is used in conjunction with a PerfCounters object, by passing it
-// to Snapshot(). The values are populated such that
-// perfCounters->names()[i]'s value is obtained at position i (as given by
-// operator[]) of this object.
-class PerfCounterValues {
+// to Snapshot(). The Read() method relocates individual reads, discarding
+// the initial padding from each group leader in the values buffer such that
+// all user accesses through the [] operator are correct.
+class BENCHMARK_EXPORT PerfCounterValues {
 public:
  explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) {
    BM_CHECK_LE(nr_counters_, kMaxCounters);
  }

-  uint64_t operator[](size_t pos) const { return values_[kPadding + pos]; }
+  // We are reading correctly now so the values don't need to skip padding
+  uint64_t operator[](size_t pos) const { return values_[pos]; }

-  static constexpr size_t kMaxCounters = 3;
+  // Increased the maximum to 32 only since the buffer
+  // is std::array<> backed
+  static constexpr size_t kMaxCounters = 32;

 private:
  friend class PerfCounters;
@ -66,7 +70,14 @@ class PerfCounterValues {
            sizeof(uint64_t) * (kPadding + nr_counters_)};
  }

-  static constexpr size_t kPadding = 1;
+  // This reading is complex and as the goal of this class is to
+  // abstract away the intrincacies of the reading process, this is
+  // a better place for it
+  size_t Read(const std::vector<int>& leaders);
+
+  // Move the padding to 2 due to the reading algorithm (1st padding plus a
+  // current read padding)
+  static constexpr size_t kPadding = 2;
  std::array<uint64_t, kPadding + kMaxCounters> values_;
  const size_t nr_counters_;
 };
@ -79,10 +90,11 @@ class BENCHMARK_EXPORT PerfCounters final {
  // True iff this platform supports performance counters.
  static const bool kSupported;

-  bool IsValid() const { return !counter_names_.empty(); }
+  // Returns an empty object
  static PerfCounters NoCounters() { return PerfCounters(); }

  ~PerfCounters() { CloseCounters(); }
+  PerfCounters() = default;
  PerfCounters(PerfCounters&&) = default;
  PerfCounters(const PerfCounters&) = delete;
  PerfCounters& operator=(PerfCounters&&) noexcept;
@ -92,11 +104,15 @@ class BENCHMARK_EXPORT PerfCounters final {
  // initialization here.
  static bool Initialize();

+  // Check if the given counter is supported, if the app wants to
+  // check before passing
+  static bool IsCounterSupported(const std::string& name);
+
  // Return a PerfCounters object ready to read the counters with the names
  // specified. The values are user-mode only. The counter name format is
  // implementation and OS specific.
-  // TODO: once we move to C++-17, this should be a std::optional, and then the
-  // IsValid() boolean can be dropped.
+  // In case of failure, this method will in the worst case return an
+  // empty object whose state will still be valid.
  static PerfCounters Create(const std::vector<std::string>& counter_names);

  // Take a snapshot of the current value of the counters into the provided
@ -105,10 +121,7 @@ class BENCHMARK_EXPORT PerfCounters final {
  BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) const {
 #ifndef BENCHMARK_OS_WINDOWS
    assert(values != nullptr);
-    assert(IsValid());
-    auto buffer = values->get_data_buffer();
-    auto read_bytes = ::read(counter_ids_[0], buffer.first, buffer.second);
-    return static_cast<size_t>(read_bytes) == buffer.second;
+    return values->Read(leader_ids_) == counter_ids_.size();
 #else
    (void)values;
    return false;
@ -120,13 +133,15 @@ class BENCHMARK_EXPORT PerfCounters final {

 private:
  PerfCounters(const std::vector<std::string>& counter_names,
-               std::vector<int>&& counter_ids)
-      : counter_ids_(std::move(counter_ids)), counter_names_(counter_names) {}
-  PerfCounters() = default;
+               std::vector<int>&& counter_ids, std::vector<int>&& leader_ids)
+      : counter_ids_(std::move(counter_ids)),
+        leader_ids_(std::move(leader_ids)),
+        counter_names_(counter_names) {}

  void CloseCounters() const;

  std::vector<int> counter_ids_;
+  std::vector<int> leader_ids_;
  std::vector<std::string> counter_names_;
 };

@ -134,33 +149,25 @@ class BENCHMARK_EXPORT PerfCounters final {
 class BENCHMARK_EXPORT PerfCountersMeasurement final {
 public:
  PerfCountersMeasurement(const std::vector<std::string>& counter_names);
-  ~PerfCountersMeasurement();

-  // The only way to get to `counters_` is after ctor-ing a
-  // `PerfCountersMeasurement`, which means that `counters_`'s state is, here,
-  // decided (either invalid or valid) and won't change again even if a ctor is
-  // concurrently running with this. This is preferring efficiency to
-  // maintainability, because the address of the static can be known at compile
-  // time.
-  bool IsValid() const {
-    MutexLock l(mutex_);
-    return counters_.IsValid();
-  }
+  size_t num_counters() const { return counters_.num_counters(); }

-  BENCHMARK_ALWAYS_INLINE void Start() {
-    assert(IsValid());
-    MutexLock l(mutex_);
+  std::vector<std::string> names() const { return counters_.names(); }
+
+  BENCHMARK_ALWAYS_INLINE bool Start() {
+    if (num_counters() == 0) return true;
    // Tell the compiler to not move instructions above/below where we take
    // the snapshot.
    ClobberMemory();
    valid_read_ &= counters_.Snapshot(&start_values_);
    ClobberMemory();
+
+    return valid_read_;
  }

  BENCHMARK_ALWAYS_INLINE bool Stop(
      std::vector<std::pair<std::string, double>>& measurements) {
-    assert(IsValid());
-    MutexLock l(mutex_);
+    if (num_counters() == 0) return true;
    // Tell the compiler to not move instructions above/below where we take
    // the snapshot.
    ClobberMemory();
@ -177,16 +184,12 @@ class BENCHMARK_EXPORT PerfCountersMeasurement final {
  }

 private:
-  static Mutex mutex_;
-  GUARDED_BY(mutex_) static int ref_count_;
-  GUARDED_BY(mutex_) static PerfCounters counters_;
+  PerfCounters counters_;
  bool valid_read_ = true;
  PerfCounterValues start_values_;
  PerfCounterValues end_values_;
 };

-BENCHMARK_UNUSED static bool perf_init_anchor = PerfCounters::Initialize();
-
 }  // namespace internal
 }  // namespace benchmark

--- a/src/re.h
+++ b/src/re.h
@ -33,7 +33,7 @@
 // Prefer C regex libraries when compiling w/o exceptions so that we can
 // correctly report errors.
 #if defined(BENCHMARK_HAS_NO_EXCEPTIONS) && \
-    defined(BENCHMARK_HAVE_STD_REGEX) && \
+    defined(HAVE_STD_REGEX) && \
    (defined(HAVE_GNU_POSIX_REGEX) || defined(HAVE_POSIX_REGEX))
  #undef HAVE_STD_REGEX
 #endif
@ -121,15 +121,13 @@ inline bool Regex::Init(const std::string& spec, std::string* error) {
  if (ec != 0) {
    if (error) {
      size_t needed = regerror(ec, &re_, nullptr, 0);
-      char* errbuf = new char[needed];
-      regerror(ec, &re_, errbuf, needed);
+      std::vector<char> errbuf(needed);
+      regerror(ec, &re_, errbuf.data(), needed);

      // regerror returns the number of bytes necessary to null terminate
      // the string, so we move that when assigning to error.
      BM_CHECK_NE(needed, 0);
-      error->assign(errbuf, needed - 1);
-
-      delete[] errbuf;
+      error->assign(errbuf.data(), needed - 1);
    }

    return false;
--- a/src/reporter.cc
+++ b/src/reporter.cc
@ -42,20 +42,23 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
  Out << LocalDateTimeString() << "\n";
 #endif

-  if (context.executable_name)
-    Out << "Running " << context.executable_name << "\n";
+  if (benchmark::BenchmarkReporter::Context::executable_name != nullptr) {
+    Out << "Running " << benchmark::BenchmarkReporter::Context::executable_name
+        << "\n";
+  }

  const CPUInfo &info = context.cpu_info;
  Out << "Run on (" << info.num_cpus << " X "
      << (info.cycles_per_second / 1000000.0) << " MHz CPU "
      << ((info.num_cpus > 1) ? "s" : "") << ")\n";
-  if (info.caches.size() != 0) {
+  if (!info.caches.empty()) {
    Out << "CPU Caches:\n";
-    for (auto &CInfo : info.caches) {
+    for (const auto &CInfo : info.caches) {
      Out << "  L" << CInfo.level << " " << CInfo.type << " "
          << (CInfo.size / 1024) << " KiB";
-      if (CInfo.num_sharing != 0)
+      if (CInfo.num_sharing != 0) {
        Out << " (x" << (info.num_cpus / CInfo.num_sharing) << ")";
+      }
      Out << "\n";
    }
  }
@ -63,7 +66,9 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
    Out << "Load Average: ";
    for (auto It = info.load_avg.begin(); It != info.load_avg.end();) {
      Out << StrFormat("%.2f", *It++);
-      if (It != info.load_avg.end()) Out << ", ";
+      if (It != info.load_avg.end()) {
+        Out << ", ";
+      }
    }
    Out << "\n";
  }
@ -105,13 +110,17 @@ std::string BenchmarkReporter::Run::benchmark_name() const {

 double BenchmarkReporter::Run::GetAdjustedRealTime() const {
  double new_time = real_accumulated_time * GetTimeUnitMultiplier(time_unit);
-  if (iterations != 0) new_time /= static_cast<double>(iterations);
+  if (iterations != 0) {
+    new_time /= static_cast<double>(iterations);
+  }
  return new_time;
 }

 double BenchmarkReporter::Run::GetAdjustedCPUTime() const {
  double new_time = cpu_accumulated_time * GetTimeUnitMultiplier(time_unit);
-  if (iterations != 0) new_time /= static_cast<double>(iterations);
+  if (iterations != 0) {
+    new_time /= static_cast<double>(iterations);
+  }
  return new_time;
 }

--- a/src/sleep.cc
+++ b/src/sleep.cc
@ -1,66 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "sleep.h"
-
-#include <cerrno>
-#include <cstdlib>
-#include <ctime>
-
-#include "internal_macros.h"
-
-#ifdef BENCHMARK_OS_WINDOWS
-#include <windows.h>
-#endif
-
-#ifdef BENCHMARK_OS_ZOS
-#include <unistd.h>
-#endif
-
-namespace benchmark {
-#ifdef BENCHMARK_OS_WINDOWS
-// Window's Sleep takes milliseconds argument.
-void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); }
-void SleepForSeconds(double seconds) {
-  SleepForMilliseconds(static_cast<int>(kNumMillisPerSecond * seconds));
-}
-#else  // BENCHMARK_OS_WINDOWS
-void SleepForMicroseconds(int microseconds) {
-#ifdef BENCHMARK_OS_ZOS
-  // z/OS does not support nanosleep. Instead call sleep() and then usleep() to
-  // sleep for the remaining microseconds because usleep() will fail if its
-  // argument is greater than 1000000.
-  div_t sleepTime = div(microseconds, kNumMicrosPerSecond);
-  int seconds = sleepTime.quot;
-  while (seconds != 0) seconds = sleep(seconds);
-  while (usleep(sleepTime.rem) == -1 && errno == EINTR)
-    ;
-#else
-  struct timespec sleep_time;
-  sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
-  sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
-  while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
-    ;  // Ignore signals and wait for the full interval to elapse.
-#endif
-}
-
-void SleepForMilliseconds(int milliseconds) {
-  SleepForMicroseconds(milliseconds * kNumMicrosPerMilli);
-}
-
-void SleepForSeconds(double seconds) {
-  SleepForMicroseconds(static_cast<int>(seconds * kNumMicrosPerSecond));
-}
-#endif  // BENCHMARK_OS_WINDOWS
-}  // end namespace benchmark
--- a/src/sleep.h
+++ b/src/sleep.h
@ -1,15 +0,0 @@
-#ifndef BENCHMARK_SLEEP_H_
-#define BENCHMARK_SLEEP_H_
-
-namespace benchmark {
-const int kNumMillisPerSecond = 1000;
-const int kNumMicrosPerMilli = 1000;
-const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000;
-const int kNumNanosPerMicro = 1000;
-const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
-
-void SleepForMilliseconds(int milliseconds);
-void SleepForSeconds(double seconds);
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_SLEEP_H_
--- a/src/statistics.cc
+++ b/src/statistics.cc
@ -26,61 +26,82 @@

 namespace benchmark {

-auto StatisticsSum = [](const std::vector<double>& v) {
+const auto StatisticsSum = [](const std::vector<double>& v) {
  return std::accumulate(v.begin(), v.end(), 0.0);
 };

 double StatisticsMean(const std::vector<double>& v) {
-  if (v.empty()) return 0.0;
-  return StatisticsSum(v) * (1.0 / v.size());
+  if (v.empty()) {
+    return 0.0;
+  }
+  return StatisticsSum(v) * (1.0 / static_cast<double>(v.size()));
 }

 double StatisticsMedian(const std::vector<double>& v) {
-  if (v.size() < 3) return StatisticsMean(v);
+  if (v.size() < 3) {
+    return StatisticsMean(v);
+  }
  std::vector<double> copy(v);

  auto center = copy.begin() + v.size() / 2;
  std::nth_element(copy.begin(), center, copy.end());

-  // did we have an odd number of samples?
-  // if yes, then center is the median
-  // it no, then we are looking for the average between center and the value
-  // before
-  if (v.size() % 2 == 1) return *center;
-  auto center2 = copy.begin() + v.size() / 2 - 1;
-  std::nth_element(copy.begin(), center2, copy.end());
+  // Did we have an odd number of samples?  If yes, then center is the median.
+  // If not, then we are looking for the average between center and the value
+  // before.  Instead of resorting, we just look for the max value before it,
+  // which is not necessarily the element immediately preceding `center` Since
+  // `copy` is only partially sorted by `nth_element`.
+  if (v.size() % 2 == 1) {
+    return *center;
+  }
+  auto center2 = std::max_element(copy.begin(), center);
  return (*center + *center2) / 2.0;
 }

 // Return the sum of the squares of this sample set
-auto SumSquares = [](const std::vector<double>& v) {
+const auto SumSquares = [](const std::vector<double>& v) {
  return std::inner_product(v.begin(), v.end(), v.begin(), 0.0);
 };

-auto Sqr = [](const double dat) { return dat * dat; };
-auto Sqrt = [](const double dat) {
+const auto Sqr = [](const double dat) { return dat * dat; };
+const auto Sqrt = [](const double dat) {
  // Avoid NaN due to imprecision in the calculations
-  if (dat < 0.0) return 0.0;
+  if (dat < 0.0) {
+    return 0.0;
+  }
  return std::sqrt(dat);
 };

 double StatisticsStdDev(const std::vector<double>& v) {
  const auto mean = StatisticsMean(v);
-  if (v.empty()) return mean;
+  if (v.empty()) {
+    return mean;
+  }

  // Sample standard deviation is undefined for n = 1
-  if (v.size() == 1) return 0.0;
+  if (v.size() == 1) {
+    return 0.0;
+  }

-  const double avg_squares = SumSquares(v) * (1.0 / v.size());
-  return Sqrt(v.size() / (v.size() - 1.0) * (avg_squares - Sqr(mean)));
+  const double avg_squares =
+      SumSquares(v) * (1.0 / static_cast<double>(v.size()));
+  return Sqrt(static_cast<double>(v.size()) /
+              (static_cast<double>(v.size()) - 1.0) *
+              (avg_squares - Sqr(mean)));
 }

 double StatisticsCV(const std::vector<double>& v) {
-  if (v.size() < 2) return 0.0;
+  if (v.size() < 2) {
+    return 0.0;
+  }

  const auto stddev = StatisticsStdDev(v);
  const auto mean = StatisticsMean(v);

+  if (std::fpclassify(mean) == FP_ZERO) {
+    return 0.0;
+  }
+
  return stddev / mean;
 }

@ -89,11 +110,10 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
  typedef BenchmarkReporter::Run Run;
  std::vector<Run> results;

-  auto error_count =
-      std::count_if(reports.begin(), reports.end(),
-                    [](Run const& run) { return run.error_occurred; });
+  auto error_count = std::count_if(reports.begin(), reports.end(),
+                                   [](Run const& run) { return run.skipped; });

-  if (reports.size() - error_count < 2) {
+  if (reports.size() - static_cast<size_t>(error_count) < 2) {
    // We don't report aggregated data if there was a single run.
    return results;
  }
@ -133,7 +153,9 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
  for (Run const& run : reports) {
    BM_CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
    BM_CHECK_EQ(run_iterations, run.iterations);
-    if (run.error_occurred) continue;
+    if (run.skipped != 0u) {
+      continue;
+    }
    real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
    cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
    // user counters
@ -154,7 +176,7 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
  }

  const double iteration_rescale_factor =
-      double(reports.size()) / double(run_iterations);
+      static_cast<double>(reports.size()) / static_cast<double>(run_iterations);

  for (const auto& Stat : *reports[0].statistics) {
    // Get the data from the accumulator to BenchmarkReporter::Run's.
@ -175,7 +197,7 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
    // Similarly, if there are N repetitions with 1 iterations each,
    // an aggregate will be computed over N measurements, not 1.
    // Thus it is best to simply use the count of separate reports.
-    data.iterations = reports.size();
+    data.iterations = static_cast<IterationCount>(reports.size());

    data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
    data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);
--- a/src/statistics.h
+++ b/src/statistics.h
@ -22,9 +22,10 @@

 namespace benchmark {

-// Return a vector containing the mean, median and standard devation information
-// (and any user-specified info) for the specified list of reports. If 'reports'
-// contains less than two non-errored runs an empty vector is returned
+// Return a vector containing the mean, median and standard deviation
+// information (and any user-specified info) for the specified list of reports.
+// If 'reports' contains less than two non-errored runs an empty vector is
+// returned
 BENCHMARK_EXPORT
 std::vector<BenchmarkReporter::Run> ComputeStats(
    const std::vector<BenchmarkReporter::Run>& reports);
--- a/src/string_util.cc
+++ b/src/string_util.cc
@ -11,16 +11,17 @@
 #include <sstream>

 #include "arraysize.h"
+#include "benchmark/benchmark.h"

 namespace benchmark {
 namespace {
-
 // kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta.
-const char kBigSIUnits[] = "kMGTPEZY";
+const char* const kBigSIUnits[] = {"k", "M", "G", "T", "P", "E", "Z", "Y"};
 // Kibi, Mebi, Gibi, Tebi, Pebi, Exbi, Zebi, Yobi.
-const char kBigIECUnits[] = "KMGTPEZY";
+const char* const kBigIECUnits[] = {"Ki", "Mi", "Gi", "Ti",
+                                    "Pi", "Ei", "Zi", "Yi"};
 // milli, micro, nano, pico, femto, atto, zepto, yocto.
-const char kSmallSIUnits[] = "munpfazy";
+const char* const kSmallSIUnits[] = {"m", "u", "n", "p", "f", "a", "z", "y"};

 // We require that all three arrays have the same size.
 static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
@ -28,11 +29,10 @@ static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
 static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
              "Small SI and Big SI unit arrays must be the same size");

-static const int64_t kUnitsSize = arraysize(kBigSIUnits);
+const int64_t kUnitsSize = arraysize(kBigSIUnits);

-void ToExponentAndMantissa(double val, double thresh, int precision,
-                           double one_k, std::string* mantissa,
-                           int64_t* exponent) {
+void ToExponentAndMantissa(double val, int precision, double one_k,
+                           std::string* mantissa, int64_t* exponent) {
  std::stringstream mantissa_stream;

  if (val < 0) {
@ -43,8 +43,8 @@ void ToExponentAndMantissa(double val, double thresh, int precision,
  // Adjust threshold so that it never excludes things which can't be rendered
  // in 'precision' digits.
  const double adjusted_threshold =
-      std::max(thresh, 1.0 / std::pow(10.0, precision));
-  const double big_threshold = adjusted_threshold * one_k;
+      std::max(1.0, 1.0 / std::pow(10.0, precision));
+  const double big_threshold = (adjusted_threshold * one_k) - 1;
  const double small_threshold = adjusted_threshold;
  // Values in ]simple_threshold,small_threshold[ will be printed as-is
  const double simple_threshold = 0.01;
@ -56,7 +56,7 @@ void ToExponentAndMantissa(double val, double thresh, int precision,
      scaled /= one_k;
      if (scaled <= big_threshold) {
        mantissa_stream << scaled;
-        *exponent = i + 1;
+        *exponent = static_cast<int64_t>(i + 1);
        *mantissa = mantissa_stream.str();
        return;
      }
@ -87,42 +87,29 @@ void ToExponentAndMantissa(double val, double thresh, int precision,
 }

 std::string ExponentToPrefix(int64_t exponent, bool iec) {
-  if (exponent == 0) return "";
+  if (exponent == 0) {
+    return {};
+  }

  const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1);
-  if (index >= kUnitsSize) return "";
+  if (index >= kUnitsSize) {
+    return {};
+  }

-  const char* array =
+  const char* const* array =
      (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : kSmallSIUnits);
-  if (iec)
-    return array[index] + std::string("i");
-  else
-    return std::string(1, array[index]);
+
+  return std::string(array[index]);
 }

-std::string ToBinaryStringFullySpecified(double value, double threshold,
-                                         int precision, double one_k = 1024.0) {
+std::string ToBinaryStringFullySpecified(double value, int precision,
+                                         Counter::OneK one_k) {
  std::string mantissa;
-  int64_t exponent;
-  ToExponentAndMantissa(value, threshold, precision, one_k, &mantissa,
+  int64_t exponent = 0;
+  ToExponentAndMantissa(value, precision,
+                        one_k == Counter::kIs1024 ? 1024.0 : 1000.0, &mantissa,
                        &exponent);
-  return mantissa + ExponentToPrefix(exponent, false);
-}
-
-}  // end namespace
-
-void AppendHumanReadable(int n, std::string* str) {
-  std::stringstream ss;
-  // Round down to the nearest SI prefix.
-  ss << ToBinaryStringFullySpecified(n, 1.0, 0);
-  *str += ss.str();
-}
-
-std::string HumanReadableNumber(double n, double one_k) {
-  // 1.1 means that figures up to 1.1k should be shown with the next unit down;
-  // this softens edge effects.
-  // 1 means that we should show one decimal place of precision.
-  return ToBinaryStringFullySpecified(n, 1.1, 1, one_k);
+  return mantissa + ExponentToPrefix(exponent, one_k == Counter::kIs1024);
 }

 std::string StrFormatImp(const char* msg, va_list args) {
@ -132,7 +119,7 @@ std::string StrFormatImp(const char* msg, va_list args) {

  // TODO(ericwf): use std::array for first attempt to avoid one memory
  // allocation guess what the size might be
-  std::array<char, 256> local_buff;
+  std::array<char, 256> local_buff = {};

  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
  // in the android-ndk
@ -141,9 +128,12 @@ std::string StrFormatImp(const char* msg, va_list args) {
  va_end(args_cp);

  // handle empty expansion
-  if (ret == 0) return std::string{};
-  if (static_cast<std::size_t>(ret) < local_buff.size())
+  if (ret == 0) {
+    return {};
+  }
+  if (static_cast<std::size_t>(ret) < local_buff.size()) {
    return std::string(local_buff.data());
+  }

  // we did not provide a long enough buffer on our first attempt.
  // add 1 to size to account for null-byte in size cast to prevent overflow
@ -155,6 +145,12 @@ std::string StrFormatImp(const char* msg, va_list args) {
  return std::string(buff_ptr.get());
 }

+}  // end namespace
+
+std::string HumanReadableNumber(double n, Counter::OneK one_k) {
+  return ToBinaryStringFullySpecified(n, 1, one_k);
+}
+
 std::string StrFormat(const char* format, ...) {
  va_list args;
  va_start(args, format);
@ -164,7 +160,9 @@ std::string StrFormat(const char* format, ...) {
 }

 std::vector<std::string> StrSplit(const std::string& str, char delim) {
-  if (str.empty()) return {};
+  if (str.empty()) {
+    return {};
+  }
  std::vector<std::string> ret;
  size_t first = 0;
  size_t next = str.find(delim);
--- a/src/string_util.h
+++ b/src/string_util.h
@ -6,15 +6,14 @@
 #include <utility>
 #include <vector>

+#include "benchmark/benchmark.h"
 #include "benchmark/export.h"
 #include "check.h"
-#include "internal_macros.h"

 namespace benchmark {

-void AppendHumanReadable(int n, std::string* str);
-
-std::string HumanReadableNumber(double n, double one_k = 1024.0);
+BENCHMARK_EXPORT
+std::string HumanReadableNumber(double n, Counter::OneK one_k);

 BENCHMARK_EXPORT
 #if defined(__MINGW32__)
--- a/Show More
+++ b/Show More