Deprecate ubuntu-20.04 images in actions (#1971 )

https://github.com/actions/runner-images/issues/11101
Fixed private macro name issue (#1968 )
2025-04-21 17:00:28 +08:00 · 2025-04-16 11:29:10 +01:00 · 2025-04-11 15:02:03 +01:00 · 2025-04-11 12:25:46 +01:00 · 2025-03-29 10:49:25 +03:00 · 2025-03-27 18:10:05 +03:00
131 changed files with 5037 additions and 2937 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@ -1,7 +1,37 @@
 ---
-Checks:          'clang-analyzer-*,readability-redundant-*,performance-*'
-WarningsAsErrors: 'clang-analyzer-*,readability-redundant-*,performance-*'
-HeaderFilterRegex: '.*'
-AnalyzeTemporaryDtors: false
+Checks: >
+  abseil-*,
+  bugprone-*,
+  clang-analyzer-*,
+  cppcoreguidelines-*,
+  google-*,
+  misc-*,
+  performance-*,
+  readability-*,
+  -clang-analyzer-deadcode*,
+  -clang-analyzer-optin*,
+  -readability-identifier-length
+WarningsAsErrors: ''
+HeaderFilterRegex: ''
 FormatStyle:     none
-User:            user
+CheckOptions:
+  llvm-else-after-return.WarnOnConditionVariables: 'false'
+  modernize-loop-convert.MinConfidence: reasonable
+  modernize-replace-auto-ptr.IncludeStyle: llvm
+  cert-str34-c.DiagnoseSignedUnsignedCharComparisons: 'false'
+  google-readability-namespace-comments.ShortNamespaceLines: '10'
+  cert-err33-c.CheckedFunctions: '::aligned_alloc;::asctime_s;::at_quick_exit;::atexit;::bsearch;::bsearch_s;::btowc;::c16rtomb;::c32rtomb;::calloc;::clock;::cnd_broadcast;::cnd_init;::cnd_signal;::cnd_timedwait;::cnd_wait;::ctime_s;::fclose;::fflush;::fgetc;::fgetpos;::fgets;::fgetwc;::fopen;::fopen_s;::fprintf;::fprintf_s;::fputc;::fputs;::fputwc;::fputws;::fread;::freopen;::freopen_s;::fscanf;::fscanf_s;::fseek;::fsetpos;::ftell;::fwprintf;::fwprintf_s;::fwrite;::fwscanf;::fwscanf_s;::getc;::getchar;::getenv;::getenv_s;::gets_s;::getwc;::getwchar;::gmtime;::gmtime_s;::localtime;::localtime_s;::malloc;::mbrtoc16;::mbrtoc32;::mbsrtowcs;::mbsrtowcs_s;::mbstowcs;::mbstowcs_s;::memchr;::mktime;::mtx_init;::mtx_lock;::mtx_timedlock;::mtx_trylock;::mtx_unlock;::printf_s;::putc;::putwc;::raise;::realloc;::remove;::rename;::scanf;::scanf_s;::setlocale;::setvbuf;::signal;::snprintf;::snprintf_s;::sprintf;::sprintf_s;::sscanf;::sscanf_s;::strchr;::strerror_s;::strftime;::strpbrk;::strrchr;::strstr;::strtod;::strtof;::strtoimax;::strtok;::strtok_s;::strtol;::strtold;::strtoll;::strtoul;::strtoull;::strtoumax;::strxfrm;::swprintf;::swprintf_s;::swscanf;::swscanf_s;::thrd_create;::thrd_detach;::thrd_join;::thrd_sleep;::time;::timespec_get;::tmpfile;::tmpfile_s;::tmpnam;::tmpnam_s;::tss_create;::tss_get;::tss_set;::ungetc;::ungetwc;::vfprintf;::vfprintf_s;::vfscanf;::vfscanf_s;::vfwprintf;::vfwprintf_s;::vfwscanf;::vfwscanf_s;::vprintf_s;::vscanf;::vscanf_s;::vsnprintf;::vsnprintf_s;::vsprintf;::vsprintf_s;::vsscanf;::vsscanf_s;::vswprintf;::vswprintf_s;::vswscanf;::vswscanf_s;::vwprintf_s;::vwscanf;::vwscanf_s;::wcrtomb;::wcschr;::wcsftime;::wcspbrk;::wcsrchr;::wcsrtombs;::wcsrtombs_s;::wcsstr;::wcstod;::wcstof;::wcstoimax;::wcstok;::wcstok_s;::wcstol;::wcstold;::wcstoll;::wcstombs;::wcstombs_s;::wcstoul;::wcstoull;::wcstoumax;::wcsxfrm;::wctob;::wctrans;::wctype;::wmemchr;::wprintf_s;::wscanf;::wscanf_s;'
+  cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField: 'false'
+  cert-dcl16-c.NewSuffixes: 'L;LL;LU;LLU'
+  google-readability-braces-around-statements.ShortStatementLines: '1'
+  cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic: 'true'
+  google-readability-namespace-comments.SpacesBeforeComments: '2'
+  modernize-loop-convert.MaxCopySize: '16'
+  modernize-pass-by-value.IncludeStyle: llvm
+  modernize-use-nullptr.NullMacros: 'NULL'
+  llvm-qualified-auto.AddConstToQualified: 'false'
+  modernize-loop-convert.NamingStyle: CamelCase
+  llvm-else-after-return.WarnOnUnfixable: 'false'
+  google-readability-function-size.StatementThreshold: '800'
+...
+
--- a/.clang-tidy.ignore
+++ b/.clang-tidy.ignore
@ -0,0 +1 @@
+.*third_party/.*
--- a/.github/install_bazel.sh
+++ b/.github/install_bazel.sh
@ -3,11 +3,10 @@ if ! bazel version; then
  if [ "$arch" == "aarch64" ]; then
    arch="arm64"
  fi
-  echo "Installing wget and downloading $arch Bazel binary from GitHub releases."
-  yum install -y wget
-  wget "https://github.com/bazelbuild/bazel/releases/download/6.3.0/bazel-6.3.0-linux-$arch" -O /usr/local/bin/bazel
-  chmod +x /usr/local/bin/bazel
+  echo "Downloading $arch Bazel binary from GitHub releases."
+  curl -L -o $HOME/bin/bazel --create-dirs "https://github.com/bazelbuild/bazel/releases/download/7.1.1/bazel-7.1.1-linux-$arch"
+  chmod +x $HOME/bin/bazel
 else
-  # bazel is installed for the correct architecture
+  # Bazel is installed for the correct architecture
  exit 0
 fi
--- a/.github/libcxx-setup.sh
+++ b/.github/libcxx-setup.sh
@ -3,7 +3,12 @@
 set -e

 # Checkout LLVM sources
-git clone --depth=1 https://github.com/llvm/llvm-project.git llvm-project
+git clone --filter=blob:none --depth=1 --branch llvmorg-19.1.6 --no-checkout https://github.com/llvm/llvm-project.git llvm-project
+cd llvm-project
+git sparse-checkout set --cone
+git checkout llvmorg-19.1.6
+git sparse-checkout set cmake llvm/cmake runtimes libcxx libcxxabi
+cd ..

 ## Setup libc++ options
 if [ -z "$BUILD_32_BITS" ]; then
@ -12,15 +17,19 @@ fi

 ## Build and install libc++ (Use unstable ABI for better sanitizer coverage)
 mkdir llvm-build && cd llvm-build
-cmake -DCMAKE_C_COMPILER=${CC}                  \
+cmake -GNinja                                   \
+      -DCMAKE_C_COMPILER=${CC}                  \
      -DCMAKE_CXX_COMPILER=${CXX}               \
      -DCMAKE_BUILD_TYPE=RelWithDebInfo         \
      -DCMAKE_INSTALL_PREFIX=/usr               \
      -DLIBCXX_ABI_UNSTABLE=OFF                 \
      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER}  \
      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS}     \
-      -DLLVM_ENABLE_RUNTIMES='libcxx;libcxxabi;libunwind' \
-      -G "Unix Makefiles" \
+      -DLIBCXXABI_USE_LLVM_UNWINDER=OFF         \
+      -DLLVM_INCLUDE_TESTS=OFF                  \
+      -DLIBCXX_INCLUDE_TESTS=OFF                \
+      -DLIBCXX_INCLUDE_BENCHMARKS=OFF           \
+      -DLLVM_ENABLE_RUNTIMES='libcxx;libcxxabi' \
      ../llvm-project/runtimes/
-make -j cxx cxxabi unwind
+cmake --build . -- cxx cxxabi
 cd ..
--- a/.github/workflows/bazel.yml
+++ b/.github/workflows/bazel.yml
@ -4,20 +4,22 @@ on:
  push: {}
  pull_request: {}

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
  build_and_test_default:
-    name: bazel.${{ matrix.os }}.${{ matrix.bzlmod && 'bzlmod' || 'no_bzlmod' }}
+    name: bazel.${{ matrix.os }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
-        bzlmod: [false, true]
    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4

    - name: mount bazel cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
      env:
        cache-name: bazel-cache
      with:
@ -28,8 +30,8 @@ jobs:

    - name: build
      run: |
-        bazel build ${{ matrix.bzlmod && '--enable_bzlmod' || '--noenable_bzlmod' }} //:benchmark //:benchmark_main //test/...
+        bazel build //:benchmark //:benchmark_main //test/...

    - name: test
      run: |
-        bazel test ${{ matrix.bzlmod && '--enable_bzlmod' || '--noenable_bzlmod' }} --test_output=all //test/...
+        bazel test --test_output=all //test/...
--- a/.github/workflows/build-and-test-min-cmake.yml
+++ b/.github/workflows/build-and-test-min-cmake.yml
@ -6,6 +6,9 @@ on:
  pull_request:
    branches: [ main ]

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
  job:
    name: ${{ matrix.os }}.min-cmake
@ -16,11 +19,11 @@ jobs:
        os: [ubuntu-latest, macos-latest]

    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4

      - uses: lukka/get-cmake@latest
        with:
-          cmakeVersion: 3.10.0
+          cmakeVersion: 3.13.0

      - name: create build environment
        run: cmake -E make_directory ${{ runner.workspace }}/_build
--- a/.github/workflows/build-and-test-perfcounters.yml
+++ b/.github/workflows/build-and-test-perfcounters.yml
@ -6,6 +6,9 @@ on:
  pull_request:
    branches: [ main ]

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
  job:
    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
@ -14,10 +17,10 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        os: [ubuntu-22.04, ubuntu-20.04]
+        os: [ubuntu-latest]
        build_type: ['Release', 'Debug']
    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4

    - name: install libpfm
      run: |
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@ -6,6 +6,9 @@ on:
  pull_request:
    branches: [ main ]

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
  # TODO: add 32-bit builds (g++ and clang++) for ubuntu
  #   (requires g++-multilib and libc6:i386)
@ -17,41 +20,30 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        os: [ubuntu-22.04, ubuntu-20.04, macos-latest]
+        os: [ubuntu-24.04, ubuntu-22.04, ubuntu-24.04-arm, macos-latest]
        build_type: ['Release', 'Debug']
        compiler: ['g++', 'clang++']
        lib: ['shared', 'static']

    steps:
-      - uses: actions/checkout@v3
+      - name: Install dependencies (macos)
+        if: runner.os == 'macOS'
+        run: brew install ninja

-      - uses: lukka/get-cmake@latest
-
-      - name: create build environment
-        run: cmake -E make_directory ${{ runner.workspace }}/_build
-
-      - name: setup cmake initial cache
-        run: touch compiler-cache.cmake
-
-      - name: configure cmake
-        env:
-          CXX: ${{ matrix.compiler }}
-        shell: bash
-        working-directory: ${{ runner.workspace }}/_build
-        run: >
-          cmake -C ${{ github.workspace }}/compiler-cache.cmake
-          $GITHUB_WORKSPACE
-          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
-          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
-          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-          -DCMAKE_CXX_COMPILER=${{ env.CXX }}
-          -DCMAKE_CXX_VISIBILITY_PRESET=hidden
-          -DCMAKE_VISIBILITY_INLINES_HIDDEN=ON
+      - uses: actions/checkout@v4

      - name: build
-        shell: bash
-        working-directory: ${{ runner.workspace }}/_build
-        run: cmake --build . --config ${{ matrix.build_type }}
+        uses: threeal/cmake-action@v2.1.0
+        with:
+          build-dir: ${{ runner.workspace }}/_build
+          cxx-compiler: ${{ matrix.compiler }}
+          options: |
+            BENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+            BUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+            CMAKE_BUILD_TYPE=${{ matrix.build_type }}
+            CMAKE_CXX_COMPILER=${{ matrix.compiler }}
+            CMAKE_CXX_VISIBILITY_PRESET=hidden
+            CMAKE_VISIBILITY_INLINES_HIDDEN=ON

      - name: test
        shell: bash
@ -70,8 +62,6 @@ jobs:
        msvc:
          - VS-16-2019
          - VS-17-2022
-        arch:
-          - x64
        build_type:
          - Debug
          - Release
@ -87,28 +77,75 @@ jobs:
            generator: 'Visual Studio 17 2022'

    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4

      - uses: lukka/get-cmake@latest

      - name: configure cmake
        run: >
-          cmake -S . -B _build/
-          -A ${{ matrix.arch }}
+          cmake -S . -B ${{ runner.workspace }}/_build/
          -G "${{ matrix.generator }}"
          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}

+      - name: build
+        run: cmake --build ${{ runner.workspace }}/_build/ --config ${{ matrix.build_type }}
+
+      - name: test
+        run: ctest --test-dir ${{ runner.workspace }}/_build/ -C ${{ matrix.build_type }} -VV
+
+  msys2:
+    name: ${{ matrix.os }}.${{ matrix.build_type }}.${{ matrix.lib }}.${{ matrix.msys2.msystem }}
+    runs-on: ${{ matrix.os }}
+    defaults:
+        run:
+            shell: msys2 {0}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ windows-latest ]
+        msys2:
+          - { msystem: MINGW64,    arch: x86_64,  family: GNU,  compiler: g++ }
+          - { msystem: CLANG64,    arch: x86_64,  family: LLVM, compiler: clang++ }
+          - { msystem: UCRT64,     arch: x86_64,  family: GNU,  compiler: g++ }
+        build_type:
+          - Debug
+          - Release
+        lib:
+          - shared
+          - static
+
+    steps:
+      - name: setup msys2
+        uses: msys2/setup-msys2@v2
+        with:
+          cache: false
+          msystem: ${{ matrix.msys2.msystem }}
+          update: true
+          install: >-
+            git
+            base-devel
+          pacboy: >-
+            gcc:p
+            clang:p
+            cmake:p
+            ninja:p
+
+      - uses: actions/checkout@v4
+
+      # NOTE: we can't use cmake actions here as we need to do everything in msys2 shell.
+      - name: configure cmake
+        env:
+          CXX: ${{ matrix.msys2.compiler }}
+        run: >
+          cmake -S . -B _build/
+          -GNinja
+          -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
+          -DBUILD_SHARED_LIBS=${{ matrix.lib == 'shared' }}
+
      - name: build
        run: cmake --build _build/ --config ${{ matrix.build_type }}

-      - name: setup test environment
-        # Make sure gmock and benchmark DLLs can be found
-        run: >
-            echo "$((Get-Item .).FullName)/_build/bin/${{ matrix.build_type }}" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append;
-            echo "$((Get-Item .).FullName)/_build/src/${{ matrix.build_type }}" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append;
-
      - name: test
-        run: ctest --test-dir _build/ -C ${{ matrix.build_type }} -VV
-
-
+        working-directory: _build
+        run: ctest -C ${{ matrix.build_type }} -VV
--- a/.github/workflows/clang-format-lint.yml
+++ b/.github/workflows/clang-format-lint.yml
@ -3,15 +3,17 @@ on:
  push: {}
  pull_request: {}

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
-  build:
+  job:
+    name: check-clang-format
    runs-on: ubuntu-latest

    steps:
-    - uses: actions/checkout@v3
-    - uses: DoozyX/clang-format-lint-action@v0.13
+    - uses: actions/checkout@v4
+    - uses: DoozyX/clang-format-lint-action@v0.18.2
      with:
        source: './include/benchmark ./src ./test'
-        extensions: 'h,cc'
-        clangFormatVersion: 12
-        style: Google
+        clangFormatVersion: 18
--- a/.github/workflows/clang-tidy-lint.yml
+++ b/.github/workflows/clang-tidy-lint.yml
@ -4,6 +4,9 @@ on:
  push: {}
  pull_request: {}

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
  job:
    name: run-clang-tidy
@ -11,17 +14,17 @@ jobs:
    strategy:
      fail-fast: false
    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4

    - name: install clang-tidy
      run: sudo apt update && sudo apt -y install clang-tidy

    - name: create build environment
-      run: cmake -E make_directory ${{ runner.workspace }}/_build
+      run: cmake -E make_directory ${{ github.workspace }}/_build

    - name: configure cmake
      shell: bash
-      working-directory: ${{ runner.workspace }}/_build
+      working-directory: ${{ github.workspace }}/_build
      run: >
        cmake $GITHUB_WORKSPACE
        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
@ -34,5 +37,5 @@ jobs:

    - name: run
      shell: bash
-      working-directory: ${{ runner.workspace }}/_build
-      run: run-clang-tidy
+      working-directory: ${{ github.workspace }}/_build
+      run: run-clang-tidy -config-file=$GITHUB_WORKSPACE/.clang-tidy
--- a/.github/workflows/doxygen.yml
+++ b/.github/workflows/doxygen.yml
@ -6,13 +6,16 @@ on:
  pull_request:
    branches: [main]

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
  build-and-deploy:
    name: Build HTML documentation
    runs-on: ubuntu-latest
    steps:
    - name: Fetching sources
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4

    - name: Installing build dependencies
      run: |
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -0,0 +1,41 @@
+name: python + Bazel pre-commit checks
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+env:
+  CMAKE_GENERATOR: Ninja
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    env:
+      MYPY_CACHE_DIR: "${{ github.workspace }}/.cache/mypy"
+      RUFF_CACHE_DIR: "${{ github.workspace }}/.cache/ruff"
+      PRE_COMMIT_HOME: "${{ github.workspace }}/.cache/pre-commit"
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.11
+        cache: pip
+        cache-dependency-path: pyproject.toml
+    - name: Install dependencies
+      run: python -m pip install ".[dev]"
+    - name: Cache pre-commit tools
+      uses: actions/cache@v4
+      with:
+        path: |
+          ${{ env.MYPY_CACHE_DIR }}
+          ${{ env.RUFF_CACHE_DIR }}
+          ${{ env.PRE_COMMIT_HOME }}
+        key: ${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }}-linter-cache
+    - name: Run pre-commit checks
+      run: pre-commit run --all-files --verbose --show-diff-on-failure
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@ -1,28 +0,0 @@
-name: pylint
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-
-jobs:
-  pylint:
-
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v1
-      with:
-        python-version: 3.8
-
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install pylint pylint-exit conan
-
-    - name: Run pylint
-      run: |
-        pylint `find . -name '*.py'|xargs` || pylint-exit $?
--- a/.github/workflows/sanitizer.yml
+++ b/.github/workflows/sanitizer.yml
@ -5,6 +5,7 @@ on:
  pull_request: {}

 env:
+  CMAKE_GENERATOR: Ninja
  UBSAN_OPTIONS: "print_stacktrace=1"

 jobs:
@ -18,7 +19,7 @@ jobs:
        sanitizer: ['asan', 'ubsan', 'tsan', 'msan']

    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4

    - name: configure msan env
      if: matrix.sanitizer == 'msan'
@ -65,7 +66,7 @@ jobs:
      if: matrix.sanitizer != 'asan'
      run: |
        "${GITHUB_WORKSPACE}/.github/libcxx-setup.sh"
-        echo "EXTRA_CXX_FLAGS=-stdlib=libc++ -L ${GITHUB_WORKSPACE}/llvm-build/lib -lc++abi -Isystem${GITHUB_WORKSPACE}/llvm-build/include -Isystem${GITHUB_WORKSPACE}/llvm-build/include/c++/v1 -Wl,-rpath,${GITHUB_WORKSPACE}/llvm-build/lib" >> $GITHUB_ENV
+        echo "EXTRA_CXX_FLAGS=-stdlib=libc++ -L${GITHUB_WORKSPACE}/llvm-build/lib -lc++abi -I${GITHUB_WORKSPACE}/llvm-build/include/c++/v1 -Isystem${GITHUB_WORKSPACE}/llvm-build/include/c++/v1 -Wl,-rpath,${GITHUB_WORKSPACE}/llvm-build/lib" >> $GITHUB_ENV

    - name: create build environment
      run: cmake -E make_directory ${{ runner.workspace }}/_build
@ -75,7 +76,7 @@ jobs:
      working-directory: ${{ runner.workspace }}/_build
      run: >
        VERBOSE=1
-        cmake $GITHUB_WORKSPACE
+        cmake -GNinja $GITHUB_WORKSPACE
        -DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF
        -DBENCHMARK_ENABLE_LIBPFM=OFF
        -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON
--- a/.github/workflows/test_bindings.yml
+++ b/.github/workflows/test_bindings.yml
@ -6,24 +6,28 @@ on:
  pull_request:
    branches: [main]

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
  python_bindings:
-    name: Test GBM Python bindings on ${{ matrix.os }}
+    name: Test GBM Python ${{ matrix.python-version }} bindings on ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
-        os: [ ubuntu-latest, macos-latest, windows-2019 ]
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
+        python-version: [ "3.10", "3.11", "3.12", "3.13" ]

    steps:
-      - uses: actions/checkout@v3
-      - name: Set up Python
-        uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
        with:
-          python-version: 3.11
-      - name: Install GBM Python bindings on ${{ matrix.os}}
-        run:
-          python -m pip install wheel .
-      - name: Run bindings example on ${{ matrix.os }}
-        run:
-          python bindings/python/google_benchmark/example.py
+          fetch-depth: 0
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install GBM Python bindings on ${{ matrix.os }}
+        run: python -m pip install .
+      - name: Run example on ${{ matrix.os }} under Python ${{ matrix.python-version }}
+        run: python bindings/python/google_benchmark/example.py
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@ -6,26 +6,28 @@ on:
    types:
      - published

+env:
+  CMAKE_GENERATOR: Ninja
+
 jobs:
  build_sdist:
    name: Build source distribution
    runs-on: ubuntu-latest
    steps:
      - name: Check out repo
-        uses: actions/checkout@v3
-
-      - name: Install Python 3.11
-        uses: actions/setup-python@v4
+        uses: actions/checkout@v4
        with:
-          python-version: 3.11
-
-      - name: Build and check sdist
-        run: |
-          python setup.py sdist
-      - name: Upload sdist
-        uses: actions/upload-artifact@v3
+          fetch-depth: 0
+      - name: Install Python 3.12
+        uses: actions/setup-python@v5
        with:
-          name: dist
+          python-version: "3.12"
+      - run: python -m pip install build
+      - name: Build sdist
+        run: python -m build --sdist
+      - uses: actions/upload-artifact@v4
+        with:
+          name: dist-sdist
          path: dist/*.tar.gz

  build_wheels:
@ -33,47 +35,57 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
+        os: [ubuntu-latest, macos-13, macos-14, windows-latest]

    steps:
      - name: Check out Google Benchmark
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - uses: actions/setup-python@v5
+        name: Install Python 3.12
+        with:
+          python-version: "3.12"
+      - run: pip install --upgrade pip uv

      - name: Set up QEMU
        if: runner.os == 'Linux'
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3
        with:
          platforms: all

      - name: Build wheels on ${{ matrix.os }} using cibuildwheel
-        uses: pypa/cibuildwheel@v2.14.1
+        uses: pypa/cibuildwheel@v2.22.0
        env:
-          CIBW_BUILD: 'cp38-* cp39-* cp310-* cp311-*'
+          CIBW_BUILD: "cp310-* cp311-* cp312-*"
+          CIBW_BUILD_FRONTEND: "build[uv]"
          CIBW_SKIP: "*-musllinux_*"
-          CIBW_TEST_SKIP: "*-macosx_arm64"
-          CIBW_ARCHS_LINUX: x86_64 aarch64
-          CIBW_ARCHS_MACOS: x86_64 arm64
-          CIBW_ARCHS_WINDOWS: AMD64
+          CIBW_ARCHS_LINUX: auto64 aarch64
+          CIBW_ARCHS_WINDOWS: auto64
          CIBW_BEFORE_ALL_LINUX: bash .github/install_bazel.sh
+          # Grab the rootless Bazel installation inside the container.
+          CIBW_ENVIRONMENT_LINUX: PATH=$PATH:$HOME/bin
          CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
+          # unused by Bazel, but needed explicitly by delocate on MacOS.
+          MACOSX_DEPLOYMENT_TARGET: "10.14"

      - name: Upload Google Benchmark ${{ matrix.os }} wheels
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
-          name: dist
-          path: ./wheelhouse/*.whl
+          name: dist-${{ matrix.os }}
+          path: wheelhouse/*.whl

  pypi_upload:
    name: Publish google-benchmark wheels to PyPI
    needs: [build_sdist, build_wheels]
    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
    steps:
-    - uses: actions/download-artifact@v3
-      with:
-        name: dist
-        path: dist
-
-    - uses: pypa/gh-action-pypi-publish@v1.6.4
-      with:
-        user: __token__
-        password: ${{ secrets.PYPI_PASSWORD }}
+      - uses: actions/download-artifact@v4
+        with:
+          path: dist
+          pattern: dist-*
+          merge-multiple: true
+      - uses: pypa/gh-action-pypi-publish@release/v1
--- a/.gitignore
+++ b/.gitignore
@ -46,6 +46,7 @@ rules.ninja

 # bazel output symlinks.
 bazel-*
+MODULE.bazel.lock

 # out-of-source build top-level folders.
 build/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,18 @@
+repos:
+  -   repo: https://github.com/keith/pre-commit-buildifier
+      rev: 8.0.1
+      hooks:
+      -   id: buildifier
+      -   id: buildifier-lint
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.15.0
+    hooks:
+      - id: mypy
+        types_or: [ python, pyi ]
+        args: [ "--ignore-missing-imports", "--scripts-are-modules" ]
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.9.6
+    hooks:
+      - id: ruff
+        args: [ --fix, --exit-non-zero-on-fix ]
+      - id: ruff-format
--- a/.travis.yml
+++ b/.travis.yml
@ -1,208 +0,0 @@
-sudo: required
-dist: trusty
-language: cpp
-
-matrix:
-  include:
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - lcov
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - g++-multilib
-            - libc6:i386
-      env:
-        - COMPILER=g++
-        - C_COMPILER=gcc
-        - BUILD_TYPE=Debug
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-m32"
-    - compiler: gcc
-      addons:
-        apt:
-          packages:
-            - g++-multilib
-            - libc6:i386
-      env:
-        - COMPILER=g++
-        - C_COMPILER=gcc
-        - BUILD_TYPE=Release
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-m32"
-    - compiler: gcc
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
-    # Clang w/ libc++
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
-        - LIBCXX_BUILD=1
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    # Clang w/ 32bit libc++
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            - clang-3.8
-            - g++-multilib
-            - libc6:i386
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-m32"
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    # Clang w/ 32bit libc++
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            - clang-3.8
-            - g++-multilib
-            - libc6:i386
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
-        - LIBCXX_BUILD=1
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-m32"
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    # Clang w/ libc++, ASAN, UBSAN
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address"
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all"
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-        - UBSAN_OPTIONS=print_stacktrace=1
-    # Clang w/ libc++ and MSAN
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    # Clang w/ libc++ and MSAN
-    - compiler: clang
-      dist: xenial
-      addons:
-        apt:
-          packages:
-            clang-3.8
-      env:
-        - INSTALL_GCC6_FROM_PPA=1
-        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo
-        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread
-        - ENABLE_SANITIZER=1
-        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
-        - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++
-        - BUILD_TYPE=Release
-        - BUILD_32_BITS=ON
-        - EXTRA_FLAGS="-m32"
-
-before_script:
-  - if [ -n "${LIBCXX_BUILD}" ]; then
-      source .libcxx-setup.sh;
-    fi
-  - if [ -n "${ENABLE_SANITIZER}" ]; then
-      export EXTRA_OPTIONS="-DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF";
-    else
-      export EXTRA_OPTIONS="";
-    fi
-  - mkdir -p build && cd build
-
-before_install:
-  - if [ -z "$BUILD_32_BITS" ]; then
-      export BUILD_32_BITS=OFF && echo disabling 32 bit build;
-    fi
-  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
-      sudo add-apt-repository -y "ppa:ubuntu-toolchain-r/test";
-      sudo apt-get update --option Acquire::Retries=100 --option Acquire::http::Timeout="60";
-    fi
-
-install:
-  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
-      travis_wait sudo -E apt-get -yq --no-install-suggests --no-install-recommends install g++-6;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "linux" -a "${BUILD_32_BITS}" == "OFF" ]; then
-      travis_wait sudo -E apt-get -y --no-install-suggests --no-install-recommends install llvm-3.9-tools;
-      sudo cp /usr/lib/llvm-3.9/bin/FileCheck /usr/local/bin/;
-    fi
-  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
-      PATH=~/.local/bin:${PATH};
-      pip install --user --upgrade pip;
-      travis_wait pip install --user cpp-coveralls;
-    fi
-  - if [ "${C_COMPILER}" == "gcc-7" -a "${TRAVIS_OS_NAME}" == "osx" ]; then
-      rm -f /usr/local/include/c++;
-      brew update;
-      travis_wait brew install gcc@7;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "linux" ]; then
-      sudo apt-get update -qq;
-      sudo apt-get install -qq unzip cmake3;
-      wget https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-linux-x86_64.sh --output-document bazel-installer.sh;
-      travis_wait sudo bash bazel-installer.sh;
-    fi
-  - if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
-      curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-darwin-x86_64.sh;
-      travis_wait sudo bash bazel-installer.sh;
-    fi
-
-script:
-  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_C_FLAGS="${EXTRA_FLAGS}" -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS} ${EXTRA_CXX_FLAGS}" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ${EXTRA_OPTIONS} ..
-  - make
-  - ctest -C ${BUILD_TYPE} --output-on-failure
-  - bazel test -c dbg --define google_benchmark.have_regex=posix --announce_rc --verbose_failures --test_output=errors --keep_going //test/...
-
-after_success:
-  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
-      coveralls --include src --include include --gcov-options '\-lp' --root .. --build-root .;
-    fi
--- a/.ycm_extra_conf.py
+++ b/.ycm_extra_conf.py
@ -1,25 +1,30 @@
 import os
+
 import ycm_core

 # These are the compilation flags that will be used in case there's no
 # compilation database set (by default, one is not set).
 # CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR.
 flags = [
-'-Wall',
-'-Werror',
-'-pedantic-errors',
-'-std=c++0x',
-'-fno-strict-aliasing',
-'-O3',
-'-DNDEBUG',
-# ...and the same thing goes for the magic -x option which specifies the
-# language that the files to be compiled are written in. This is mostly
-# relevant for c++ headers.
-# For a C project, you would set this to 'c' instead of 'c++'.
-'-x', 'c++',
-'-I', 'include',
-'-isystem', '/usr/include',
-'-isystem', '/usr/local/include',
+    "-Wall",
+    "-Werror",
+    "-pedantic-errors",
+    "-std=c++0x",
+    "-fno-strict-aliasing",
+    "-O3",
+    "-DNDEBUG",
+    # ...and the same thing goes for the magic -x option which specifies the
+    # language that the files to be compiled are written in. This is mostly
+    # relevant for c++ headers.
+    # For a C project, you would set this to 'c' instead of 'c++'.
+    "-x",
+    "c++",
+    "-I",
+    "include",
+    "-isystem",
+    "/usr/include",
+    "-isystem",
+    "/usr/local/include",
 ]


@ -29,87 +34,87 @@ flags = [
 #
 # Most projects will NOT need to set this to anything; you can just change the
 # 'flags' list of compilation flags. Notice that YCM itself uses that approach.
-compilation_database_folder = ''
+compilation_database_folder = ""

-if os.path.exists( compilation_database_folder ):
-  database = ycm_core.CompilationDatabase( compilation_database_folder )
+if os.path.exists(compilation_database_folder):
+    database = ycm_core.CompilationDatabase(compilation_database_folder)
 else:
-  database = None
+    database = None
+
+SOURCE_EXTENSIONS = [".cc"]

-SOURCE_EXTENSIONS = [ '.cc' ]

 def DirectoryOfThisScript():
-  return os.path.dirname( os.path.abspath( __file__ ) )
+    return os.path.dirname(os.path.abspath(__file__))


-def MakeRelativePathsInFlagsAbsolute( flags, working_directory ):
-  if not working_directory:
-    return list( flags )
-  new_flags = []
-  make_next_absolute = False
-  path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ]
-  for flag in flags:
-    new_flag = flag
+def MakeRelativePathsInFlagsAbsolute(flags, working_directory):
+    if not working_directory:
+        return list(flags)
+    new_flags = []
+    make_next_absolute = False
+    path_flags = ["-isystem", "-I", "-iquote", "--sysroot="]
+    for flag in flags:
+        new_flag = flag

-    if make_next_absolute:
-      make_next_absolute = False
-      if not flag.startswith( '/' ):
-        new_flag = os.path.join( working_directory, flag )
+        if make_next_absolute:
+            make_next_absolute = False
+            if not flag.startswith("/"):
+                new_flag = os.path.join(working_directory, flag)

-    for path_flag in path_flags:
-      if flag == path_flag:
-        make_next_absolute = True
-        break
+        for path_flag in path_flags:
+            if flag == path_flag:
+                make_next_absolute = True
+                break

-      if flag.startswith( path_flag ):
-        path = flag[ len( path_flag ): ]
-        new_flag = path_flag + os.path.join( working_directory, path )
-        break
+            if flag.startswith(path_flag):
+                path = flag[len(path_flag) :]
+                new_flag = path_flag + os.path.join(working_directory, path)
+                break

-    if new_flag:
-      new_flags.append( new_flag )
-  return new_flags
+        if new_flag:
+            new_flags.append(new_flag)
+    return new_flags


-def IsHeaderFile( filename ):
-  extension = os.path.splitext( filename )[ 1 ]
-  return extension in [ '.h', '.hxx', '.hpp', '.hh' ]
+def IsHeaderFile(filename):
+    extension = os.path.splitext(filename)[1]
+    return extension in [".h", ".hxx", ".hpp", ".hh"]


-def GetCompilationInfoForFile( filename ):
-  # The compilation_commands.json file generated by CMake does not have entries
-  # for header files. So we do our best by asking the db for flags for a
-  # corresponding source file, if any. If one exists, the flags for that file
-  # should be good enough.
-  if IsHeaderFile( filename ):
-    basename = os.path.splitext( filename )[ 0 ]
-    for extension in SOURCE_EXTENSIONS:
-      replacement_file = basename + extension
-      if os.path.exists( replacement_file ):
-        compilation_info = database.GetCompilationInfoForFile(
-          replacement_file )
-        if compilation_info.compiler_flags_:
-          return compilation_info
-    return None
-  return database.GetCompilationInfoForFile( filename )
+def GetCompilationInfoForFile(filename):
+    # The compilation_commands.json file generated by CMake does not have
+    # entries for header files. So we do our best by asking the db for flags for
+    # a corresponding source file, if any. If one exists, the flags for that
+    # file should be good enough.
+    if IsHeaderFile(filename):
+        basename = os.path.splitext(filename)[0]
+        for extension in SOURCE_EXTENSIONS:
+            replacement_file = basename + extension
+            if os.path.exists(replacement_file):
+                compilation_info = database.GetCompilationInfoForFile(
+                    replacement_file
+                )
+                if compilation_info.compiler_flags_:
+                    return compilation_info
+        return None
+    return database.GetCompilationInfoForFile(filename)


-def FlagsForFile( filename, **kwargs ):
-  if database:
-    # Bear in mind that compilation_info.compiler_flags_ does NOT return a
-    # python list, but a "list-like" StringVec object
-    compilation_info = GetCompilationInfoForFile( filename )
-    if not compilation_info:
-      return None
+def FlagsForFile(filename, **kwargs):
+    if database:
+        # Bear in mind that compilation_info.compiler_flags_ does NOT return a
+        # python list, but a "list-like" StringVec object
+        compilation_info = GetCompilationInfoForFile(filename)
+        if not compilation_info:
+            return None

-    final_flags = MakeRelativePathsInFlagsAbsolute(
-      compilation_info.compiler_flags_,
-      compilation_info.compiler_working_dir_ )
-  else:
-    relative_to = DirectoryOfThisScript()
-    final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to )
+        final_flags = MakeRelativePathsInFlagsAbsolute(
+            compilation_info.compiler_flags_,
+            compilation_info.compiler_working_dir_,
+        )
+    else:
+        relative_to = DirectoryOfThisScript()
+        final_flags = MakeRelativePathsInFlagsAbsolute(flags, relative_to)

-  return {
-    'flags': final_flags,
-    'do_cache': True
-  }
+    return {"flags": final_flags, "do_cache": True}
--- a/1
+++ b/1
@ -31,6 +31,7 @@ Evgeny Safronov <division494@gmail.com>
 Fabien Pichot <pichot.fabien@gmail.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Gergely Meszaros <maetveis@gmail.com>
 Gergő Szitár <szitar.gergo@gmail.com>
 Google Inc.
 Henrique Bucher <hbucher@gmail.com>
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -1,29 +1,34 @@
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
 licenses(["notice"])

-config_setting(
-    name = "qnx",
-    constraint_values = ["@platforms//os:qnx"],
-    values = {
-        "cpu": "x64_qnx",
-    },
-    visibility = [":__subpackages__"],
-)
+COPTS = [
+    "-pedantic",
+    "-pedantic-errors",
+    "-std=c++17",
+    "-Wall",
+    "-Wconversion",
+    "-Wextra",
+    "-Wshadow",
+    #    "-Wshorten-64-to-32",
+    "-Wfloat-equal",
+    "-fstrict-aliasing",
+    ## assert() are used a lot in tests upstream, which may be optimised out leading to
+    ## unused-variable warning.
+    "-Wno-unused-variable",
+    "-Werror=old-style-cast",
+]
+
+MSVC_COPTS = [
+    "/std:c++17",
+]

 config_setting(
    name = "windows",
    constraint_values = ["@platforms//os:windows"],
-    values = {
-        "cpu": "x64_windows",
-    },
    visibility = [":__subpackages__"],
 )

-config_setting(
-    name = "macos",
-    constraint_values = ["@platforms//os:macos"],
-    visibility = ["//visibility:public"],
-)
-
 config_setting(
    name = "perfcounters",
    define_values = {
@ -45,28 +50,35 @@ cc_library(
        "include/benchmark/benchmark.h",
        "include/benchmark/export.h",
    ],
-    linkopts = select({
-        ":windows": ["-DEFAULTLIB:shlwapi.lib"],
-        "//conditions:default": ["-pthread"],
-    }),
    copts = select({
-        ":windows": [],
-        "//conditions:default": ["-Werror=old-style-cast"],
+        ":windows": MSVC_COPTS,
+        "//conditions:default": COPTS,
    }),
-    strip_include_prefix = "include",
-    visibility = ["//visibility:public"],
-    # Only static linking is allowed; no .so will be produced.
-    # Using `defines` (i.e. not `local_defines`) means that no
-    # dependent rules need to bother about defining the macro.
-    linkstatic = True,
    defines = [
        "BENCHMARK_STATIC_DEFINE",
+        "BENCHMARK_VERSION=\\\"" + (module_version() if module_version() != None else "") + "\\\"",
    ] + select({
        ":perfcounters": ["HAVE_LIBPFM"],
        "//conditions:default": [],
    }),
+    includes = ["include"],
+    linkopts = select({
+        ":windows": ["-DEFAULTLIB:shlwapi.lib"],
+        "//conditions:default": ["-pthread"],
+    }),
+    # Only static linking is allowed; no .so will be produced.
+    # Using `defines` (i.e. not `local_defines`) means that no
+    # dependent rules need to bother about defining the macro.
+    linkstatic = True,
+    local_defines = [
+        # Turn on Large-file Support
+        "_FILE_OFFSET_BITS=64",
+        "_LARGEFILE64_SOURCE",
+        "_LARGEFILE_SOURCE",
+    ],
+    visibility = ["//visibility:public"],
    deps = select({
-        ":perfcounters": ["@libpfm//:libpfm"],
+        ":perfcounters": ["@libpfm"],
        "//conditions:default": [],
    }),
 )
@ -74,8 +86,11 @@ cc_library(
 cc_library(
    name = "benchmark_main",
    srcs = ["src/benchmark_main.cc"],
-    hdrs = ["include/benchmark/benchmark.h", "include/benchmark/export.h"],
-    strip_include_prefix = "include",
+    hdrs = [
+        "include/benchmark/benchmark.h",
+        "include/benchmark/export.h",
+    ],
+    includes = ["include"],
    visibility = ["//visibility:public"],
    deps = [":benchmark"],
 )
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,7 +1,7 @@
 # Require CMake 3.10. If available, use the policies up to CMake 3.22.
-cmake_minimum_required (VERSION 3.10...3.22)
+cmake_minimum_required (VERSION 3.13...3.22)

-project (benchmark VERSION 1.8.3 LANGUAGES CXX)
+project (benchmark VERSION 1.9.2 LANGUAGES CXX)

 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
@ -21,7 +21,7 @@ if(BENCHMARK_FORCE_WERROR)
  set(BENCHMARK_ENABLE_WERROR ON)
 endif(BENCHMARK_FORCE_WERROR)

-if(NOT MSVC)
+if(NOT (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC"))
  option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
 else()
  set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
@ -45,7 +45,7 @@ option(BENCHMARK_ENABLE_LIBPFM "Enable performance counters provided by libpfm"
 set(CMAKE_CXX_VISIBILITY_PRESET hidden)
 set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)

-if(MSVC)
+if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
    # As of CMake 3.18, CMAKE_SYSTEM_PROCESSOR is not set properly for MSVC and
    # cross-compilation (e.g. Host=x86_64, target=aarch64) requires using the
    # undocumented, but working variable.
@ -66,7 +66,7 @@ function(should_enable_assembly_tests)
      return()
    endif()
  endif()
-  if (MSVC)
+  if (MSVC OR CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC")
    return()
  elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
    return()
@ -104,17 +104,27 @@ get_git_version(GIT_VERSION)

 # If no git version can be determined, use the version
 # from the project() command
-if ("${GIT_VERSION}" STREQUAL "0.0.0")
-  set(VERSION "${benchmark_VERSION}")
+if ("${GIT_VERSION}" STREQUAL "v0.0.0")
+  set(VERSION "v${benchmark_VERSION}")
 else()
  set(VERSION "${GIT_VERSION}")
 endif()
+
+# Normalize version: drop "v" prefix, replace first "-" with ".",
+# drop everything after second "-" (including said "-").
+string(STRIP ${VERSION} VERSION)
+if(VERSION MATCHES v[^-]*-)
+   string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  NORMALIZED_VERSION ${VERSION})
+else()
+   string(REGEX REPLACE "v(.*)" "\\1" NORMALIZED_VERSION ${VERSION})
+endif()
+
 # Tell the user what versions we are using
-message(STATUS "Google Benchmark version: ${VERSION}")
+message(STATUS "Google Benchmark version: ${VERSION}, normalized to ${NORMALIZED_VERSION}")

 # The version of the libraries
-set(GENERIC_LIB_VERSION ${VERSION})
-string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)
+set(GENERIC_LIB_VERSION ${NORMALIZED_VERSION})
+string(SUBSTRING ${NORMALIZED_VERSION} 0 1 GENERIC_LIB_SOVERSION)

 # Import our CMake modules
 include(AddCXXCompilerFlag)
@ -128,11 +138,7 @@ if (BENCHMARK_BUILD_32_BITS)
  add_required_cxx_compiler_flag(-m32)
 endif()

-if (MSVC)
-  set(BENCHMARK_CXX_STANDARD 14)
-else()
-  set(BENCHMARK_CXX_STANDARD 11)
-endif()
+set(BENCHMARK_CXX_STANDARD 17)

 set(CMAKE_CXX_STANDARD ${BENCHMARK_CXX_STANDARD})
 set(CMAKE_CXX_STANDARD_REQUIRED YES)
@ -142,8 +148,17 @@ if (MSVC)
  # Turn compiler warnings up to 11
  string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
+
+  # MP flag only applies to cl, not cl frontends to other compilers (e.g. clang-cl, icx-cl etc)
+  if(CMAKE_CXX_COMPILER_ID MATCHES MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
+  endif()
  add_definitions(-D_CRT_SECURE_NO_WARNINGS)

+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-WX)
+  endif()
+
  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
    add_cxx_compiler_flag(-EHs-)
    add_cxx_compiler_flag(-EHa-)
@ -170,12 +185,17 @@ if (MSVC)
    set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG")
  endif()
 else()
+  # Turn on Large-file Support
+  add_definitions(-D_FILE_OFFSET_BITS=64)
+  add_definitions(-D_LARGEFILE64_SOURCE)
+  add_definitions(-D_LARGEFILE_SOURCE)
  # Turn compiler warnings up to 11
  add_cxx_compiler_flag(-Wall)
  add_cxx_compiler_flag(-Wextra)
  add_cxx_compiler_flag(-Wshadow)
  add_cxx_compiler_flag(-Wfloat-equal)
  add_cxx_compiler_flag(-Wold-style-cast)
+  add_cxx_compiler_flag(-Wconversion)
  if(BENCHMARK_ENABLE_WERROR)
      add_cxx_compiler_flag(-Werror)
  endif()
@ -286,17 +306,11 @@ if (BENCHMARK_USE_LIBCXX)
  endif()
 endif(BENCHMARK_USE_LIBCXX)

-set(EXTRA_CXX_FLAGS "")
-if (WIN32 AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-  # Clang on Windows fails to compile the regex feature check under C++11
-  set(EXTRA_CXX_FLAGS "-DCMAKE_CXX_STANDARD=14")
-endif()
-
 # C++ feature checks
 # Determine the correct regular expression engine to use
-cxx_feature_check(STD_REGEX ${EXTRA_CXX_FLAGS})
-cxx_feature_check(GNU_POSIX_REGEX ${EXTRA_CXX_FLAGS})
-cxx_feature_check(POSIX_REGEX ${EXTRA_CXX_FLAGS})
+cxx_feature_check(STD_REGEX)
+cxx_feature_check(GNU_POSIX_REGEX)
+cxx_feature_check(POSIX_REGEX)
 if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
  message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
 endif()
@ -312,7 +326,7 @@ find_package(Threads REQUIRED)
 cxx_feature_check(PTHREAD_AFFINITY)

 if (BENCHMARK_ENABLE_LIBPFM)
-  find_package(PFM)
+  find_package(PFM REQUIRED)
 endif()

 # Set up directories
--- a/3
+++ b/3
@ -42,6 +42,7 @@ Dominic Hamon <dma@stripysock.com> <dominic@google.com>
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
 Dominik Korman <kormandominik@gmail.com>
 Donald Aingworth <donalds_junk_mail@yahoo.com>
+Doug Evans <xdje42@gmail.com>
 Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
@ -51,10 +52,12 @@ Fanbo Meng <fanbo.meng@ibm.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Geoffrey Martin-Noble <gcmn@google.com> <gmngeoffrey@gmail.com>
+Gergely Meszaros <maetveis@gmail.com>
 Gergő Szitár <szitar.gergo@gmail.com>
 Hannes Hauswedell <h2@fsfe.org>
 Henrique Bucher <hbucher@gmail.com>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
+Iakov Sergeev <yahontu@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
--- a/MODULE.bazel
+++ b/MODULE.bazel
@ -1,12 +1,16 @@
-module(name = "google_benchmark", version="1.8.3")
+module(
+    name = "google_benchmark",
+    version = "1.9.2",
+)

-bazel_dep(name = "bazel_skylib", version = "1.4.1")
-bazel_dep(name = "platforms", version = "0.0.6")
-bazel_dep(name = "rules_foreign_cc", version = "0.9.0")
-bazel_dep(name = "rules_cc", version = "0.0.6")
-bazel_dep(name = "rules_python", version = "0.24.0", dev_dependency = True)
-bazel_dep(name = "googletest", version = "1.12.1", repo_name = "com_google_googletest", dev_dependency = True)
-bazel_dep(name = "libpfm", version = "4.11.0")
+bazel_dep(name = "bazel_skylib", version = "1.7.1")
+bazel_dep(name = "platforms", version = "0.0.10")
+bazel_dep(name = "rules_cc", version = "0.0.9")
+
+bazel_dep(name = "rules_python", version = "1.0.0", dev_dependency = True)
+bazel_dep(name = "googletest", version = "1.14.0", dev_dependency = True, repo_name = "com_google_googletest")
+
+bazel_dep(name = "libpfm", version = "4.11.0.bcr.1")

 # Register a toolchain for Python 3.9 to be able to build numpy. Python
 # versions >=3.10 are problematic.
@ -14,11 +18,24 @@ bazel_dep(name = "libpfm", version = "4.11.0")
 # of relying on the changing default version from rules_python.

 python = use_extension("@rules_python//python/extensions:python.bzl", "python", dev_dependency = True)
+python.toolchain(python_version = "3.8")
 python.toolchain(python_version = "3.9")
+python.toolchain(python_version = "3.10")
+python.toolchain(python_version = "3.11")
+python.toolchain(
+    is_default = True,
+    python_version = "3.12",
+)
+python.toolchain(python_version = "3.13")

 pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip", dev_dependency = True)
 pip.parse(
-    hub_name="tools_pip_deps",
+    hub_name = "tools_pip_deps",
    python_version = "3.9",
-    requirements_lock="//tools:requirements.txt")
+    requirements_lock = "//tools:requirements.txt",
+)
 use_repo(pip, "tools_pip_deps")
+
+# -- bazel_dep definitions -- #
+
+bazel_dep(name = "nanobind_bazel", version = "2.5.0", dev_dependency = True)
--- a/README.md
+++ b/README.md
@ -50,15 +50,13 @@ IRC channels:

 ## Requirements

-The library can be used with C++03. However, it requires C++11 to build,
+The library can be used with C++11. However, it requires C++17 to build,
 including compiler and standard library support.

-The following minimum versions are required to build the library:
+_See [dependencies.md](docs/dependencies.md) for more details regarding supported
+compilers and standards._

-* GCC 4.8
-* Clang 3.4
-* Visual Studio 14 2015
-* Intel 2015 Update 1
+If you have need for a particular compiler to be supported, patches are very welcome.

 See [Platform-Specific Build Instructions](docs/platform_specific_build_instructions.md).

@ -80,7 +78,7 @@ $ cmake -E make_directory "build"
 # Generate build system files with cmake, and download any dependencies.
 $ cmake -E chdir "build" cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=on -DCMAKE_BUILD_TYPE=Release ../
 # or, starting with CMake 3.13, use a simpler form:
-# cmake -DCMAKE_BUILD_TYPE=Release -S . -B "build"
+# cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=on -DCMAKE_BUILD_TYPE=Release -S . -B "build"
 # Build the library.
 $ cmake --build "build" --config Release
 ```
--- a/20
+++ b/20
@ -4,19 +4,17 @@ load("//:bazel/benchmark_deps.bzl", "benchmark_deps")

 benchmark_deps()

-load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
+load("@rules_python//python:repositories.bzl", "py_repositories")

-rules_foreign_cc_dependencies()
+py_repositories()

-load("@rules_python//python:pip.bzl", pip3_install="pip_install")
+load("@rules_python//python:pip.bzl", "pip_parse")

-pip3_install(
-   name = "tools_pip_deps",
-   requirements = "//tools:requirements.txt",
+pip_parse(
+    name = "tools_pip_deps",
+    requirements_lock = "//tools:requirements.txt",
 )

-new_local_repository(
-    name = "python_headers",
-    build_file = "@//bindings/python:python_headers.BUILD",
-    path = "<PYTHON_INCLUDE_PATH>",  # May be overwritten by setup.py.
-)
+load("@tools_pip_deps//:requirements.bzl", "install_deps")
+
+install_deps()
--- a/bazel/benchmark_deps.bzl
+++ b/bazel/benchmark_deps.bzl
@ -1,5 +1,9 @@
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+"""
+This file contains the Bazel build dependencies for Google Benchmark (both C++ source and Python bindings).
+"""
+
 load("@bazel_tools//tools/build_defs/repo:git.bzl", "new_git_repository")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")

 def benchmark_deps():
    """Loads dependencies required to build Google Benchmark."""
@ -7,48 +11,33 @@ def benchmark_deps():
    if "bazel_skylib" not in native.existing_rules():
        http_archive(
            name = "bazel_skylib",
-            sha256 = "f7be3474d42aae265405a592bb7da8e171919d74c16f082a5457840f06054728",
+            sha256 = "cd55a062e763b9349921f0f5db8c3933288dc8ba4f76dd9416aac68acee3cb94",
            urls = [
-                "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
-                "https://github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz",
+                "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
+                "https://github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
            ],
        )

-    if "rules_foreign_cc" not in native.existing_rules():
-        http_archive(
-            name = "rules_foreign_cc",
-            sha256 = "bcd0c5f46a49b85b384906daae41d277b3dc0ff27c7c752cc51e43048a58ec83",
-            strip_prefix = "rules_foreign_cc-0.7.1",
-            url = "https://github.com/bazelbuild/rules_foreign_cc/archive/0.7.1.tar.gz",
-        )
-
    if "rules_python" not in native.existing_rules():
        http_archive(
            name = "rules_python",
-            url = "https://github.com/bazelbuild/rules_python/releases/download/0.1.0/rules_python-0.1.0.tar.gz",
-            sha256 = "b6d46438523a3ec0f3cead544190ee13223a52f6a6765a29eae7b7cc24cc83a0",
-        )
-
-    if "com_google_absl" not in native.existing_rules():
-        http_archive(
-            name = "com_google_absl",
-            sha256 = "f41868f7a938605c92936230081175d1eae87f6ea2c248f41077c8f88316f111",
-            strip_prefix = "abseil-cpp-20200225.2",
-            urls = ["https://github.com/abseil/abseil-cpp/archive/20200225.2.tar.gz"],
+            sha256 = "e85ae30de33625a63eca7fc40a94fea845e641888e52f32b6beea91e8b1b2793",
+            strip_prefix = "rules_python-0.27.1",
+            url = "https://github.com/bazelbuild/rules_python/releases/download/0.27.1/rules_python-0.27.1.tar.gz",
        )

    if "com_google_googletest" not in native.existing_rules():
        new_git_repository(
            name = "com_google_googletest",
            remote = "https://github.com/google/googletest.git",
-            tag = "release-1.11.0",
+            tag = "release-1.12.1",
        )

    if "nanobind" not in native.existing_rules():
        new_git_repository(
            name = "nanobind",
            remote = "https://github.com/wjakob/nanobind.git",
-            tag = "v1.4.0",
+            tag = "v1.9.2",
            build_file = "@//bindings/python:nanobind.BUILD",
            recursive_init_submodules = True,
        )
--- a/bindings/python/BUILD
+++ b/bindings/python/BUILD
@ -1,3 +0,0 @@
-exports_files(glob(["*.BUILD"]))
-exports_files(["build_defs.bzl"])
-
--- a/bindings/python/build_defs.bzl
+++ b/bindings/python/build_defs.bzl
@ -1,25 +0,0 @@
-_SHARED_LIB_SUFFIX = {
-    "//conditions:default": ".so",
-    "//:windows": ".dll",
-}
-
-def py_extension(name, srcs, hdrs = [], copts = [], features = [], deps = []):
-    for shared_lib_suffix in _SHARED_LIB_SUFFIX.values():
-        shared_lib_name = name + shared_lib_suffix
-        native.cc_binary(
-            name = shared_lib_name,
-            linkshared = True,
-            linkstatic = True,
-            srcs = srcs + hdrs,
-            copts = copts,
-            features = features,
-            deps = deps,
-        )
-
-    return native.py_library(
-        name = name,
-        data = select({
-            platform: [name + shared_lib_suffix]
-            for platform, shared_lib_suffix in _SHARED_LIB_SUFFIX.items()
-        }),
-    )
--- a/bindings/python/google_benchmark/BUILD
+++ b/bindings/python/google_benchmark/BUILD
@ -1,4 +1,5 @@
-load("//bindings/python:build_defs.bzl", "py_extension")
+load("@nanobind_bazel//:build_defs.bzl", "nanobind_extension", "nanobind_stubgen")
+load("@rules_python//python:defs.bzl", "py_library", "py_test")

 py_library(
    name = "google_benchmark",
@ -9,22 +10,16 @@ py_library(
    ],
 )

-py_extension(
+nanobind_extension(
    name = "_benchmark",
    srcs = ["benchmark.cc"],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = [
-        "-use_header_modules",
-        "-parse_headers",
-    ],
-    deps = [
-        "//:benchmark",
-        "@nanobind",
-        "@python_headers",
-    ],
+    deps = ["//:benchmark"],
+)
+
+nanobind_stubgen(
+    name = "benchmark_stubgen",
+    marker_file = "bindings/python/google_benchmark/py.typed",
+    module = ":_benchmark",
 )

 py_test(
@ -37,4 +32,3 @@ py_test(
        ":google_benchmark",
    ],
 )
-
--- a/bindings/python/google_benchmark/init.py
+++ b/bindings/python/google_benchmark/init.py
@ -26,50 +26,31 @@ Example usage:
  if __name__ == '__main__':
    benchmark.main()
 """
+
 import atexit

 from absl import app
+
 from google_benchmark import _benchmark
 from google_benchmark._benchmark import (
-    Counter,
-    kNanosecond,
-    kMicrosecond,
-    kMillisecond,
-    kSecond,
-    oNone,
-    o1,
-    oN,
-    oNSquared,
-    oNCubed,
-    oLogN,
-    oNLogN,
-    oAuto,
-    oLambda,
-    State,
+    Counter as Counter,
+    State as State,
+    kMicrosecond as kMicrosecond,
+    kMillisecond as kMillisecond,
+    kNanosecond as kNanosecond,
+    kSecond as kSecond,
+    o1 as o1,
+    oAuto as oAuto,
+    oLambda as oLambda,
+    oLogN as oLogN,
+    oN as oN,
+    oNCubed as oNCubed,
+    oNLogN as oNLogN,
+    oNone as oNone,
+    oNSquared as oNSquared,
 )

-
-__all__ = [
-    "register",
-    "main",
-    "Counter",
-    "kNanosecond",
-    "kMicrosecond",
-    "kMillisecond",
-    "kSecond",
-    "oNone",
-    "o1",
-    "oN",
-    "oNSquared",
-    "oNCubed",
-    "oLogN",
-    "oNLogN",
-    "oAuto",
-    "oLambda",
-    "State",
-]
-
-__version__ = "1.8.3"
+__version__ = "1.9.2"


 class __OptionMaker:
@ -79,7 +60,8 @@ class __OptionMaker:
    """

    class Options:
-        """Pure data class to store options calls, along with the benchmarked function."""
+        """Pure data class to store options calls, along with the benchmarked
+        function."""

        def __init__(self, func):
            self.func = func
@ -97,14 +79,13 @@ class __OptionMaker:

        # The function that get returned on @option.range(start=0, limit=1<<5).
        def __builder_method(*args, **kwargs):
-
            # The decorator that get called, either with the benchmared function
            # or the previous Options
            def __decorator(func_or_options):
                options = self.make(func_or_options)
                options.builder_calls.append((builder_name, args, kwargs))
-                # The decorator returns Options so it is not technically a decorator
-                # and needs a final call to @register
+                # The decorator returns Options so it is not technically a
+                # decorator and needs a final call to @register
                return options

            return __decorator
@ -113,8 +94,8 @@ class __OptionMaker:


 # Alias for nicer API.
-# We have to instantiate an object, even if stateless, to be able to use __getattr__
-# on option.range
+# We have to instantiate an object, even if stateless, to be able to use
+# __getattr__ on option.range
 option = __OptionMaker()


@ -124,8 +105,8 @@ def register(undefined=None, *, name=None):
        # Decorator is called without parenthesis so we return a decorator
        return lambda f: register(f, name=name)

-    # We have either the function to benchmark (simple case) or an instance of Options
-    # (@option._ case).
+    # We have either the function to benchmark (simple case) or an instance of
+    # Options (@option._ case).
    options = __OptionMaker.make(undefined)

    if name is None:
--- a/bindings/python/google_benchmark/benchmark.cc
+++ b/bindings/python/google_benchmark/benchmark.cc
@ -118,7 +118,7 @@ NB_MODULE(_benchmark, m) {
  using benchmark::Counter;
  nb::class_<Counter> py_counter(m, "Counter");

-  nb::enum_<Counter::Flags>(py_counter, "Flags")
+  nb::enum_<Counter::Flags>(py_counter, "Flags", nb::is_arithmetic(), nb::is_flag())
      .value("kDefaults", Counter::Flags::kDefaults)
      .value("kIsRate", Counter::Flags::kIsRate)
      .value("kAvgThreads", Counter::Flags::kAvgThreads)
@ -129,8 +129,7 @@ NB_MODULE(_benchmark, m) {
      .value("kAvgIterations", Counter::Flags::kAvgIterations)
      .value("kAvgIterationsRate", Counter::Flags::kAvgIterationsRate)
      .value("kInvert", Counter::Flags::kInvert)
-      .export_values()
-      .def(nb::self | nb::self);
+      .export_values();

  nb::enum_<Counter::OneK>(py_counter, "OneK")
      .value("kIs1000", Counter::OneK::kIs1000)
@ -141,7 +140,8 @@ NB_MODULE(_benchmark, m) {
      .def(nb::init<double, Counter::Flags, Counter::OneK>(),
           nb::arg("value") = 0., nb::arg("flags") = Counter::kDefaults,
           nb::arg("k") = Counter::kIs1000)
-      .def("__init__", ([](Counter *c, double value) { new (c) Counter(value); }))
+      .def("__init__",
+           ([](Counter* c, double value) { new (c) Counter(value); }))
      .def_rw("value", &Counter::value)
      .def_rw("flags", &Counter::flags)
      .def_rw("oneK", &Counter::oneK)
--- a/bindings/python/google_benchmark/example.py
+++ b/bindings/python/google_benchmark/example.py
@ -13,7 +13,8 @@
 # limitations under the License.
 """Example of Python using C++ benchmark framework.

-To run this example, you must first install the `google_benchmark` Python package.
+To run this example, you must first install the `google_benchmark` Python
+package.

 To install using `setup.py`, download and extract the `google_benchmark` source.
 In the extracted directory, execute:
@ -38,6 +39,7 @@ def sum_million(state):
    while state:
        sum(range(1_000_000))

+
@benchmark.register
 def pause_timing(state):
    """Pause timing every iteration."""
@ -56,10 +58,11 @@ def skipped(state):
        state.skip_with_error("some error")
        return  # NOTE: You must explicitly return, or benchmark will continue.

-    ...  # Benchmark code would be here.
+    # Benchmark code would be here.


@benchmark.register
+@benchmark.option.use_manual_time()
 def manual_timing(state):
    while state:
        # Manually count Python CPU time
@ -76,7 +79,6 @@ def custom_counters(state):
    num_foo = 0.0
    while state:
        # Benchmark some code here
-        pass
        # Collect some custom metric named foo
        num_foo += 0.13

@ -85,7 +87,9 @@ def custom_counters(state):
    # Set a counter as a rate.
    state.counters["foo_rate"] = Counter(num_foo, Counter.kIsRate)
    #  Set a counter as an inverse of rate.
-    state.counters["foo_inv_rate"] = Counter(num_foo, Counter.kIsRate | Counter.kInvert)
+    state.counters["foo_inv_rate"] = Counter(
+        num_foo, Counter.kIsRate | Counter.kInvert
+    )
    # Set a counter as a thread-average quantity.
    state.counters["foo_avg"] = Counter(num_foo, Counter.kAvgThreads)
    # There's also a combined flag:
--- a/bindings/python/nanobind.BUILD
+++ b/bindings/python/nanobind.BUILD
@ -1,17 +0,0 @@
-cc_library(
-    name = "nanobind",
-    srcs = glob([
-        "src/*.cpp"
-    ]),
-    copts = ["-fexceptions"],
-    includes = ["include", "ext/robin_map/include"],
-    textual_hdrs = glob(
-        [
-            "include/**/*.h",
-            "src/*.h",
-            "ext/robin_map/include/tsl/*.h",
-        ],
-    ),
-    deps = ["@python_headers"],
-    visibility = ["//visibility:public"],
-)
--- a/bindings/python/python_headers.BUILD
+++ b/bindings/python/python_headers.BUILD
@ -1,6 +0,0 @@
-cc_library(
-    name = "python_headers",
-    hdrs = glob(["**/*.h"]),
-    includes = ["."],
-    visibility = ["//visibility:public"],
-)
--- a/cmake/CXXFeatureCheck.cmake
+++ b/cmake/CXXFeatureCheck.cmake
@ -40,7 +40,7 @@ function(cxx_feature_check FILE)
      message(STATUS "Cross-compiling to test ${FEATURE}")
      try_compile(COMPILE_${FEATURE}
              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CXX_STANDARD 11
+              CXX_STANDARD 17
              CXX_STANDARD_REQUIRED ON
              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
@ -56,7 +56,7 @@ function(cxx_feature_check FILE)
      message(STATUS "Compiling and running to test ${FEATURE}")
      try_run(RUN_${FEATURE} COMPILE_${FEATURE}
              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CXX_STANDARD 11
+              CXX_STANDARD 17
              CXX_STANDARD_REQUIRED ON
              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
--- a/cmake/Config.cmake.in
+++ b/cmake/Config.cmake.in
@ -4,4 +4,9 @@ include (CMakeFindDependencyMacro)

 find_dependency (Threads)

+if (@BENCHMARK_ENABLE_LIBPFM@)
+    list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
+    find_dependency (PFM)
+endif()
+
 include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
--- a/cmake/GetGitVersion.cmake
+++ b/cmake/GetGitVersion.cmake
@ -20,38 +20,16 @@ set(__get_git_version INCLUDED)

 function(get_git_version var)
  if(GIT_EXECUTABLE)
-      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8 --dirty
          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
          RESULT_VARIABLE status
-          OUTPUT_VARIABLE GIT_DESCRIBE_VERSION
+          OUTPUT_VARIABLE GIT_VERSION
          ERROR_QUIET)
      if(status)
-          set(GIT_DESCRIBE_VERSION "v0.0.0")
+          set(GIT_VERSION "v0.0.0")
      endif()
-      
-      string(STRIP ${GIT_DESCRIBE_VERSION} GIT_DESCRIBE_VERSION)
-      if(GIT_DESCRIBE_VERSION MATCHES v[^-]*-) 
-         string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  GIT_VERSION ${GIT_DESCRIBE_VERSION})
-      else()
-         string(REGEX REPLACE "v(.*)" "\\1" GIT_VERSION ${GIT_DESCRIBE_VERSION})
-      endif()
-
-      # Work out if the repository is dirty
-      execute_process(COMMAND ${GIT_EXECUTABLE} update-index -q --refresh
-          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-          OUTPUT_QUIET
-          ERROR_QUIET)
-      execute_process(COMMAND ${GIT_EXECUTABLE} diff-index --name-only HEAD --
-          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-          OUTPUT_VARIABLE GIT_DIFF_INDEX
-          ERROR_QUIET)
-      string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
-      if (${GIT_DIRTY})
-          set(GIT_DESCRIBE_VERSION "${GIT_DESCRIBE_VERSION}-dirty")
-      endif()
-      message(STATUS "git version: ${GIT_DESCRIBE_VERSION} normalized to ${GIT_VERSION}")
  else()
-      set(GIT_VERSION "0.0.0")
+      set(GIT_VERSION "v0.0.0")
  endif()

  set(${var} ${GIT_VERSION} PARENT_SCOPE)
--- a/cmake/GoogleTest.cmake.in
+++ b/cmake/GoogleTest.cmake.in
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required (VERSION 3.13...3.22)

 project(googletest-download NONE)

@ -34,11 +34,12 @@ else()
    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_USE_BUNDLED_GTEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
    return()
  else()
-    message(WARNING "Did not find Google Test sources! Fetching from web...")
+    message(STATUS "Did not find Google Test sources! Fetching from web...")
    ExternalProject_Add(
      googletest
      GIT_REPOSITORY    https://github.com/google/googletest.git
-      GIT_TAG           "release-1.11.0"
+      GIT_TAG           "v1.15.2"
+      GIT_SHALLOW       "ON"
      PREFIX            "${CMAKE_BINARY_DIR}"
      STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
      DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
--- a/cmake/benchmark.pc.in
+++ b/cmake/benchmark.pc.in
@ -5,8 +5,8 @@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@

 Name: @PROJECT_NAME@
 Description: Google microbenchmark framework
-Version: @VERSION@
+Version: @NORMALIZED_VERSION@

 Libs: -L${libdir} -lbenchmark
-Libs.private: -lpthread
+Libs.private: -lpthread @BENCHMARK_PRIVATE_LINK_LIBRARIES@
 Cflags: -I${includedir}
--- a/cmake/benchmark_main.pc.in
+++ b/cmake/benchmark_main.pc.in
@ -0,0 +1,7 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+
+Name: @PROJECT_NAME@
+Description: Google microbenchmark framework (with main() function)
+Version: @NORMALIZED_VERSION@
+Requires: benchmark
+Libs: -L${libdir} -lbenchmark_main
--- a/docs/dependencies.md
+++ b/docs/dependencies.md
@ -11,3 +11,9 @@ distributions include newer versions, for example:
 * Ubuntu 20.04 provides CMake 3.16.3
 * Debian 11.4 provides CMake 3.18.4
 * Ubuntu 22.04 provides CMake 3.22.1
+
+## Python
+
+The Python bindings require Python 3.10+ as of v1.9.0 (2024-08-16) for installation from PyPI.
+Building from source for older versions probably still works, though. See the [user guide](python_bindings.md) for details on how to build from source.
+The minimum theoretically supported version is Python 3.8, since the used bindings generator (nanobind) only supports Python 3.8+.
--- a/docs/python_bindings.md
+++ b/docs/python_bindings.md
@ -3,7 +3,7 @@
 Python bindings are available as wheels on [PyPI](https://pypi.org/project/google-benchmark/) for importing and 
 using Google Benchmark directly in Python. 
 Currently, pre-built wheels exist for macOS (both ARM64 and Intel x86), Linux x86-64 and 64-bit Windows.
-Supported Python versions are Python 3.7 - 3.10.
+Supported Python versions are Python 3.8 - 3.12.

 To install Google Benchmark's Python bindings, run:

@ -25,9 +25,9 @@ python3 -m venv venv --system-site-packages
 source venv/bin/activate  # .\venv\Scripts\Activate.ps1 on Windows

 # upgrade Python's system-wide packages
-python -m pip install --upgrade pip setuptools wheel
-# builds the wheel and stores it in the directory "wheelhouse".
-python -m pip wheel . -w wheelhouse
+python -m pip install --upgrade pip build
+# builds the wheel and stores it in the directory "dist".
+python -m build
 ```

 NB: Building wheels from source requires Bazel. For platform-specific instructions on how to install Bazel,
--- a/docs/reducing_variance.md
+++ b/docs/reducing_variance.md
@ -14,8 +14,6 @@ you might want to disable the CPU frequency scaling while running the
 benchmark, as well as consider other ways to stabilize the performance of
 your system while benchmarking.

-See [Reducing Variance](reducing_variance.md) for more information.
-
 Exactly how to do this depends on the Linux distribution,
 desktop environment, and installed programs.  Specific details are a moving
 target, so we will not attempt to exhaustively document them here.
@ -67,7 +65,7 @@ program.
 Reducing sources of variance is OS and architecture dependent, which is one
 reason some companies maintain machines dedicated to performance testing.

-Some of the easier and and effective ways of reducing variance on a typical
+Some of the easier and effective ways of reducing variance on a typical
 Linux workstation are:

 1. Use the performance governor as [discussed
@ -89,7 +87,7 @@ above](user_guide#disabling-cpu-frequency-scaling).
 4. Close other programs that do non-trivial things based on timers, such as
   your web browser, desktop environment, etc.
 5. Reduce the working set of your benchmark to fit within the L1 cache, but
-   do be aware that this may lead you to optimize for an unrelistic
+   do be aware that this may lead you to optimize for an unrealistic
   situation.

 Further resources on this topic:
--- a/docs/releasing.md
+++ b/docs/releasing.md
@ -8,27 +8,24 @@
    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
      commits between the last annotated tag and HEAD
    * Pick the most interesting.
-* Create one last commit that updates the version saved in `CMakeLists.txt`, `MODULE.bazel`
-  and the `__version__` variable in `bindings/python/google_benchmark/__init__.py`to the
-  release version you're creating. (This version will be used if benchmark is installed
-  from the archive you'll be creating in the next step.)
+* Create one last commit that updates the version saved in `CMakeLists.txt`, `MODULE.bazel`,
+  and `bindings/python/google_benchmark/__init__.py` to the release version you're creating.
+  (This version will be used if benchmark is installed from the archive you'll be creating
+  in the next step.)

 ```
-project (benchmark VERSION 1.8.0 LANGUAGES CXX)
+# CMakeLists.txt
+project (benchmark VERSION 1.9.0 LANGUAGES CXX)
 ```

 ```
-module(name = "com_github_google_benchmark", version="1.8.0")
+# MODULE.bazel
+module(name = "com_github_google_benchmark", version="1.9.0")
 ```

-```python
-# bindings/python/google_benchmark/__init__.py
-
-# ...
-
-__version__ = "1.8.0"  # <-- change this to the release version you are creating
-
-# ...
+```
+# google_benchmark/__init__.py
+__version__ = "1.9.0"
 ```

 * Create a release through github's interface
@ -38,4 +35,4 @@ __version__ = "1.8.0"  # <-- change this to the release version you are creating
      * `git tag -a -f <tag> <tag>`
      * `git push --force --tags origin`
 * Confirm that the "Build and upload Python wheels" action runs to completion
-    * run it manually if it hasn't run
+    * Run it manually if it hasn't run.
--- a/docs/user_guide.md
+++ b/docs/user_guide.md
@ -28,6 +28,8 @@

 [Templated Benchmarks](#templated-benchmarks)

+[Templated Benchmarks that take arguments](#templated-benchmarks-with-arguments)
+
 [Fixtures](#fixtures)

 [Custom Counters](#custom-counters)
@ -80,9 +82,9 @@ tabular data on stdout. Example tabular output looks like:
 ```
 Benchmark                               Time(ns)    CPU(ns) Iterations
 ----------------------------------------------------------------------
-BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
-BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
-BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
+BM_SetInsert/1024/1                        28928      29349      23853  133.097kiB/s   33.2742k items/s
+BM_SetInsert/1024/8                        32065      32913      21375  949.487kiB/s   237.372k items/s
+BM_SetInsert/1024/10                       33157      33648      21431  1.13369MiB/s   290.225k items/s
 ```

 The JSON format outputs human readable json split into two top level attributes.
@ -165,6 +167,13 @@ line interface or by setting environment variables before execution. For every
 prevails). A complete list of CLI options is available running benchmarks
 with the `--help` switch.

+### Dry runs
+
+To confirm that benchmarks can run successfully without needing to wait for
+multiple repetitions and iterations, the `--benchmark_dry_run` flag can be
+used.  This will run the benchmarks as normal, but for 1 iteration and 1
+repetition only.
+
 <a name="running-a-subset-of-benchmarks" />

 ## Running a Subset of Benchmarks
@ -453,7 +462,7 @@ BENCHMARK(BM_SetInsert)->Apply(CustomArguments);

 ### Passing Arbitrary Arguments to a Benchmark

-In C++11 it is possible to define a benchmark that takes an arbitrary number
+It is possible to define a benchmark that takes an arbitrary number
 of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
 macro creates a benchmark that invokes `func`  with the `benchmark::State` as
 the first argument followed by the specified `args...`.
@ -554,26 +563,47 @@ template <class Q> void BM_Sequential(benchmark::State& state) {
  state.SetBytesProcessed(
      static_cast<int64_t>(state.iterations())*state.range(0));
 }
-// C++03
-BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);

-// C++11 or newer, you can use the BENCHMARK macro with template parameters:
+// You can use the BENCHMARK macro with template parameters:
 BENCHMARK(BM_Sequential<WaitQueue<int>>)->Range(1<<0, 1<<10);

+// Old, legacy verbose C++03 syntax:
+BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+
 ```

 Three macros are provided for adding benchmark templates.

 ```c++
-#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK(func<...>) // Takes any number of parameters.
-#else // C++ < C++11
-#define BENCHMARK_TEMPLATE(func, arg1)
-#endif
 #define BENCHMARK_TEMPLATE1(func, arg1)
 #define BENCHMARK_TEMPLATE2(func, arg1, arg2)
 ```

+<a name="templated-benchmarks-with-arguments" />
+
+## Templated Benchmarks that take arguments
+
+Sometimes there is a need to template benchmarks, and provide arguments to them.
+
+```c++
+template <class Q> void BM_Sequential_With_Step(benchmark::State& state, int step) {
+  Q q;
+  typename Q::value_type v;
+  for (auto _ : state) {
+    for (int i = state.range(0); i-=step; )
+      q.push(v);
+    for (int e = state.range(0); e-=step; )
+      q.Wait(&v);
+  }
+  // actually messages, not bytes:
+  state.SetBytesProcessed(
+      static_cast<int64_t>(state.iterations())*state.range(0));
+}
+
+BENCHMARK_TEMPLATE1_CAPTURE(BM_Sequential, WaitQueue<int>, Step1, 1)->Range(1<<0, 1<<10);
+```
+
 <a name="fixtures" />

 ## Fixtures
@ -591,27 +621,29 @@ For Example:
 ```c++
 class MyFixture : public benchmark::Fixture {
 public:
-  void SetUp(const ::benchmark::State& state) {
+  void SetUp(::benchmark::State& state) {
  }

-  void TearDown(const ::benchmark::State& state) {
+  void TearDown(::benchmark::State& state) {
  }
 };

+// Defines and registers `FooTest` using the class `MyFixture`.
 BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
   for (auto _ : st) {
     ...
  }
 }

+// Only defines `BarTest` using the class `MyFixture`.
 BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
   for (auto _ : st) {
     ...
  }
 }
-/* BarTest is NOT registered */
+// `BarTest` is NOT registered.
 BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
-/* BarTest is now registered */
+// `BarTest` is now registered.
 ```

 ### Templated Fixtures
@ -627,19 +659,70 @@ For example:
 template<typename T>
 class MyFixture : public benchmark::Fixture {};

+// Defines and registers `IntTest` using the class template `MyFixture<int>`.
 BENCHMARK_TEMPLATE_F(MyFixture, IntTest, int)(benchmark::State& st) {
   for (auto _ : st) {
     ...
  }
 }

+// Only defines `DoubleTest` using the class template `MyFixture<double>`.
 BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, DoubleTest, double)(benchmark::State& st) {
   for (auto _ : st) {
     ...
  }
 }
-
+// `DoubleTest` is NOT registered.
 BENCHMARK_REGISTER_F(MyFixture, DoubleTest)->Threads(2);
+// `DoubleTest` is now registered.
+```
+
+If you want to use a method template for your fixtures,
+which you instantiate afterward, use the following macros:
+
+* `BENCHMARK_TEMPLATE_METHOD_F(ClassName, Method)`
+* `BENCHMARK_TEMPLATE_INSTANTIATE_F(ClassName, Method, ...)`
+
+With these macros you can define one method for several instantiations.
+Example (using `MyFixture` from above):
+
+```c++
+// Defines `Test` using the class template `MyFixture`.
+BENCHMARK_TEMPLATE_METHOD_F(MyFixture, Test)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+// Instantiates and registers the benchmark `MyFixture<int>::Test`.
+BENCHMARK_TEMPLATE_INSTANTIATE_F(MyFixture, Test, int)->Threads(2);
+// Instantiates and registers the benchmark `MyFixture<double>::Test`.
+BENCHMARK_TEMPLATE_INSTANTIATE_F(MyFixture, Test, double)->Threads(4);
+```
+
+Inside the method definition of `BENCHMARK_TEMPLATE_METHOD_F` the type `Base` refers
+to the type of the instantiated fixture.
+Accesses to members of the fixture must be prefixed by `this->`.
+
+`BENCHMARK_TEMPLATE_METHOD_F`and `BENCHMARK_TEMPLATE_INSTANTIATE_F` can only be used,
+if the fixture does not use non-type template parameters.
+If you want to pass values as template parameters, use e.g. `std::integral_constant`.
+For example:
+
+```c++
+template<typename Sz>
+class SizedFixture : public benchmark::Fixture {
+  static constexpr auto Size = Sz::value;
+  int myValue;
+};
+
+BENCHMARK_TEMPLATE_METHOD_F(SizedFixture, Test)(benchmark::State& st) {
+   for (auto _ : st) {
+     this->myValue = Base::Size;
+  }
+}
+
+BENCHMARK_TEMPLATE_INSTANTIATE_F(SizedFixture, Test, std::integral_constant<5>)->Threads(2);
 ```

 <a name="custom-counters" />
@ -702,12 +785,10 @@ is 1k a 1000 (default, `benchmark::Counter::OneK::kIs1000`), or 1024
  state.counters["BytesProcessed"] = Counter(state.range(0), benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
 ```

-When you're compiling in C++11 mode or later you can use `insert()` with
-`std::initializer_list`:
+You can use `insert()` with `std::initializer_list`:

 <!-- {% raw %} -->
 ```c++
-  // With C++11, this can be done:
  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
  // ... instead of:
  state.counters["Foo"] = numFoos;
@ -830,6 +911,46 @@ BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();

 Without `UseRealTime`, CPU time is used by default.

+### Manual Multithreaded Benchmarks
+
+Google/benchmark uses `std::thread` as multithreading environment per default.
+If you want to use another multithreading environment (e.g. OpenMP), you can provide
+a factory function to your benchmark using the `ThreadRunner` function.
+The factory function takes the number of threads as argument and creates a custom class
+derived from `benchmark::ThreadRunnerBase`.
+This custom class must override the function
+`void RunThreads(const std::function<void(int)>& fn)`.
+`RunThreads` is called by the main thread and spawns the requested number of threads.
+Each spawned thread must call `fn(thread_index)`, where `thread_index` is its own
+thread index. Before `RunThreads` returns, all spawned threads must be joined.
+```c++
+class OpenMPThreadRunner : public benchmark::ThreadRunnerBase
+{
+  OpenMPThreadRunner(int num_threads)
+  : num_threads_(num_threads)
+  {}
+
+  void RunThreads(const std::function<void(int)>& fn) final
+  {
+#pragma omp parallel num_threads(num_threads_)
+    fn(omp_get_thread_num());
+  }
+
+private:
+  int num_threads_;
+};
+
+BENCHMARK(BM_MultiThreaded)
+  ->ThreadRunner([](int num_threads) {
+    return std::make_unique<OpenMPThreadRunner>(num_threads);
+  })
+  ->Threads(1)->Threads(2)->Threads(4);
+```
+The above example creates a parallel OpenMP region before it enters `BM_MultiThreaded`.
+The actual benchmark code can remain the same and is therefore not tied to a specific
+thread runner. The measurement does not include the time for creating and joining the
+threads.
+
 <a name="cpu-timers" />

 ## CPU Timers
@ -986,11 +1107,11 @@ in any way. `<expr>` may even be removed entirely when the result is already
 known. For example:

 ```c++
-  /* Example 1: `<expr>` is removed entirely. */
+  // Example 1: `<expr>` is removed entirely.
  int foo(int x) { return x + 42; }
  while (...) DoNotOptimize(foo(0)); // Optimized to DoNotOptimize(42);

-  /*  Example 2: Result of '<expr>' is only reused */
+  // Example 2: Result of '<expr>' is only reused.
  int bar(int) __attribute__((const));
  while (...) DoNotOptimize(bar(0)); // Optimized to:
  // int __result__ = bar(0);
@ -1068,6 +1189,7 @@ void BM_spin_empty(benchmark::State& state) {
 }

 BENCHMARK(BM_spin_empty)
+  ->Repetitions(3) // or add option --benchmark_repetitions=3
  ->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
    return *(std::max_element(std::begin(v), std::end(v)));
  })
@ -1087,8 +1209,9 @@ void BM_spin_empty(benchmark::State& state) {
 }

 BENCHMARK(BM_spin_empty)
+  ->Repetitions(3) // or add option --benchmark_repetitions=3
  ->ComputeStatistics("ratio", [](const std::vector<double>& v) -> double {
-    return std::begin(v) / std::end(v);
+    return v.front() / v.back();
  }, benchmark::StatisticUnit::kPercentage)
  ->Arg(512);
 ```
@ -1108,6 +1231,21 @@ a report on the number of allocations, bytes used, etc.
 This data will then be reported alongside other performance data, currently
 only when using JSON output.

+<a name="profiling" />
+
+## Profiling
+
+It's often useful to also profile benchmarks in particular ways, in addition to
+CPU performance. For this reason, benchmark offers the `RegisterProfilerManager`
+method that allows a custom `ProfilerManager` to be injected.
+
+If set, the `ProfilerManager::AfterSetupStart` and
+`ProfilerManager::BeforeTeardownStop` methods will be called at the start and
+end of a separate benchmark run to allow user code to collect and report
+user-provided profile metrics.
+
+Output collected from this profiling run must be reported separately.
+
 <a name="using-register-benchmark" />

 ## Using RegisterBenchmark(name, fn, args...)
@ -1194,7 +1332,7 @@ static void BM_test_ranged_fo(benchmark::State & state) {

 ## A Faster KeepRunning Loop

-In C++11 mode, a ranged-based for loop should be used in preference to
+A ranged-based for loop should be used in preference to
 the `KeepRunning` loop for running the benchmarks. For example:

 ```c++
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@ -163,52 +163,33 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #ifndef BENCHMARK_BENCHMARK_H_
 #define BENCHMARK_BENCHMARK_H_

-// The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer.
-#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
-#define BENCHMARK_HAS_CXX11
-#endif
-
-// This _MSC_VER check should detect VS 2017 v15.3 and newer.
-#if __cplusplus >= 201703L || \
-    (defined(_MSC_VER) && _MSC_VER >= 1911 && _MSVC_LANG >= 201703L)
-#define BENCHMARK_HAS_CXX17
-#endif
-
 #include <stdint.h>

 #include <algorithm>
+#include <atomic>
 #include <cassert>
 #include <cstddef>
+#include <functional>
+#include <initializer_list>
 #include <iosfwd>
 #include <limits>
 #include <map>
+#include <memory>
 #include <set>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <vector>

 #include "benchmark/export.h"

-#if defined(BENCHMARK_HAS_CXX11)
-#include <atomic>
-#include <initializer_list>
-#include <type_traits>
-#include <utility>
-#endif
-
 #if defined(_MSC_VER)
 #include <intrin.h>  // for _ReadWriteBarrier
 #endif

-#ifndef BENCHMARK_HAS_CXX11
-#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
-  TypeName(const TypeName&);                         \
-  TypeName& operator=(const TypeName&)
-#else
 #define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
  TypeName(const TypeName&) = delete;                \
  TypeName& operator=(const TypeName&) = delete
-#endif

 #ifdef BENCHMARK_HAS_CXX17
 #define BENCHMARK_UNUSED [[maybe_unused]]
@ -284,28 +265,82 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_UNREACHABLE() ((void)0)
 #endif

-#ifdef BENCHMARK_HAS_CXX11
-#define BENCHMARK_OVERRIDE override
+#if defined(__GNUC__)
+// Determine the cacheline size based on architecture
+#if defined(__i386__) || defined(__x86_64__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#elif defined(__powerpc64__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 128
+#elif defined(__aarch64__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#elif defined(__arm__)
+// Cache line sizes for ARM: These values are not strictly correct since
+// cache line sizes depend on implementations, not architectures.  There
+// are even implementations with cache line sizes configurable at boot
+// time.
+#if defined(__ARM_ARCH_5T__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 32
+#elif defined(__ARM_ARCH_7A__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#endif  // ARM_ARCH
+#endif  // arches
+#endif  // __GNUC__
+
+#ifndef BENCHMARK_INTERNAL_CACHELINE_SIZE
+// A reasonable default guess.  Note that overestimates tend to waste more
+// space, while underestimates tend to waste more time.
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#endif
+
+#if defined(__GNUC__)
+// Indicates that the declared object be cache aligned using
+// `BENCHMARK_INTERNAL_CACHELINE_SIZE` (see above).
+#define BENCHMARK_INTERNAL_CACHELINE_ALIGNED \
+  __attribute__((aligned(BENCHMARK_INTERNAL_CACHELINE_SIZE)))
+#elif defined(_MSC_VER)
+#define BENCHMARK_INTERNAL_CACHELINE_ALIGNED \
+  __declspec(align(BENCHMARK_INTERNAL_CACHELINE_SIZE))
 #else
-#define BENCHMARK_OVERRIDE
+#define BENCHMARK_INTERNAL_CACHELINE_ALIGNED
 #endif

 #if defined(_MSC_VER)
 #pragma warning(push)
 // C4251: <symbol> needs to have dll-interface to be used by clients of class
 #pragma warning(disable : 4251)
-#endif
+#endif  // _MSC_VER_

 namespace benchmark {
+
+namespace internal {
+#if (__cplusplus < 201402L || (defined(_MSC_VER) && _MSVC_LANG < 201402L))
+template <typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+#else
+using ::std::make_unique;
+#endif
+}  // namespace internal
+
 class BenchmarkReporter;
+class State;
+
+using IterationCount = int64_t;
+
+// Define alias of Setup/Teardown callback function type
+using callback_function = std::function<void(const benchmark::State&)>;

 // Default number of minimum benchmark running time in seconds.
 const char kDefaultMinTimeStr[] = "0.5s";

+// Returns the version of the library.
+BENCHMARK_EXPORT std::string GetBenchmarkVersion();
+
 BENCHMARK_EXPORT void PrintDefaultHelp();

 BENCHMARK_EXPORT void Initialize(int* argc, char** argv,
-                                 void (*HelperPrinterf)() = PrintDefaultHelp);
+                                 void (*HelperPrintf)() = PrintDefaultHelp);
 BENCHMARK_EXPORT void Shutdown();

 // Report to stdout all arguments in 'argv' as unrecognized except the first.
@ -341,7 +376,7 @@ BENCHMARK_EXPORT BenchmarkReporter* CreateDefaultDisplayReporter();
 // The second and third overload use the specified 'display_reporter' and
 //  'file_reporter' respectively. 'file_reporter' will write to the file
 //  specified
-//   by '--benchmark_output'. If '--benchmark_output' is not given the
+//   by '--benchmark_out'. If '--benchmark_out' is not given the
 //  'file_reporter' is ignored.
 //
 // RETURNS: The number of matching benchmarks.
@ -374,14 +409,15 @@ BENCHMARK_EXPORT void SetDefaultTimeUnit(TimeUnit unit);
 // benchmark.
 class MemoryManager {
 public:
-  static const int64_t TombstoneValue;
+  static constexpr int64_t TombstoneValue = std::numeric_limits<int64_t>::max();

  struct Result {
    Result()
        : num_allocs(0),
          max_bytes_used(0),
          total_allocated_bytes(TombstoneValue),
-          net_heap_growth(TombstoneValue) {}
+          net_heap_growth(TombstoneValue),
+          memory_iterations(0) {}

    // The number of allocations made in total between Start and Stop.
    int64_t num_allocs;
@ -397,6 +433,8 @@ class MemoryManager {
    // ie., total_allocated_bytes - total_deallocated_bytes.
    // Init'ed to TombstoneValue if metric not available.
    int64_t net_heap_growth;
+
+    IterationCount memory_iterations;
  };

  virtual ~MemoryManager() {}
@ -413,6 +451,26 @@ class MemoryManager {
 BENCHMARK_EXPORT
 void RegisterMemoryManager(MemoryManager* memory_manager);

+// If a ProfilerManager is registered (via RegisterProfilerManager()), the
+// benchmark will be run an additional time under the profiler to collect and
+// report profile metrics for the run of the benchmark.
+class ProfilerManager {
+ public:
+  virtual ~ProfilerManager() {}
+
+  // This is called after `Setup()` code and right before the benchmark is run.
+  virtual void AfterSetupStart() = 0;
+
+  // This is called before `Teardown()` code and right after the benchmark
+  // completes.
+  virtual void BeforeTeardownStop() = 0;
+};
+
+// Register a ProfilerManager instance that will be used to collect and report
+// profile measurements for benchmark runs.
+BENCHMARK_EXPORT
+void RegisterProfilerManager(ProfilerManager* profiler_manager);
+
 // Add a key-value pair to output as part of the context stanza in the report.
 BENCHMARK_EXPORT
 void AddCustomContext(const std::string& key, const std::string& value);
@ -429,7 +487,8 @@ void UseCharPointer(char const volatile*);

 // Take ownership of the pointer and register the benchmark. Return the
 // registered benchmark.
-BENCHMARK_EXPORT Benchmark* RegisterBenchmarkInternal(Benchmark*);
+BENCHMARK_EXPORT Benchmark* RegisterBenchmarkInternal(
+    std::unique_ptr<Benchmark>);

 // Ensure that the standard streams are properly initialized in every TU.
 BENCHMARK_EXPORT int InitializeStreams();
@ -444,11 +503,9 @@ BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();

 // Force the compiler to flush pending writes to global memory. Acts as an
 // effective read/write barrier
-#ifdef BENCHMARK_HAS_CXX11
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
  std::atomic_signal_fence(std::memory_order_acq_rel);
 }
-#endif

 // The DoNotOptimize(...) function can be used to prevent a value or
 // expression from being optimized away by the compiler. This function is
@ -473,7 +530,6 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
 #endif
 }

-#ifdef BENCHMARK_HAS_CXX11
 template <class Tp>
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
 #if defined(__clang__)
@ -482,8 +538,8 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
  asm volatile("" : "+m,r"(value) : : "memory");
 #endif
 }
-#endif
-#elif defined(BENCHMARK_HAS_CXX11) && (__GNUC__ >= 5)
+// !defined(__GNUC__) || defined(__llvm__) || defined(__INTEL_COMPILER)
+#elif (__GNUC__ >= 5)
 // Workaround for a bug with full argument copy overhead with GCC.
 // See: #1340 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105519
 template <class Tp>
@ -539,37 +595,9 @@ inline BENCHMARK_ALWAYS_INLINE
    DoNotOptimize(Tp&& value) {
  asm volatile("" : "+m"(value) : : "memory");
 }
-
-#else
-// Fallback for GCC < 5. Can add some overhead because the compiler is forced
-// to use memory operations instead of operations with registers.
-// TODO: Remove if GCC < 5 will be unsupported.
-template <class Tp>
-BENCHMARK_DEPRECATED_MSG(
-    "The const-ref version of this method can permit "
-    "undesired compiler optimizations in benchmarks")
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-  asm volatile("" : : "m"(value) : "memory");
-}
-
-template <class Tp>
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
-  asm volatile("" : "+m"(value) : : "memory");
-}
-
-#ifdef BENCHMARK_HAS_CXX11
-template <class Tp>
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
-  asm volatile("" : "+m"(value) : : "memory");
-}
-#endif
+// !defined(__GNUC__) || defined(__llvm__) || defined(__INTEL_COMPILER)
 #endif

-#ifndef BENCHMARK_HAS_CXX11
-inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
-  asm volatile("" : : : "memory");
-}
-#endif
 #elif defined(_MSC_VER)
 template <class Tp>
 BENCHMARK_DEPRECATED_MSG(
@ -580,15 +608,9 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
  _ReadWriteBarrier();
 }

-#ifndef BENCHMARK_HAS_CXX11
-inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { _ReadWriteBarrier(); }
-#endif
 #else
 template <class Tp>
-BENCHMARK_DEPRECATED_MSG(
-    "The const-ref version of this method can permit "
-    "undesired compiler optimizations in benchmarks")
-inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
 }
 // FIXME Add ClobberMemory() for non-gnu and non-msvc compilers, before C++11.
@ -639,7 +661,7 @@ class Counter {
  Counter(double v = 0., Flags f = kDefaults, OneK k = kIs1000)
      : value(v), flags(f), oneK(k) {}

-  BENCHMARK_ALWAYS_INLINE operator double const &() const { return value; }
+  BENCHMARK_ALWAYS_INLINE operator double const&() const { return value; }
  BENCHMARK_ALWAYS_INLINE operator double&() { return value; }
 };

@ -660,13 +682,13 @@ typedef std::map<std::string, Counter> UserCounters;
 // calculated automatically to the best fit.
 enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };

-typedef int64_t IterationCount;
+typedef int64_t ComplexityN;

 enum StatisticUnit { kTime, kPercentage };

 // BigOFunc is passed to a benchmark in order to specify the asymptotic
 // computational complexity for the benchmark.
-typedef double(BigOFunc)(IterationCount);
+typedef double(BigOFunc)(ComplexityN);

 // StatisticsFunc is passed to a benchmark in order to compute some descriptive
 // statistics over all the measurements of some type
@ -688,12 +710,7 @@ class ThreadTimer;
 class ThreadManager;
 class PerfCountersMeasurement;

-enum AggregationReportMode
-#if defined(BENCHMARK_HAS_CXX11)
-    : unsigned
-#else
-#endif
-{
+enum AggregationReportMode : unsigned {
  // The mode has not been manually specified
  ARM_Unspecified = 0,
  // The mode is user-specified.
@ -708,11 +725,7 @@ enum AggregationReportMode
      ARM_FileReportAggregatesOnly | ARM_DisplayReportAggregatesOnly
 };

-enum Skipped
-#if defined(BENCHMARK_HAS_CXX11)
-    : unsigned
-#endif
-{
+enum Skipped : unsigned {
  NotSkipped = 0,
  SkippedWithMessage,
  SkippedWithError
@ -720,9 +733,14 @@ enum Skipped

 }  // namespace internal

+#if defined(_MSC_VER)
+#pragma warning(push)
+// C4324: 'benchmark::State': structure was padded due to alignment specifier
+#pragma warning(disable : 4324)
+#endif  // _MSC_VER_
 // State is passed to a running Benchmark and contains state for the
 // benchmark to use.
-class BENCHMARK_EXPORT State {
+class BENCHMARK_EXPORT BENCHMARK_INTERNAL_CACHELINE_ALIGNED State {
 public:
  struct StateIterator;
  friend struct StateIterator;
@ -734,13 +752,13 @@ class BENCHMARK_EXPORT State {
  // have been called previously.
  //
  // NOTE: KeepRunning may not be used after calling either of these functions.
-  BENCHMARK_ALWAYS_INLINE StateIterator begin();
-  BENCHMARK_ALWAYS_INLINE StateIterator end();
+  inline BENCHMARK_ALWAYS_INLINE StateIterator begin();
+  inline BENCHMARK_ALWAYS_INLINE StateIterator end();

  // Returns true if the benchmark should continue through another iteration.
  // NOTE: A benchmark may not return from the test until KeepRunning() has
  // returned false.
-  bool KeepRunning();
+  inline bool KeepRunning();

  // Returns true iff the benchmark should run n more iterations.
  // REQUIRES: 'n' > 0.
@ -752,7 +770,7 @@ class BENCHMARK_EXPORT State {
  //   while (state.KeepRunningBatch(1000)) {
  //     // process 1000 elements
  //   }
-  bool KeepRunningBatch(IterationCount n);
+  inline bool KeepRunningBatch(IterationCount n);

  // REQUIRES: timer is running and 'SkipWithMessage(...)' or
  //   'SkipWithError(...)' has not been called by the current thread.
@ -863,10 +881,12 @@ class BENCHMARK_EXPORT State {
  // and complexity_n will
  // represent the length of N.
  BENCHMARK_ALWAYS_INLINE
-  void SetComplexityN(int64_t complexity_n) { complexity_n_ = complexity_n; }
+  void SetComplexityN(ComplexityN complexity_n) {
+    complexity_n_ = complexity_n;
+  }

  BENCHMARK_ALWAYS_INLINE
-  int64_t complexity_length_n() const { return complexity_n_; }
+  ComplexityN complexity_length_n() const { return complexity_n_; }

  // If this routine is called with items > 0, then an items/s
  // label is printed on the benchmark report line for the currently
@ -955,7 +975,7 @@ class BENCHMARK_EXPORT State {
  // items we don't need on the first cache line
  std::vector<int64_t> range_;

-  int64_t complexity_n_;
+  ComplexityN complexity_n_;

 public:
  // Container for user-defined counters.
@ -965,12 +985,13 @@ class BENCHMARK_EXPORT State {
  State(std::string name, IterationCount max_iters,
        const std::vector<int64_t>& ranges, int thread_i, int n_threads,
        internal::ThreadTimer* timer, internal::ThreadManager* manager,
-        internal::PerfCountersMeasurement* perf_counters_measurement);
+        internal::PerfCountersMeasurement* perf_counters_measurement,
+        ProfilerManager* profiler_manager);

  void StartKeepRunning();
  // Implementation of KeepRunning() and KeepRunningBatch().
  // is_batch must be true unless n is 1.
-  bool KeepRunningInternal(IterationCount n, bool is_batch);
+  inline bool KeepRunningInternal(IterationCount n, bool is_batch);
  void FinishKeepRunning();

  const std::string name_;
@ -980,9 +1001,13 @@ class BENCHMARK_EXPORT State {
  internal::ThreadTimer* const timer_;
  internal::ThreadManager* const manager_;
  internal::PerfCountersMeasurement* const perf_counters_measurement_;
+  ProfilerManager* const profiler_manager_;

  friend class internal::BenchmarkInstance;
 };
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif  // _MSC_VER_

 inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
  return KeepRunningInternal(1, /*is_batch=*/false);
@ -1068,8 +1093,18 @@ inline BENCHMARK_ALWAYS_INLINE State::StateIterator State::end() {
  return StateIterator();
 }

+// Base class for user-defined multi-threading
+struct ThreadRunnerBase {
+  virtual ~ThreadRunnerBase() {}
+  virtual void RunThreads(const std::function<void(int)>& fn) = 0;
+};
+
 namespace internal {

+// Define alias of ThreadRunner factory function type
+using threadrunner_factory =
+    std::function<std::unique_ptr<ThreadRunnerBase>(int)>;
+
 typedef void(Function)(State&);

 // ------------------------------------------------------
@ -1124,12 +1159,12 @@ class BENCHMARK_EXPORT Benchmark {
  // Run this benchmark once for a number of values picked from the
  // ranges [start..limit].  (starts and limits are always picked.)
  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
-  Benchmark* Ranges(const std::vector<std::pair<int64_t, int64_t> >& ranges);
+  Benchmark* Ranges(const std::vector<std::pair<int64_t, int64_t>>& ranges);

  // Run this benchmark once for each combination of values in the (cartesian)
  // product of the supplied argument lists.
  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
-  Benchmark* ArgsProduct(const std::vector<std::vector<int64_t> >& arglists);
+  Benchmark* ArgsProduct(const std::vector<std::vector<int64_t>>& arglists);

  // Equivalent to ArgNames({name})
  Benchmark* ArgName(const std::string& name);
@ -1142,7 +1177,7 @@ class BENCHMARK_EXPORT Benchmark {
  // NOTE: This is a legacy C++03 interface provided for compatibility only.
  //   New code should use 'Ranges'.
  Benchmark* RangePair(int64_t lo1, int64_t hi1, int64_t lo2, int64_t hi2) {
-    std::vector<std::pair<int64_t, int64_t> > ranges;
+    std::vector<std::pair<int64_t, int64_t>> ranges;
    ranges.push_back(std::make_pair(lo1, hi1));
    ranges.push_back(std::make_pair(lo2, hi2));
    return Ranges(ranges);
@ -1160,15 +1195,15 @@ class BENCHMARK_EXPORT Benchmark {
  //
  // The callback will be passed a State object, which includes the number
  // of threads, thread-index, benchmark arguments, etc.
-  //
-  // The callback must not be NULL or self-deleting.
-  Benchmark* Setup(void (*setup)(const benchmark::State&));
-  Benchmark* Teardown(void (*teardown)(const benchmark::State&));
+  Benchmark* Setup(callback_function&&);
+  Benchmark* Setup(const callback_function&);
+  Benchmark* Teardown(callback_function&&);
+  Benchmark* Teardown(const callback_function&);

  // Pass this benchmark object to *func, which can customize
  // the benchmark by calling various methods like Arg, Args,
  // Threads, etc.
-  Benchmark* Apply(void (*func)(Benchmark* benchmark));
+  Benchmark* Apply(void (*custom_arguments)(Benchmark* benchmark));

  // Set the range multiplier for non-dense range. If not called, the range
  // multiplier kRangeMultiplier will be used.
@ -1274,6 +1309,9 @@ class BENCHMARK_EXPORT Benchmark {
  // Equivalent to ThreadRange(NumCPUs(), NumCPUs())
  Benchmark* ThreadPerCpu();

+  // Sets a user-defined threadrunner (see ThreadRunnerBase)
+  Benchmark* ThreadRunner(threadrunner_factory&& factory);
+
  virtual void Run(State& state) = 0;

  TimeUnit GetTimeUnit() const;
@ -1293,8 +1331,8 @@ class BENCHMARK_EXPORT Benchmark {

  std::string name_;
  AggregationReportMode aggregation_report_mode_;
-  std::vector<std::string> arg_names_;       // Args for all benchmark runs
-  std::vector<std::vector<int64_t> > args_;  // Args for all benchmark runs
+  std::vector<std::string> arg_names_;      // Args for all benchmark runs
+  std::vector<std::vector<int64_t>> args_;  // Args for all benchmark runs

  TimeUnit time_unit_;
  bool use_default_time_unit_;
@ -1312,21 +1350,12 @@ class BENCHMARK_EXPORT Benchmark {
  std::vector<Statistics> statistics_;
  std::vector<int> thread_counts_;

-  typedef void (*callback_function)(const benchmark::State&);
  callback_function setup_;
  callback_function teardown_;

-  Benchmark(Benchmark const&)
-#if defined(BENCHMARK_HAS_CXX11)
-      = delete
-#endif
-      ;
+  threadrunner_factory threadrunner_;

-  Benchmark& operator=(Benchmark const&)
-#if defined(BENCHMARK_HAS_CXX11)
-      = delete
-#endif
-      ;
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(Benchmark);
 };

 }  // namespace internal
@ -1338,10 +1367,8 @@ class BENCHMARK_EXPORT Benchmark {
 internal::Benchmark* RegisterBenchmark(const std::string& name,
                                       internal::Function* fn);

-#if defined(BENCHMARK_HAS_CXX11)
 template <class Lambda>
 internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn);
-#endif

 // Remove all registered benchmarks. All pointers to previously registered
 // benchmarks are invalidated.
@ -1355,71 +1382,56 @@ class BENCHMARK_EXPORT FunctionBenchmark : public Benchmark {
  FunctionBenchmark(const std::string& name, Function* func)
      : Benchmark(name), func_(func) {}

-  void Run(State& st) BENCHMARK_OVERRIDE;
+  void Run(State& st) override;

 private:
  Function* func_;
 };

-#ifdef BENCHMARK_HAS_CXX11
 template <class Lambda>
 class LambdaBenchmark : public Benchmark {
 public:
-  void Run(State& st) BENCHMARK_OVERRIDE { lambda_(st); }
+  void Run(State& st) override { lambda_(st); }

- private:
  template <class OLambda>
  LambdaBenchmark(const std::string& name, OLambda&& lam)
      : Benchmark(name), lambda_(std::forward<OLambda>(lam)) {}

+ private:
  LambdaBenchmark(LambdaBenchmark const&) = delete;
-
-  template <class Lam>  // NOLINTNEXTLINE(readability-redundant-declaration)
-  friend Benchmark* ::benchmark::RegisterBenchmark(const std::string&, Lam&&);
-
  Lambda lambda_;
 };
-#endif
 }  // namespace internal

 inline internal::Benchmark* RegisterBenchmark(const std::string& name,
                                              internal::Function* fn) {
-  // FIXME: this should be a `std::make_unique<>()` but we don't have C++14.
-  // codechecker_intentional [cplusplus.NewDeleteLeaks]
  return internal::RegisterBenchmarkInternal(
-      ::new internal::FunctionBenchmark(name, fn));
+      ::benchmark::internal::make_unique<internal::FunctionBenchmark>(name,
+                                                                      fn));
 }

-#ifdef BENCHMARK_HAS_CXX11
 template <class Lambda>
 internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn) {
  using BenchType =
      internal::LambdaBenchmark<typename std::decay<Lambda>::type>;
-  // FIXME: this should be a `std::make_unique<>()` but we don't have C++14.
-  // codechecker_intentional [cplusplus.NewDeleteLeaks]
  return internal::RegisterBenchmarkInternal(
-      ::new BenchType(name, std::forward<Lambda>(fn)));
+      ::benchmark::internal::make_unique<BenchType>(name,
+                                                    std::forward<Lambda>(fn)));
 }
-#endif

-#if defined(BENCHMARK_HAS_CXX11) && \
-    (!defined(BENCHMARK_GCC_VERSION) || BENCHMARK_GCC_VERSION >= 409)
 template <class Lambda, class... Args>
 internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn,
                                       Args&&... args) {
  return benchmark::RegisterBenchmark(
      name, [=](benchmark::State& st) { fn(st, args...); });
 }
-#else
-#define BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
-#endif

 // The base class for all fixture tests.
 class Fixture : public internal::Benchmark {
 public:
  Fixture() : internal::Benchmark("") {}

-  void Run(State& st) BENCHMARK_OVERRIDE {
+  void Run(State& st) override {
    this->SetUp(st);
    this->BenchmarkCase(st);
    this->TearDown(st);
@ -1450,14 +1462,9 @@ class Fixture : public internal::Benchmark {
 #endif

 // Helpers for generating unique variable names
-#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_PRIVATE_NAME(...)                                      \
  BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, \
                           __VA_ARGS__)
-#else
-#define BENCHMARK_PRIVATE_NAME(n) \
-  BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
-#endif  // BENCHMARK_HAS_CXX11

 #define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
 #define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
@ -1465,22 +1472,17 @@ class Fixture : public internal::Benchmark {
 #define BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method) \
  BaseClass##_##Method##_Benchmark

-#define BENCHMARK_PRIVATE_DECLARE(n)                                 \
-  static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
-      BENCHMARK_UNUSED
+#define BENCHMARK_PRIVATE_DECLARE(n)                                           \
+  /* NOLINTNEXTLINE(misc-use-anonymous-namespace) */                           \
+  static ::benchmark::internal::Benchmark const* const BENCHMARK_PRIVATE_NAME( \
+      n) BENCHMARK_UNUSED

-#ifdef BENCHMARK_HAS_CXX11
-#define BENCHMARK(...)                                               \
-  BENCHMARK_PRIVATE_DECLARE(_benchmark_) =                           \
-      (::benchmark::internal::RegisterBenchmarkInternal(             \
-          new ::benchmark::internal::FunctionBenchmark(#__VA_ARGS__, \
-                                                       __VA_ARGS__)))
-#else
-#define BENCHMARK(n)                                     \
-  BENCHMARK_PRIVATE_DECLARE(n) =                         \
-      (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(#n, n)))
-#endif  // BENCHMARK_HAS_CXX11
+#define BENCHMARK(...)                                                \
+  BENCHMARK_PRIVATE_DECLARE(_benchmark_) =                            \
+      (::benchmark::internal::RegisterBenchmarkInternal(              \
+          ::benchmark::internal::make_unique<                         \
+              ::benchmark::internal::FunctionBenchmark>(#__VA_ARGS__, \
+                                                        __VA_ARGS__)))

 // Old-style macros
 #define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
@ -1490,8 +1492,6 @@ class Fixture : public internal::Benchmark {
 #define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
  BENCHMARK(n)->RangePair({{(l1), (h1)}, {(l2), (h2)}})

-#ifdef BENCHMARK_HAS_CXX11
-
 // Register a benchmark which invokes the function specified by `func`
 // with the additional arguments specified by `...`.
 //
@ -1504,14 +1504,13 @@ class Fixture : public internal::Benchmark {
 // /* Registers a benchmark named "BM_takes_args/int_string_test` */
 // BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
 #define BENCHMARK_CAPTURE(func, test_case_name, ...)     \
-  BENCHMARK_PRIVATE_DECLARE(func) =                      \
+  BENCHMARK_PRIVATE_DECLARE(_benchmark_) =               \
      (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(  \
+          ::benchmark::internal::make_unique<            \
+              ::benchmark::internal::FunctionBenchmark>( \
              #func "/" #test_case_name,                 \
              [](::benchmark::State& st) { func(st, __VA_ARGS__); })))

-#endif  // BENCHMARK_HAS_CXX11
-
 // This will register a benchmark for a templatized function.  For example:
 //
 // template<int arg>
@ -1523,33 +1522,56 @@ class Fixture : public internal::Benchmark {
 #define BENCHMARK_TEMPLATE1(n, a)                        \
  BENCHMARK_PRIVATE_DECLARE(n) =                         \
      (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(#n "<" #a ">", n<a>)))
+          ::benchmark::internal::make_unique<            \
+              ::benchmark::internal::FunctionBenchmark>(#n "<" #a ">", n<a>)))

-#define BENCHMARK_TEMPLATE2(n, a, b)                                         \
-  BENCHMARK_PRIVATE_DECLARE(n) =                                             \
-      (::benchmark::internal::RegisterBenchmarkInternal(                     \
-          new ::benchmark::internal::FunctionBenchmark(#n "<" #a "," #b ">", \
-                                                       n<a, b>)))
+#define BENCHMARK_TEMPLATE2(n, a, b)                                          \
+  BENCHMARK_PRIVATE_DECLARE(n) =                                              \
+      (::benchmark::internal::RegisterBenchmarkInternal(                      \
+          ::benchmark::internal::make_unique<                                 \
+              ::benchmark::internal::FunctionBenchmark>(#n "<" #a "," #b ">", \
+                                                        n<a, b>)))

-#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE(n, ...)                       \
  BENCHMARK_PRIVATE_DECLARE(n) =                         \
      (::benchmark::internal::RegisterBenchmarkInternal( \
-          new ::benchmark::internal::FunctionBenchmark(  \
+          ::benchmark::internal::make_unique<            \
+              ::benchmark::internal::FunctionBenchmark>( \
              #n "<" #__VA_ARGS__ ">", n<__VA_ARGS__>)))
-#else
-#define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
-#endif

-#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)          \
-  class BaseClass##_##Method##_Benchmark : public BaseClass {   \
-   public:                                                      \
-    BaseClass##_##Method##_Benchmark() {                        \
-      this->SetName(#BaseClass "/" #Method);                    \
-    }                                                           \
-                                                                \
-   protected:                                                   \
-    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
+// This will register a benchmark for a templatized function,
+// with the additional arguments specified by `...`.
+//
+// For example:
+//
+// template <typename T, class ...ExtraArgs>`
+// void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
+//  [...]
+//}
+// /* Registers a benchmark named "BM_takes_args<void>/int_string_test` */
+// BENCHMARK_TEMPLATE1_CAPTURE(BM_takes_args, void, int_string_test, 42,
+//                             std::string("abc"));
+#define BENCHMARK_TEMPLATE1_CAPTURE(func, a, test_case_name, ...) \
+  BENCHMARK_CAPTURE(func<a>, test_case_name, __VA_ARGS__)
+
+#define BENCHMARK_TEMPLATE2_CAPTURE(func, a, b, test_case_name, ...) \
+  BENCHMARK_PRIVATE_DECLARE(func) =                                  \
+      (::benchmark::internal::RegisterBenchmarkInternal(             \
+          ::benchmark::internal::make_unique<                        \
+              ::benchmark::internal::FunctionBenchmark>(             \
+              #func "<" #a "," #b ">"                                \
+                    "/" #test_case_name,                             \
+              [](::benchmark::State& st) { func<a, b>(st, __VA_ARGS__); })))
+
+#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)        \
+  class BaseClass##_##Method##_Benchmark : public BaseClass { \
+   public:                                                    \
+    BaseClass##_##Method##_Benchmark() {                      \
+      this->SetName(#BaseClass "/" #Method);                  \
+    }                                                         \
+                                                              \
+   protected:                                                 \
+    void BenchmarkCase(::benchmark::State&) override;         \
  };

 #define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
@ -1560,7 +1582,7 @@ class Fixture : public internal::Benchmark {
    }                                                               \
                                                                    \
   protected:                                                       \
-    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;     \
+    void BenchmarkCase(::benchmark::State&) override;               \
  };

 #define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
@ -1571,10 +1593,9 @@ class Fixture : public internal::Benchmark {
    }                                                                  \
                                                                       \
   protected:                                                          \
-    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;        \
+    void BenchmarkCase(::benchmark::State&) override;                  \
  };

-#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, ...)       \
  class BaseClass##_##Method##_Benchmark : public BaseClass<__VA_ARGS__> { \
   public:                                                                 \
@ -1583,12 +1604,8 @@ class Fixture : public internal::Benchmark {
    }                                                                      \
                                                                           \
   protected:                                                              \
-    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;            \
+    void BenchmarkCase(::benchmark::State&) override;                      \
  };
-#else
-#define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(n, a) \
-  BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(n, a)
-#endif

 #define BENCHMARK_DEFINE_F(BaseClass, Method)    \
  BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
@ -1602,21 +1619,48 @@ class Fixture : public internal::Benchmark {
  BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase

-#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, ...)            \
  BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
-#else
-#define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, a) \
-  BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)
-#endif

 #define BENCHMARK_REGISTER_F(BaseClass, Method) \
  BENCHMARK_PRIVATE_REGISTER_F(BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method))

-#define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
-  BENCHMARK_PRIVATE_DECLARE(TestName) =        \
-      (::benchmark::internal::RegisterBenchmarkInternal(new TestName()))
+#define BENCHMARK_PRIVATE_REGISTER_F(TestName)           \
+  BENCHMARK_PRIVATE_DECLARE(TestName) =                  \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          ::benchmark::internal::make_unique<TestName>()))
+
+#define BENCHMARK_TEMPLATE_PRIVATE_CONCAT_NAME_F(BaseClass, Method) \
+  BaseClass##_##Method##_BenchmarkTemplate
+
+#define BENCHMARK_TEMPLATE_METHOD_F(BaseClass, Method)              \
+  template <class... Args>                                          \
+  class BENCHMARK_TEMPLATE_PRIVATE_CONCAT_NAME_F(BaseClass, Method) \
+      : public BaseClass<Args...> {                                 \
+   protected:                                                       \
+    using Base = BaseClass<Args...>;                                \
+    void BenchmarkCase(::benchmark::State&) override;               \
+  };                                                                \
+  template <class... Args>                                          \
+  void BENCHMARK_TEMPLATE_PRIVATE_CONCAT_NAME_F(                    \
+      BaseClass, Method)<Args...>::BenchmarkCase
+
+#define BENCHMARK_TEMPLATE_PRIVATE_INSTANTIATE_F(BaseClass, Method,           \
+                                                 UniqueName, ...)             \
+  class UniqueName : public BENCHMARK_TEMPLATE_PRIVATE_CONCAT_NAME_F(         \
+                         BaseClass, Method)<__VA_ARGS__> {                    \
+   public:                                                                    \
+    UniqueName() { this->SetName(#BaseClass "<" #__VA_ARGS__ ">/" #Method); } \
+  };                                                                          \
+  BENCHMARK_PRIVATE_DECLARE(BaseClass##_##Method##_Benchmark) =               \
+      (::benchmark::internal::RegisterBenchmarkInternal(                      \
+          ::benchmark::internal::make_unique<UniqueName>()))
+
+#define BENCHMARK_TEMPLATE_INSTANTIATE_F(BaseClass, Method, ...)    \
+  BENCHMARK_TEMPLATE_PRIVATE_INSTANTIATE_F(                         \
+      BaseClass, Method, BENCHMARK_PRIVATE_NAME(BaseClass##Method), \
+      __VA_ARGS__)

 // This macro will define and register a benchmark within a fixture class.
 #define BENCHMARK_F(BaseClass, Method)           \
@ -1634,22 +1678,17 @@ class Fixture : public internal::Benchmark {
  BENCHMARK_REGISTER_F(BaseClass, Method);                       \
  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase

-#ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_F(BaseClass, Method, ...)                   \
  BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
  BENCHMARK_REGISTER_F(BaseClass, Method);                             \
  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
-#else
-#define BENCHMARK_TEMPLATE_F(BaseClass, Method, a) \
-  BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)
-#endif

 // Helper macro to create a main routine in a test that runs the benchmarks
 // Note the workaround for Hexagon simulator passing argc != 0, argv = NULL.
 #define BENCHMARK_MAIN()                                                \
  int main(int argc, char** argv) {                                     \
    char arg0_default[] = "benchmark";                                  \
-    char* args_default = arg0_default;                                  \
+    char* args_default = reinterpret_cast<char*>(arg0_default);         \
    if (!argv) {                                                        \
      argc = 1;                                                         \
      argv = &args_default;                                             \
@ -1729,7 +1768,7 @@ class BENCHMARK_EXPORT BenchmarkReporter {
    CPUInfo const& cpu_info;
    SystemInfo const& sys_info;
    // The number of chars in the longest benchmark name.
-    size_t name_field_width;
+    size_t name_field_width = 0;
    static const char* executable_name;
    Context();
  };
@ -1748,12 +1787,12 @@ class BENCHMARK_EXPORT BenchmarkReporter {
          real_accumulated_time(0),
          cpu_accumulated_time(0),
          max_heapbytes_used(0),
+          use_real_time_for_initial_big_o(false),
          complexity(oNone),
          complexity_lambda(),
          complexity_n(0),
          report_big_o(false),
          report_rms(false),
-          memory_result(NULL),
          allocs_per_iter(0.0) {}

    std::string benchmark_name() const;
@ -1790,10 +1829,14 @@ class BENCHMARK_EXPORT BenchmarkReporter {
    // This is set to 0.0 if memory tracing is not enabled.
    double max_heapbytes_used;

+    // By default Big-O is computed for CPU time, but that is not what you want
+    // to happen when manual time was requested, which is stored as real time.
+    bool use_real_time_for_initial_big_o;
+
    // Keep track of arguments to compute asymptotic complexity
    BigO complexity;
    BigOFunc* complexity_lambda;
-    int64_t complexity_n;
+    ComplexityN complexity_n;

    // what statistics to compute from the measurements
    const std::vector<internal::Statistics>* statistics;
@ -1805,7 +1848,7 @@ class BENCHMARK_EXPORT BenchmarkReporter {
    UserCounters counters;

    // Memory metrics.
-    const MemoryManager::Result* memory_result;
+    MemoryManager::Result memory_result;
    double allocs_per_iter;
  };

@ -1897,12 +1940,12 @@ class BENCHMARK_EXPORT ConsoleReporter : public BenchmarkReporter {
  explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults)
      : output_options_(opts_), name_field_width_(0), printed_header_(false) {}

-  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
-  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
+  bool ReportContext(const Context& context) override;
+  void ReportRuns(const std::vector<Run>& reports) override;

 protected:
-  virtual void PrintRunData(const Run& report);
-  virtual void PrintHeader(const Run& report);
+  virtual void PrintRunData(const Run& result);
+  virtual void PrintHeader(const Run& run);

  OutputOptions output_options_;
  size_t name_field_width_;
@ -1913,12 +1956,12 @@ class BENCHMARK_EXPORT ConsoleReporter : public BenchmarkReporter {
 class BENCHMARK_EXPORT JSONReporter : public BenchmarkReporter {
 public:
  JSONReporter() : first_report_(true) {}
-  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
-  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
-  void Finalize() BENCHMARK_OVERRIDE;
+  bool ReportContext(const Context& context) override;
+  void ReportRuns(const std::vector<Run>& reports) override;
+  void Finalize() override;

 private:
-  void PrintRunData(const Run& report);
+  void PrintRunData(const Run& run);

  bool first_report_;
 };
@ -1928,11 +1971,11 @@ class BENCHMARK_EXPORT BENCHMARK_DEPRECATED_MSG(
    : public BenchmarkReporter {
 public:
  CSVReporter() : printed_header_(false) {}
-  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
-  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
+  bool ReportContext(const Context& context) override;
+  void ReportRuns(const std::vector<Run>& reports) override;

 private:
-  void PrintRunData(const Run& report);
+  void PrintRunData(const Run& run);

  bool printed_header_;
  std::set<std::string> user_counter_names_;
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,36 +1,34 @@
 [build-system]
-requires = ["setuptools", "wheel"]
+requires = ["setuptools"]
 build-backend = "setuptools.build_meta"

 [project]
 name = "google_benchmark"
 description = "A library to benchmark code snippets."
-requires-python = ">=3.8"
-license = {file = "LICENSE"}
+requires-python = ">=3.10"
+license = { file = "LICENSE" }
 keywords = ["benchmark"]

-authors = [
-    {name = "Google", email = "benchmark-discuss@googlegroups.com"},
-]
+authors = [{ name = "Google", email = "benchmark-discuss@googlegroups.com" }]

 classifiers = [
    "Development Status :: 4 - Beta",
    "Intended Audience :: Developers",
    "Intended Audience :: Science/Research",
    "License :: OSI Approved :: Apache Software License",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
    "Topic :: Software Development :: Testing",
    "Topic :: System :: Benchmark",
 ]

 dynamic = ["readme", "version"]

-dependencies = [
-    "absl-py>=0.7.1",
-]
+dependencies = ["absl-py>=0.7.1"]
+
+[project.optional-dependencies]
+dev = ["pre-commit>=3.3.3"]

 [project.urls]
 Homepage = "https://github.com/google/benchmark"
@ -39,12 +37,42 @@ Repository = "https://github.com/google/benchmark.git"
 Discord = "https://discord.gg/cz7UX7wKC2"

 [tool.setuptools]
-package-dir = {"" = "bindings/python"}
+package-dir = { "" = "bindings/python" }
 zip-safe = false

 [tool.setuptools.packages.find]
 where = ["bindings/python"]

 [tool.setuptools.dynamic]
-version = { attr = "google_benchmark.__version__" }
 readme = { file = "README.md", content-type = "text/markdown" }
+version = { attr = "google_benchmark.__version__" }
+
+[tool.mypy]
+check_untyped_defs = true
+disallow_incomplete_defs = true
+pretty = true
+python_version = "3.11"
+strict_optional = false
+warn_unreachable = true
+
+[[tool.mypy.overrides]]
+module = ["yaml"]
+ignore_missing_imports = true
+
+[tool.ruff]
+# explicitly tell ruff the source directory to correctly identify first-party package.
+src = ["bindings/python"]
+
+line-length = 80
+target-version = "py311"
+
+[tool.ruff.lint]
+# Enable pycodestyle (`E`, `W`), Pyflakes (`F`), and isort (`I`) codes by default.
+select = ["ASYNC", "B", "C4", "C90", "E", "F", "I", "PERF", "PIE", "PT018", "RUF", "SIM", "UP", "W"]
+ignore = [
+    "PLW2901",  # redefined-loop-name
+    "UP031",    # printf-string-formatting
+]
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
--- a/setup.py
+++ b/setup.py
@ -1,46 +1,71 @@
 import contextlib
 import os
 import platform
+import re
 import shutil
-import sysconfig
+import sys
+from collections.abc import Generator
 from pathlib import Path
+from typing import Any

 import setuptools
 from setuptools.command import build_ext

-
-PYTHON_INCLUDE_PATH_PLACEHOLDER = "<PYTHON_INCLUDE_PATH>"
-
 IS_WINDOWS = platform.system() == "Windows"
 IS_MAC = platform.system() == "Darwin"
+IS_LINUX = platform.system() == "Linux"
+
+# hardcoded SABI-related options. Requires that each Python interpreter
+# (hermetic or not) participating is of the same major-minor version.
+py_limited_api = sys.version_info >= (3, 12)
+options = {"bdist_wheel": {"py_limited_api": "cp312"}} if py_limited_api else {}
+
+
+def is_cibuildwheel() -> bool:
+    return os.getenv("CIBUILDWHEEL") is not None


@contextlib.contextmanager
-def temp_fill_include_path(fp: str):
-    """Temporarily set the Python include path in a file."""
-    with open(fp, "r+") as f:
-        try:
-            content = f.read()
-            replaced = content.replace(
-                PYTHON_INCLUDE_PATH_PLACEHOLDER,
-                Path(sysconfig.get_paths()['include']).as_posix(),
+def _maybe_patch_toolchains() -> Generator[None, None, None]:
+    """
+    Patch rules_python toolchains to ignore root user error
+    when run in a Docker container on Linux in cibuildwheel.
+    """
+
+    def fmt_toolchain_args(matchobj):
+        suffix = "ignore_root_user_error = True"
+        callargs = matchobj.group(1)
+        # toolchain def is broken over multiple lines
+        if callargs.endswith("\n"):
+            callargs = callargs + "    " + suffix + ",\n"
+        # toolchain def is on one line.
+        else:
+            callargs = callargs + ", " + suffix
+        return "python.toolchain(" + callargs + ")"
+
+    CIBW_LINUX = is_cibuildwheel() and IS_LINUX
+    module_bazel = Path("MODULE.bazel")
+    content: str = module_bazel.read_text()
+    try:
+        if CIBW_LINUX:
+            module_bazel.write_text(
+                re.sub(
+                    r"python.toolchain\(([\w\"\s,.=]*)\)",
+                    fmt_toolchain_args,
+                    content,
+                )
            )
-            f.seek(0)
-            f.write(replaced)
-            f.truncate()
-            yield
-        finally:
-            # revert to the original content after exit
-            f.seek(0)
-            f.write(content)
-            f.truncate()
+        yield
+    finally:
+        if CIBW_LINUX:
+            module_bazel.write_text(content)


 class BazelExtension(setuptools.Extension):
    """A C/C++ extension that is defined as a Bazel BUILD target."""

-    def __init__(self, name: str, bazel_target: str):
-        super().__init__(name=name, sources=[])
+    def __init__(self, name: str, bazel_target: str, **kwargs: Any):
+        super().__init__(name=name, sources=[], **kwargs)

        self.bazel_target = bazel_target
        stripped_target = bazel_target.split("//")[-1]
@ -53,61 +78,89 @@ class BuildBazelExtension(build_ext.build_ext):
    def run(self):
        for ext in self.extensions:
            self.bazel_build(ext)
-        build_ext.build_ext.run(self)
+        # explicitly call `bazel shutdown` for graceful exit
+        self.spawn(["bazel", "shutdown"])

-    def bazel_build(self, ext: BazelExtension):
+    def copy_extensions_to_source(self):
+        """
+        Copy generated extensions into the source tree.
+        This is done in the ``bazel_build`` method, so it's not necessary to
+        do again in the `build_ext` base class.
+        """
+
+    def bazel_build(self, ext: BazelExtension) -> None:  # noqa: C901
        """Runs the bazel build to create the package."""
-        with temp_fill_include_path("WORKSPACE"):
-            temp_path = Path(self.build_temp)
+        temp_path = Path(self.build_temp)

-            bazel_argv = [
-                "bazel",
-                "build",
-                ext.bazel_target,
-                f"--symlink_prefix={temp_path / 'bazel-'}",
-                f"--compilation_mode={'dbg' if self.debug else 'opt'}",
-                # C++17 is required by nanobind
-                f"--cxxopt={'/std:c++17' if IS_WINDOWS else '-std=c++17'}",
-            ]
+        # We round to the minor version, which makes rules_python
+        # look up the latest available patch version internally.
+        python_version = "{}.{}".format(*sys.version_info[:2])

-            if IS_WINDOWS:
-                # Link with python*.lib.
-                for library_dir in self.library_dirs:
-                    bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
-            elif IS_MAC:
-                if platform.machine() == "x86_64":
-                    # C++17 needs macOS 10.14 at minimum
-                    bazel_argv.append("--macos_minimum_os=10.14")
+        bazel_argv = [
+            "bazel",
+            "run",
+            ext.bazel_target,
+            f"--symlink_prefix={temp_path / 'bazel-'}",
+            f"--compilation_mode={'dbg' if self.debug else 'opt'}",
+            # C++17 is required by nanobind
+            f"--cxxopt={'/std:c++17' if IS_WINDOWS else '-std=c++17'}",
+            f"--@rules_python//python/config_settings:python_version={python_version}",
+        ]

-                    # cross-compilation for Mac ARM64 on GitHub Mac x86 runners.
-                    # ARCHFLAGS is set by cibuildwheel before macOS wheel builds.
-                    archflags = os.getenv("ARCHFLAGS", "")
-                    if "arm64" in archflags:
-                        bazel_argv.append("--cpu=darwin_arm64")
-                        bazel_argv.append("--macos_cpus=arm64")
+        if ext.py_limited_api:
+            bazel_argv += ["--@nanobind_bazel//:py-limited-api=cp312"]

-                elif platform.machine() == "arm64":
-                    bazel_argv.append("--macos_minimum_os=11.0")
+        if IS_WINDOWS:
+            # Link with python*.lib.
+            for library_dir in self.library_dirs:
+                bazel_argv.append("--linkopt=/LIBPATH:" + library_dir)
+        elif IS_MAC:
+            # C++17 needs macOS 10.14 at minimum
+            bazel_argv.append("--macos_minimum_os=10.14")

+        with _maybe_patch_toolchains():
            self.spawn(bazel_argv)

-            shared_lib_suffix = '.dll' if IS_WINDOWS else '.so'
-            ext_name = ext.target_name + shared_lib_suffix
-            ext_bazel_bin_path = temp_path / 'bazel-bin' / ext.relpath / ext_name
+        if IS_WINDOWS:
+            suffix = ".pyd"
+        else:
+            suffix = ".abi3.so" if ext.py_limited_api else ".so"

-            ext_dest_path = Path(self.get_ext_fullpath(ext.name))
-            shutil.copyfile(ext_bazel_bin_path, ext_dest_path)
+        # copy the Bazel build artifacts into setuptools' libdir,
+        # from where the wheel is built.
+        pkgname = "google_benchmark"
+        pythonroot = Path("bindings") / "python" / "google_benchmark"
+        srcdir = temp_path / "bazel-bin" / pythonroot
+        libdir = Path(self.build_lib) / pkgname
+        for root, dirs, files in os.walk(srcdir, topdown=True):
+            # exclude runfiles directories and children.
+            dirs[:] = [d for d in dirs if "runfiles" not in d]

-            # explicitly call `bazel shutdown` for graceful exit
-            self.spawn(["bazel", "shutdown"])
+            for f in files:
+                fp = Path(f)
+                should_copy = False
+                # we do not want the bare .so file included
+                # when building for ABI3, so we require a
+                # full and exact match on the file extension.
+                if "".join(fp.suffixes) == suffix or fp.suffix == ".pyi":
+                    should_copy = True
+                elif Path(root) == srcdir and f == "py.typed":
+                    # copy py.typed, but only at the package root.
+                    should_copy = True
+
+                if should_copy:
+                    shutil.copyfile(root / fp, libdir / fp)


 setuptools.setup(
-    cmdclass=dict(build_ext=BuildBazelExtension),
+    cmdclass={"build_ext": BuildBazelExtension},
+    package_data={"google_benchmark": ["py.typed", "*.pyi"]},
    ext_modules=[
        BazelExtension(
            name="google_benchmark._benchmark",
-            bazel_target="//bindings/python/google_benchmark:_benchmark",
+            bazel_target="//bindings/python/google_benchmark:benchmark_stubgen",
+            py_limited_api=py_limited_api,
        )
    ],
+    options=options,
 )
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -1,4 +1,4 @@
-# Allow the source files to find headers in src/
+#Allow the source files to find headers in src /
 include(GNUInstallDirs)
 include_directories(${PROJECT_SOURCE_DIR}/src)

@ -28,10 +28,20 @@ target_include_directories(benchmark PUBLIC
  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
 )

+set_property(
+  SOURCE benchmark.cc
+  APPEND
+  PROPERTY COMPILE_DEFINITIONS
+  BENCHMARK_VERSION="${VERSION}"
+)
+
 # libpfm, if available
 if (PFM_FOUND)
  target_link_libraries(benchmark PRIVATE PFM::libpfm)
  target_compile_definitions(benchmark PRIVATE -DHAVE_LIBPFM)
+  install(
+      FILES "${PROJECT_SOURCE_DIR}/cmake/Modules/FindPFM.cmake"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
 endif()

 # pthread affinity, if available
@ -57,6 +67,7 @@ endif()
 # We need extra libraries on Solaris
 if(${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
  target_link_libraries(benchmark PRIVATE kstat)
+  set(BENCHMARK_PRIVATE_LINK_LIBRARIES -lkstat)
 endif()

 if (NOT BUILD_SHARED_LIBS)
@ -79,6 +90,7 @@ set(generated_dir "${PROJECT_BINARY_DIR}")
 set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
 set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
 set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
+set(pkg_config_main "${generated_dir}/${PROJECT_NAME}_main.pc")
 set(targets_to_export benchmark benchmark_main)
 set(targets_export_name "${PROJECT_NAME}Targets")

@ -98,6 +110,7 @@ write_basic_package_version_file(
 )

 configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)
+configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark_main.pc.in" "${pkg_config_main}" @ONLY)

 export (
  TARGETS ${targets_to_export}
@ -126,7 +139,7 @@ if (BENCHMARK_ENABLE_INSTALL)
      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")

  install(
-      FILES "${pkg_config}"
+      FILES "${pkg_config}" "${pkg_config_main}"
      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")

  install(
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@ -46,7 +46,6 @@
 #include "commandlineflags.h"
 #include "complexity.h"
 #include "counter.h"
-#include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
 #include "perf_counters.h"
@ -92,6 +91,11 @@ BM_DEFINE_double(benchmark_min_warmup_time, 0.0);
 // standard deviation of the runs will be reported.
 BM_DEFINE_int32(benchmark_repetitions, 1);

+// If enabled, forces each benchmark to execute exactly one iteration and one
+// repetition, bypassing any configured
+// MinTime()/MinWarmUpTime()/Iterations()/Repetitions()
+BM_DEFINE_bool(benchmark_dry_run, false);
+
 // If set, enable random interleaving of repetitions of all benchmarks.
 // See http://github.com/google/benchmark/issues/1051 for details.
 BM_DEFINE_bool(benchmark_enable_random_interleaving, false);
@ -146,21 +150,34 @@ BM_DEFINE_int32(v, 0);

 namespace internal {

+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 std::map<std::string, std::string>* global_context = nullptr;

 BENCHMARK_EXPORT std::map<std::string, std::string>*& GetGlobalContext() {
  return global_context;
 }

-// FIXME: wouldn't LTO mess this up?
-void UseCharPointer(char const volatile*) {}
+namespace {
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+void const volatile* volatile global_force_escape_pointer;
+}  // namespace
+
+// FIXME: Verify if LTO still messes this up?
+void UseCharPointer(char const volatile* const v) {
+  // We want to escape the pointer `v` so that the compiler can not eliminate
+  // computations that produced it. To do that, we escape the pointer by storing
+  // it into a volatile variable, since generally, volatile store, is not
+  // something the compiler is allowed to elide.
+  global_force_escape_pointer = reinterpret_cast<void const volatile*>(v);
+}

 }  // namespace internal

 State::State(std::string name, IterationCount max_iters,
             const std::vector<int64_t>& ranges, int thread_i, int n_threads,
             internal::ThreadTimer* timer, internal::ThreadManager* manager,
-             internal::PerfCountersMeasurement* perf_counters_measurement)
+             internal::PerfCountersMeasurement* perf_counters_measurement,
+             ProfilerManager* profiler_manager)
    : total_iterations_(0),
      batch_leftover_(0),
      max_iterations(max_iters),
@ -174,7 +191,8 @@ State::State(std::string name, IterationCount max_iters,
      threads_(n_threads),
      timer_(timer),
      manager_(manager),
-      perf_counters_measurement_(perf_counters_measurement) {
+      perf_counters_measurement_(perf_counters_measurement),
+      profiler_manager_(profiler_manager) {
  BM_CHECK(max_iterations != 0) << "At least one iteration must be run";
  BM_CHECK_LT(thread_index_, threads_)
      << "thread_index must be less than threads";
@ -183,7 +201,7 @@ State::State(std::string name, IterationCount max_iters,
  // `PauseTiming`, a new `Counter` will be inserted the first time, which
  // won't have the flag.  Inserting them now also reduces the allocations
  // during the benchmark.
-  if (perf_counters_measurement_) {
+  if (perf_counters_measurement_ != nullptr) {
    for (const std::string& counter_name :
         perf_counters_measurement_->names()) {
      counters[counter_name] = Counter(0.0, Counter::kAvgIterations);
@ -199,7 +217,7 @@ State::State(std::string name, IterationCount max_iters,
 #if defined(__INTEL_COMPILER)
 #pragma warning push
 #pragma warning(disable : 1875)
-#elif defined(__GNUC__)
+#elif defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Winvalid-offsetof"
 #endif
@ -217,7 +235,7 @@ State::State(std::string name, IterationCount max_iters,
      offsetof(State, skipped_) <= (cache_line_size - sizeof(skipped_)), "");
 #if defined(__INTEL_COMPILER)
 #pragma warning pop
-#elif defined(__GNUC__)
+#elif defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic pop
 #endif
 #if defined(__NVCC__)
@ -232,7 +250,7 @@ void State::PauseTiming() {
  // Add in time accumulated so far
  BM_CHECK(started_ && !finished_ && !skipped());
  timer_->StopTimer();
-  if (perf_counters_measurement_) {
+  if (perf_counters_measurement_ != nullptr) {
    std::vector<std::pair<std::string, double>> measurements;
    if (!perf_counters_measurement_->Stop(measurements)) {
      BM_CHECK(false) << "Perf counters read the value failed.";
@ -250,7 +268,7 @@ void State::PauseTiming() {
 void State::ResumeTiming() {
  BM_CHECK(started_ && !finished_ && !skipped());
  timer_->StartTimer();
-  if (perf_counters_measurement_) {
+  if (perf_counters_measurement_ != nullptr) {
    perf_counters_measurement_->Start();
  }
 }
@ -265,7 +283,9 @@ void State::SkipWithMessage(const std::string& msg) {
    }
  }
  total_iterations_ = 0;
-  if (timer_->running()) timer_->StopTimer();
+  if (timer_->running()) {
+    timer_->StopTimer();
+  }
 }

 void State::SkipWithError(const std::string& msg) {
@ -278,7 +298,9 @@ void State::SkipWithError(const std::string& msg) {
    }
  }
  total_iterations_ = 0;
-  if (timer_->running()) timer_->StopTimer();
+  if (timer_->running()) {
+    timer_->StopTimer();
+  }
 }

 void State::SetIterationTime(double seconds) {
@ -294,8 +316,13 @@ void State::StartKeepRunning() {
  BM_CHECK(!started_ && !finished_);
  started_ = true;
  total_iterations_ = skipped() ? 0 : max_iterations;
+  if (BENCHMARK_BUILTIN_EXPECT(profiler_manager_ != nullptr, false)) {
+    profiler_manager_->AfterSetupStart();
+  }
  manager_->StartStopBarrier();
-  if (!skipped()) ResumeTiming();
+  if (!skipped()) {
+    ResumeTiming();
+  }
 }

 void State::FinishKeepRunning() {
@ -307,6 +334,9 @@ void State::FinishKeepRunning() {
  total_iterations_ = 0;
  finished_ = true;
  manager_->StartStopBarrier();
+  if (BENCHMARK_BUILTIN_EXPECT(profiler_manager_ != nullptr, false)) {
+    profiler_manager_->BeforeTeardownStop();
+  }
 }

 namespace internal {
@ -315,7 +345,9 @@ namespace {
 // Flushes streams after invoking reporter methods that write to them. This
 // ensures users get timely updates even when streams are not line-buffered.
 void FlushStreams(BenchmarkReporter* reporter) {
-  if (!reporter) return;
+  if (reporter == nullptr) {
+    return;
+  }
  std::flush(reporter->GetOutputStream());
  std::flush(reporter->GetErrorStream());
 }
@ -328,16 +360,20 @@ void Report(BenchmarkReporter* display_reporter,
    assert(reporter);
    // If there are no aggregates, do output non-aggregates.
    aggregates_only &= !results.aggregates_only.empty();
-    if (!aggregates_only) reporter->ReportRuns(results.non_aggregates);
-    if (!results.aggregates_only.empty())
+    if (!aggregates_only) {
+      reporter->ReportRuns(results.non_aggregates);
+    }
+    if (!results.aggregates_only.empty()) {
      reporter->ReportRuns(results.aggregates_only);
+    }
  };

  report_one(display_reporter, run_results.display_report_aggregates_only,
             run_results);
-  if (file_reporter)
+  if (file_reporter != nullptr) {
    report_one(file_reporter, run_results.file_report_aggregates_only,
               run_results);
+  }

  FlushStreams(display_reporter);
  FlushStreams(file_reporter);
@ -358,10 +394,13 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
        std::max<size_t>(name_field_width, benchmark.name().str().size());
    might_have_aggregates |= benchmark.repetitions() > 1;

-    for (const auto& Stat : benchmark.statistics())
+    for (const auto& Stat : benchmark.statistics()) {
      stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
+    }
+  }
+  if (might_have_aggregates) {
+    name_field_width += 1 + stat_field_width;
  }
-  if (might_have_aggregates) name_field_width += 1 + stat_field_width;

  // Print header here
  BenchmarkReporter::Context context;
@ -372,7 +411,7 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
      per_family_reports;

  if (display_reporter->ReportContext(context) &&
-      (!file_reporter || file_reporter->ReportContext(context))) {
+      ((file_reporter == nullptr) || file_reporter->ReportContext(context))) {
    FlushStreams(display_reporter);
    FlushStreams(file_reporter);

@ -394,14 +433,17 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
    // Loop through all benchmarks
    for (const BenchmarkInstance& benchmark : benchmarks) {
      BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
-      if (benchmark.complexity() != oNone)
+      if (benchmark.complexity() != oNone) {
        reports_for_family = &per_family_reports[benchmark.family_index()];
-      benchmarks_with_threads += (benchmark.threads() > 1);
+      }
+      benchmarks_with_threads += static_cast<int>(benchmark.threads() > 1);
      runners.emplace_back(benchmark, &perfcounters, reports_for_family);
      int num_repeats_of_this_instance = runners.back().GetNumRepeats();
-      num_repetitions_total += num_repeats_of_this_instance;
-      if (reports_for_family)
+      num_repetitions_total +=
+          static_cast<size_t>(num_repeats_of_this_instance);
+      if (reports_for_family != nullptr) {
        reports_for_family->num_runs_total += num_repeats_of_this_instance;
+      }
    }
    assert(runners.size() == benchmarks.size() && "Unexpected runner count.");

@ -436,14 +478,17 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
    for (size_t repetition_index : repetition_indices) {
      internal::BenchmarkRunner& runner = runners[repetition_index];
      runner.DoOneRepetition();
-      if (runner.HasRepeatsRemaining()) continue;
+      if (runner.HasRepeatsRemaining()) {
+        continue;
+      }
      // FIXME: report each repetition separately, not all of them in bulk.

      display_reporter->ReportRunsConfig(
          runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());
-      if (file_reporter)
+      if (file_reporter != nullptr) {
        file_reporter->ReportRunsConfig(
            runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());
+      }

      RunResults run_results = runner.GetResults();

@ -464,7 +509,9 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
    }
  }
  display_reporter->Finalize();
-  if (file_reporter) file_reporter->Finalize();
+  if (file_reporter != nullptr) {
+    file_reporter->Finalize();
+  }
  FlushStreams(display_reporter);
  FlushStreams(file_reporter);
 }
@ -486,6 +533,7 @@ std::unique_ptr<BenchmarkReporter> CreateReporter(
    return PtrType(new CSVReporter());
  }
  std::cerr << "Unexpected format: '" << name << "'\n";
+  std::flush(std::cerr);
  std::exit(1);
 }

@ -524,7 +572,7 @@ ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
 }  // end namespace internal

 BenchmarkReporter* CreateDefaultDisplayReporter() {
-  static auto default_display_reporter =
+  static auto* default_display_reporter =
      internal::CreateReporter(FLAGS_benchmark_format,
                               internal::GetOutputOptions())
          .release();
@ -558,14 +606,15 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
                              BenchmarkReporter* file_reporter,
                              std::string spec) {
-  if (spec.empty() || spec == "all")
+  if (spec.empty() || spec == "all") {
    spec = ".";  // Regexp that matches all benchmarks
+  }

  // Setup the reporters
  std::ofstream output_file;
  std::unique_ptr<BenchmarkReporter> default_display_reporter;
  std::unique_ptr<BenchmarkReporter> default_file_reporter;
-  if (!display_reporter) {
+  if (display_reporter == nullptr) {
    default_display_reporter.reset(CreateDefaultDisplayReporter());
    display_reporter = default_display_reporter.get();
  }
@ -573,19 +622,22 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
  auto& Err = display_reporter->GetErrorStream();

  std::string const& fname = FLAGS_benchmark_out;
-  if (fname.empty() && file_reporter) {
+  if (fname.empty() && (file_reporter != nullptr)) {
    Err << "A custom file reporter was provided but "
-           "--benchmark_out=<file> was not specified."
-        << std::endl;
+           "--benchmark_out=<file> was not specified.\n";
+    Out.flush();
+    Err.flush();
    std::exit(1);
  }
  if (!fname.empty()) {
    output_file.open(fname);
    if (!output_file.is_open()) {
-      Err << "invalid file name: '" << fname << "'" << std::endl;
+      Err << "invalid file name: '" << fname << "'\n";
+      Out.flush();
+      Err.flush();
      std::exit(1);
    }
-    if (!file_reporter) {
+    if (file_reporter == nullptr) {
      default_file_reporter = internal::CreateReporter(
          FLAGS_benchmark_out_format, FLAGS_benchmark_counters_tabular
                                          ? ConsoleReporter::OO_Tabular
@ -597,20 +649,29 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
  }

  std::vector<internal::BenchmarkInstance> benchmarks;
-  if (!FindBenchmarksInternal(spec, &benchmarks, &Err)) return 0;
+  if (!FindBenchmarksInternal(spec, &benchmarks, &Err)) {
+    Out.flush();
+    Err.flush();
+    return 0;
+  }

  if (benchmarks.empty()) {
    Err << "Failed to match any benchmarks against regex: " << spec << "\n";
+    Out.flush();
+    Err.flush();
    return 0;
  }

  if (FLAGS_benchmark_list_tests) {
-    for (auto const& benchmark : benchmarks)
+    for (auto const& benchmark : benchmarks) {
      Out << benchmark.name().str() << "\n";
+    }
  } else {
    internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
  }

+  Out.flush();
+  Err.flush();
  return benchmarks.size();
 }

@ -635,6 +696,14 @@ void RegisterMemoryManager(MemoryManager* manager) {
  internal::memory_manager = manager;
 }

+void RegisterProfilerManager(ProfilerManager* manager) {
+  // Don't allow overwriting an existing manager.
+  if (manager != nullptr) {
+    BM_CHECK_EQ(internal::profiler_manager, nullptr);
+  }
+  internal::profiler_manager = manager;
+}
+
 void AddCustomContext(const std::string& key, const std::string& value) {
  if (internal::global_context == nullptr) {
    internal::global_context = new std::map<std::string, std::string>();
@ -651,7 +720,9 @@ void (*HelperPrintf)();

 void PrintUsageAndExit() {
  HelperPrintf();
-  exit(0);
+  std::flush(std::cout);
+  std::flush(std::cerr);
+  std::exit(0);
 }

 void SetDefaultTimeUnitFromFlag(const std::string& time_unit_flag) {
@ -675,8 +746,8 @@ void SetDefaultTimeUnitFromFlag(const std::string& time_unit_flag) {
 void ParseCommandLineFlags(int* argc, char** argv) {
  using namespace benchmark;
  BenchmarkReporter::Context::executable_name =
-      (argc && *argc > 0) ? argv[0] : "unknown";
-  for (int i = 1; argc && i < *argc; ++i) {
+      ((argc != nullptr) && *argc > 0) ? argv[0] : "unknown";
+  for (int i = 1; (argc != nullptr) && i < *argc; ++i) {
    if (ParseBoolFlag(argv[i], "benchmark_list_tests",
                      &FLAGS_benchmark_list_tests) ||
        ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
@ -686,6 +757,7 @@ void ParseCommandLineFlags(int* argc, char** argv) {
                        &FLAGS_benchmark_min_warmup_time) ||
        ParseInt32Flag(argv[i], "benchmark_repetitions",
                       &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_dry_run", &FLAGS_benchmark_dry_run) ||
        ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
                      &FLAGS_benchmark_enable_random_interleaving) ||
        ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
@ -706,7 +778,9 @@ void ParseCommandLineFlags(int* argc, char** argv) {
        ParseStringFlag(argv[i], "benchmark_time_unit",
                        &FLAGS_benchmark_time_unit) ||
        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
-      for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
+      for (int j = i; j != *argc - 1; ++j) {
+        argv[j] = argv[j + 1];
+      }

      --(*argc);
      --i;
@ -724,6 +798,9 @@ void ParseCommandLineFlags(int* argc, char** argv) {
  if (FLAGS_benchmark_color.empty()) {
    PrintUsageAndExit();
  }
+  if (FLAGS_benchmark_dry_run) {
+    AddCustomContext("dry_run", "true");
+  }
  for (const auto& kv : FLAGS_benchmark_context) {
    AddCustomContext(kv.first, kv.second);
  }
@ -736,6 +813,14 @@ int InitializeStreams() {

 }  // end namespace internal

+std::string GetBenchmarkVersion() {
+#ifdef BENCHMARK_VERSION
+  return {BENCHMARK_VERSION};
+#else
+  return {""};
+#endif
+}
+
 void PrintDefaultHelp() {
  fprintf(stdout,
          "benchmark"
@ -744,6 +829,7 @@ void PrintDefaultHelp() {
          "          [--benchmark_min_time=`<integer>x` OR `<float>s` ]\n"
          "          [--benchmark_min_warmup_time=<min_warmup_time>]\n"
          "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_dry_run={true|false}]\n"
          "          [--benchmark_enable_random_interleaving={true|false}]\n"
          "          [--benchmark_report_aggregates_only={true|false}]\n"
          "          [--benchmark_display_aggregates_only={true|false}]\n"
--- a/src/benchmark_api_internal.cc
+++ b/src/benchmark_api_internal.cc
@ -27,7 +27,9 @@ BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
      min_time_(benchmark_.min_time_),
      min_warmup_time_(benchmark_.min_warmup_time_),
      iterations_(benchmark_.iterations_),
-      threads_(thread_count) {
+      threads_(thread_count),
+      setup_(benchmark_.setup_),
+      teardown_(benchmark_.teardown_) {
  name_.function_name = benchmark_.name_;

  size_t arg_i = 0;
@ -84,33 +86,31 @@ BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
  if (!benchmark_.thread_counts_.empty()) {
    name_.threads = StrFormat("threads:%d", threads_);
  }
-
-  setup_ = benchmark_.setup_;
-  teardown_ = benchmark_.teardown_;
 }

 State BenchmarkInstance::Run(
    IterationCount iters, int thread_id, internal::ThreadTimer* timer,
    internal::ThreadManager* manager,
-    internal::PerfCountersMeasurement* perf_counters_measurement) const {
+    internal::PerfCountersMeasurement* perf_counters_measurement,
+    ProfilerManager* profiler_manager) const {
  State st(name_.function_name, iters, args_, thread_id, threads_, timer,
-           manager, perf_counters_measurement);
+           manager, perf_counters_measurement, profiler_manager);
  benchmark_.Run(st);
  return st;
 }

 void BenchmarkInstance::Setup() const {
-  if (setup_) {
+  if (setup_ != nullptr) {
    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
-             nullptr, nullptr, nullptr);
+             nullptr, nullptr, nullptr, nullptr);
    setup_(st);
  }
 }

 void BenchmarkInstance::Teardown() const {
-  if (teardown_) {
+  if (teardown_ != nullptr) {
    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
-             nullptr, nullptr, nullptr);
+             nullptr, nullptr, nullptr, nullptr);
    teardown_(st);
  }
 }
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@ -17,9 +17,9 @@ namespace internal {
 // Information kept per benchmark we may want to run
 class BenchmarkInstance {
 public:
-  BenchmarkInstance(Benchmark* benchmark, int family_index,
-                    int per_family_instance_index,
-                    const std::vector<int64_t>& args, int threads);
+  BenchmarkInstance(Benchmark* benchmark, int family_idx,
+                    int per_family_instance_idx,
+                    const std::vector<int64_t>& args, int thread_count);

  const BenchmarkName& name() const { return name_; }
  int family_index() const { return family_index_; }
@ -41,10 +41,14 @@ class BenchmarkInstance {
  int threads() const { return threads_; }
  void Setup() const;
  void Teardown() const;
+  const auto& GetUserThreadRunnerFactory() const {
+    return benchmark_.threadrunner_;
+  }

  State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
            internal::ThreadManager* manager,
-            internal::PerfCountersMeasurement* perf_counters_measurement) const;
+            internal::PerfCountersMeasurement* perf_counters_measurement,
+            ProfilerManager* profiler_manager) const;

 private:
  BenchmarkName name_;
@ -67,9 +71,8 @@ class BenchmarkInstance {
  IterationCount iterations_;
  int threads_;  // Number of concurrent threads to us

-  typedef void (*callback_function)(const benchmark::State&);
-  callback_function setup_ = nullptr;
-  callback_function teardown_ = nullptr;
+  callback_function setup_;
+  callback_function teardown_;
 };

 bool FindBenchmarksInternal(const std::string& re,
--- a/src/benchmark_main.cc
+++ b/src/benchmark_main.cc
@ -14,5 +14,5 @@

 #include "benchmark/benchmark.h"

-BENCHMARK_EXPORT int main(int, char**);
+BENCHMARK_EXPORT int main(int /*argc*/, char** /*argv*/);
 BENCHMARK_MAIN();
--- a/src/benchmark_name.cc
+++ b/src/benchmark_name.cc
@ -27,8 +27,8 @@ size_t size_impl(const Head& head, const Tail&... tail) {
 }

 // Join a pack of std::strings using a delimiter
-// TODO: use absl::StrJoin
-void join_impl(std::string&, char) {}
+// TODO(dominic): use absl::StrJoin
+void join_impl(std::string& /*unused*/, char /*unused*/) {}

 template <typename Head, typename... Tail>
 void join_impl(std::string& s, const char delimiter, const Head& head,
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@ -53,13 +53,13 @@ namespace benchmark {

 namespace {
 // For non-dense Range, intermediate values are powers of kRangeMultiplier.
-static constexpr int kRangeMultiplier = 8;
+constexpr int kRangeMultiplier = 8;

 // The size of a benchmark family determines is the number of inputs to repeat
 // the benchmark on. If this is "large" then warn the user during configuration.
-static constexpr size_t kMaxFamilySize = 100;
+constexpr size_t kMaxFamilySize = 100;

-static constexpr char kDisabledPrefix[] = "DISABLED_";
+constexpr char kDisabledPrefix[] = "DISABLED_";
 }  // end namespace

 namespace internal {
@ -82,7 +82,7 @@ class BenchmarkFamilies {

  // Extract the list of benchmark instances that match the specified
  // regular expression.
-  bool FindBenchmarks(std::string re,
+  bool FindBenchmarks(std::string spec,
                      std::vector<BenchmarkInstance>* benchmarks,
                      std::ostream* Err);

@ -125,7 +125,7 @@ bool BenchmarkFamilies::FindBenchmarks(
    is_negative_filter = true;
  }
  if (!re.Init(spec, &error_msg)) {
-    Err << "Could not compile benchmark re: " << error_msg << std::endl;
+    Err << "Could not compile benchmark re: " << error_msg << '\n';
    return false;
  }

@ -140,7 +140,9 @@ bool BenchmarkFamilies::FindBenchmarks(
    int per_family_instance_index = 0;

    // Family was deleted or benchmark doesn't match
-    if (!family) continue;
+    if (!family) {
+      continue;
+    }

    if (family->ArgsCnt() == -1) {
      family->Args({});
@ -159,7 +161,9 @@ bool BenchmarkFamilies::FindBenchmarks(
    // reserve in the special case the regex ".", since we know the final
    // family size.  this doesn't take into account any disabled benchmarks
    // so worst case we reserve more than we need.
-    if (spec == ".") benchmarks->reserve(benchmarks->size() + family_size);
+    if (spec == ".") {
+      benchmarks->reserve(benchmarks->size() + family_size);
+    }

    for (auto const& args : family->args_) {
      for (int num_threads : *thread_counts) {
@ -177,7 +181,9 @@ bool BenchmarkFamilies::FindBenchmarks(

          // Only bump the next family index once we've estabilished that
          // at least one instance of this family will be run.
-          if (next_family_index == family_index) ++next_family_index;
+          if (next_family_index == family_index) {
+            ++next_family_index;
+          }
        }
      }
    }
@ -185,11 +191,11 @@ bool BenchmarkFamilies::FindBenchmarks(
  return true;
 }

-Benchmark* RegisterBenchmarkInternal(Benchmark* bench) {
-  std::unique_ptr<Benchmark> bench_ptr(bench);
+Benchmark* RegisterBenchmarkInternal(std::unique_ptr<Benchmark> bench) {
+  Benchmark* bench_ptr = bench.get();
  BenchmarkFamilies* families = BenchmarkFamilies::GetInstance();
-  families->AddBenchmark(std::move(bench_ptr));
-  return bench;
+  families->AddBenchmark(std::move(bench));
+  return bench_ptr;
 }

 // FIXME: This function is a hack so that benchmark.cc can access
@ -218,9 +224,7 @@ Benchmark::Benchmark(const std::string& name)
      use_real_time_(false),
      use_manual_time_(false),
      complexity_(oNone),
-      complexity_lambda_(nullptr),
-      setup_(nullptr),
-      teardown_(nullptr) {
+      complexity_lambda_(nullptr) {
  ComputeStatistics("mean", StatisticsMean);
  ComputeStatistics("median", StatisticsMedian);
  ComputeStatistics("stddev", StatisticsStdDev);
@ -331,13 +335,25 @@ Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
  return this;
 }

-Benchmark* Benchmark::Setup(void (*setup)(const benchmark::State&)) {
+Benchmark* Benchmark::Setup(callback_function&& setup) {
+  BM_CHECK(setup != nullptr);
+  setup_ = std::forward<callback_function>(setup);
+  return this;
+}
+
+Benchmark* Benchmark::Setup(const callback_function& setup) {
  BM_CHECK(setup != nullptr);
  setup_ = setup;
  return this;
 }

-Benchmark* Benchmark::Teardown(void (*teardown)(const benchmark::State&)) {
+Benchmark* Benchmark::Teardown(callback_function&& teardown) {
+  BM_CHECK(teardown != nullptr);
+  teardown_ = std::forward<callback_function>(teardown);
+  return this;
+}
+
+Benchmark* Benchmark::Teardown(const callback_function& teardown) {
  BM_CHECK(teardown != nullptr);
  teardown_ = teardown;
  return this;
@ -468,13 +484,20 @@ Benchmark* Benchmark::ThreadPerCpu() {
  return this;
 }

+Benchmark* Benchmark::ThreadRunner(threadrunner_factory&& factory) {
+  threadrunner_ = std::move(factory);
+  return this;
+}
+
 void Benchmark::SetName(const std::string& name) { name_ = name; }

 const char* Benchmark::GetName() const { return name_.c_str(); }

 int Benchmark::ArgsCnt() const {
  if (args_.empty()) {
-    if (arg_names_.empty()) return -1;
+    if (arg_names_.empty()) {
+      return -1;
+    }
    return static_cast<int>(arg_names_.size());
  }
  return static_cast<int>(args_.front().size());
@ -482,8 +505,9 @@ int Benchmark::ArgsCnt() const {

 const char* Benchmark::GetArgName(int arg) const {
  BM_CHECK_GE(arg, 0);
-  BM_CHECK_LT(arg, static_cast<int>(arg_names_.size()));
-  return arg_names_[arg].c_str();
+  size_t uarg = static_cast<size_t>(arg);
+  BM_CHECK_LT(uarg, arg_names_.size());
+  return arg_names_[uarg].c_str();
 }

 TimeUnit Benchmark::GetTimeUnit() const {
--- a/src/benchmark_register.h
+++ b/src/benchmark_register.h
@ -24,7 +24,7 @@ typename std::vector<T>::iterator AddPowers(std::vector<T>* dst, T lo, T hi,
  static const T kmax = std::numeric_limits<T>::max();

  // Space out the values in multiples of "mult"
-  for (T i = static_cast<T>(1); i <= hi; i *= static_cast<T>(mult)) {
+  for (T i = static_cast<T>(1); i <= hi; i = static_cast<T>(i * mult)) {
    if (i >= lo) {
      dst->push_back(i);
    }
@ -52,7 +52,7 @@ void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {

  const auto it = AddPowers(dst, hi_complement, lo_complement, mult);

-  std::for_each(it, dst->end(), [](T& t) { t *= -1; });
+  std::for_each(it, dst->end(), [](T& t) { t = static_cast<T>(t * -1); });
  std::reverse(it, dst->end());
 }

--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@ -34,6 +34,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <fstream>
+#include <functional>
 #include <iostream>
 #include <limits>
 #include <memory>
@ -46,7 +47,6 @@
 #include "commandlineflags.h"
 #include "complexity.h"
 #include "counter.h"
-#include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
 #include "perf_counters.h"
@ -58,13 +58,23 @@

 namespace benchmark {

+BM_DECLARE_bool(benchmark_dry_run);
+BM_DECLARE_string(benchmark_min_time);
+BM_DECLARE_double(benchmark_min_warmup_time);
+BM_DECLARE_int32(benchmark_repetitions);
+BM_DECLARE_bool(benchmark_report_aggregates_only);
+BM_DECLARE_bool(benchmark_display_aggregates_only);
+BM_DECLARE_string(benchmark_perf_counters);
+
 namespace internal {

 MemoryManager* memory_manager = nullptr;

+ProfilerManager* profiler_manager = nullptr;
+
 namespace {

-static constexpr IterationCount kMaxIterations = 1000000000;
+constexpr IterationCount kMaxIterations = 1000000000000;
 const double kDefaultMinTime =
    std::strtod(::benchmark::kDefaultMinTimeStr, /*p_end*/ nullptr);

@ -72,7 +82,7 @@ BenchmarkReporter::Run CreateRunReport(
    const benchmark::internal::BenchmarkInstance& b,
    const internal::ThreadManager::Result& results,
    IterationCount memory_iterations,
-    const MemoryManager::Result* memory_result, double seconds,
+    const MemoryManager::Result& memory_result, double seconds,
    int64_t repetition_index, int64_t repeats) {
  // Create report about this benchmark run.
  BenchmarkReporter::Run report;
@ -90,12 +100,13 @@ BenchmarkReporter::Run CreateRunReport(
  report.repetition_index = repetition_index;
  report.repetitions = repeats;

-  if (!report.skipped) {
+  if (report.skipped == 0u) {
    if (b.use_manual_time()) {
      report.real_accumulated_time = results.manual_time_used;
    } else {
      report.real_accumulated_time = results.real_time_used;
    }
+    report.use_real_time_for_initial_big_o = b.use_manual_time();
    report.cpu_accumulated_time = results.cpu_time_used;
    report.complexity_n = results.complexity_n;
    report.complexity = b.complexity();
@ -104,12 +115,12 @@ BenchmarkReporter::Run CreateRunReport(
    report.counters = results.counters;

    if (memory_iterations > 0) {
-      assert(memory_result != nullptr);
      report.memory_result = memory_result;
      report.allocs_per_iter =
-          memory_iterations ? static_cast<double>(memory_result->num_allocs) /
-                                  memory_iterations
-                            : 0;
+          memory_iterations != 0
+              ? static_cast<double>(memory_result.num_allocs) /
+                    static_cast<double>(memory_iterations)
+              : 0;
    }

    internal::Finish(&report.counters, results.iterations, seconds,
@ -122,14 +133,15 @@ BenchmarkReporter::Run CreateRunReport(
 // Adds the stats collected for the thread into manager->results.
 void RunInThread(const BenchmarkInstance* b, IterationCount iters,
                 int thread_id, ThreadManager* manager,
-                 PerfCountersMeasurement* perf_counters_measurement) {
+                 PerfCountersMeasurement* perf_counters_measurement,
+                 ProfilerManager* profiler_manager_) {
  internal::ThreadTimer timer(
      b->measure_process_cpu_time()
          ? internal::ThreadTimer::CreateProcessCpuTime()
          : internal::ThreadTimer::Create());

-  State st =
-      b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
+  State st = b->Run(iters, thread_id, &timer, manager,
+                    perf_counters_measurement, profiler_manager_);
  BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations)
      << "Benchmark returned before State::KeepRunning() returned false!";
  {
@ -147,17 +159,23 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,

 double ComputeMinTime(const benchmark::internal::BenchmarkInstance& b,
                      const BenchTimeType& iters_or_time) {
-  if (!IsZero(b.min_time())) return b.min_time();
+  if (!IsZero(b.min_time())) {
+    return b.min_time();
+  }
  // If the flag was used to specify number of iters, then return the default
  // min_time.
-  if (iters_or_time.tag == BenchTimeType::ITERS) return kDefaultMinTime;
+  if (iters_or_time.tag == BenchTimeType::ITERS) {
+    return kDefaultMinTime;
+  }

  return iters_or_time.time;
 }

 IterationCount ComputeIters(const benchmark::internal::BenchmarkInstance& b,
                            const BenchTimeType& iters_or_time) {
-  if (b.iterations() != 0) return b.iterations();
+  if (b.iterations() != 0) {
+    return b.iterations();
+  }

  // We've already concluded that this flag is currently used to pass
  // iters but do a check here again anyway.
@ -165,10 +183,42 @@ IterationCount ComputeIters(const benchmark::internal::BenchmarkInstance& b,
  return iters_or_time.iters;
 }

+class ThreadRunnerDefault : public ThreadRunnerBase {
+ public:
+  explicit ThreadRunnerDefault(int num_threads)
+      : pool(static_cast<size_t>(num_threads - 1)) {}
+
+  void RunThreads(const std::function<void(int)>& fn) final {
+    // Run all but one thread in separate threads
+    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+      pool[ti] = std::thread(fn, static_cast<int>(ti + 1));
+    }
+    // And run one thread here directly.
+    // (If we were asked to run just one thread, we don't create new threads.)
+    // Yes, we need to do this here *after* we start the separate threads.
+    fn(0);
+
+    // The main thread has finished. Now let's wait for the other threads.
+    for (std::thread& thread : pool) {
+      thread.join();
+    }
+  }
+
+ private:
+  std::vector<std::thread> pool;
+};
+
+std::unique_ptr<ThreadRunnerBase> GetThreadRunner(
+    const threadrunner_factory& userThreadRunnerFactory, int num_threads) {
+  return userThreadRunnerFactory
+             ? userThreadRunnerFactory(num_threads)
+             : std::make_unique<ThreadRunnerDefault>(num_threads);
+}
+
 }  // end namespace

 BenchTimeType ParseBenchMinTime(const std::string& value) {
-  BenchTimeType ret;
+  BenchTimeType ret = {};

  if (value.empty()) {
    ret.tag = BenchTimeType::TIME;
@ -177,7 +227,7 @@ BenchTimeType ParseBenchMinTime(const std::string& value) {
  }

  if (value.back() == 'x') {
-    char* p_end;
+    char* p_end = nullptr;
    // Reset errno before it's changed by strtol.
    errno = 0;
    IterationCount num_iters = std::strtol(value.c_str(), &p_end, 10);
@ -199,7 +249,7 @@ BenchTimeType ParseBenchMinTime(const std::string& value) {
                  "Eg., `30s` for 30-seconds.";
  }

-  char* p_end;
+  char* p_end = nullptr;
  // Reset errno before it's changed by strtod.
  errno = 0;
  double min_time = std::strtod(value.c_str(), &p_end);
@ -224,20 +274,30 @@ BenchmarkRunner::BenchmarkRunner(
    : b(b_),
      reports_for_family(reports_for_family_),
      parsed_benchtime_flag(ParseBenchMinTime(FLAGS_benchmark_min_time)),
-      min_time(ComputeMinTime(b_, parsed_benchtime_flag)),
-      min_warmup_time((!IsZero(b.min_time()) && b.min_warmup_time() > 0.0)
-                          ? b.min_warmup_time()
-                          : FLAGS_benchmark_min_warmup_time),
-      warmup_done(!(min_warmup_time > 0.0)),
-      repeats(b.repetitions() != 0 ? b.repetitions()
-                                   : FLAGS_benchmark_repetitions),
+      min_time(FLAGS_benchmark_dry_run
+                   ? 0
+                   : ComputeMinTime(b_, parsed_benchtime_flag)),
+      min_warmup_time(
+          FLAGS_benchmark_dry_run
+              ? 0
+              : ((!IsZero(b.min_time()) && b.min_warmup_time() > 0.0)
+                     ? b.min_warmup_time()
+                     : FLAGS_benchmark_min_warmup_time)),
+      warmup_done(FLAGS_benchmark_dry_run ? true : !(min_warmup_time > 0.0)),
+      repeats(FLAGS_benchmark_dry_run
+                  ? 1
+                  : (b.repetitions() != 0 ? b.repetitions()
+                                          : FLAGS_benchmark_repetitions)),
      has_explicit_iteration_count(b.iterations() != 0 ||
                                   parsed_benchtime_flag.tag ==
                                       BenchTimeType::ITERS),
-      pool(b.threads() - 1),
-      iters(has_explicit_iteration_count
-                ? ComputeIters(b_, parsed_benchtime_flag)
-                : 1),
+      thread_runner(
+          GetThreadRunner(b.GetUserThreadRunnerFactory(), b.threads())),
+      iters(FLAGS_benchmark_dry_run
+                ? 1
+                : (has_explicit_iteration_count
+                       ? ComputeIters(b_, parsed_benchtime_flag)
+                       : 1)),
      perf_counters_measurement_ptr(pcm_) {
  run_results.display_report_aggregates_only =
      (FLAGS_benchmark_report_aggregates_only ||
@ -246,10 +306,11 @@ BenchmarkRunner::BenchmarkRunner(
      FLAGS_benchmark_report_aggregates_only;
  if (b.aggregation_report_mode() != internal::ARM_Unspecified) {
    run_results.display_report_aggregates_only =
-        (b.aggregation_report_mode() &
-         internal::ARM_DisplayReportAggregatesOnly);
+        ((b.aggregation_report_mode() &
+          internal::ARM_DisplayReportAggregatesOnly) != 0u);
    run_results.file_report_aggregates_only =
-        (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
+        ((b.aggregation_report_mode() &
+          internal::ARM_FileReportAggregatesOnly) != 0u);
    BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
             (perf_counters_measurement_ptr->num_counters() == 0))
        << "Perf counters were requested but could not be set up.";
@ -262,19 +323,10 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
  std::unique_ptr<internal::ThreadManager> manager;
  manager.reset(new internal::ThreadManager(b.threads()));

-  // Run all but one thread in separate threads
-  for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-    pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
-                           manager.get(), perf_counters_measurement_ptr);
-  }
-  // And run one thread here directly.
-  // (If we were asked to run just one thread, we don't create new threads.)
-  // Yes, we need to do this here *after* we start the separate threads.
-  RunInThread(&b, iters, 0, manager.get(), perf_counters_measurement_ptr);
-
-  // The main thread has finished. Now let's wait for the other threads.
-  manager->WaitForAllThreads();
-  for (std::thread& thread : pool) thread.join();
+  thread_runner->RunThreads([&](int thread_idx) {
+    RunInThread(&b, iters, thread_idx, manager.get(),
+                perf_counters_measurement_ptr, /*profiler_manager=*/nullptr);
+  });

  IterationResults i;
  // Acquire the measurements/counters from the manager, UNDER THE LOCK!
@ -286,12 +338,6 @@ BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
  // And get rid of the manager.
  manager.reset();

-  // Adjust real/manual time stats since they were reported per thread.
-  i.results.real_time_used /= b.threads();
-  i.results.manual_time_used /= b.threads();
-  // If we were measuring whole-process CPU usage, adjust the CPU time too.
-  if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads();
-
  BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
             << i.results.real_time_used << "\n";

@ -325,8 +371,8 @@ IterationCount BenchmarkRunner::PredictNumItersNeeded(

  // So what seems to be the sufficiently-large iteration count? Round up.
  const IterationCount max_next_iters = static_cast<IterationCount>(
-      std::lround(std::max(multiplier * static_cast<double>(i.iters),
-                           static_cast<double>(i.iters) + 1.0)));
+      std::llround(std::max(multiplier * static_cast<double>(i.iters),
+                            static_cast<double>(i.iters) + 1.0)));
  // But we do have *some* limits though..
  const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);

@ -339,7 +385,7 @@ bool BenchmarkRunner::ShouldReportIterationResults(
  // Determine if this run should be reported;
  // Either it has run for a sufficient amount of time
  // or because an error was reported.
-  return i.results.skipped_ ||
+  return (i.results.skipped_ != 0u) || FLAGS_benchmark_dry_run ||
         i.iters >= kMaxIterations ||  // Too many iterations already.
         i.seconds >=
             GetMinTimeToApply() ||  // The elapsed time is large enough.
@ -400,6 +446,34 @@ void BenchmarkRunner::RunWarmUp() {
  }
 }

+MemoryManager::Result BenchmarkRunner::RunMemoryManager(
+    IterationCount memory_iterations) {
+  memory_manager->Start();
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(1));
+  b.Setup();
+  RunInThread(&b, memory_iterations, 0, manager.get(),
+              perf_counters_measurement_ptr,
+              /*profiler_manager=*/nullptr);
+  manager.reset();
+  b.Teardown();
+  MemoryManager::Result memory_result;
+  memory_manager->Stop(memory_result);
+  memory_result.memory_iterations = memory_iterations;
+  return memory_result;
+}
+
+void BenchmarkRunner::RunProfilerManager(IterationCount profile_iterations) {
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(1));
+  b.Setup();
+  RunInThread(&b, profile_iterations, 0, manager.get(),
+              /*perf_counters_measurement_ptr=*/nullptr,
+              /*profiler_manager=*/profiler_manager);
+  manager.reset();
+  b.Teardown();
+}
+
 void BenchmarkRunner::DoOneRepetition() {
  assert(HasRepeatsRemaining() && "Already done all repetitions?");

@ -410,7 +484,9 @@ void BenchmarkRunner::DoOneRepetition() {
  // this warmup never happened except the fact that warmup_done is set. Every
  // other manipulation of the BenchmarkRunner instance would be a bug! Please
  // fix it.
-  if (!warmup_done) RunWarmUp();
+  if (!warmup_done) {
+    RunWarmUp();
+  }

  IterationResults i;
  // We *may* be gradually increasing the length (iteration count)
@ -432,8 +508,10 @@ void BenchmarkRunner::DoOneRepetition() {
    const bool results_are_significant = !is_the_first_repetition ||
                                         has_explicit_iteration_count ||
                                         ShouldReportIterationResults(i);
-
-    if (results_are_significant) break;  // Good, let's report them!
+    // Good, let's report them!
+    if (results_are_significant) {
+      break;
+    }

    // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
    // iteration count, and run the benchmark again...
@ -444,28 +522,21 @@ void BenchmarkRunner::DoOneRepetition() {
           "then we should have accepted the current iteration run.");
  }

-  // Oh, one last thing, we need to also produce the 'memory measurements'..
-  MemoryManager::Result* memory_result = nullptr;
+  // Produce memory measurements if requested.
+  MemoryManager::Result memory_result;
  IterationCount memory_iterations = 0;
  if (memory_manager != nullptr) {
-    // TODO(vyng): Consider making BenchmarkReporter::Run::memory_result an
-    // optional so we don't have to own the Result here.
-    // Can't do it now due to cxx03.
-    memory_results.push_back(MemoryManager::Result());
-    memory_result = &memory_results.back();
    // Only run a few iterations to reduce the impact of one-time
    // allocations in benchmarks that are not properly managed.
    memory_iterations = std::min<IterationCount>(16, iters);
-    memory_manager->Start();
-    std::unique_ptr<internal::ThreadManager> manager;
-    manager.reset(new internal::ThreadManager(1));
-    b.Setup();
-    RunInThread(&b, memory_iterations, 0, manager.get(),
-                perf_counters_measurement_ptr);
-    manager->WaitForAllThreads();
-    manager.reset();
-    b.Teardown();
-    memory_manager->Stop(*memory_result);
+    memory_result = RunMemoryManager(memory_iterations);
+  }
+
+  if (profiler_manager != nullptr) {
+    // We want to externally profile the benchmark for the same number of
+    // iterations because, for example, if we're tracing the benchmark then we
+    // want trace data to reasonably match PMU data.
+    RunProfilerManager(iters);
  }

  // Ok, now actually report.
@ -473,9 +544,11 @@ void BenchmarkRunner::DoOneRepetition() {
      CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
                      num_repetitions_done, repeats);

-  if (reports_for_family) {
+  if (reports_for_family != nullptr) {
    ++reports_for_family->num_runs_done;
-    if (!report.skipped) reports_for_family->Runs.push_back(report);
+    if (report.skipped == 0u) {
+      reports_for_family->Runs.push_back(report);
+    }
  }

  run_results.non_aggregates.push_back(report);
--- a/src/benchmark_runner.h
+++ b/src/benchmark_runner.h
@ -15,26 +15,20 @@
 #ifndef BENCHMARK_RUNNER_H_
 #define BENCHMARK_RUNNER_H_

+#include <memory>
 #include <thread>
 #include <vector>

 #include "benchmark_api_internal.h"
-#include "internal_macros.h"
 #include "perf_counters.h"
 #include "thread_manager.h"

 namespace benchmark {

-BM_DECLARE_string(benchmark_min_time);
-BM_DECLARE_double(benchmark_min_warmup_time);
-BM_DECLARE_int32(benchmark_repetitions);
-BM_DECLARE_bool(benchmark_report_aggregates_only);
-BM_DECLARE_bool(benchmark_display_aggregates_only);
-BM_DECLARE_string(benchmark_perf_counters);
-
 namespace internal {

 extern MemoryManager* memory_manager;
+extern ProfilerManager* profiler_manager;

 struct RunResults {
  std::vector<BenchmarkReporter::Run> non_aggregates;
@ -45,7 +39,7 @@ struct RunResults {
 };

 struct BENCHMARK_EXPORT BenchTimeType {
-  enum { ITERS, TIME } tag;
+  enum { UNSPECIFIED, ITERS, TIME } tag;
  union {
    IterationCount iters;
    double time;
@ -58,7 +52,7 @@ BenchTimeType ParseBenchMinTime(const std::string& value);
 class BenchmarkRunner {
 public:
  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
-                  benchmark::internal::PerfCountersMeasurement* pmc_,
+                  benchmark::internal::PerfCountersMeasurement* pcm_,
                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);

  int GetNumRepeats() const { return repeats; }
@ -96,9 +90,7 @@ class BenchmarkRunner {

  int num_repetitions_done = 0;

-  std::vector<std::thread> pool;
-
-  std::vector<MemoryManager::Result> memory_results;
+  std::unique_ptr<ThreadRunnerBase> thread_runner;

  IterationCount iters;  // preserved between repetitions!
  // So only the first repetition has to find/calculate it,
@ -113,6 +105,10 @@ class BenchmarkRunner {
  };
  IterationResults DoNIterations();

+  MemoryManager::Result RunMemoryManager(IterationCount memory_iterations);
+
+  void RunProfilerManager(IterationCount profile_iterations);
+
  IterationCount PredictNumItersNeeded(const IterationResults& i) const;

  bool ShouldReportIterationResults(const IterationResults& i) const;
--- a/src/check.cc
+++ b/src/check.cc
@ -3,7 +3,10 @@
 namespace benchmark {
 namespace internal {

-static AbortHandlerT* handler = &std::abort;
+namespace {
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+AbortHandlerT* handler = &std::abort;
+}  // namespace

 BENCHMARK_EXPORT AbortHandlerT*& GetAbortHandler() { return handler; }

--- a/src/check.h
+++ b/src/check.h
@ -4,6 +4,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <ostream>
+#include <string_view>

 #include "benchmark/export.h"
 #include "internal_macros.h"
@ -36,6 +37,8 @@ AbortHandlerT*& GetAbortHandler();

 BENCHMARK_NORETURN inline void CallAbortHandler() {
  GetAbortHandler()();
+  std::flush(std::cout);
+  std::flush(std::cerr);
  std::abort();  // fallback to enforce noreturn
 }

@ -44,7 +47,8 @@ BENCHMARK_NORETURN inline void CallAbortHandler() {
 // destructed.
 class CheckHandler {
 public:
-  CheckHandler(const char* check, const char* file, const char* func, int line)
+  CheckHandler(std::string_view check, std::string_view file,
+               std::string_view func, int line)
      : log_(GetErrorLogInstance()) {
    log_ << file << ":" << line << ": " << func << ": Check `" << check
         << "' failed. ";
@ -57,7 +61,7 @@ class CheckHandler {
 #pragma warning(disable : 4722)
 #endif
  BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
-    log_ << std::endl;
+    log_ << '\n';
    CallAbortHandler();
  }
 #if defined(COMPILER_MSVC)
@ -78,9 +82,11 @@ class CheckHandler {
 // The BM_CHECK macro returns a std::ostream object that can have extra
 // information written to it.
 #ifndef NDEBUG
-#define BM_CHECK(b)                                                          \
-  (b ? ::benchmark::internal::GetNullLogInstance()                           \
-     : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
+#define BM_CHECK(b)                                          \
+  (b ? ::benchmark::internal::GetNullLogInstance()           \
+     : ::benchmark::internal::CheckHandler(                  \
+           std::string_view(#b), std::string_view(__FILE__), \
+           std::string_view(__func__), __LINE__)             \
           .GetLog())
 #else
 #define BM_CHECK(b) ::benchmark::internal::GetNullLogInstance()
--- a/src/colorprint.cc
+++ b/src/colorprint.cc
@ -135,22 +135,30 @@ void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
  // Gets the current text color.
  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
-  const WORD old_color_attrs = buffer_info.wAttributes;
+  const WORD original_color_attrs = buffer_info.wAttributes;

  // We need to flush the stream buffers into the console before each
  // SetConsoleTextAttribute call lest it affect the text that is already
  // printed but has not yet reached the console.
-  fflush(stdout);
-  SetConsoleTextAttribute(stdout_handle,
-                          GetPlatformColorCode(color) | FOREGROUND_INTENSITY);
-  vprintf(fmt, args);
+  out.flush();

-  fflush(stdout);
-  // Restores the text color.
-  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+  const WORD original_background_attrs =
+      original_color_attrs & (BACKGROUND_RED | BACKGROUND_GREEN |
+                              BACKGROUND_BLUE | BACKGROUND_INTENSITY);
+
+  SetConsoleTextAttribute(stdout_handle, GetPlatformColorCode(color) |
+                                             FOREGROUND_INTENSITY |
+                                             original_background_attrs);
+  out << FormatString(fmt, args);
+
+  out.flush();
+  // Restores the text and background color.
+  SetConsoleTextAttribute(stdout_handle, original_color_attrs);
 #else
  const char* color_code = GetPlatformColorCode(color);
-  if (color_code) out << FormatString("\033[0;3%sm", color_code);
+  if (color_code != nullptr) {
+    out << FormatString("\033[0;3%sm", color_code);
+  }
  out << FormatString(fmt, args) << "\033[m";
 #endif
 }
@ -187,7 +195,7 @@ bool IsColorTerminal() {

  bool term_supports_color = false;
  for (const char* candidate : SUPPORTED_TERM_VALUES) {
-    if (term && 0 == strcmp(term, candidate)) {
+    if ((term != nullptr) && 0 == strcmp(term, candidate)) {
      term_supports_color = true;
      break;
    }
--- a/src/commandlineflags.cc
+++ b/src/commandlineflags.cc
@ -109,12 +109,13 @@ bool ParseKvPairs(const std::string& src_text, const char* str,
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "BENCHMARK_FOO" in the open-source version.
-static std::string FlagToEnvVar(const char* flag) {
+std::string FlagToEnvVar(const char* flag) {
  const std::string flag_str(flag);

  std::string env_var;
-  for (size_t i = 0; i != flag_str.length(); ++i)
+  for (size_t i = 0; i != flag_str.length(); ++i) {
    env_var += static_cast<char>(::toupper(flag_str.c_str()[i]));
+  }

  return env_var;
 }
@ -167,7 +168,9 @@ std::map<std::string, std::string> KvPairsFromEnv(
  const std::string env_var = FlagToEnvVar(flag);
  const char* const value_str = getenv(env_var.c_str());

-  if (value_str == nullptr) return default_val;
+  if (value_str == nullptr) {
+    return default_val;
+  }

  std::map<std::string, std::string> value;
  if (!ParseKvPairs("Environment variable " + env_var, value_str, &value)) {
@ -184,23 +187,31 @@ std::map<std::string, std::string> KvPairsFromEnv(
 const char* ParseFlagValue(const char* str, const char* flag,
                           bool def_optional) {
  // str and flag must not be nullptr.
-  if (str == nullptr || flag == nullptr) return nullptr;
+  if (str == nullptr || flag == nullptr) {
+    return nullptr;
+  }

  // The flag must start with "--".
  const std::string flag_str = std::string("--") + std::string(flag);
  const size_t flag_len = flag_str.length();
-  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) {
+    return nullptr;
+  }

  // Skips the flag name.
  const char* flag_end = str + flag_len;

  // When def_optional is true, it's OK to not have a "=value" part.
-  if (def_optional && (flag_end[0] == '\0')) return flag_end;
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }

  // If def_optional is true and there are more characters after the
  // flag name, or if def_optional is false, there must be a '=' after
  // the flag name.
-  if (flag_end[0] != '=') return nullptr;
+  if (flag_end[0] != '=') {
+    return nullptr;
+  }

  // Returns the string after "=".
  return flag_end + 1;
@ -212,7 +223,9 @@ bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
  const char* const value_str = ParseFlagValue(str, flag, true);

  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }

  // Converts the string value to a bool.
  *value = IsTruthyFlagValue(value_str);
@ -225,7 +238,9 @@ bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
  const char* const value_str = ParseFlagValue(str, flag, false);

  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }

  // Sets *value to the value of the flag.
  return ParseInt32(std::string("The value of flag --") + flag, value_str,
@ -238,7 +253,9 @@ bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
  const char* const value_str = ParseFlagValue(str, flag, false);

  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }

  // Sets *value to the value of the flag.
  return ParseDouble(std::string("The value of flag --") + flag, value_str,
@ -251,7 +268,9 @@ bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
  const char* const value_str = ParseFlagValue(str, flag, false);

  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }

  *value = value_str;
  return true;
@ -262,11 +281,15 @@ bool ParseKeyValueFlag(const char* str, const char* flag,
                       std::map<std::string, std::string>* value) {
  const char* const value_str = ParseFlagValue(str, flag, false);

-  if (value_str == nullptr) return false;
+  if (value_str == nullptr) {
+    return false;
+  }

  for (const auto& kvpair : StrSplit(value_str, ',')) {
    const auto kv = StrSplit(kvpair, '=');
-    if (kv.size() != 2) return false;
+    if (kv.size() != 2) {
+      return false;
+    }
    value->emplace(kv[0], kv[1]);
  }

--- a/src/commandlineflags.h
+++ b/src/commandlineflags.h
@ -11,14 +11,17 @@
 #define FLAG(name) FLAGS_##name

 // Macros for declaring flags.
+// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
 #define BM_DECLARE_bool(name) BENCHMARK_EXPORT extern bool FLAG(name)
 #define BM_DECLARE_int32(name) BENCHMARK_EXPORT extern int32_t FLAG(name)
 #define BM_DECLARE_double(name) BENCHMARK_EXPORT extern double FLAG(name)
 #define BM_DECLARE_string(name) BENCHMARK_EXPORT extern std::string FLAG(name)
 #define BM_DECLARE_kvpairs(name) \
  BENCHMARK_EXPORT extern std::map<std::string, std::string> FLAG(name)
+// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)

 // Macros for defining flags.
+// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
 #define BM_DEFINE_bool(name, default_val) \
  BENCHMARK_EXPORT bool FLAG(name) = benchmark::BoolFromEnv(#name, default_val)
 #define BM_DEFINE_int32(name, default_val) \
@ -33,6 +36,7 @@
 #define BM_DEFINE_kvpairs(name, default_val)                       \
  BENCHMARK_EXPORT std::map<std::string, std::string> FLAG(name) = \
      benchmark::KvPairsFromEnv(#name, default_val)
+// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)

 namespace benchmark {

--- a/src/complexity.cc
+++ b/src/complexity.cc
@ -27,7 +27,6 @@ namespace benchmark {

 // Internal function to calculate the different scalability forms
 BigOFunc* FittingCurve(BigO complexity) {
-  static const double kLog2E = 1.44269504088896340736;
  switch (complexity) {
    case oN:
      return [](IterationCount n) -> double { return static_cast<double>(n); };
@ -36,13 +35,12 @@ BigOFunc* FittingCurve(BigO complexity) {
    case oNCubed:
      return [](IterationCount n) -> double { return std::pow(n, 3); };
    case oLogN:
-      /* Note: can't use log2 because Android's GNU STL lacks it */
-      return
-          [](IterationCount n) { return kLog2E * log(static_cast<double>(n)); };
+      return [](IterationCount n) -> double {
+        return std::log2(static_cast<double>(n));
+      };
    case oNLogN:
-      /* Note: can't use log2 because Android's GNU STL lacks it */
-      return [](IterationCount n) {
-        return kLog2E * n * log(static_cast<double>(n));
+      return [](IterationCount n) -> double {
+        return static_cast<double>(n) * std::log2(static_cast<double>(n));
      };
    case o1:
    default:
@ -75,12 +73,12 @@ std::string GetBigOString(BigO complexity) {
 // given by the lambda expression.
 //   - n             : Vector containing the size of the benchmark tests.
 //   - time          : Vector containing the times for the benchmark tests.
-//   - fitting_curve : lambda expression (e.g. [](int64_t n) {return n; };).
+//   - fitting_curve : lambda expression (e.g. [](ComplexityN n) {return n; };).

 // For a deeper explanation on the algorithm logic, please refer to
 // https://en.wikipedia.org/wiki/Least_squares#Least_squares,_regression_analysis_and_statistics

-LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
+LeastSq MinimalLeastSq(const std::vector<ComplexityN>& n,
                       const std::vector<double>& time,
                       BigOFunc* fitting_curve) {
  double sigma_gn_squared = 0.0;
@ -105,12 +103,12 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
  double rms = 0.0;
  for (size_t i = 0; i < n.size(); ++i) {
    double fit = result.coef * fitting_curve(n[i]);
-    rms += pow((time[i] - fit), 2);
+    rms += std::pow((time[i] - fit), 2);
  }

  // Normalized RMS by the mean of the observed values
-  double mean = sigma_time / n.size();
-  result.rms = sqrt(rms / n.size()) / mean;
+  double mean = sigma_time / static_cast<double>(n.size());
+  result.rms = std::sqrt(rms / static_cast<double>(n.size())) / mean;

  return result;
 }
@ -122,7 +120,7 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
 //   - complexity : If different than oAuto, the fitting curve will stick to
 //                  this one. If it is oAuto, it will be calculated the best
 //                  fitting curve.
-LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
+LeastSq MinimalLeastSq(const std::vector<ComplexityN>& n,
                       const std::vector<double>& time, const BigO complexity) {
  BM_CHECK_EQ(n.size(), time.size());
  BM_CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
@ -159,10 +157,12 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
  typedef BenchmarkReporter::Run Run;
  std::vector<Run> results;

-  if (reports.size() < 2) return results;
+  if (reports.size() < 2) {
+    return results;
+  }

  // Accumulators.
-  std::vector<int64_t> n;
+  std::vector<ComplexityN> n;
  std::vector<double> real_time;
  std::vector<double> cpu_time;

@ -171,8 +171,10 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
    BM_CHECK_GT(run.complexity_n, 0)
        << "Did you forget to call SetComplexityN?";
    n.push_back(run.complexity_n);
-    real_time.push_back(run.real_accumulated_time / run.iterations);
-    cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
+    real_time.push_back(run.real_accumulated_time /
+                        static_cast<double>(run.iterations));
+    cpu_time.push_back(run.cpu_accumulated_time /
+                       static_cast<double>(run.iterations));
  }

  LeastSq result_cpu;
@ -182,8 +184,19 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
    result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity_lambda);
    result_real = MinimalLeastSq(n, real_time, reports[0].complexity_lambda);
  } else {
-    result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity);
-    result_real = MinimalLeastSq(n, real_time, result_cpu.complexity);
+    const BigO* InitialBigO = &reports[0].complexity;
+    const bool use_real_time_for_initial_big_o =
+        reports[0].use_real_time_for_initial_big_o;
+    if (use_real_time_for_initial_big_o) {
+      result_real = MinimalLeastSq(n, real_time, *InitialBigO);
+      InitialBigO = &result_real.complexity;
+      // The Big-O complexity for CPU time must have the same Big-O function!
+    }
+    result_cpu = MinimalLeastSq(n, cpu_time, *InitialBigO);
+    InitialBigO = &result_cpu.complexity;
+    if (!use_real_time_for_initial_big_o) {
+      result_real = MinimalLeastSq(n, real_time, *InitialBigO);
+    }
  }

  // Drop the 'args' when reporting complexity.
--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@ -42,11 +42,15 @@ bool ConsoleReporter::ReportContext(const Context& context) {
  PrintBasicContext(&GetErrorStream(), context);

 #ifdef BENCHMARK_OS_WINDOWS
-  if ((output_options_ & OO_Color) && &std::cout != &GetOutputStream()) {
-    GetErrorStream()
-        << "Color printing is only supported for stdout on windows."
-           " Disabling color printing\n";
-    output_options_ = static_cast<OutputOptions>(output_options_ & ~OO_Color);
+  if ((output_options_ & OO_Color)) {
+    auto stdOutBuf = std::cout.rdbuf();
+    auto outStreamBuf = GetOutputStream().rdbuf();
+    if (stdOutBuf != outStreamBuf) {
+      GetErrorStream()
+          << "Color printing is only supported for stdout on windows."
+             " Disabling color printing\n";
+      output_options_ = static_cast<OutputOptions>(output_options_ & ~OO_Color);
+    }
  }
 #endif

@ -59,7 +63,7 @@ void ConsoleReporter::PrintHeader(const Run& run) {
      FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
                   "Benchmark", "Time", "CPU", "Iterations");
  if (!run.counters.empty()) {
-    if (output_options_ & OO_Tabular) {
+    if ((output_options_ & OO_Tabular) != 0) {
      for (auto const& c : run.counters) {
        str += FormatString(" %10s", c.first.c_str());
      }
@ -79,7 +83,7 @@ void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
    bool print_header = !printed_header_;
    // --- or if the format is tabular and this run
    //     has different fields from the prev header
-    print_header |= (output_options_ & OO_Tabular) &&
+    print_header |= ((output_options_ & OO_Tabular) != 0) &&
                    (!internal::SameNames(run.counters, prev_counters_));
    if (print_header) {
      printed_header_ = true;
@ -93,8 +97,8 @@ void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
  }
 }

-static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
-                             ...) {
+static void IgnoreColorPrint(std::ostream& out, LogColor /*unused*/,
+                             const char* fmt, ...) {
  va_list args;
  va_start(args, fmt);
  out << FormatString(fmt, args);
@ -127,7 +131,7 @@ BENCHMARK_EXPORT
 void ConsoleReporter::PrintRunData(const Run& result) {
  typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
  auto& Out = GetOutputStream();
-  PrinterFn* printer = (output_options_ & OO_Color)
+  PrinterFn* printer = (output_options_ & OO_Color) != 0
                           ? static_cast<PrinterFn*>(ColorPrintf)
                           : IgnoreColorPrint;
  auto name_color =
@ -140,7 +144,8 @@ void ConsoleReporter::PrintRunData(const Run& result) {
            result.skip_message.c_str());
    printer(Out, COLOR_DEFAULT, "\n");
    return;
-  } else if (internal::SkippedWithMessage == result.skipped) {
+  }
+  if (internal::SkippedWithMessage == result.skipped) {
    printer(Out, COLOR_WHITE, "SKIPPED: \'%s\'", result.skip_message.c_str());
    printer(Out, COLOR_DEFAULT, "\n");
    return;
@ -174,9 +179,9 @@ void ConsoleReporter::PrintRunData(const Run& result) {
    printer(Out, COLOR_CYAN, "%10lld", result.iterations);
  }

-  for (auto& c : result.counters) {
+  for (const auto& c : result.counters) {
    const std::size_t cNameLen =
-        std::max(std::string::size_type(10), c.first.length());
+        std::max(static_cast<std::size_t>(10), c.first.length());
    std::string s;
    const char* unit = "";
    if (result.run_type == Run::RT_Aggregate &&
@ -185,10 +190,11 @@ void ConsoleReporter::PrintRunData(const Run& result) {
      unit = "%";
    } else {
      s = HumanReadableNumber(c.second.value, c.second.oneK);
-      if (c.second.flags & Counter::kIsRate)
-        unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
+      if ((c.second.flags & Counter::kIsRate) != 0) {
+        unit = (c.second.flags & Counter::kInvert) != 0 ? "s" : "/s";
+      }
    }
-    if (output_options_ & OO_Tabular) {
+    if ((output_options_ & OO_Tabular) != 0) {
      printer(Out, COLOR_DEFAULT, " %*s%s", cNameLen - strlen(unit), s.c_str(),
              unit);
    } else {
--- a/src/counter.cc
+++ b/src/counter.cc
@ -20,20 +20,20 @@ namespace internal {
 double Finish(Counter const& c, IterationCount iterations, double cpu_time,
              double num_threads) {
  double v = c.value;
-  if (c.flags & Counter::kIsRate) {
+  if ((c.flags & Counter::kIsRate) != 0) {
    v /= cpu_time;
  }
-  if (c.flags & Counter::kAvgThreads) {
+  if ((c.flags & Counter::kAvgThreads) != 0) {
    v /= num_threads;
  }
-  if (c.flags & Counter::kIsIterationInvariant) {
-    v *= iterations;
+  if ((c.flags & Counter::kIsIterationInvariant) != 0) {
+    v *= static_cast<double>(iterations);
  }
-  if (c.flags & Counter::kAvgIterations) {
-    v /= iterations;
+  if ((c.flags & Counter::kAvgIterations) != 0) {
+    v /= static_cast<double>(iterations);
  }

-  if (c.flags & Counter::kInvert) {  // Invert is *always* last.
+  if ((c.flags & Counter::kInvert) != 0) {  // Invert is *always* last.
    v = 1.0 / v;
  }
  return v;
@ -64,7 +64,9 @@ void Increment(UserCounters* l, UserCounters const& r) {
 }

 bool SameNames(UserCounters const& l, UserCounters const& r) {
-  if (&l == &r) return true;
+  if (&l == &r) {
+    return true;
+  }
  if (l.size() != r.size()) {
    return false;
  }
--- a/src/csv_reporter.cc
+++ b/src/csv_reporter.cc
@ -66,8 +66,10 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
    // save the names of all the user counters
    for (const auto& run : reports) {
      for (const auto& cnt : run.counters) {
-        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+        if (cnt.first == "bytes_per_second" ||
+            cnt.first == "items_per_second") {
          continue;
+        }
        user_counter_names_.insert(cnt.first);
      }
    }
@ -75,7 +77,9 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
    // print the header
    for (auto B = elements.begin(); B != elements.end();) {
      Out << *B++;
-      if (B != elements.end()) Out << ",";
+      if (B != elements.end()) {
+        Out << ",";
+      }
    }
    for (auto B = user_counter_names_.begin();
         B != user_counter_names_.end();) {
@ -88,8 +92,10 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
    // check that all the current counters are saved in the name set
    for (const auto& run : reports) {
      for (const auto& cnt : run.counters) {
-        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+        if (cnt.first == "bytes_per_second" ||
+            cnt.first == "items_per_second") {
          continue;
+        }
        BM_CHECK(user_counter_names_.find(cnt.first) !=
                 user_counter_names_.end())
            << "All counters must be present in each run. "
@ -109,7 +115,7 @@ BENCHMARK_EXPORT
 void CSVReporter::PrintRunData(const Run& run) {
  std::ostream& Out = GetOutputStream();
  Out << CsvEscape(run.benchmark_name()) << ",";
-  if (run.skipped) {
+  if (run.skipped != 0u) {
    Out << std::string(elements.size() - 3, ',');
    Out << std::boolalpha << (internal::SkippedWithError == run.skipped) << ",";
    Out << CsvEscape(run.skip_message) << "\n";
@ -122,13 +128,21 @@ void CSVReporter::PrintRunData(const Run& run) {
  }
  Out << ",";

-  Out << run.GetAdjustedRealTime() << ",";
-  Out << run.GetAdjustedCPUTime() << ",";
+  if (run.run_type != Run::RT_Aggregate ||
+      run.aggregate_unit == StatisticUnit::kTime) {
+    Out << run.GetAdjustedRealTime() << ",";
+    Out << run.GetAdjustedCPUTime() << ",";
+  } else {
+    assert(run.aggregate_unit == StatisticUnit::kPercentage);
+    Out << run.real_accumulated_time << ",";
+    Out << run.cpu_accumulated_time << ",";
+  }

  // Do not print timeLabel on bigO and RMS report
  if (run.report_big_o) {
    Out << GetBigOString(run.complexity);
-  } else if (!run.report_rms) {
+  } else if (!run.report_rms &&
+             run.aggregate_unit != StatisticUnit::kPercentage) {
    Out << GetTimeUnitString(run.time_unit);
  }
  Out << ",";
--- a/src/cycleclock.h
+++ b/src/cycleclock.h
@ -70,7 +70,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
  // frequency scaling).  Also note that when the Mac sleeps, this
  // counter pauses; it does not continue counting, nor does it
  // reset to zero.
-  return mach_absolute_time();
+  return static_cast<int64_t>(mach_absolute_time());
 #elif defined(BENCHMARK_OS_EMSCRIPTEN)
  // this goes above x86-specific code because old versions of Emscripten
  // define __x86_64__, although they have nothing to do with it.
@ -82,7 +82,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
 #elif defined(__x86_64__) || defined(__amd64__)
  uint64_t low, high;
  __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
-  return (high << 32) | low;
+  return static_cast<int64_t>((high << 32) | low);
 #elif defined(__powerpc__) || defined(__ppc__)
  // This returns a time-base, which is not always precisely a cycle-count.
 #if defined(__powerpc64__) || defined(__ppc64__)
@ -181,33 +181,36 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
 #elif defined(__s390__)  // Covers both s390 and s390x.
  // Return the CPU clock.
  uint64_t tsc;
-#if defined(BENCHMARK_OS_ZOS) && defined(COMPILER_IBMXL)
-  // z/OS XL compiler HLASM syntax.
+#if defined(BENCHMARK_OS_ZOS)
+  // z/OS HLASM syntax.
  asm(" stck %0" : "=m"(tsc) : : "cc");
 #else
+  // Linux on Z syntax.
  asm("stck %0" : "=Q"(tsc) : : "cc");
 #endif
  return tsc;
 #elif defined(__riscv)  // RISC-V
-  // Use RDCYCLE (and RDCYCLEH on riscv32)
+  // Use RDTIME (and RDTIMEH on riscv32).
+  // RDCYCLE is a privileged instruction since Linux 6.6.
 #if __riscv_xlen == 32
  uint32_t cycles_lo, cycles_hi0, cycles_hi1;
  // This asm also includes the PowerPC overflow handling strategy, as above.
  // Implemented in assembly because Clang insisted on branching.
  asm volatile(
-      "rdcycleh %0\n"
-      "rdcycle %1\n"
-      "rdcycleh %2\n"
+      "rdtimeh %0\n"
+      "rdtime %1\n"
+      "rdtimeh %2\n"
      "sub %0, %0, %2\n"
      "seqz %0, %0\n"
      "sub %0, zero, %0\n"
      "and %1, %1, %0\n"
      : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1));
-  return (static_cast<uint64_t>(cycles_hi1) << 32) | cycles_lo;
+  return static_cast<int64_t>((static_cast<uint64_t>(cycles_hi1) << 32) |
+                              cycles_lo);
 #else
  uint64_t cycles;
-  asm volatile("rdcycle %0" : "=r"(cycles));
-  return cycles;
+  asm volatile("rdtime %0" : "=r"(cycles));
+  return static_cast<int64_t>(cycles);
 #endif
 #elif defined(__e2k__) || defined(__elbrus__)
  struct timeval tv;
@ -216,11 +219,33 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
 #elif defined(__hexagon__)
  uint64_t pcycle;
  asm volatile("%0 = C15:14" : "=r"(pcycle));
-  return static_cast<double>(pcycle);
+  return static_cast<int64_t>(pcycle);
+#elif defined(__alpha__)
+  // Alpha has a cycle counter, the PCC register, but it is an unsigned 32-bit
+  // integer and thus wraps every ~4s, making using it for tick counts
+  // unreliable beyond this time range.  The real-time clock is low-precision,
+  // roughtly ~1ms, but it is the only option that can reasonable count
+  // indefinitely.
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__hppa__) || defined(__linux__)
+  // Fallback for all other architectures with a recent Linux kernel, e.g.:
+  // HP PA-RISC provides a user-readable clock counter (cr16), but
+  // it's not syncronized across CPUs and only 32-bit wide when programs
+  // are built as 32-bit binaries.
+  // Same for SH-4 and possibly others.
+  // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
+  // because is provides nanosecond resolution.
+  // Initialize to always return 0 if clock_gettime fails.
+  struct timespec ts = {0, 0};
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return static_cast<int64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
 #else
-// The soft failover to a generic implementation is automatic only for ARM.
-// For other platforms the developer is expected to make an attempt to create
-// a fast implementation and use generic version if nothing better is available.
+  // The soft failover to a generic implementation is automatic only for ARM.
+  // For other platforms the developer is expected to make an attempt to create
+  // a fast implementation and use generic version if nothing better is
+  // available.
 #error You need to define CycleTimer for your OS and CPU
 #endif
 }
--- a/src/internal_macros.h
+++ b/src/internal_macros.h
@ -11,11 +11,7 @@
 #endif

 #if defined(__clang__)
-  #if defined(__ibmxl__)
-    #if !defined(COMPILER_IBMXL)
-      #define COMPILER_IBMXL
-    #endif
-  #elif !defined(COMPILER_CLANG)
+  #if !defined(COMPILER_CLANG)
    #define COMPILER_CLANG
  #endif
 #elif defined(_MSC_VER)
--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@ -85,15 +85,19 @@ std::string FormatKV(std::string const& key, int64_t value) {
  return ss.str();
 }

+std::string FormatKV(std::string const& key, int value) {
+  return FormatKV(key, static_cast<int64_t>(value));
+}
+
 std::string FormatKV(std::string const& key, double value) {
  std::stringstream ss;
  ss << '"' << StrEscape(key) << "\": ";

-  if (std::isnan(value))
+  if (std::isnan(value)) {
    ss << (value < 0 ? "-" : "") << "NaN";
-  else if (std::isinf(value))
+  } else if (std::isinf(value)) {
    ss << (value < 0 ? "-" : "") << "Infinity";
-  else {
+  } else {
    const auto max_digits10 =
        std::numeric_limits<decltype(value)>::max_digits10;
    const auto max_fractional_digits10 = max_digits10 - 1;
@ -122,7 +126,7 @@ bool JSONReporter::ReportContext(const Context& context) {

  out << indent << FormatKV("host_name", context.sys_info.name) << ",\n";

-  if (Context::executable_name) {
+  if (Context::executable_name != nullptr) {
    out << indent << FormatKV("executable", Context::executable_name) << ",\n";
  }

@ -136,7 +140,7 @@ bool JSONReporter::ReportContext(const Context& context) {
  if (CPUInfo::Scaling::UNKNOWN != info.scaling) {
    out << indent
        << FormatKV("cpu_scaling_enabled",
-                    info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
+                    info.scaling == CPUInfo::Scaling::ENABLED)
        << ",\n";
  }

@ -144,7 +148,7 @@ bool JSONReporter::ReportContext(const Context& context) {
  indent = std::string(6, ' ');
  std::string cache_indent(8, ' ');
  for (size_t i = 0; i < info.caches.size(); ++i) {
-    auto& CI = info.caches[i];
+    const auto& CI = info.caches[i];
    out << indent << "{\n";
    out << cache_indent << FormatKV("type", CI.type) << ",\n";
    out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
@ -155,7 +159,9 @@ bool JSONReporter::ReportContext(const Context& context) {
        << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
        << "\n";
    out << indent << "}";
-    if (i != info.caches.size() - 1) out << ",";
+    if (i != info.caches.size() - 1) {
+      out << ",";
+    }
    out << "\n";
  }
  indent = std::string(4, ' ');
@ -163,16 +169,25 @@ bool JSONReporter::ReportContext(const Context& context) {
  out << indent << "\"load_avg\": [";
  for (auto it = info.load_avg.begin(); it != info.load_avg.end();) {
    out << *it++;
-    if (it != info.load_avg.end()) out << ",";
+    if (it != info.load_avg.end()) {
+      out << ",";
+    }
  }
  out << "],\n";

+  out << indent << FormatKV("library_version", GetBenchmarkVersion());
+  out << ",\n";
+
 #if defined(NDEBUG)
  const char build_type[] = "release";
 #else
  const char build_type[] = "debug";
 #endif
  out << indent << FormatKV("library_build_type", build_type);
+  out << ",\n";
+
+  // NOTE: our json schema is not strictly tied to the library version!
+  out << indent << FormatKV("json_schema_version", 1);

  std::map<std::string, std::string>* global_context =
      internal::GetGlobalContext();
@ -287,20 +302,21 @@ void JSONReporter::PrintRunData(Run const& run) {
    out << indent << FormatKV("rms", run.GetAdjustedCPUTime());
  }

-  for (auto& c : run.counters) {
+  for (const auto& c : run.counters) {
    out << ",\n" << indent << FormatKV(c.first, c.second);
  }

-  if (run.memory_result) {
-    const MemoryManager::Result memory_result = *run.memory_result;
+  if (run.memory_result.memory_iterations > 0) {
+    const auto& memory_result = run.memory_result;
    out << ",\n" << indent << FormatKV("allocs_per_iter", run.allocs_per_iter);
    out << ",\n"
        << indent << FormatKV("max_bytes_used", memory_result.max_bytes_used);

    auto report_if_present = [&out, &indent](const std::string& label,
                                             int64_t val) {
-      if (val != MemoryManager::TombstoneValue)
+      if (val != MemoryManager::TombstoneValue) {
        out << ",\n" << indent << FormatKV(label, val);
+      }
    };

    report_if_present("total_allocated_bytes",
@ -314,7 +330,4 @@ void JSONReporter::PrintRunData(Run const& run) {
  out << '\n';
 }

-const int64_t MemoryManager::TombstoneValue =
-    std::numeric_limits<int64_t>::max();
-
 }  // end namespace benchmark
--- a/src/log.h
+++ b/src/log.h
@ -4,13 +4,6 @@
 #include <iostream>
 #include <ostream>

-// NOTE: this is also defined in benchmark.h but we're trying to avoid a
-// dependency.
-// The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer.
-#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
-#define BENCHMARK_HAS_CXX11
-#endif
-
 namespace benchmark {
 namespace internal {

@ -31,13 +24,8 @@ class LogType {

  // NOTE: we could use BENCHMARK_DISALLOW_COPY_AND_ASSIGN but we shouldn't have
  // a dependency on benchmark.h from here.
-#ifndef BENCHMARK_HAS_CXX11
-  LogType(const LogType&);
-  LogType& operator=(const LogType&);
-#else
  LogType(const LogType&) = delete;
  LogType& operator=(const LogType&) = delete;
-#endif
 };

 template <class Tp>
--- a/src/perf_counters.cc
+++ b/src/perf_counters.cc
@ -26,8 +26,6 @@
 namespace benchmark {
 namespace internal {

-constexpr size_t PerfCounterValues::kMaxCounters;
-
 #if defined HAVE_LIBPFM

 size_t PerfCounterValues::Read(const std::vector<int>& leaders) {
@ -39,7 +37,8 @@ size_t PerfCounterValues::Read(const std::vector<int>& leaders) {
    auto read_bytes = ::read(lead, ptr, size);
    if (read_bytes >= ssize_t(sizeof(uint64_t))) {
      // Actual data bytes are all bytes minus initial padding
-      std::size_t data_bytes = read_bytes - sizeof(uint64_t);
+      std::size_t data_bytes =
+          static_cast<std::size_t>(read_bytes) - sizeof(uint64_t);
      // This should be very cheap since it's in hot cache
      std::memmove(ptr, ptr + sizeof(uint64_t), data_bytes);
      // Increment our counters
@ -156,7 +155,8 @@ PerfCounters PerfCounters::Create(
    attr.exclude_hv = true;

    // Read all counters in a group in one read.
-    attr.read_format = PERF_FORMAT_GROUP;
+    attr.read_format = PERF_FORMAT_GROUP;  //| PERF_FORMAT_TOTAL_TIME_ENABLED |
+                                           // PERF_FORMAT_TOTAL_TIME_RUNNING;

    int id = -1;
    while (id < 0) {
@ -216,7 +216,7 @@ PerfCounters PerfCounters::Create(
      GetErrorLogInstance() << "***WARNING*** Failed to start counters. "
                               "Claring out all counters.\n";

-      // Close all peformance counters
+      // Close all performance counters
      for (int id : counter_ids) {
        ::close(id);
      }
@ -254,7 +254,7 @@ bool PerfCounters::IsCounterSupported(const std::string&) { return false; }
 PerfCounters PerfCounters::Create(
    const std::vector<std::string>& counter_names) {
  if (!counter_names.empty()) {
-    GetErrorLogInstance() << "Performance counters not supported.";
+    GetErrorLogInstance() << "Performance counters not supported.\n";
  }
  return NoCounters();
 }
--- a/src/re.h
+++ b/src/re.h
@ -121,15 +121,13 @@ inline bool Regex::Init(const std::string& spec, std::string* error) {
  if (ec != 0) {
    if (error) {
      size_t needed = regerror(ec, &re_, nullptr, 0);
-      char* errbuf = new char[needed];
-      regerror(ec, &re_, errbuf, needed);
+      std::vector<char> errbuf(needed);
+      regerror(ec, &re_, errbuf.data(), needed);

      // regerror returns the number of bytes necessary to null terminate
      // the string, so we move that when assigning to error.
      BM_CHECK_NE(needed, 0);
-      error->assign(errbuf, needed - 1);
-
-      delete[] errbuf;
+      error->assign(errbuf.data(), needed - 1);
    }

    return false;
--- a/src/reporter.cc
+++ b/src/reporter.cc
@ -42,20 +42,23 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
  Out << LocalDateTimeString() << "\n";
 #endif

-  if (context.executable_name)
-    Out << "Running " << context.executable_name << "\n";
+  if (benchmark::BenchmarkReporter::Context::executable_name != nullptr) {
+    Out << "Running " << benchmark::BenchmarkReporter::Context::executable_name
+        << "\n";
+  }

  const CPUInfo &info = context.cpu_info;
  Out << "Run on (" << info.num_cpus << " X "
      << (info.cycles_per_second / 1000000.0) << " MHz CPU "
      << ((info.num_cpus > 1) ? "s" : "") << ")\n";
-  if (info.caches.size() != 0) {
+  if (!info.caches.empty()) {
    Out << "CPU Caches:\n";
-    for (auto &CInfo : info.caches) {
+    for (const auto &CInfo : info.caches) {
      Out << "  L" << CInfo.level << " " << CInfo.type << " "
          << (CInfo.size / 1024) << " KiB";
-      if (CInfo.num_sharing != 0)
+      if (CInfo.num_sharing != 0) {
        Out << " (x" << (info.num_cpus / CInfo.num_sharing) << ")";
+      }
      Out << "\n";
    }
  }
@ -63,7 +66,9 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
    Out << "Load Average: ";
    for (auto It = info.load_avg.begin(); It != info.load_avg.end();) {
      Out << StrFormat("%.2f", *It++);
-      if (It != info.load_avg.end()) Out << ", ";
+      if (It != info.load_avg.end()) {
+        Out << ", ";
+      }
    }
    Out << "\n";
  }
@ -105,13 +110,17 @@ std::string BenchmarkReporter::Run::benchmark_name() const {

 double BenchmarkReporter::Run::GetAdjustedRealTime() const {
  double new_time = real_accumulated_time * GetTimeUnitMultiplier(time_unit);
-  if (iterations != 0) new_time /= static_cast<double>(iterations);
+  if (iterations != 0) {
+    new_time /= static_cast<double>(iterations);
+  }
  return new_time;
 }

 double BenchmarkReporter::Run::GetAdjustedCPUTime() const {
  double new_time = cpu_accumulated_time * GetTimeUnitMultiplier(time_unit);
-  if (iterations != 0) new_time /= static_cast<double>(iterations);
+  if (iterations != 0) {
+    new_time /= static_cast<double>(iterations);
+  }
  return new_time;
 }

--- a/src/statistics.cc
+++ b/src/statistics.cc
@ -26,17 +26,21 @@

 namespace benchmark {

-auto StatisticsSum = [](const std::vector<double>& v) {
+const auto StatisticsSum = [](const std::vector<double>& v) {
  return std::accumulate(v.begin(), v.end(), 0.0);
 };

 double StatisticsMean(const std::vector<double>& v) {
-  if (v.empty()) return 0.0;
-  return StatisticsSum(v) * (1.0 / v.size());
+  if (v.empty()) {
+    return 0.0;
+  }
+  return StatisticsSum(v) * (1.0 / static_cast<double>(v.size()));
 }

 double StatisticsMedian(const std::vector<double>& v) {
-  if (v.size() < 3) return StatisticsMean(v);
+  if (v.size() < 3) {
+    return StatisticsMean(v);
+  }
  std::vector<double> copy(v);

  auto center = copy.begin() + v.size() / 2;
@ -47,40 +51,57 @@ double StatisticsMedian(const std::vector<double>& v) {
  // before.  Instead of resorting, we just look for the max value before it,
  // which is not necessarily the element immediately preceding `center` Since
  // `copy` is only partially sorted by `nth_element`.
-  if (v.size() % 2 == 1) return *center;
+  if (v.size() % 2 == 1) {
+    return *center;
+  }
  auto center2 = std::max_element(copy.begin(), center);
  return (*center + *center2) / 2.0;
 }

 // Return the sum of the squares of this sample set
-auto SumSquares = [](const std::vector<double>& v) {
+const auto SumSquares = [](const std::vector<double>& v) {
  return std::inner_product(v.begin(), v.end(), v.begin(), 0.0);
 };

-auto Sqr = [](const double dat) { return dat * dat; };
-auto Sqrt = [](const double dat) {
+const auto Sqr = [](const double dat) { return dat * dat; };
+const auto Sqrt = [](const double dat) {
  // Avoid NaN due to imprecision in the calculations
-  if (dat < 0.0) return 0.0;
+  if (dat < 0.0) {
+    return 0.0;
+  }
  return std::sqrt(dat);
 };

 double StatisticsStdDev(const std::vector<double>& v) {
  const auto mean = StatisticsMean(v);
-  if (v.empty()) return mean;
+  if (v.empty()) {
+    return mean;
+  }

  // Sample standard deviation is undefined for n = 1
-  if (v.size() == 1) return 0.0;
+  if (v.size() == 1) {
+    return 0.0;
+  }

-  const double avg_squares = SumSquares(v) * (1.0 / v.size());
-  return Sqrt(v.size() / (v.size() - 1.0) * (avg_squares - Sqr(mean)));
+  const double avg_squares =
+      SumSquares(v) * (1.0 / static_cast<double>(v.size()));
+  return Sqrt(static_cast<double>(v.size()) /
+              (static_cast<double>(v.size()) - 1.0) *
+              (avg_squares - Sqr(mean)));
 }

 double StatisticsCV(const std::vector<double>& v) {
-  if (v.size() < 2) return 0.0;
+  if (v.size() < 2) {
+    return 0.0;
+  }

  const auto stddev = StatisticsStdDev(v);
  const auto mean = StatisticsMean(v);

+  if (std::fpclassify(mean) == FP_ZERO) {
+    return 0.0;
+  }
+
  return stddev / mean;
 }

@ -92,7 +113,7 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
  auto error_count = std::count_if(reports.begin(), reports.end(),
                                   [](Run const& run) { return run.skipped; });

-  if (reports.size() - error_count < 2) {
+  if (reports.size() - static_cast<size_t>(error_count) < 2) {
    // We don't report aggregated data if there was a single run.
    return results;
  }
@ -132,7 +153,9 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
  for (Run const& run : reports) {
    BM_CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
    BM_CHECK_EQ(run_iterations, run.iterations);
-    if (run.skipped) continue;
+    if (run.skipped != 0u) {
+      continue;
+    }
    real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
    cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
    // user counters
@ -153,7 +176,7 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
  }

  const double iteration_rescale_factor =
-      double(reports.size()) / double(run_iterations);
+      static_cast<double>(reports.size()) / static_cast<double>(run_iterations);

  for (const auto& Stat : *reports[0].statistics) {
    // Get the data from the accumulator to BenchmarkReporter::Run's.
@ -174,7 +197,7 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
    // Similarly, if there are N repetitions with 1 iterations each,
    // an aggregate will be computed over N measurements, not 1.
    // Thus it is best to simply use the count of separate reports.
-    data.iterations = reports.size();
+    data.iterations = static_cast<IterationCount>(reports.size());

    data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
    data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);
--- a/src/string_util.cc
+++ b/src/string_util.cc
@ -29,7 +29,7 @@ static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
 static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
              "Small SI and Big SI unit arrays must be the same size");

-static const int64_t kUnitsSize = arraysize(kBigSIUnits);
+const int64_t kUnitsSize = arraysize(kBigSIUnits);

 void ToExponentAndMantissa(double val, int precision, double one_k,
                           std::string* mantissa, int64_t* exponent) {
@ -56,7 +56,7 @@ void ToExponentAndMantissa(double val, int precision, double one_k,
      scaled /= one_k;
      if (scaled <= big_threshold) {
        mantissa_stream << scaled;
-        *exponent = i + 1;
+        *exponent = static_cast<int64_t>(i + 1);
        *mantissa = mantissa_stream.str();
        return;
      }
@ -87,10 +87,14 @@ void ToExponentAndMantissa(double val, int precision, double one_k,
 }

 std::string ExponentToPrefix(int64_t exponent, bool iec) {
-  if (exponent == 0) return "";
+  if (exponent == 0) {
+    return {};
+  }

  const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1);
-  if (index >= kUnitsSize) return "";
+  if (index >= kUnitsSize) {
+    return {};
+  }

  const char* const* array =
      (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : kSmallSIUnits);
@ -101,7 +105,7 @@ std::string ExponentToPrefix(int64_t exponent, bool iec) {
 std::string ToBinaryStringFullySpecified(double value, int precision,
                                         Counter::OneK one_k) {
  std::string mantissa;
-  int64_t exponent;
+  int64_t exponent = 0;
  ToExponentAndMantissa(value, precision,
                        one_k == Counter::kIs1024 ? 1024.0 : 1000.0, &mantissa,
                        &exponent);
@ -115,7 +119,7 @@ std::string StrFormatImp(const char* msg, va_list args) {

  // TODO(ericwf): use std::array for first attempt to avoid one memory
  // allocation guess what the size might be
-  std::array<char, 256> local_buff;
+  std::array<char, 256> local_buff = {};

  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
  // in the android-ndk
@ -124,9 +128,12 @@ std::string StrFormatImp(const char* msg, va_list args) {
  va_end(args_cp);

  // handle empty expansion
-  if (ret == 0) return std::string{};
-  if (static_cast<std::size_t>(ret) < local_buff.size())
+  if (ret == 0) {
+    return {};
+  }
+  if (static_cast<std::size_t>(ret) < local_buff.size()) {
    return std::string(local_buff.data());
+  }

  // we did not provide a long enough buffer on our first attempt.
  // add 1 to size to account for null-byte in size cast to prevent overflow
@ -153,7 +160,9 @@ std::string StrFormat(const char* format, ...) {
 }

 std::vector<std::string> StrSplit(const std::string& str, char delim) {
-  if (str.empty()) return {};
+  if (str.empty()) {
+    return {};
+  }
  std::vector<std::string> ret;
  size_t first = 0;
  size_t next = str.find(delim);
--- a/src/string_util.h
+++ b/src/string_util.h
@ -9,7 +9,6 @@
 #include "benchmark/benchmark.h"
 #include "benchmark/export.h"
 #include "check.h"
-#include "internal_macros.h"

 namespace benchmark {

--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@ -15,6 +15,10 @@
 #include "internal_macros.h"

 #ifdef BENCHMARK_OS_WINDOWS
+#if !defined(WINVER) || WINVER < 0x0600
+#undef WINVER
+#define WINVER 0x0600
+#endif  // WINVER handling
 #include <shlwapi.h>
 #undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
 #include <versionhelpers.h>
@ -72,7 +76,6 @@
 #include "benchmark/benchmark.h"
 #include "check.h"
 #include "cycleclock.h"
-#include "internal_macros.h"
 #include "log.h"
 #include "string_util.h"
 #include "timers.h"
@ -80,7 +83,7 @@
 namespace benchmark {
 namespace {

-void PrintImp(std::ostream& out) { out << std::endl; }
+void PrintImp(std::ostream& out) { out << '\n'; }

 template <class First, class... Rest>
 void PrintImp(std::ostream& out, First&& f, Rest&&... rest) {
@ -91,6 +94,7 @@ void PrintImp(std::ostream& out, First&& f, Rest&&... rest) {
 template <class... Args>
 BENCHMARK_NORETURN void PrintErrorAndDie(Args&&... args) {
  PrintImp(std::cerr, std::forward<Args>(args)...);
+  std::cerr << std::flush;
  std::exit(EXIT_FAILURE);
 }

@ -116,7 +120,7 @@ struct ValueUnion {

  explicit ValueUnion(std::size_t buff_size)
      : size(sizeof(DataT) + buff_size),
-        buff(::new (std::malloc(size)) DataT(), &std::free) {}
+        buff(::new(std::malloc(size)) DataT(), &std::free) {}

  ValueUnion(ValueUnion&& other) = default;

@ -149,16 +153,16 @@ ValueUnion GetSysctlImp(std::string const& name) {
  int mib[2];

  mib[0] = CTL_HW;
-  if ((name == "hw.ncpu") || (name == "hw.cpuspeed")) {
+  if ((name == "hw.ncpuonline") || (name == "hw.cpuspeed")) {
    ValueUnion buff(sizeof(int));

-    if (name == "hw.ncpu") {
-      mib[1] = HW_NCPU;
+    if (name == "hw.ncpuonline") {
+      mib[1] = HW_NCPUONLINE;
    } else {
      mib[1] = HW_CPUSPEED;
    }

-    if (sysctl(mib, 2, buff.data(), &buff.Size, nullptr, 0) == -1) {
+    if (sysctl(mib, 2, buff.data(), &buff.size, nullptr, 0) == -1) {
      return ValueUnion();
    }
    return buff;
@ -208,14 +212,18 @@ template <class ArgT>
 bool ReadFromFile(std::string const& fname, ArgT* arg) {
  *arg = ArgT();
  std::ifstream f(fname.c_str());
-  if (!f.is_open()) return false;
+  if (!f.is_open()) {
+    return false;
+  }
  f >> *arg;
  return f.good();
 }

 CPUInfo::Scaling CpuScaling(int num_cpus) {
  // We don't have a valid CPU count, so don't even bother.
-  if (num_cpus <= 0) return CPUInfo::Scaling::UNKNOWN;
+  if (num_cpus <= 0) {
+    return CPUInfo::Scaling::UNKNOWN;
+  }
 #if defined(BENCHMARK_OS_QNX)
  return CPUInfo::Scaling::UNKNOWN;
 #elif !defined(BENCHMARK_OS_WINDOWS)
@ -226,8 +234,9 @@ CPUInfo::Scaling CpuScaling(int num_cpus) {
  for (int cpu = 0; cpu < num_cpus; ++cpu) {
    std::string governor_file =
        StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
-    if (ReadFromFile(governor_file, &res) && res != "performance")
+    if (ReadFromFile(governor_file, &res) && res != "performance") {
      return CPUInfo::Scaling::ENABLED;
+    }
  }
  return CPUInfo::Scaling::DISABLED;
 #else
@ -242,7 +251,7 @@ int CountSetBitsInCPUMap(std::string val) {
    CPUMask mask(benchmark::stoul(part, nullptr, 16));
    return static_cast<int>(mask.count());
  };
-  std::size_t pos;
+  std::size_t pos = 0;
  int total = 0;
  while ((pos = val.find(',')) != std::string::npos) {
    total += CountBits(val.substr(0, pos));
@ -263,28 +272,35 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
    CPUInfo::CacheInfo info;
    std::string fpath = StrCat(dir, "index", idx++, "/");
    std::ifstream f(StrCat(fpath, "size").c_str());
-    if (!f.is_open()) break;
+    if (!f.is_open()) {
+      break;
+    }
    std::string suffix;
    f >> info.size;
-    if (f.fail())
+    if (f.fail()) {
      PrintErrorAndDie("Failed while reading file '", fpath, "size'");
+    }
    if (f.good()) {
      f >> suffix;
-      if (f.bad())
+      if (f.bad()) {
        PrintErrorAndDie(
            "Invalid cache size format: failed to read size suffix");
-      else if (f && suffix != "K")
+      } else if (f && suffix != "K") {
        PrintErrorAndDie("Invalid cache size format: Expected bytes ", suffix);
-      else if (suffix == "K")
+      } else if (suffix == "K") {
        info.size *= 1024;
+      }
    }
-    if (!ReadFromFile(StrCat(fpath, "type"), &info.type))
+    if (!ReadFromFile(StrCat(fpath, "type"), &info.type)) {
      PrintErrorAndDie("Failed to read from file ", fpath, "type");
-    if (!ReadFromFile(StrCat(fpath, "level"), &info.level))
+    }
+    if (!ReadFromFile(StrCat(fpath, "level"), &info.level)) {
      PrintErrorAndDie("Failed to read from file ", fpath, "level");
+    }
    std::string map_str;
-    if (!ReadFromFile(StrCat(fpath, "shared_cpu_map"), &map_str))
+    if (!ReadFromFile(StrCat(fpath, "shared_cpu_map"), &map_str)) {
      PrintErrorAndDie("Failed to read from file ", fpath, "shared_cpu_map");
+    }
    info.num_sharing = CountSetBitsInCPUMap(map_str);
    res.push_back(info);
  }
@ -329,15 +345,18 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
  using UPtr = std::unique_ptr<PInfo, decltype(&std::free)>;
  GetLogicalProcessorInformation(nullptr, &buffer_size);
  UPtr buff(static_cast<PInfo*>(std::malloc(buffer_size)), &std::free);
-  if (!GetLogicalProcessorInformation(buff.get(), &buffer_size))
+  if (!GetLogicalProcessorInformation(buff.get(), &buffer_size)) {
    PrintErrorAndDie("Failed during call to GetLogicalProcessorInformation: ",
                     GetLastError());
+  }

  PInfo* it = buff.get();
  PInfo* end = buff.get() + (buffer_size / sizeof(PInfo));

  for (; it != end; ++it) {
-    if (it->Relationship != RelationCache) continue;
+    if (it->Relationship != RelationCache) {
+      continue;
+    }
    using BitSet = std::bitset<sizeof(ULONG_PTR) * CHAR_BIT>;
    BitSet b(it->ProcessorMask);
    // To prevent duplicates, only consider caches where CPU 0 is specified
@ -346,9 +365,14 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
    CPUInfo::CacheInfo C;
    C.num_sharing = static_cast<int>(b.count());
    C.level = cache.Level;
-    C.size = cache.Size;
+    C.size = static_cast<int>(cache.Size);
    C.type = "Unknown";
    switch (cache.Type) {
+// Windows SDK version >= 10.0.26100.0
+#ifdef NTDDI_WIN11_GE
+      case CacheUnknown:
+        break;
+#endif
      case CacheUnified:
        C.type = "Unified";
        break;
@ -456,6 +480,8 @@ std::string GetSystemName() {
 #define HOST_NAME_MAX 256
 #elif defined(BENCHMARK_OS_SOLARIS)
 #define HOST_NAME_MAX MAXHOSTNAMELEN
+#elif defined(BENCHMARK_OS_ZOS)
+#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
 #else
 #pragma message("HOST_NAME_MAX not defined. using 64")
 #define HOST_NAME_MAX 64
@ -463,34 +489,19 @@ std::string GetSystemName() {
 #endif  // def HOST_NAME_MAX
  char hostname[HOST_NAME_MAX];
  int retVal = gethostname(hostname, HOST_NAME_MAX);
-  if (retVal != 0) return std::string("");
-  return std::string(hostname);
+  return retVal != 0 ? std::string() : std::string(hostname);
 #endif  // Catch-all POSIX block.
 }

-int GetNumCPUs() {
-#ifdef BENCHMARK_HAS_SYSCTL
-  int num_cpu = -1;
-  if (GetSysctl("hw.ncpu", &num_cpu)) return num_cpu;
-  fprintf(stderr, "Err: %s\n", strerror(errno));
-  std::exit(EXIT_FAILURE);
-#elif defined(BENCHMARK_OS_WINDOWS)
+int GetNumCPUsImpl() {
+#ifdef BENCHMARK_OS_WINDOWS
  SYSTEM_INFO sysinfo;
  // Use memset as opposed to = {} to avoid GCC missing initializer false
  // positives.
  std::memset(&sysinfo, 0, sizeof(SYSTEM_INFO));
  GetSystemInfo(&sysinfo);
-  return sysinfo.dwNumberOfProcessors;  // number of logical
-                                        // processors in the current
-                                        // group
-#elif defined(BENCHMARK_OS_SOLARIS)
-  // Returns -1 in case of a failure.
-  long num_cpu = sysconf(_SC_NPROCESSORS_ONLN);
-  if (num_cpu < 0) {
-    fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n",
-            strerror(errno));
-  }
-  return (int)num_cpu;
+  // number of logical processors in the current group
+  return static_cast<int>(sysinfo.dwNumberOfProcessors);
 #elif defined(BENCHMARK_OS_QNX)
  return static_cast<int>(_syspage_ptr->num_cpu);
 #elif defined(BENCHMARK_OS_QURT)
@ -498,76 +509,71 @@ int GetNumCPUs() {
  if (qurt_sysenv_get_max_hw_threads(&hardware_threads) != QURT_EOK) {
    hardware_threads.max_hthreads = 1;
  }
-  return hardware_threads.max_hthreads;
+  return static_cast<int>(hardware_threads.max_hthreads);
+#elif defined(BENCHMARK_HAS_SYSCTL)
+  // *BSD, macOS
+  int num_cpu = -1;
+  constexpr auto* hwncpu =
+#if defined BENCHMARK_OS_MACOSX
+      "hw.logicalcpu";
+#elif defined(HW_NCPUONLINE)
+      "hw.ncpuonline";
 #else
-  int num_cpus = 0;
-  int max_id = -1;
-  std::ifstream f("/proc/cpuinfo");
-  if (!f.is_open()) {
-    std::cerr << "failed to open /proc/cpuinfo\n";
-    return -1;
-  }
-  const std::string Key = "processor";
-  std::string ln;
-  while (std::getline(f, ln)) {
-    if (ln.empty()) continue;
-    std::size_t split_idx = ln.find(':');
-    std::string value;
-#if defined(__s390__)
-    // s390 has another format in /proc/cpuinfo
-    // it needs to be parsed differently
-    if (split_idx != std::string::npos)
-      value = ln.substr(Key.size() + 1, split_idx - Key.size() - 1);
-#else
-    if (split_idx != std::string::npos) value = ln.substr(split_idx + 1);
+      "hw.ncpu";
 #endif
-    if (ln.size() >= Key.size() && ln.compare(0, Key.size(), Key) == 0) {
-      num_cpus++;
-      if (!value.empty()) {
-        const int cur_id = benchmark::stoi(value);
-        max_id = std::max(cur_id, max_id);
-      }
-    }
+  if (GetSysctl(hwncpu, &num_cpu)) return num_cpu;
+  PrintErrorAndDie("Err: ", strerror(errno));
+#elif defined(_SC_NPROCESSORS_ONLN)
+  // Linux, Solaris, AIX, Haiku, WASM, etc.
+  // Returns -1 in case of a failure.
+  int num_cpu = static_cast<int>(sysconf(_SC_NPROCESSORS_ONLN));
+  if (num_cpu < 0) {
+    PrintErrorAndDie("sysconf(_SC_NPROCESSORS_ONLN) failed with error: ",
+                     strerror(errno));
  }
-  if (f.bad()) {
-    std::cerr << "Failure reading /proc/cpuinfo\n";
-    return -1;
-  }
-  if (!f.eof()) {
-    std::cerr << "Failed to read to end of /proc/cpuinfo\n";
-    return -1;
-  }
-  f.close();
-
-  if ((max_id + 1) != num_cpus) {
-    fprintf(stderr,
-            "CPU ID assignments in /proc/cpuinfo seem messed up."
-            " This is usually caused by a bad BIOS.\n");
-  }
-  return num_cpus;
+  return num_cpu;
+#else
+  // Fallback, no other API exists.
+  return -1;
 #endif
  BENCHMARK_UNREACHABLE();
 }

+int GetNumCPUs() {
+  int num_cpus = GetNumCPUsImpl();
+  if (num_cpus < 1) {
+    std::cerr << "Unable to extract number of CPUs.\n";
+    // There must be at least one CPU on which we're running.
+    num_cpus = 1;
+  }
+  return num_cpus;
+}
+
 class ThreadAffinityGuard final {
 public:
  ThreadAffinityGuard() : reset_affinity(SetAffinity()) {
-    if (!reset_affinity)
+    if (!reset_affinity) {
      std::cerr << "***WARNING*** Failed to set thread affinity. Estimated CPU "
-                   "frequency may be incorrect."
-                << std::endl;
+                   "frequency may be incorrect.\n";
+    }
  }

  ~ThreadAffinityGuard() {
-    if (!reset_affinity) return;
+    if (!reset_affinity) {
+      return;
+    }

 #if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
    int ret = pthread_setaffinity_np(self, sizeof(previous_affinity),
                                     &previous_affinity);
-    if (ret == 0) return;
+    if (ret == 0) {
+      return;
+    }
 #elif defined(BENCHMARK_OS_WINDOWS_WIN32)
    DWORD_PTR ret = SetThreadAffinityMask(self, previous_affinity);
-    if (ret != 0) return;
+    if (ret != 0) {
+      return;
+    }
 #endif  // def BENCHMARK_HAS_PTHREAD_AFFINITY
    PrintErrorAndDie("Failed to reset thread affinity");
  }
@ -580,26 +586,32 @@ class ThreadAffinityGuard final {
 private:
  bool SetAffinity() {
 #if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
-    int ret;
+    int ret = 0;
    self = pthread_self();
    ret = pthread_getaffinity_np(self, sizeof(previous_affinity),
                                 &previous_affinity);
-    if (ret != 0) return false;
+    if (ret != 0) {
+      return false;
+    }

    cpu_set_t affinity;
    memcpy(&affinity, &previous_affinity, sizeof(affinity));

    bool is_first_cpu = true;

-    for (int i = 0; i < CPU_SETSIZE; ++i)
+    for (int i = 0; i < CPU_SETSIZE; ++i) {
      if (CPU_ISSET(i, &affinity)) {
-        if (is_first_cpu)
+        if (is_first_cpu) {
          is_first_cpu = false;
-        else
+        } else {
          CPU_CLR(i, &affinity);
+        }
      }
+    }

-    if (is_first_cpu) return false;
+    if (is_first_cpu) {
+      return false;
+    }

    ret = pthread_setaffinity_np(self, sizeof(affinity), &affinity);
    return ret == 0;
@ -614,8 +626,8 @@ class ThreadAffinityGuard final {
  }

 #if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
-  pthread_t self;
-  cpu_set_t previous_affinity;
+  pthread_t self{};
+  cpu_set_t previous_affinity{};
 #elif defined(BENCHMARK_OS_WINDOWS_WIN32)
  HANDLE self;
  DWORD_PTR previous_affinity;
@ -629,7 +641,7 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
  (void)scaling;

 #if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
-  long freq;
+  long freq = 0;

  // If the kernel is exporting the tsc frequency use that. There are issues
  // where cpuinfo_max_freq cannot be relied on because the BIOS may be
@ -651,7 +663,7 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
                      &freq)) {
    // The value is in kHz (as the file name suggests).  For example, on a
    // 2GHz warpstation, the file contains the value "2000000".
-    return freq * 1000.0;
+    return static_cast<double>(freq) * 1000.0;
  }

  const double error_value = -1;
@ -664,7 +676,9 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
  }

  auto StartsWithKey = [](std::string const& Value, std::string const& Key) {
-    if (Key.size() > Value.size()) return false;
+    if (Key.size() > Value.size()) {
+      return false;
+    }
    auto Cmp = [&](char X, char Y) {
      return std::tolower(X) == std::tolower(Y);
    };
@ -673,22 +687,30 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {

  std::string ln;
  while (std::getline(f, ln)) {
-    if (ln.empty()) continue;
+    if (ln.empty()) {
+      continue;
+    }
    std::size_t split_idx = ln.find(':');
    std::string value;
-    if (split_idx != std::string::npos) value = ln.substr(split_idx + 1);
+    if (split_idx != std::string::npos) {
+      value = ln.substr(split_idx + 1);
+    }
    // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
    // accept positive values. Some environments (virtual machines) report zero,
    // which would cause infinite looping in WallTime_Init.
    if (StartsWithKey(ln, "cpu MHz")) {
      if (!value.empty()) {
        double cycles_per_second = benchmark::stod(value) * 1000000.0;
-        if (cycles_per_second > 0) return cycles_per_second;
+        if (cycles_per_second > 0) {
+          return cycles_per_second;
+        }
      }
    } else if (StartsWithKey(ln, "bogomips")) {
      if (!value.empty()) {
        bogo_clock = benchmark::stod(value) * 1000000.0;
-        if (bogo_clock < 0.0) bogo_clock = error_value;
+        if (bogo_clock < 0.0) {
+          bogo_clock = error_value;
+        }
      }
    }
  }
@ -704,7 +726,9 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
  // If we found the bogomips clock, but nothing better, we'll use it (but
  // we're not happy about it); otherwise, fallback to the rough estimation
  // below.
-  if (bogo_clock >= 0.0) return bogo_clock;
+  if (bogo_clock >= 0.0) {
+    return bogo_clock;
+  }

 #elif defined BENCHMARK_HAS_SYSCTL
  constexpr auto* freqStr =
@ -719,9 +743,13 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
 #endif
  unsigned long long hz = 0;
 #if defined BENCHMARK_OS_OPENBSD
-  if (GetSysctl(freqStr, &hz)) return hz * 1000000;
+  if (GetSysctl(freqStr, &hz)) {
+    return static_cast<double>(hz * 1000000);
+  }
 #else
-  if (GetSysctl(freqStr, &hz)) return hz;
+  if (GetSysctl(freqStr, &hz)) {
+    return static_cast<double>(hz);
+  }
 #endif
  fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
          freqStr, strerror(errno));
@ -737,9 +765,10 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
      SUCCEEDED(
          SHGetValueA(HKEY_LOCAL_MACHINE,
                      "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
-                      "~MHz", nullptr, &data, &data_size)))
+                      "~MHz", nullptr, &data, &data_size))) {
    return static_cast<double>(static_cast<int64_t>(data) *
                               static_cast<int64_t>(1000 * 1000));  // was mhz
+  }
 #elif defined(BENCHMARK_OS_SOLARIS)
  kstat_ctl_t* kc = kstat_open();
  if (!kc) {
@ -771,8 +800,9 @@ double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
  kstat_close(kc);
  return clock_hz;
 #elif defined(BENCHMARK_OS_QNX)
-  return static_cast<double>((int64_t)(SYSPAGE_ENTRY(cpuinfo)->speed) *
-                             (int64_t)(1000 * 1000));
+  return static_cast<double>(
+      static_cast<int64_t>(SYSPAGE_ENTRY(cpuinfo)->speed) *
+      static_cast<int64_t>(1000 * 1000));
 #elif defined(BENCHMARK_OS_QURT)
  // QuRT doesn't provide any API to query Hexagon frequency.
  return 1000000000;
@ -820,7 +850,7 @@ std::vector<double> GetLoadAvg() {
    !(defined(__ANDROID__) && __ANDROID_API__ < 29)
  static constexpr int kMaxSamples = 3;
  std::vector<double> res(kMaxSamples, 0.0);
-  const int nelem = getloadavg(res.data(), kMaxSamples);
+  const size_t nelem = static_cast<size_t>(getloadavg(res.data(), kMaxSamples));
  if (nelem < 1) {
    res.clear();
  } else {
--- a/src/thread_manager.h
+++ b/src/thread_manager.h
@ -11,30 +11,15 @@ namespace internal {

 class ThreadManager {
 public:
-  explicit ThreadManager(int num_threads)
-      : alive_threads_(num_threads), start_stop_barrier_(num_threads) {}
+  explicit ThreadManager(int num_threads) : start_stop_barrier_(num_threads) {}

  Mutex& GetBenchmarkMutex() const RETURN_CAPABILITY(benchmark_mutex_) {
    return benchmark_mutex_;
  }

-  bool StartStopBarrier() EXCLUDES(end_cond_mutex_) {
-    return start_stop_barrier_.wait();
-  }
+  bool StartStopBarrier() { return start_stop_barrier_.wait(); }

-  void NotifyThreadComplete() EXCLUDES(end_cond_mutex_) {
-    start_stop_barrier_.removeThread();
-    if (--alive_threads_ == 0) {
-      MutexLock lock(end_cond_mutex_);
-      end_condition_.notify_all();
-    }
-  }
-
-  void WaitForAllThreads() EXCLUDES(end_cond_mutex_) {
-    MutexLock lock(end_cond_mutex_);
-    end_condition_.wait(lock.native_handle(),
-                        [this]() { return alive_threads_ == 0; });
-  }
+  void NotifyThreadComplete() { start_stop_barrier_.removeThread(); }

  struct Result {
    IterationCount iterations = 0;
@ -51,10 +36,7 @@ class ThreadManager {

 private:
  mutable Mutex benchmark_mutex_;
-  std::atomic<int> alive_threads_;
  Barrier start_stop_barrier_;
-  Mutex end_cond_mutex_;
-  Condition end_condition_;
 };

 }  // namespace internal
--- a/src/timers.cc
+++ b/src/timers.cc
@ -102,12 +102,14 @@ double MakeTime(thread_basic_info_data_t const& info) {
 #endif
 #if defined(CLOCK_PROCESS_CPUTIME_ID) || defined(CLOCK_THREAD_CPUTIME_ID)
 double MakeTime(struct timespec const& ts) {
-  return ts.tv_sec + (static_cast<double>(ts.tv_nsec) * 1e-9);
+  return static_cast<double>(ts.tv_sec) +
+         (static_cast<double>(ts.tv_nsec) * 1e-9);
 }
 #endif

-BENCHMARK_NORETURN static void DiagnoseAndExit(const char* msg) {
-  std::cerr << "ERROR: " << msg << std::endl;
+BENCHMARK_NORETURN void DiagnoseAndExit(const char* msg) {
+  std::cerr << "ERROR: " << msg << '\n';
+  std::flush(std::cerr);
  std::exit(EXIT_FAILURE);
 }

@ -125,8 +127,12 @@ double ProcessCPUUsage() {
    return MakeTime(kernel_time, user_time);
  DiagnoseAndExit("GetProccessTimes() failed");
 #elif defined(BENCHMARK_OS_QURT)
+  // Note that qurt_timer_get_ticks() is no longer documented as of SDK 5.3.0,
+  // and doesn't appear to work on at least some devices (eg Samsung S22),
+  // so let's use the actually-documented and apparently-equivalent
+  // qurt_sysclock_get_hw_ticks() call instead.
  return static_cast<double>(
-             qurt_timer_timetick_to_us(qurt_timer_get_ticks())) *
+             qurt_timer_timetick_to_us(qurt_sysclock_get_hw_ticks())) *
         1.0e-6;
 #elif defined(BENCHMARK_OS_EMSCRIPTEN)
  // clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) returns 0 on Emscripten.
@ -137,9 +143,10 @@ double ProcessCPUUsage() {
 #elif defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
  // See https://github.com/google/benchmark/pull/292
-  struct timespec spec;
-  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
+  struct timespec spec {};
+  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0) {
    return MakeTime(spec);
+  }
  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
 #else
  struct rusage ru;
@ -159,8 +166,12 @@ double ThreadCPUUsage() {
                 &user_time);
  return MakeTime(kernel_time, user_time);
 #elif defined(BENCHMARK_OS_QURT)
+  // Note that qurt_timer_get_ticks() is no longer documented as of SDK 5.3.0,
+  // and doesn't appear to work on at least some devices (eg Samsung S22),
+  // so let's use the actually-documented and apparently-equivalent
+  // qurt_sysclock_get_hw_ticks() call instead.
  return static_cast<double>(
-             qurt_timer_timetick_to_us(qurt_timer_get_ticks())) *
+             qurt_timer_timetick_to_us(qurt_sysclock_get_hw_ticks())) *
         1.0e-6;
 #elif defined(BENCHMARK_OS_MACOSX)
  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
@ -181,13 +192,18 @@ double ThreadCPUUsage() {
  // RTEMS doesn't support CLOCK_THREAD_CPUTIME_ID. See
  // https://github.com/RTEMS/rtems/blob/master/cpukit/posix/src/clockgettime.c
  return ProcessCPUUsage();
+#elif defined(BENCHMARK_OS_ZOS)
+  // z/OS doesn't support CLOCK_THREAD_CPUTIME_ID.
+  return ProcessCPUUsage();
 #elif defined(BENCHMARK_OS_SOLARIS)
  struct rusage ru;
  if (getrusage(RUSAGE_LWP, &ru) == 0) return MakeTime(ru);
  DiagnoseAndExit("getrusage(RUSAGE_LWP, ...) failed");
 #elif defined(CLOCK_THREAD_CPUTIME_ID)
-  struct timespec ts;
-  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0) return MakeTime(ts);
+  struct timespec ts {};
+  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0) {
+    return MakeTime(ts);
+  }
  DiagnoseAndExit("clock_gettime(CLOCK_THREAD_CPUTIME_ID, ...) failed");
 #else
 #error Per-thread timing is not available on your system.
@ -201,9 +217,9 @@ std::string LocalDateTimeString() {
  const std::size_t kTzOffsetLen = 6;
  const std::size_t kTimestampLen = 19;

-  std::size_t tz_len;
-  std::size_t timestamp_len;
-  long int offset_minutes;
+  std::size_t tz_len = 0;
+  std::size_t timestamp_len = 0;
+  long int offset_minutes = 0;
  char tz_offset_sign = '+';
  // tz_offset is set in one of three ways:
  // * strftime with %z - This either returns empty or the ISO 8601 time.  The
@ -223,7 +239,7 @@ std::string LocalDateTimeString() {
 #if defined(BENCHMARK_OS_WINDOWS)
  std::tm* timeinfo_p = ::localtime(&now);
 #else
-  std::tm timeinfo;
+  std::tm timeinfo{};
  std::tm* timeinfo_p = &timeinfo;
  ::localtime_r(&now, &timeinfo);
 #endif
@ -241,9 +257,9 @@ std::string LocalDateTimeString() {
      tz_offset_sign = '-';
    }

-    tz_len =
+    tz_len = static_cast<size_t>(
        ::snprintf(tz_offset, sizeof(tz_offset), "%c%02li:%02li",
-                   tz_offset_sign, offset_minutes / 100, offset_minutes % 100);
+                   tz_offset_sign, offset_minutes / 100, offset_minutes % 100));
    BM_CHECK(tz_len == kTzOffsetLen);
    ((void)tz_len);  // Prevent unused variable warning in optimized build.
  } else {
--- a/src/timers.h
+++ b/src/timers.h
@ -15,6 +15,29 @@ double ChildrenCPUUsage();
 // Return the CPU usage of the current thread
 double ThreadCPUUsage();

+#if defined(BENCHMARK_OS_QURT)
+
+// std::chrono::now() can return 0 on some Hexagon devices;
+// this reads the value of a 56-bit, 19.2MHz hardware counter
+// and converts it to seconds. Unlike std::chrono, this doesn't
+// return an absolute time, but since ChronoClockNow() is only used
+// to compute elapsed time, this shouldn't matter.
+struct QuRTClock {
+  typedef uint64_t rep;
+  typedef std::ratio<1, 19200000> period;
+  typedef std::chrono::duration<rep, period> duration;
+  typedef std::chrono::time_point<QuRTClock> time_point;
+  static const bool is_steady = false;
+
+  static time_point now() {
+    unsigned long long count;
+    asm volatile(" %0 = c31:30 " : "=r"(count));
+    return time_point(static_cast<duration>(count));
+  }
+};
+
+#else
+
 #if defined(HAVE_STEADY_CLOCK)
 template <bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
 struct ChooseSteadyClock {
@ -25,10 +48,14 @@ template <>
 struct ChooseSteadyClock<false> {
  typedef std::chrono::steady_clock type;
 };
+#endif  // HAVE_STEADY_CLOCK
+
 #endif

 struct ChooseClockType {
-#if defined(HAVE_STEADY_CLOCK)
+#if defined(BENCHMARK_OS_QURT)
+  typedef QuRTClock type;
+#elif defined(HAVE_STEADY_CLOCK)
  typedef ChooseSteadyClock<>::type type;
 #else
  typedef std::chrono::high_resolution_clock type;
--- a/test/BUILD
+++ b/test/BUILD
@ -10,7 +10,7 @@ platform(
 TEST_COPTS = [
    "-pedantic",
    "-pedantic-errors",
-    "-std=c++11",
+    "-std=c++17",
    "-Wall",
    "-Wconversion",
    "-Wextra",
@ -18,6 +18,14 @@ TEST_COPTS = [
    #    "-Wshorten-64-to-32",
    "-Wfloat-equal",
    "-fstrict-aliasing",
+    ## assert() are used a lot in tests upstream, which may be optimised out leading to
+    ## unused-variable warning.
+    "-Wno-unused-variable",
+    "-Werror=old-style-cast",
+]
+
+TEST_MSVC_OPTS = [
+    "/std:c++17",
 ]

 # Some of the issues with DoNotOptimize only occur when optimization is enabled
@ -32,6 +40,7 @@ PER_SRC_TEST_ARGS = {
    "repetitions_test.cc": [" --benchmark_repetitions=3"],
    "spec_arg_test.cc": ["--benchmark_filter=BM_NotChosen"],
    "spec_arg_verbosity_test.cc": ["--v=42"],
+    "complexity_test.cc": ["--benchmark_min_time=1000000x"],
 }

 cc_library(
@ -40,7 +49,7 @@ cc_library(
    srcs = ["output_test_helper.cc"],
    hdrs = ["output_test.h"],
    copts = select({
-        "//:windows": [],
+        "//:windows": TEST_MSVC_OPTS,
        "//conditions:default": TEST_COPTS,
    }),
    deps = [
@ -56,7 +65,7 @@ cc_library(
        size = "small",
        srcs = [test_src],
        copts = select({
-            "//:windows": [],
+            "//:windows": TEST_MSVC_OPTS,
            "//conditions:default": TEST_COPTS,
        }) + PER_SRC_COPTS.get(test_src, []),
        deps = [
@ -77,7 +86,7 @@ cc_library(
        srcs = [test_src],
        args = TEST_ARGS + PER_SRC_TEST_ARGS.get(test_src, []),
        copts = select({
-            "//:windows": [],
+            "//:windows": TEST_MSVC_OPTS,
            "//conditions:default": TEST_COPTS,
        }) + PER_SRC_COPTS.get(test_src, []),
        deps = [
@ -93,25 +102,24 @@ cc_library(
        ["*_test.cc"],
        exclude = [
            "*_assembly_test.cc",
-            "cxx03_test.cc",
+            "cxx11_test.cc",
            "link_main_test.cc",
        ],
    )
 ]

 cc_test(
-    name = "cxx03_test",
+    name = "cxx11_test",
    size = "small",
-    srcs = ["cxx03_test.cc"],
-    copts = TEST_COPTS + ["-std=c++03"],
+    srcs = ["cxx11_test.cc"],
+    copts = TEST_COPTS + ["-std=c++11"],
    target_compatible_with = select({
        "//:windows": ["@platforms//:incompatible"],
        "//conditions:default": [],
    }),
    deps = [
        ":output_test_helper",
-        "//:benchmark",
-        "//:benchmark_internal_headers",
+        "//:benchmark_main",
    ],
 )

@ -120,7 +128,7 @@ cc_test(
    size = "small",
    srcs = ["link_main_test.cc"],
    copts = select({
-        "//:windows": [],
+        "//:windows": TEST_MSVC_OPTS,
        "//conditions:default": TEST_COPTS,
    }),
    deps = ["//:benchmark_main"],
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -1,10 +1,12 @@
-# Enable the tests
+#Enable the tests

 set(THREADS_PREFER_PTHREAD_FLAG ON)

 find_package(Threads REQUIRED)
 include(CheckCXXCompilerFlag)

+add_cxx_compiler_flag(-Wno-unused-variable)
+
 # NOTE: Some tests use `<cassert>` to perform the test. Therefore we must
 # strip -DNDEBUG from the default CMake flags in DEBUG mode.
 string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
@ -62,30 +64,50 @@ macro(compile_output_test name)
          ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endmacro(compile_output_test)

+macro(benchmark_add_test)
+  add_test(${ARGV})
+  if(WIN32 AND BUILD_SHARED_LIBS)
+    cmake_parse_arguments(TEST "" "NAME" "" ${ARGN})
+    set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT_MODIFICATION "PATH=path_list_prepend:$<TARGET_FILE_DIR:benchmark::benchmark>")
+  endif()
+endmacro(benchmark_add_test)
+
 # Demonstration executable
+
+compile_benchmark_test_with_main(cxx11_test)
+if(DEFINED MSVC)
+  # MSVC does not really support C++11.
+  set_property(TARGET cxx11_test PROPERTY CXX_STANDARD 14)
+else()
+  set_property(TARGET cxx11_test PROPERTY CXX_STANDARD 11)
+endif()
+set_property(TARGET cxx11_test PROPERTY CXX_STANDARD_REQUIRED ON)
+set_property(TARGET cxx11_test PROPERTY CXX_EXTENSIONS OFF)
+benchmark_add_test(NAME cxx11_test COMMAND cxx11_test --benchmark_min_time=0.01s)
+
 compile_benchmark_test(benchmark_test)
-add_test(NAME benchmark COMMAND benchmark_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME benchmark COMMAND benchmark_test --benchmark_min_time=0.01s)

 compile_benchmark_test(spec_arg_test)
-add_test(NAME spec_arg COMMAND spec_arg_test --benchmark_filter=BM_NotChosen)
+benchmark_add_test(NAME spec_arg COMMAND spec_arg_test --benchmark_filter=BM_NotChosen)

 compile_benchmark_test(spec_arg_verbosity_test)
-add_test(NAME spec_arg_verbosity COMMAND spec_arg_verbosity_test --v=42)
+benchmark_add_test(NAME spec_arg_verbosity COMMAND spec_arg_verbosity_test --v=42)

 compile_benchmark_test(benchmark_setup_teardown_test)
-add_test(NAME benchmark_setup_teardown COMMAND benchmark_setup_teardown_test)
+benchmark_add_test(NAME benchmark_setup_teardown COMMAND benchmark_setup_teardown_test)

 compile_benchmark_test(filter_test)
 macro(add_filter_test name filter expect)
-  add_test(NAME ${name} COMMAND filter_test --benchmark_min_time=0.01s --benchmark_filter=${filter} ${expect})
-  add_test(NAME ${name}_list_only COMMAND filter_test --benchmark_list_tests --benchmark_filter=${filter} ${expect})
+  benchmark_add_test(NAME ${name} COMMAND filter_test --benchmark_min_time=0.01s --benchmark_filter=${filter} ${expect})
+  benchmark_add_test(NAME ${name}_list_only COMMAND filter_test --benchmark_list_tests --benchmark_filter=${filter} ${expect})
 endmacro(add_filter_test)

 compile_benchmark_test(benchmark_min_time_flag_time_test)
-add_test(NAME min_time_flag_time COMMAND benchmark_min_time_flag_time_test)
+benchmark_add_test(NAME min_time_flag_time COMMAND benchmark_min_time_flag_time_test)

 compile_benchmark_test(benchmark_min_time_flag_iters_test)
-add_test(NAME min_time_flag_iters COMMAND benchmark_min_time_flag_iters_test)
+benchmark_add_test(NAME min_time_flag_iters COMMAND benchmark_min_time_flag_iters_test)

 add_filter_test(filter_simple "Foo" 3)
 add_filter_test(filter_simple_negative "-Foo" 2)
@ -107,19 +129,19 @@ add_filter_test(filter_regex_end ".*Ba$" 1)
 add_filter_test(filter_regex_end_negative "-.*Ba$" 4)

 compile_benchmark_test(options_test)
-add_test(NAME options_benchmarks COMMAND options_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME options_benchmarks COMMAND options_test --benchmark_min_time=0.01s)

 compile_benchmark_test(basic_test)
-add_test(NAME basic_benchmark COMMAND basic_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME basic_benchmark COMMAND basic_test --benchmark_min_time=0.01s)

 compile_output_test(repetitions_test)
-add_test(NAME repetitions_benchmark COMMAND repetitions_test --benchmark_min_time=0.01s --benchmark_repetitions=3)
+benchmark_add_test(NAME repetitions_benchmark COMMAND repetitions_test --benchmark_min_time=0.01s --benchmark_repetitions=3)

 compile_benchmark_test(diagnostics_test)
-add_test(NAME diagnostics_test COMMAND diagnostics_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME diagnostics_test COMMAND diagnostics_test --benchmark_min_time=0.01s)

 compile_benchmark_test(skip_with_error_test)
-add_test(NAME skip_with_error_test COMMAND skip_with_error_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME skip_with_error_test COMMAND skip_with_error_test --benchmark_min_time=0.01s)

 compile_benchmark_test(donotoptimize_test)
 # Enable errors for deprecated deprecations (DoNotOptimize(Tp const& value)).
@ -132,90 +154,70 @@ check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
 if (BENCHMARK_HAS_O3_FLAG)
  set_target_properties(donotoptimize_test PROPERTIES COMPILE_FLAGS "-O3")
 endif()
-add_test(NAME donotoptimize_test COMMAND donotoptimize_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME donotoptimize_test COMMAND donotoptimize_test --benchmark_min_time=0.01s)

 compile_benchmark_test(fixture_test)
-add_test(NAME fixture_test COMMAND fixture_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME fixture_test COMMAND fixture_test --benchmark_min_time=0.01s)

 compile_benchmark_test(register_benchmark_test)
-add_test(NAME register_benchmark_test COMMAND register_benchmark_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME register_benchmark_test COMMAND register_benchmark_test --benchmark_min_time=0.01s)

 compile_benchmark_test(map_test)
-add_test(NAME map_test COMMAND map_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME map_test COMMAND map_test --benchmark_min_time=0.01s)

 compile_benchmark_test(multiple_ranges_test)
-add_test(NAME multiple_ranges_test COMMAND multiple_ranges_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME multiple_ranges_test COMMAND multiple_ranges_test --benchmark_min_time=0.01s)

 compile_benchmark_test(args_product_test)
-add_test(NAME args_product_test COMMAND args_product_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME args_product_test COMMAND args_product_test --benchmark_min_time=0.01s)

 compile_benchmark_test_with_main(link_main_test)
-add_test(NAME link_main_test COMMAND link_main_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME link_main_test COMMAND link_main_test --benchmark_min_time=0.01s)

 compile_output_test(reporter_output_test)
-add_test(NAME reporter_output_test COMMAND reporter_output_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME reporter_output_test COMMAND reporter_output_test --benchmark_min_time=0.01s)

 compile_output_test(templated_fixture_test)
-add_test(NAME templated_fixture_test COMMAND templated_fixture_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME templated_fixture_test COMMAND templated_fixture_test --benchmark_min_time=0.01s)
+
+compile_output_test(templated_fixture_method_test)
+benchmark_add_test(NAME templated_fixture_method_test COMMAND templated_fixture_method_test --benchmark_min_time=0.01s)

 compile_output_test(user_counters_test)
-add_test(NAME user_counters_test COMMAND user_counters_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME user_counters_test COMMAND user_counters_test --benchmark_min_time=0.01s)

 compile_output_test(perf_counters_test)
-add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time=0.01s --benchmark_perf_counters=CYCLES,BRANCHES)
+benchmark_add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time=0.01s --benchmark_perf_counters=CYCLES,INSTRUCTIONS)

 compile_output_test(internal_threading_test)
-add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01s)
+
+compile_output_test(manual_threading_test)
+benchmark_add_test(NAME manual_threading_test COMMAND manual_threading_test --benchmark_min_time=0.01s)

 compile_output_test(report_aggregates_only_test)
-add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME report_aggregates_only_test COMMAND report_aggregates_only_test --benchmark_min_time=0.01s)

 compile_output_test(display_aggregates_only_test)
-add_test(NAME display_aggregates_only_test COMMAND display_aggregates_only_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME display_aggregates_only_test COMMAND display_aggregates_only_test --benchmark_min_time=0.01s)

 compile_output_test(user_counters_tabular_test)
-add_test(NAME user_counters_tabular_test COMMAND user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01s)
+benchmark_add_test(NAME user_counters_tabular_test COMMAND user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01s)

 compile_output_test(user_counters_thousands_test)
-add_test(NAME user_counters_thousands_test COMMAND user_counters_thousands_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME user_counters_thousands_test COMMAND user_counters_thousands_test --benchmark_min_time=0.01s)

 compile_output_test(memory_manager_test)
-add_test(NAME memory_manager_test COMMAND memory_manager_test --benchmark_min_time=0.01s)
+benchmark_add_test(NAME memory_manager_test COMMAND memory_manager_test --benchmark_min_time=0.01s)

-# MSVC does not allow to set the language standard to C++98/03.
-if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-  compile_benchmark_test(cxx03_test)
-  set_target_properties(cxx03_test
-      PROPERTIES
-      CXX_STANDARD 98
-      CXX_STANDARD_REQUIRED YES)
-  # libstdc++ provides different definitions within <map> between dialects. When
-  # LTO is enabled and -Werror is specified GCC diagnoses this ODR violation
-  # causing the test to fail to compile. To prevent this we explicitly disable
-  # the warning.
-  check_cxx_compiler_flag(-Wno-odr BENCHMARK_HAS_WNO_ODR)
-  check_cxx_compiler_flag(-Wno-lto-type-mismatch BENCHMARK_HAS_WNO_LTO_TYPE_MISMATCH)
-  # Cannot set_target_properties multiple times here because the warnings will
-  # be overwritten on each call
-  set (DISABLE_LTO_WARNINGS "")
-  if (BENCHMARK_HAS_WNO_ODR)
-    set(DISABLE_LTO_WARNINGS "${DISABLE_LTO_WARNINGS} -Wno-odr")
-  endif()
-  if (BENCHMARK_HAS_WNO_LTO_TYPE_MISMATCH)
-    set(DISABLE_LTO_WARNINGS "${DISABLE_LTO_WARNINGS} -Wno-lto-type-mismatch")
-  endif()
-  set_target_properties(cxx03_test PROPERTIES LINK_FLAGS "${DISABLE_LTO_WARNINGS}")
-  add_test(NAME cxx03 COMMAND cxx03_test --benchmark_min_time=0.01s)
-endif()
+compile_output_test(profiler_manager_test)
+benchmark_add_test(NAME profiler_manager_test COMMAND profiler_manager_test --benchmark_min_time=0.01s)
+
+compile_benchmark_test(profiler_manager_iterations_test)
+benchmark_add_test(NAME profiler_manager_iterations COMMAND profiler_manager_iterations_test)

-# Attempt to work around flaky test failures when running on Appveyor servers.
-if (DEFINED ENV{APPVEYOR})
-  set(COMPLEXITY_MIN_TIME "0.5s")
-else()
-  set(COMPLEXITY_MIN_TIME "0.01s")
-endif()
 compile_output_test(complexity_test)
-add_test(NAME complexity_benchmark COMMAND complexity_test --benchmark_min_time=${COMPLEXITY_MIN_TIME})
+benchmark_add_test(NAME complexity_benchmark COMMAND complexity_test --benchmark_min_time=1000000x)

 ###############################################################################
 # GoogleTest Unit Tests
@ -230,7 +232,12 @@ if (BENCHMARK_ENABLE_GTEST_TESTS)

  macro(add_gtest name)
    compile_gtest(${name})
-    add_test(NAME ${name} COMMAND ${name})
+    benchmark_add_test(NAME ${name} COMMAND ${name})
+    if(WIN32 AND BUILD_SHARED_LIBS)
+      set_tests_properties(${name} PROPERTIES
+        ENVIRONMENT_MODIFICATION "PATH=path_list_prepend:$<TARGET_FILE_DIR:benchmark::benchmark>;PATH=path_list_prepend:$<TARGET_FILE_DIR:gmock_main>"
+      )
+    endif()
  endmacro()

  add_gtest(benchmark_gtest)
@ -242,6 +249,9 @@ if (BENCHMARK_ENABLE_GTEST_TESTS)
  add_gtest(perf_counters_gtest)
  add_gtest(time_unit_gtest)
  add_gtest(min_time_parse_gtest)
+  add_gtest(profiler_manager_gtest)
+  add_gtest(benchmark_setup_teardown_cb_types_gtest)
+  add_gtest(memory_results_gtest)
 endif(BENCHMARK_ENABLE_GTEST_TESTS)

 ###############################################################################
@ -283,7 +293,7 @@ if (${CMAKE_BUILD_TYPE_LOWER} MATCHES "coverage")
      COMMAND ${LCOV} -q -a before.lcov -a after.lcov --output-file final.lcov
      COMMAND ${LCOV} -q -r final.lcov "'${CMAKE_SOURCE_DIR}/test/*'" -o final.lcov
      COMMAND ${GENHTML} final.lcov -o lcov --demangle-cpp --sort -p "${CMAKE_BINARY_DIR}" -t benchmark
-      DEPENDS filter_test benchmark_test options_test basic_test fixture_test cxx03_test complexity_test
+      DEPENDS filter_test benchmark_test options_test basic_test fixture_test complexity_test
      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
      COMMENT "Running LCOV"
    )
--- a/test/basic_test.cc
+++ b/test/basic_test.cc
@ -5,7 +5,8 @@

 void BM_empty(benchmark::State& state) {
  for (auto _ : state) {
-    auto iterations = state.iterations();
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
    benchmark::DoNotOptimize(iterations);
  }
 }
@ -142,7 +143,6 @@ void BM_RangedFor(benchmark::State& state) {
 }
 BENCHMARK(BM_RangedFor);

-#ifdef BENCHMARK_HAS_CXX11
 template <typename T>
 void BM_OneTemplateFunc(benchmark::State& state) {
  auto arg = state.range(0);
@ -167,8 +167,6 @@ void BM_TwoTemplateFunc(benchmark::State& state) {
 BENCHMARK(BM_TwoTemplateFunc<int, double>)->Arg(1);
 BENCHMARK(BM_TwoTemplateFunc<double, int>)->Arg(1);

-#endif  // BENCHMARK_HAS_CXX11
-
 // Ensure that StateIterator provides all the necessary typedefs required to
 // instantiate std::iterator_traits.
 static_assert(
--- a/test/benchmark_gtest.cc
+++ b/test/benchmark_gtest.cc
@ -38,7 +38,7 @@ TEST(AddRangeTest, Advanced64) {

 TEST(AddRangeTest, FullRange8) {
  std::vector<int8_t> dst;
-  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), int8_t{8});
+  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), 8);
  EXPECT_THAT(
      dst, testing::ElementsAre(int8_t{1}, int8_t{8}, int8_t{64}, int8_t{127}));
 }
--- a/test/benchmark_min_time_flag_iters_test.cc
+++ b/test/benchmark_min_time_flag_iters_test.cc
@ -13,11 +13,11 @@ namespace {

 class TestReporter : public benchmark::ConsoleReporter {
 public:
-  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
+  bool ReportContext(const Context& context) override {
    return ConsoleReporter::ReportContext(context);
  };

-  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
+  void ReportRuns(const std::vector<Run>& report) override {
    assert(report.size() == 1);
    iter_nums_.push_back(report[0].iterations);
    ConsoleReporter::ReportRuns(report);
@ -25,7 +25,7 @@ class TestReporter : public benchmark::ConsoleReporter {

  TestReporter() {}

-  virtual ~TestReporter() {}
+  ~TestReporter() override {}

  const std::vector<benchmark::IterationCount>& GetIters() const {
    return iter_nums_;
@ -46,11 +46,13 @@ BENCHMARK(BM_MyBench);
 int main(int argc, char** argv) {
  // Make a fake argv and append the new --benchmark_min_time=<foo> to it.
  int fake_argc = argc + 1;
-  const char** fake_argv = new const char*[static_cast<size_t>(fake_argc)];
-  for (int i = 0; i < argc; ++i) fake_argv[i] = argv[i];
-  fake_argv[argc] = "--benchmark_min_time=4x";
+  std::vector<const char*> fake_argv(static_cast<size_t>(fake_argc));
+  for (size_t i = 0; i < static_cast<size_t>(argc); ++i) {
+    fake_argv[i] = argv[i];
+  }
+  fake_argv[static_cast<size_t>(argc)] = "--benchmark_min_time=4x";

-  benchmark::Initialize(&fake_argc, const_cast<char**>(fake_argv));
+  benchmark::Initialize(&fake_argc, const_cast<char**>(fake_argv.data()));

  TestReporter test_reporter;
  const size_t returned_count =
@ -61,6 +63,5 @@ int main(int argc, char** argv) {
  const std::vector<benchmark::IterationCount> iters = test_reporter.GetIters();
  assert(!iters.empty() && iters[0] == 4);

-  delete[] fake_argv;
  return 0;
 }
--- a/test/benchmark_min_time_flag_time_test.cc
+++ b/test/benchmark_min_time_flag_time_test.cc
@ -19,23 +19,23 @@ typedef int64_t IterationCount;

 class TestReporter : public benchmark::ConsoleReporter {
 public:
-  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE {
+  bool ReportContext(const Context& context) override {
    return ConsoleReporter::ReportContext(context);
  };

-  virtual void ReportRuns(const std::vector<Run>& report) BENCHMARK_OVERRIDE {
+  void ReportRuns(const std::vector<Run>& report) override {
    assert(report.size() == 1);
    ConsoleReporter::ReportRuns(report);
  };

-  virtual void ReportRunsConfig(double min_time, bool /* has_explicit_iters */,
-                                IterationCount /* iters */) BENCHMARK_OVERRIDE {
+  void ReportRunsConfig(double min_time, bool /* has_explicit_iters */,
+                        IterationCount /* iters */) override {
    min_times_.push_back(min_time);
  }

  TestReporter() {}

-  virtual ~TestReporter() {}
+  ~TestReporter() override {}

  const std::vector<double>& GetMinTimes() const { return min_times_; }

@ -71,20 +71,21 @@ BENCHMARK(BM_MyBench);
 int main(int argc, char** argv) {
  // Make a fake argv and append the new --benchmark_min_time=<foo> to it.
  int fake_argc = argc + 1;
-  const char** fake_argv = new const char*[static_cast<size_t>(fake_argc)];
+  std::vector<const char*> fake_argv(static_cast<size_t>(fake_argc));

-  for (int i = 0; i < argc; ++i) fake_argv[i] = argv[i];
+  for (size_t i = 0; i < static_cast<size_t>(argc); ++i) {
+    fake_argv[i] = argv[i];
+  }

  const char* no_suffix = "--benchmark_min_time=4";
  const char* with_suffix = "--benchmark_min_time=4.0s";
  double expected = 4.0;

-  fake_argv[argc] = no_suffix;
-  DoTestHelper(&fake_argc, fake_argv, expected);
+  fake_argv[static_cast<size_t>(argc)] = no_suffix;
+  DoTestHelper(&fake_argc, fake_argv.data(), expected);

-  fake_argv[argc] = with_suffix;
-  DoTestHelper(&fake_argc, fake_argv, expected);
+  fake_argv[static_cast<size_t>(argc)] = with_suffix;
+  DoTestHelper(&fake_argc, fake_argv.data(), expected);

-  delete[] fake_argv;
  return 0;
 }
--- a/test/benchmark_random_interleaving_gtest.cc
+++ b/test/benchmark_random_interleaving_gtest.cc
@ -34,7 +34,8 @@ class EventQueue : public std::queue<std::string> {
  }
 };

-EventQueue* queue = new EventQueue();
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+EventQueue* const queue = new EventQueue();

 class NullReporter : public BenchmarkReporter {
 public:
@ -48,7 +49,7 @@ class BenchmarkTest : public testing::Test {

  static void TeardownHook(int /* num_threads */) { queue->push("Teardown"); }

-  void Execute(const std::string& pattern) {
+  static void Execute(const std::string& pattern) {
    queue->Clear();

    std::unique_ptr<BenchmarkReporter> reporter(new NullReporter());
--- a/test/benchmark_setup_teardown_cb_types_gtest.cc
+++ b/test/benchmark_setup_teardown_cb_types_gtest.cc
@ -0,0 +1,126 @@
+#include "benchmark/benchmark.h"
+#include "gtest/gtest.h"
+
+using benchmark::BenchmarkReporter;
+using benchmark::callback_function;
+using benchmark::ClearRegisteredBenchmarks;
+using benchmark::RegisterBenchmark;
+using benchmark::RunSpecifiedBenchmarks;
+using benchmark::State;
+using benchmark::internal::Benchmark;
+
+static int functor_called = 0;
+struct Functor {
+  void operator()(const benchmark::State& /*unused*/) { functor_called++; }
+};
+
+class NullReporter : public BenchmarkReporter {
+ public:
+  bool ReportContext(const Context& /*context*/) override { return true; }
+  void ReportRuns(const std::vector<Run>& /* report */) override {}
+};
+
+class BenchmarkTest : public testing::Test {
+ public:
+  Benchmark* bm;
+  NullReporter null_reporter;
+
+  int setup_calls;
+  int teardown_calls;
+
+  void SetUp() override {
+    setup_calls = 0;
+    teardown_calls = 0;
+    functor_called = 0;
+
+    bm = RegisterBenchmark("BM", [](State& st) {
+      for (auto _ : st) {
+      }
+    });
+    bm->Iterations(1);
+  }
+
+  void TearDown() override { ClearRegisteredBenchmarks(); }
+};
+
+// Test that Setup/Teardown can correctly take a lambda expressions
+TEST_F(BenchmarkTest, LambdaTestCopy) {
+  auto setup_lambda = [this](const State&) { setup_calls++; };
+  auto teardown_lambda = [this](const State&) { teardown_calls++; };
+  bm->Setup(setup_lambda);
+  bm->Teardown(teardown_lambda);
+  RunSpecifiedBenchmarks(&null_reporter);
+  EXPECT_EQ(setup_calls, 1);
+  EXPECT_EQ(teardown_calls, 1);
+}
+
+// Test that Setup/Teardown can correctly take a lambda expressions
+TEST_F(BenchmarkTest, LambdaTestMove) {
+  auto setup_lambda = [this](const State&) { setup_calls++; };
+  auto teardown_lambda = [this](const State&) { teardown_calls++; };
+  bm->Setup(std::move(setup_lambda));
+  bm->Teardown(std::move(teardown_lambda));
+  RunSpecifiedBenchmarks(&null_reporter);
+  EXPECT_EQ(setup_calls, 1);
+  EXPECT_EQ(teardown_calls, 1);
+}
+
+// Test that Setup/Teardown can correctly take std::function
+TEST_F(BenchmarkTest, CallbackFunctionCopy) {
+  callback_function setup_lambda = [this](const State&) { setup_calls++; };
+  callback_function teardown_lambda = [this](const State&) {
+    teardown_calls++;
+  };
+  bm->Setup(setup_lambda);
+  bm->Teardown(teardown_lambda);
+  RunSpecifiedBenchmarks(&null_reporter);
+  EXPECT_EQ(setup_calls, 1);
+  EXPECT_EQ(teardown_calls, 1);
+}
+
+// Test that Setup/Teardown can correctly take std::function
+TEST_F(BenchmarkTest, CallbackFunctionMove) {
+  callback_function setup_lambda = [this](const State&) { setup_calls++; };
+  callback_function teardown_lambda = [this](const State&) {
+    teardown_calls++;
+  };
+  bm->Setup(std::move(setup_lambda));
+  bm->Teardown(std::move(teardown_lambda));
+  RunSpecifiedBenchmarks(&null_reporter);
+  EXPECT_EQ(setup_calls, 1);
+  EXPECT_EQ(teardown_calls, 1);
+}
+
+// Test that Setup/Teardown can correctly take functors
+TEST_F(BenchmarkTest, FunctorCopy) {
+  Functor func;
+  bm->Setup(func);
+  bm->Teardown(func);
+  RunSpecifiedBenchmarks(&null_reporter);
+  EXPECT_EQ(functor_called, 2);
+}
+
+// Test that Setup/Teardown can correctly take functors
+TEST_F(BenchmarkTest, FunctorMove) {
+  Functor func1;
+  Functor func2;
+  bm->Setup(std::move(func1));
+  bm->Teardown(std::move(func2));
+  RunSpecifiedBenchmarks(&null_reporter);
+  EXPECT_EQ(functor_called, 2);
+}
+
+// Test that Setup/Teardown can not take nullptr
+TEST_F(BenchmarkTest, NullptrTest) {
+#if GTEST_HAS_DEATH_TEST
+  // Tests only runnable in debug mode (when BM_CHECK is enabled).
+#ifndef NDEBUG
+#ifndef TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS
+  EXPECT_DEATH(bm->Setup(nullptr), "setup != nullptr");
+  EXPECT_DEATH(bm->Teardown(nullptr), "teardown != nullptr");
+#else
+  GTEST_SKIP() << "Test skipped because BM_CHECK is disabled";
+#endif
+#endif
+#endif
+}
--- a/test/benchmark_setup_teardown_test.cc
+++ b/test/benchmark_setup_teardown_test.cc
@ -10,10 +10,12 @@

 // Test that Setup() and Teardown() are called exactly once
 // for each benchmark run (single-threaded).
+namespace {
 namespace singlethreaded {
 static int setup_call = 0;
 static int teardown_call = 0;
 }  // namespace singlethreaded
+}  // namespace
 static void DoSetup1(const benchmark::State& state) {
  ++singlethreaded::setup_call;

@ -40,11 +42,13 @@ BENCHMARK(BM_with_setup)
    ->Teardown(DoTeardown1);

 // Test that Setup() and Teardown() are called once for each group of threads.
+namespace {
 namespace concurrent {
 static std::atomic<int> setup_call(0);
 static std::atomic<int> teardown_call(0);
 static std::atomic<int> func_call(0);
 }  // namespace concurrent
+}  // namespace

 static void DoSetup2(const benchmark::State& state) {
  concurrent::setup_call.fetch_add(1, std::memory_order_acquire);
@ -71,16 +75,18 @@ BENCHMARK(BM_concurrent)
    ->Threads(15);

 // Testing interaction with Fixture::Setup/Teardown
+namespace {
 namespace fixture_interaction {
 int setup = 0;
 int fixture_setup = 0;
 }  // namespace fixture_interaction
+}  // namespace

 #define FIXTURE_BECHMARK_NAME MyFixture

 class FIXTURE_BECHMARK_NAME : public ::benchmark::Fixture {
 public:
-  void SetUp(const ::benchmark::State&) override {
+  void SetUp(const ::benchmark::State& /*unused*/) override {
    fixture_interaction::fixture_setup++;
  }

@ -92,7 +98,7 @@ BENCHMARK_F(FIXTURE_BECHMARK_NAME, BM_WithFixture)(benchmark::State& st) {
  }
 }

-static void DoSetupWithFixture(const benchmark::State&) {
+static void DoSetupWithFixture(const benchmark::State& /*unused*/) {
  fixture_interaction::setup++;
 }

@ -110,7 +116,7 @@ namespace repetitions {
 int setup = 0;
 }

-static void DoSetupWithRepetitions(const benchmark::State&) {
+static void DoSetupWithRepetitions(const benchmark::State& /*unused*/) {
  repetitions::setup++;
 }
 static void BM_WithRep(benchmark::State& state) {
--- a/test/benchmark_test.cc
+++ b/test/benchmark_test.cc
@ -12,10 +12,12 @@
 #include <list>
 #include <map>
 #include <mutex>
+#include <optional>
 #include <set>
 #include <sstream>
 #include <string>
 #include <thread>
+#include <type_traits>
 #include <utility>
 #include <vector>

@ -43,18 +45,24 @@ double CalculatePi(int depth) {

 std::set<int64_t> ConstructRandomSet(int64_t size) {
  std::set<int64_t> s;
-  for (int i = 0; i < size; ++i) s.insert(s.end(), i);
+  for (int i = 0; i < size; ++i) {
+    s.insert(s.end(), i);
+  }
  return s;
 }

+// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables)
 std::mutex test_vector_mu;
-std::vector<int>* test_vector = nullptr;
+std::optional<std::vector<int>> test_vector;
+// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables)

 }  // end namespace

 static void BM_Factorial(benchmark::State& state) {
  int fac_42 = 0;
-  for (auto _ : state) fac_42 = Factorial(8);
+  for (auto _ : state) {
+    fac_42 = Factorial(8);
+  }
  // Prevent compiler optimizations
  std::stringstream ss;
  ss << fac_42;
@ -65,7 +73,9 @@ BENCHMARK(BM_Factorial)->UseRealTime();

 static void BM_CalculatePiRange(benchmark::State& state) {
  double pi = 0.0;
-  for (auto _ : state) pi = CalculatePi(static_cast<int>(state.range(0)));
+  for (auto _ : state) {
+    pi = CalculatePi(static_cast<int>(state.range(0)));
+  }
  std::stringstream ss;
  ss << pi;
  state.SetLabel(ss.str());
@ -89,7 +99,9 @@ static void BM_SetInsert(benchmark::State& state) {
    state.PauseTiming();
    data = ConstructRandomSet(state.range(0));
    state.ResumeTiming();
-    for (int j = 0; j < state.range(1); ++j) data.insert(rand());
+    for (int j = 0; j < state.range(1); ++j) {
+      data.insert(rand());
+    }
  }
  state.SetItemsProcessed(state.iterations() * state.range(1));
  state.SetBytesProcessed(state.iterations() * state.range(1) *
@ -107,7 +119,9 @@ static void BM_Sequential(benchmark::State& state) {
  ValueType v = 42;
  for (auto _ : state) {
    Container c;
-    for (int64_t i = state.range(0); --i;) c.push_back(v);
+    for (int64_t i = state.range(0); --i;) {
+      c.push_back(v);
+    }
  }
  const int64_t items_processed = state.iterations() * state.range(0);
  state.SetItemsProcessed(items_processed);
@ -117,9 +131,7 @@ BENCHMARK_TEMPLATE2(BM_Sequential, std::vector<int>, int)
    ->Range(1 << 0, 1 << 10);
 BENCHMARK_TEMPLATE(BM_Sequential, std::list<int>)->Range(1 << 0, 1 << 10);
 // Test the variadic version of BENCHMARK_TEMPLATE in C++11 and beyond.
-#ifdef BENCHMARK_HAS_CXX11
 BENCHMARK_TEMPLATE(BM_Sequential, std::vector<int>, int)->Arg(512);
-#endif

 static void BM_StringCompare(benchmark::State& state) {
  size_t len = static_cast<size_t>(state.range(0));
@ -135,19 +147,20 @@ BENCHMARK(BM_StringCompare)->Range(1, 1 << 20);
 static void BM_SetupTeardown(benchmark::State& state) {
  if (state.thread_index() == 0) {
    // No need to lock test_vector_mu here as this is running single-threaded.
-    test_vector = new std::vector<int>();
+    test_vector = std::vector<int>();
  }
  int i = 0;
  for (auto _ : state) {
    std::lock_guard<std::mutex> l(test_vector_mu);
-    if (i % 2 == 0)
+    if (i % 2 == 0) {
      test_vector->push_back(i);
-    else
+    } else {
      test_vector->pop_back();
+    }
    ++i;
  }
  if (state.thread_index() == 0) {
-    delete test_vector;
+    test_vector.reset();
  }
 }
 BENCHMARK(BM_SetupTeardown)->ThreadPerCpu();
@ -155,8 +168,9 @@ BENCHMARK(BM_SetupTeardown)->ThreadPerCpu();
 static void BM_LongTest(benchmark::State& state) {
  double tracker = 0.0;
  for (auto _ : state) {
-    for (int i = 0; i < state.range(0); ++i)
+    for (int i = 0; i < state.range(0); ++i) {
      benchmark::DoNotOptimize(tracker += i);
+    }
  }
 }
 BENCHMARK(BM_LongTest)->Range(1 << 16, 1 << 28);
@ -168,7 +182,7 @@ static void BM_ParallelMemset(benchmark::State& state) {
  int to = from + thread_size;

  if (state.thread_index() == 0) {
-    test_vector = new std::vector<int>(static_cast<size_t>(size));
+    test_vector = std::vector<int>(static_cast<size_t>(size));
  }

  for (auto _ : state) {
@ -180,7 +194,7 @@ static void BM_ParallelMemset(benchmark::State& state) {
  }

  if (state.thread_index() == 0) {
-    delete test_vector;
+    test_vector.reset();
  }
 }
 BENCHMARK(BM_ParallelMemset)->Arg(10 << 20)->ThreadRange(1, 4);
@ -209,8 +223,6 @@ static void BM_ManualTiming(benchmark::State& state) {
 BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseRealTime();
 BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseManualTime();

-#ifdef BENCHMARK_HAS_CXX11
-
 template <class... Args>
 void BM_with_args(benchmark::State& state, Args&&...) {
  for (auto _ : state) {
@ -226,7 +238,30 @@ void BM_non_template_args(benchmark::State& state, int, double) {
 }
 BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);

-#endif  // BENCHMARK_HAS_CXX11
+template <class T, class U, class... ExtraArgs>
+void BM_template2_capture(benchmark::State& state, ExtraArgs&&... extra_args) {
+  static_assert(std::is_same<T, void>::value, "");
+  static_assert(std::is_same<U, char*>::value, "");
+  static_assert(std::is_same<ExtraArgs..., unsigned int>::value, "");
+  unsigned int dummy[sizeof...(ExtraArgs)] = {extra_args...};
+  assert(dummy[0] == 42);
+  for (auto _ : state) {
+  }
+}
+BENCHMARK_TEMPLATE2_CAPTURE(BM_template2_capture, void, char*, foo, 42U);
+BENCHMARK_CAPTURE((BM_template2_capture<void, char*>), foo, 42U);
+
+template <class T, class... ExtraArgs>
+void BM_template1_capture(benchmark::State& state, ExtraArgs&&... extra_args) {
+  static_assert(std::is_same<T, void>::value, "");
+  static_assert(std::is_same<ExtraArgs..., unsigned long>::value, "");
+  unsigned long dummy[sizeof...(ExtraArgs)] = {extra_args...};
+  assert(dummy[0] == 24);
+  for (auto _ : state) {
+  }
+}
+BENCHMARK_TEMPLATE1_CAPTURE(BM_template1_capture, void, foo, 24UL);
+BENCHMARK_CAPTURE(BM_template1_capture<void>, foo, 24UL);

 static void BM_DenseThreadRanges(benchmark::State& st) {
  switch (st.range(0)) {
@ -268,7 +303,8 @@ static void BM_templated_test(benchmark::State& state) {
  }
 }

-static auto BM_templated_test_double = BM_templated_test<std::complex<double>>;
+static const auto BM_templated_test_double =
+    BM_templated_test<std::complex<double>>;
 BENCHMARK(BM_templated_test_double);

 BENCHMARK_MAIN();
--- a/test/complexity_test.cc
+++ b/test/complexity_test.cc
@ -11,7 +11,7 @@
 namespace {

 #define ADD_COMPLEXITY_CASES(...) \
-  int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
+  const int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)

 int AddComplexityTest(const std::string &test_name,
                      const std::string &big_o_test_name,
@ -69,35 +69,44 @@ int AddComplexityTest(const std::string &test_name,

 void BM_Complexity_O1(benchmark::State &state) {
  for (auto _ : state) {
-    for (int i = 0; i < 1024; ++i) {
-      benchmark::DoNotOptimize(i);
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+    double tmp = static_cast<double>(state.iterations());
+    benchmark::DoNotOptimize(tmp);
+    for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
+      benchmark::DoNotOptimize(state.iterations());
+      tmp *= static_cast<double>(state.iterations());
+      benchmark::DoNotOptimize(tmp);
    }
+
+    // always 1ns per iteration
+    state.SetIterationTime(42 * 1e-9);
  }
  state.SetComplexityN(state.range(0));
 }
-BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1);
-BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity();
 BENCHMARK(BM_Complexity_O1)
    ->Range(1, 1 << 18)
+    ->UseManualTime()
+    ->Complexity(benchmark::o1);
+BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->UseManualTime()->Complexity();
+BENCHMARK(BM_Complexity_O1)
+    ->Range(1, 1 << 18)
+    ->UseManualTime()
    ->Complexity([](benchmark::IterationCount) { return 1.0; });

-const char *one_test_name = "BM_Complexity_O1";
-const char *big_o_1_test_name = "BM_Complexity_O1_BigO";
-const char *rms_o_1_test_name = "BM_Complexity_O1_RMS";
-const char *enum_big_o_1 = "\\([0-9]+\\)";
-// FIXME: Tolerate both '(1)' and 'lgN' as output when the complexity is auto
-// deduced.
-// See https://github.com/google/benchmark/issues/272
-const char *auto_big_o_1 = "(\\([0-9]+\\))|(lgN)";
-const char *lambda_big_o_1 = "f\\(N\\)";
+constexpr char one_test_name[] = "BM_Complexity_O1/manual_time";
+constexpr char big_o_1_test_name[] = "BM_Complexity_O1/manual_time_BigO";
+constexpr char rms_o_1_test_name[] = "BM_Complexity_O1/manual_time_RMS";
+constexpr char enum_auto_big_o_1[] = "\\([0-9]+\\)";
+constexpr char lambda_big_o_1[] = "f\\(N\\)";

 // Add enum tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     enum_big_o_1, /*family_index=*/0);
+                     enum_auto_big_o_1, /*family_index=*/0);

-// Add auto enum tests
+// Add auto tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
-                     auto_big_o_1, /*family_index=*/1);
+                     enum_auto_big_o_1, /*family_index=*/1);

 // Add lambda tests
 ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
@ -107,96 +116,121 @@ ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
 // --------------------------- Testing BigO O(N) --------------------------- //
 // ========================================================================= //

-std::vector<int> ConstructRandomVector(int64_t size) {
-  std::vector<int> v;
-  v.reserve(static_cast<size_t>(size));
-  for (int i = 0; i < size; ++i) {
-    v.push_back(static_cast<int>(std::rand() % size));
-  }
-  return v;
-}
-
 void BM_Complexity_O_N(benchmark::State &state) {
-  auto v = ConstructRandomVector(state.range(0));
-  // Test worst case scenario (item not in vector)
-  const int64_t item_not_in_vector = state.range(0) * 2;
  for (auto _ : state) {
-    auto it = std::find(v.begin(), v.end(), item_not_in_vector);
-    benchmark::DoNotOptimize(it);
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+    double tmp = static_cast<double>(state.iterations());
+    benchmark::DoNotOptimize(tmp);
+    for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
+      benchmark::DoNotOptimize(state.iterations());
+      tmp *= static_cast<double>(state.iterations());
+      benchmark::DoNotOptimize(tmp);
+    }
+
+    // 1ns per iteration per entry
+    state.SetIterationTime(static_cast<double>(state.range(0)) * 42 * 1e-9);
  }
  state.SetComplexityN(state.range(0));
 }
 BENCHMARK(BM_Complexity_O_N)
    ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
+    ->Range(1 << 10, 1 << 20)
+    ->UseManualTime()
    ->Complexity(benchmark::oN);
 BENCHMARK(BM_Complexity_O_N)
    ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
+    ->Range(1 << 10, 1 << 20)
+    ->UseManualTime()
+    ->Complexity();
+BENCHMARK(BM_Complexity_O_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 20)
+    ->UseManualTime()
    ->Complexity([](benchmark::IterationCount n) -> double {
      return static_cast<double>(n);
    });
-BENCHMARK(BM_Complexity_O_N)
-    ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
-    ->Complexity();

-const char *n_test_name = "BM_Complexity_O_N";
-const char *big_o_n_test_name = "BM_Complexity_O_N_BigO";
-const char *rms_o_n_test_name = "BM_Complexity_O_N_RMS";
-const char *enum_auto_big_o_n = "N";
-const char *lambda_big_o_n = "f\\(N\\)";
+constexpr char n_test_name[] = "BM_Complexity_O_N/manual_time";
+constexpr char big_o_n_test_name[] = "BM_Complexity_O_N/manual_time_BigO";
+constexpr char rms_o_n_test_name[] = "BM_Complexity_O_N/manual_time_RMS";
+constexpr char enum_auto_big_o_n[] = "N";
+constexpr char lambda_big_o_n[] = "f\\(N\\)";

 // Add enum tests
 ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
                     enum_auto_big_o_n, /*family_index=*/3);

+// Add auto tests
+ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
+                     enum_auto_big_o_n, /*family_index=*/4);
+
 // Add lambda tests
 ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
-                     lambda_big_o_n, /*family_index=*/4);
+                     lambda_big_o_n, /*family_index=*/5);

 // ========================================================================= //
-// ------------------------- Testing BigO O(N*lgN) ------------------------- //
+// ------------------------- Testing BigO O(NlgN) ------------------------- //
 // ========================================================================= //

+static const double kLog2E = 1.44269504088896340736;
 static void BM_Complexity_O_N_log_N(benchmark::State &state) {
-  auto v = ConstructRandomVector(state.range(0));
  for (auto _ : state) {
-    std::sort(v.begin(), v.end());
+    // This test requires a non-zero CPU time to avoid divide-by-zero
+    benchmark::DoNotOptimize(state.iterations());
+    double tmp = static_cast<double>(state.iterations());
+    benchmark::DoNotOptimize(tmp);
+    for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
+      benchmark::DoNotOptimize(state.iterations());
+      tmp *= static_cast<double>(state.iterations());
+      benchmark::DoNotOptimize(tmp);
+    }
+
+    state.SetIterationTime(static_cast<double>(state.range(0)) * kLog2E *
+                           std::log(state.range(0)) * 42 * 1e-9);
  }
  state.SetComplexityN(state.range(0));
 }
-static const double kLog2E = 1.44269504088896340736;
 BENCHMARK(BM_Complexity_O_N_log_N)
    ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
+    ->Range(1 << 10, 1U << 24)
+    ->UseManualTime()
    ->Complexity(benchmark::oNLogN);
 BENCHMARK(BM_Complexity_O_N_log_N)
    ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
-    ->Complexity([](benchmark::IterationCount n) {
-      return kLog2E * static_cast<double>(n) * log(static_cast<double>(n));
-    });
+    ->Range(1 << 10, 1U << 24)
+    ->UseManualTime()
+    ->Complexity();
 BENCHMARK(BM_Complexity_O_N_log_N)
    ->RangeMultiplier(2)
-    ->Range(1 << 10, 1 << 16)
-    ->Complexity();
+    ->Range(1 << 10, 1U << 24)
+    ->UseManualTime()
+    ->Complexity([](benchmark::IterationCount n) {
+      return kLog2E * static_cast<double>(n) * std::log(static_cast<double>(n));
+    });

-const char *n_lg_n_test_name = "BM_Complexity_O_N_log_N";
-const char *big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_BigO";
-const char *rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_RMS";
-const char *enum_auto_big_o_n_lg_n = "NlgN";
-const char *lambda_big_o_n_lg_n = "f\\(N\\)";
+constexpr char n_lg_n_test_name[] = "BM_Complexity_O_N_log_N/manual_time";
+constexpr char big_o_n_lg_n_test_name[] =
+    "BM_Complexity_O_N_log_N/manual_time_BigO";
+constexpr char rms_o_n_lg_n_test_name[] =
+    "BM_Complexity_O_N_log_N/manual_time_RMS";
+constexpr char enum_auto_big_o_n_lg_n[] = "NlgN";
+constexpr char lambda_big_o_n_lg_n[] = "f\\(N\\)";

 // Add enum tests
 ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n,
                     /*family_index=*/6);

-// Add lambda tests
+// NOTE: auto big-o is wron.g
+ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
+                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n,
+                     /*family_index=*/7);
+
+//// Add lambda tests
 ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n,
-                     /*family_index=*/7);
+                     /*family_index=*/8);

 // ========================================================================= //
 // -------- Testing formatting of Complexity with captured args ------------ //
@ -205,21 +239,31 @@ ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
 void BM_ComplexityCaptureArgs(benchmark::State &state, int n) {
  for (auto _ : state) {
    // This test requires a non-zero CPU time to avoid divide-by-zero
-    auto iterations = state.iterations();
-    benchmark::DoNotOptimize(iterations);
+    benchmark::DoNotOptimize(state.iterations());
+    double tmp = static_cast<double>(state.iterations());
+    benchmark::DoNotOptimize(tmp);
+    for (benchmark::IterationCount i = 0; i < state.iterations(); ++i) {
+      benchmark::DoNotOptimize(state.iterations());
+      tmp *= static_cast<double>(state.iterations());
+      benchmark::DoNotOptimize(tmp);
+    }
+
+    state.SetIterationTime(static_cast<double>(state.range(0)) * 42 * 1e-9);
  }
  state.SetComplexityN(n);
 }

 BENCHMARK_CAPTURE(BM_ComplexityCaptureArgs, capture_test, 100)
+    ->UseManualTime()
    ->Complexity(benchmark::oN)
    ->Ranges({{1, 2}, {3, 4}});

 const std::string complexity_capture_name =
-    "BM_ComplexityCaptureArgs/capture_test";
+    "BM_ComplexityCaptureArgs/capture_test/manual_time";

 ADD_COMPLEXITY_CASES(complexity_capture_name, complexity_capture_name + "_BigO",
-                     complexity_capture_name + "_RMS", "N", /*family_index=*/9);
+                     complexity_capture_name + "_RMS", "N",
+                     /*family_index=*/9);

 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
--- a/test/cxx03_test.cc
+++ b/test/cxx03_test.cc
@ -1,62 +0,0 @@
-#undef NDEBUG
-#include <cassert>
-#include <cstddef>
-
-#include "benchmark/benchmark.h"
-
-#if __cplusplus >= 201103L
-#error C++11 or greater detected. Should be C++03.
-#endif
-
-#ifdef BENCHMARK_HAS_CXX11
-#error C++11 or greater detected by the library. BENCHMARK_HAS_CXX11 is defined.
-#endif
-
-void BM_empty(benchmark::State& state) {
-  while (state.KeepRunning()) {
-    volatile benchmark::IterationCount x = state.iterations();
-    ((void)x);
-  }
-}
-BENCHMARK(BM_empty);
-
-// The new C++11 interface for args/ranges requires initializer list support.
-// Therefore we provide the old interface to support C++03.
-void BM_old_arg_range_interface(benchmark::State& state) {
-  assert((state.range(0) == 1 && state.range(1) == 2) ||
-         (state.range(0) == 5 && state.range(1) == 6));
-  while (state.KeepRunning()) {
-  }
-}
-BENCHMARK(BM_old_arg_range_interface)->ArgPair(1, 2)->RangePair(5, 5, 6, 6);
-
-template <class T, class U>
-void BM_template2(benchmark::State& state) {
-  BM_empty(state);
-}
-BENCHMARK_TEMPLATE2(BM_template2, int, long);
-
-template <class T>
-void BM_template1(benchmark::State& state) {
-  BM_empty(state);
-}
-BENCHMARK_TEMPLATE(BM_template1, long);
-BENCHMARK_TEMPLATE1(BM_template1, int);
-
-template <class T>
-struct BM_Fixture : public ::benchmark::Fixture {};
-
-BENCHMARK_TEMPLATE_F(BM_Fixture, BM_template1, long)(benchmark::State& state) {
-  BM_empty(state);
-}
-BENCHMARK_TEMPLATE1_F(BM_Fixture, BM_template2, int)(benchmark::State& state) {
-  BM_empty(state);
-}
-
-void BM_counters(benchmark::State& state) {
-  BM_empty(state);
-  state.counters["Foo"] = 2;
-}
-BENCHMARK(BM_counters);
-
-BENCHMARK_MAIN();
--- a/test/cxx11_test.cc
+++ b/test/cxx11_test.cc
@ -0,0 +1,12 @@
+#include "benchmark/benchmark.h"
+
+#if defined(_MSC_VER)
+#if _MSVC_LANG != 201402L
+// MSVC, even in C++11 mode, dooes not claim to be in C++11 mode.
+#error "Trying to compile C++11 test with wrong C++ standard"
+#endif  //  _MSVC_LANG
+#else   // Non-MSVC
+#if __cplusplus != 201103L
+#error "Trying to compile C++11 test with wrong C++ standard"
+#endif  // Non-MSVC
+#endif
--- a/test/diagnostics_test.cc
+++ b/test/diagnostics_test.cc
@ -46,14 +46,19 @@ void try_invalid_pause_resume(benchmark::State& state) {
 void BM_diagnostic_test(benchmark::State& state) {
  static bool called_once = false;

-  if (called_once == false) try_invalid_pause_resume(state);
+  if (!called_once) {
+    try_invalid_pause_resume(state);
+  }

  for (auto _ : state) {
-    auto iterations = state.iterations();
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
    benchmark::DoNotOptimize(iterations);
  }

-  if (called_once == false) try_invalid_pause_resume(state);
+  if (!called_once) {
+    try_invalid_pause_resume(state);
+  }

  called_once = true;
 }
@ -62,14 +67,19 @@ BENCHMARK(BM_diagnostic_test);
 void BM_diagnostic_test_keep_running(benchmark::State& state) {
  static bool called_once = false;

-  if (called_once == false) try_invalid_pause_resume(state);
+  if (!called_once) {
+    try_invalid_pause_resume(state);
+  }

  while (state.KeepRunning()) {
-    auto iterations = state.iterations();
+    auto iterations = static_cast<double>(state.iterations()) *
+                      static_cast<double>(state.iterations());
    benchmark::DoNotOptimize(iterations);
  }

-  if (called_once == false) try_invalid_pause_resume(state);
+  if (!called_once) {
+    try_invalid_pause_resume(state);
+  }

  called_once = true;
 }
@ -80,7 +90,7 @@ int main(int argc, char* argv[]) {
  // This test is exercising functionality for debug builds, which are not
  // available in release builds. Skip the test if we are in that environment
  // to avoid a test failure.
-  std::cout << "Diagnostic test disabled in release build" << std::endl;
+  std::cout << "Diagnostic test disabled in release build\n";
  (void)argc;
  (void)argv;
 #else
--- a/test/donotoptimize_test.cc
+++ b/test/donotoptimize_test.cc
@ -4,7 +4,7 @@

 namespace {
 #if defined(__GNUC__)
-std::int64_t double_up(const std::int64_t x) __attribute__((const));
+std::int64_t double_up(std::int64_t x) __attribute__((const));
 #endif
 std::int64_t double_up(const std::int64_t x) { return x * 2; }
 }  // namespace
@ -26,7 +26,7 @@ struct BitRef {
  BitRef(int i, unsigned char& b) : index(i), byte(b) {}
 };

-int main(int, char*[]) {
+int main(int /*unused*/, char* /*unused*/[]) {
  // this test verifies compilation of DoNotOptimize() for some types

  char buffer1[1] = "";
@ -62,8 +62,6 @@ int main(int, char*[]) {
  BitRef lval = BitRef::Make();
  benchmark::DoNotOptimize(lval);

-#ifdef BENCHMARK_HAS_CXX11
  // Check that accept rvalue.
  benchmark::DoNotOptimize(BitRef::Make());
-#endif
 }
--- a/test/filter_test.cc
+++ b/test/filter_test.cc
@ -71,9 +71,10 @@ BENCHMARK(BM_FooBa);

 int main(int argc, char** argv) {
  bool list_only = false;
-  for (int i = 0; i < argc; ++i)
+  for (int i = 0; i < argc; ++i) {
    list_only |= std::string(argv[i]).find("--benchmark_list_tests") !=
                 std::string::npos;
+  }

  benchmark::Initialize(&argc, argv);

@ -84,13 +85,13 @@ int main(int argc, char** argv) {
  if (argc == 2) {
    // Make sure we ran all of the tests
    std::stringstream ss(argv[1]);
-    int64_t expected_return;
+    int64_t expected_return = 0;
    ss >> expected_return;

    if (returned_count != expected_return) {
      std::cerr << "ERROR: Expected " << expected_return
                << " tests to match the filter but returned_count = "
-                << returned_count << std::endl;
+                << returned_count << '\n';
      return -1;
    }

@ -99,7 +100,7 @@ int main(int argc, char** argv) {
    if (reports_count != expected_reports) {
      std::cerr << "ERROR: Expected " << expected_reports
                << " tests to be run but reported_count = " << reports_count
-                << std::endl;
+                << '\n';
      return -1;
    }

@ -108,7 +109,7 @@ int main(int argc, char** argv) {
    if (num_families != expected_reports) {
      std::cerr << "ERROR: Expected " << expected_reports
                << " test families to be run but num_families = "
-                << num_families << std::endl;
+                << num_families << '\n';
      return -1;
    }
  }
--- a/test/internal_threading_test.cc
+++ b/test/internal_threading_test.cc
@ -22,8 +22,9 @@ void MyBusySpinwait() {
    const auto elapsed = now - start;

    if (std::chrono::duration<double, std::chrono::seconds::period>(elapsed) >=
-        time_frame)
+        time_frame) {
      return;
+    }
  }
 }

--- a/Show More
+++ b/Show More