From c0a103e851c148b478a2eeb0a221e70089684fbc Mon Sep 17 00:00:00 2001 From: Tyler Neely Date: Tue, 22 Nov 2022 16:00:06 +0000 Subject: [PATCH] Do not advance the clock with every message, as this prevents messages of a certain request depth from ever completing --- src/io/simulator/simulator_handle.cpp | 19 +++++++++++++------ src/io/simulator/simulator_stats.hpp | 2 +- tests/simulation/raft.cpp | 1 + 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/io/simulator/simulator_handle.cpp b/src/io/simulator/simulator_handle.cpp index a521943f8..f216f3c01 100644 --- a/src/io/simulator/simulator_handle.cpp +++ b/src/io/simulator/simulator_handle.cpp @@ -100,15 +100,22 @@ bool SimulatorHandle::MaybeTickSimulator() { } const Duration clock_advance = std::chrono::microseconds{time_distrib_(rng_)}; - cluster_wide_time_microseconds_ += clock_advance; - stats_.elapsed_time = cluster_wide_time_microseconds_ - config_.start_time; - spdlog::info("simulator progressing: clock advanced by {}", clock_advance.count()); + // We don't always want to advance the clock with every message that we deliver because + // when we advance it for every message, it causes timeouts to occur for any "request path" + // over a certain length. Alternatively, we don't want to simply deliver multiple messages + // in a single simulator tick because that would reduce the amount of concurrent message + // mixing that may naturally occur in production. This approach is to mod the random clock + // advance by a prime number (hopefully avoiding most harmonic effects that would be introduced + // by only advancing the clock by an even amount etc...) and only advancing the clock close to + // half of the time. + if (clock_advance.count() % 97 > 49) { + spdlog::info("simulator progressing: clock advanced by {}", clock_advance.count()); + cluster_wide_time_microseconds_ += clock_advance; + stats_.elapsed_time = cluster_wide_time_microseconds_ - config_.start_time; + } if (cluster_wide_time_microseconds_ >= config_.abort_time) { - if (should_shut_down_) { - return true; - } spdlog::error( "Cluster has executed beyond its configured abort_time, and something may be failing to make progress " "in an expected amount of time."); diff --git a/src/io/simulator/simulator_stats.hpp b/src/io/simulator/simulator_stats.hpp index b02dc9f2a..1ae52ef28 100644 --- a/src/io/simulator/simulator_stats.hpp +++ b/src/io/simulator/simulator_stats.hpp @@ -30,7 +30,7 @@ struct SimulatorStats { friend bool operator==(const SimulatorStats & /* lhs */, const SimulatorStats & /* rhs */) = default; friend std::ostream &operator<<(std::ostream &in, const SimulatorStats &stats) { - auto elapsed_ms = std::chrono::duration_cast(stats.elapsed_time).count() / 1000; + auto elapsed_ms = std::chrono::duration_cast(stats.elapsed_time).count(); std::string formated = fmt::format( "SimulatorStats {{ total_messages: {}, dropped_messages: {}, timed_out_requests: {}, total_requests: {}, " diff --git a/tests/simulation/raft.cpp b/tests/simulation/raft.cpp index f2dd82e9d..d725b0712 100644 --- a/tests/simulation/raft.cpp +++ b/tests/simulation/raft.cpp @@ -185,6 +185,7 @@ std::pair RunSimulation(SimulatorConf auto write_cas_response_result = client.SendWriteRequest(cas_req); if (write_cas_response_result.HasError()) { + spdlog::debug("timed out"); // timed out continue; }