Make raft tests fully deterministic for rng_seeds between 0 and 500 at 1% message loss

This commit is contained in:
Tyler Neely 2022-11-17 21:32:55 +00:00
parent 9c3d683942
commit f6017697d6
4 changed files with 25 additions and 22 deletions

View File

@ -75,12 +75,14 @@ bool SimulatorHandle::MaybeTickSimulator() {
return false; return false;
} }
spdlog::info("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ simulator tick ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
stats_.simulator_ticks++; stats_.simulator_ticks++;
bool fired_cv = false; bool fired_cv = false;
bool timed_anything_out = TimeoutPromisesPastDeadline(); bool timed_anything_out = TimeoutPromisesPastDeadline();
if (timed_anything_out) { if (timed_anything_out) {
spdlog::info("simulator progressing: timed out a request");
fired_cv = true; fired_cv = true;
blocked_on_receive_.clear(); blocked_on_receive_.clear();
cv_.notify_all(); cv_.notify_all();
@ -95,6 +97,7 @@ bool SimulatorHandle::MaybeTickSimulator() {
cluster_wide_time_microseconds_ += clock_advance; cluster_wide_time_microseconds_ += clock_advance;
if (!fired_cv) { if (!fired_cv) {
spdlog::info("simulator progressing: clock advanced by {}", clock_advance.count());
cv_.notify_all(); cv_.notify_all();
blocked_on_receive_.clear(); blocked_on_receive_.clear();
fired_cv = true; fired_cv = true;
@ -109,6 +112,9 @@ bool SimulatorHandle::MaybeTickSimulator() {
"in an expected amount of time."); "in an expected amount of time.");
throw utils::BasicException{"Cluster has executed beyond its configured abort_time"}; throw utils::BasicException{"Cluster has executed beyond its configured abort_time"};
} }
// if the clock is advanced, no messages should be delivered also.
// do that in a future tick.
return true; return true;
} }
@ -141,7 +147,7 @@ bool SimulatorHandle::MaybeTickSimulator() {
if (should_drop || normal_timeout) { if (should_drop || normal_timeout) {
stats_.timed_out_requests++; stats_.timed_out_requests++;
dop.promise.TimeOut(); dop.promise.TimeOut();
spdlog::info("timing out request"); spdlog::info("timing out request ");
} else { } else {
stats_.total_responses++; stats_.total_responses++;
Duration response_latency = cluster_wide_time_microseconds_ - dop.requested_at; Duration response_latency = cluster_wide_time_microseconds_ - dop.requested_at;
@ -153,15 +159,20 @@ bool SimulatorHandle::MaybeTickSimulator() {
} else if (should_drop) { } else if (should_drop) {
// don't add it anywhere, let it drop // don't add it anywhere, let it drop
spdlog::info("silently dropping request"); spdlog::info("silently dropping request");
// we don't want to reset the block list here
return true;
} else { } else {
// add to can_receive_ if not // add to can_receive_ if not
spdlog::info("adding message to can_receive_"); spdlog::info("adding message to can_receive_ from {} to {}", opaque_message.from_address.last_known_port,
opaque_message.to_address.last_known_port);
const auto &[om_vec, inserted] = const auto &[om_vec, inserted] =
can_receive_.try_emplace(to_address.ToPartialAddress(), std::vector<OpaqueMessage>()); can_receive_.try_emplace(to_address.ToPartialAddress(), std::vector<OpaqueMessage>());
om_vec->second.emplace_back(std::move(opaque_message)); om_vec->second.emplace_back(std::move(opaque_message));
} }
if (!fired_cv) { if (!fired_cv) {
spdlog::info("simulator progressing: handled a message");
cv_.notify_all(); cv_.notify_all();
blocked_on_receive_.clear(); blocked_on_receive_.clear();
fired_cv = true; fired_cv = true;

View File

@ -171,12 +171,15 @@ class SimulatorHandle {
} }
if (!should_shut_down_) { if (!should_shut_down_) {
if (!blocked_on_receive_.contains(receiver)) {
blocked_on_receive_.emplace(receiver); blocked_on_receive_.emplace(receiver);
cv_.notify_all();
spdlog::info("blocking receiver {}", receiver.ToPartialAddress().port); spdlog::info("blocking receiver {}", receiver.ToPartialAddress().port);
cv_.notify_all();
}
cv_.wait(lock); cv_.wait(lock);
} }
} }
spdlog::info("timing out receiver {}", receiver.ToPartialAddress().port);
return TimedOut{}; return TimedOut{};
} }

View File

@ -34,12 +34,7 @@ class SimulatorTransport {
template <Message RequestT, Message ResponseT> template <Message RequestT, Message ResponseT>
ResponseFuture<ResponseT> Request(Address to_address, Address from_address, RequestT request, Duration timeout) { ResponseFuture<ResponseT> Request(Address to_address, Address from_address, RequestT request, Duration timeout) {
std::function<bool()> maybe_tick_simulator = [this] { std::function<bool()> maybe_tick_simulator = [this] { return simulator_handle_->MaybeTickSimulator(); };
spdlog::info("client calling MaybeTickSimulator");
bool ret = simulator_handle_->MaybeTickSimulator();
spdlog::info("client returned from MaybeTickSimulator");
return ret;
};
return simulator_handle_->template SubmitRequest<RequestT, ResponseT>(to_address, from_address, std::move(request), return simulator_handle_->template SubmitRequest<RequestT, ResponseT>(to_address, from_address, std::move(request),
timeout, std::move(maybe_tick_simulator)); timeout, std::move(maybe_tick_simulator));

View File

@ -240,13 +240,6 @@ std::pair<SimulatorStats, LatencyHistogramSummaries> RunSimulation(SimulatorConf
spdlog::info("========================== SUCCESS :) =========================="); spdlog::info("========================== SUCCESS :) ==========================");
return std::make_pair(simulator.Stats(), cli_io.ResponseLatencies()); return std::make_pair(simulator.Stats(), cli_io.ResponseLatencies());
/*
this is implicit in jthread's dtor
srv_thread_1.join();
srv_thread_2.join();
srv_thread_3.join();
*/
} }
int main() { int main() {
@ -264,25 +257,26 @@ int main() {
}; };
for (int i = 0; i < n_tests; i++) { for (int i = 0; i < n_tests; i++) {
spdlog::error("========================== NEW SIMULATION SEED {} ==========================", i);
spdlog::info("\tTime\t\tTerm\tPort\tRole\t\tMessage\n"); spdlog::info("\tTime\t\tTerm\tPort\tRole\t\tMessage\n");
// this is vital to cause tests to behave differently across runs // this is vital to cause tests to behave differently across runs
config.rng_seed = i; config.rng_seed = i;
spdlog::info("========================== NEW SIMULATION SEED {} ==========================", i);
auto [sim_stats_1, latency_stats_1] = RunSimulation(config); auto [sim_stats_1, latency_stats_1] = RunSimulation(config);
spdlog::info("========================== NEW SIMULATION SEED {} ==========================", i);
auto [sim_stats_2, latency_stats_2] = RunSimulation(config); auto [sim_stats_2, latency_stats_2] = RunSimulation(config);
if (sim_stats_1 != sim_stats_2 || latency_stats_1 != latency_stats_2) { if (sim_stats_1 != sim_stats_2 || latency_stats_1 != latency_stats_2) {
spdlog::error("simulator stats diverged across runs"); spdlog::error("simulator stats diverged across runs for test rng_seed {}", i);
spdlog::error("run 1 simulator stats: {}", sim_stats_1); spdlog::error("run 1 simulator stats: {}", sim_stats_1);
spdlog::error("run 2 simulator stats: {}", sim_stats_2); spdlog::error("run 2 simulator stats: {}", sim_stats_2);
spdlog::error("run 1 latency:\n{}", latency_stats_1.SummaryTable()); spdlog::error("run 1 latency:\n{}", latency_stats_1.SummaryTable());
spdlog::error("run 2 latency:\n{}", latency_stats_2.SummaryTable()); spdlog::error("run 2 latency:\n{}", latency_stats_2.SummaryTable());
std::terminate(); std::terminate();
} }
spdlog::error("run 1 simulator stats: {}", sim_stats_1); spdlog::info("run 1 simulator stats: {}", sim_stats_1);
spdlog::error("run 2 simulator stats: {}", sim_stats_2); spdlog::info("run 2 simulator stats: {}", sim_stats_2);
} }
spdlog::info("passed {} tests!", n_tests); spdlog::info("passed {} tests!", n_tests);