Fix a race condition that happens when logging from a detached thread in the cluster property test. Improve the ShardManager dtor and log statements

2022-11-04 11:14:39 +00:00 · 2022-11-04 11:14:39 +00:00 · 8598f6edf4
commit 8598f6edf4
parent 9203616283
4 changed files with 28 additions and 4 deletions
--- a/src/storage/v3/shard_manager.hpp
+++ b/src/storage/v3/shard_manager.hpp
@ -105,6 +105,8 @@ class ShardManager {
      worker.Push(shard_worker::ShutDown{});
    }

+    workers_.clear();
+
    // The jthread handes for our shard worker threads will be
    // blocked on implicitly when worker_handles_ is destroyed.
  }
--- a/src/storage/v3/shard_worker.hpp
+++ b/src/storage/v3/shard_worker.hpp
@ -147,7 +147,7 @@ class ShardWorker {
  }

  Time Cron() {
-    spdlog::info("running ShardManager::Cron, address {}", io_.GetAddress().ToString());
+    spdlog::info("running ShardWorker::Cron, address {}", io_.GetAddress().ToString());
    Time now = io_.Now();

    while (!cron_schedule_.empty()) {
--- a/tests/simulation/cluster_property_test.cpp
+++ b/tests/simulation/cluster_property_test.cpp
@ -18,6 +18,7 @@
 #include <gtest/gtest.h>
 #include <rapidcheck.h>
 #include <rapidcheck/gtest.h>
+#include <spdlog/cfg/env.h>

 #include "generated_operations.hpp"
 #include "io/simulator/simulator_config.hpp"
@ -35,6 +36,8 @@ using storage::v3::kMaximumCronInterval;
 RC_GTEST_PROP(RandomClusterConfig, HappyPath, (ClusterConfig cluster_config, NonEmptyOpVec ops)) {
  // TODO(tyler) set abort_time to something more restrictive than Time::max()

+  spdlog::cfg::load_env_levels();
+
  SimulatorConfig sim_config{
      .drop_percent = 0,
      .perform_timeouts = false,
--- a/tests/simulation/test_cluster.hpp
+++ b/tests/simulation/test_cluster.hpp
@ -194,6 +194,22 @@ void ExecuteOp(msgs::ShardRequestManager<SimulatorTransport> &shard_request_mana
  }
 }

+/// This struct exists as a way of detaching
+/// a thread if something causes an uncaught
+/// exception - because that thread would not
+/// receive a ShutDown message otherwise, and
+/// would cause the test to hang forever.
+struct DetachIfDropped {
+  std::jthread &handle;
+  bool detach = true;
+
+  ~DetachIfDropped() {
+    if (detach && handle.joinable()) {
+      handle.detach();
+    }
+  }
+};
+
 void RunClusterSimulation(const SimulatorConfig &sim_config, const ClusterConfig &cluster_config,
                          const std::vector<Op> &ops) {
  spdlog::info("========================== NEW SIMULATION ==========================");
@ -217,9 +233,7 @@ void RunClusterSimulation(const SimulatorConfig &sim_config, const ClusterConfig

  auto mm_thread_1 = std::jthread(RunMachine, std::move(mm_1));

-  // Need to detach this thread so that the destructor does not
-  // block before we can propagate assertion failures.
-  mm_thread_1.detach();
+  auto detach_on_error = DetachIfDropped{.handle = mm_thread_1};

  // TODO(tyler) clarify addresses of coordinator etc... as it's a mess

@ -236,6 +250,11 @@ void RunClusterSimulation(const SimulatorConfig &sim_config, const ClusterConfig
    std::visit([&](auto &o) { ExecuteOp(shard_request_manager, correctness_model, o); }, op.inner);
  }

+  // We have now completed our workload without failing any assertions, so we can
+  // disable detaching the worker thread, which will cause the mm_thread_1 jthread
+  // to be joined when this function returns.
+  detach_on_error.detach = false;
+
  simulator.ShutDown();

  SimulatorStats stats = simulator.Stats();