Fix Raft failure discovered in apollo run 479391

Summary: We noticed a Raft test failure https://apollo.memgraph.io/runs/479391/ This diff should fix it. Reviewers: ipaljak Reviewed By: ipaljak Subscribers: mferencevic, pullbot Differential Revision: https://phabricator.memgraph.io/D1865
2019-02-14 14:56:44 +01:00 · 2019-02-14 14:56:44 +01:00 · ed75e45541
commit ed75e45541
parent d1eeaa8de0
5 changed files with 22 additions and 10 deletions
--- a/src/raft/raft_server.cpp
+++ b/src/raft/raft_server.cpp
@ -717,7 +717,7 @@ void RaftServer::SendLogEntries(
    return;
  }

-  if (current_term_ != request_term || exiting_) {
+  if (current_term_ != request_term || mode_ != Mode::LEADER || exiting_) {
    return;
  }

@ -800,7 +800,7 @@ void RaftServer::SendSnapshot(uint16_t peer_id,
    return;
  }

-  if (current_term_ != request_term || exiting_) {
+  if (current_term_ != request_term || mode_ != Mode::LEADER || exiting_) {
    return;
  }

--- a/tests/integration/apollo_runs.yaml
+++ b/tests/integration/apollo_runs.yaml
@ -53,7 +53,7 @@

 - name: integration__ha_basic
  cd: ha/basic
-  commands: ./runner.py
+  commands: TIMEOUT=480 ./runner.py
  infiles:
    - runner.py # runner script
    - raft.json # raft configuration
--- a/tests/integration/ha/basic/runner.py
+++ b/tests/integration/ha/basic/runner.py
@ -19,10 +19,12 @@ from ha_test import HaTestBase
 class HaBasicTest(HaTestBase):
    def execute_step(self, step, expected_results):
        if step == "create":
+            print("Executing create query")
            client = subprocess.Popen([self.tester_binary, "--step", "create",
                "--cluster_size", str(self.cluster_size)])

        elif step == "count":
+            print("Executing count query")
            client = subprocess.Popen([self.tester_binary, "--step", "count",
                "--cluster_size", str(self.cluster_size), "--expected_results",
                str(expected_results)])
@ -33,6 +35,7 @@ class HaBasicTest(HaTestBase):
        try:
            code = client.wait(timeout=30)
        except subprocess.TimeoutExpired as e:
+            print("HA client timed out!")
            client.kill()
            return 1

@ -55,9 +58,10 @@ class HaBasicTest(HaTestBase):

            # Kill workers.
            for worker_id in partition:
+                print("Killing worker {}".format(worker_id))
                self.kill_worker(worker_id)

-            time.sleep(2) # allow some time for possible leader re-election
+            time.sleep(5) # allow some time for possible leader re-election

            if random.random() < 0.7:
                assert self.execute_step("create", expected_results) == 0, \
@ -69,8 +73,11 @@ class HaBasicTest(HaTestBase):

            # Bring workers back to life.
            for worker_id in partition:
+                print("Starting worker {}".format(worker_id))
                self.start_worker(worker_id)

+            time.sleep(5) # allow some time for possible leader re-election
+
        # Check that no data was lost.
        assert self.execute_step("count", expected_results) == 0, \
                "Error while executing count query"
--- a/tests/integration/ha/basic/tester.cpp
+++ b/tests/integration/ha/basic/tester.cpp
@ -64,6 +64,7 @@ int main(int argc, char **argv) {
        // This one seems to be down, continue.
        continue;
      }
+      LOG(INFO) << "Current Raft cluster leader is " << i;
    }
    if (!successfull) {
      LOG(INFO) << "Couldn't find Raft cluster leader, retrying.";
--- a/tests/integration/ha/ha_test.py
+++ b/tests/integration/ha/ha_test.py
@ -30,12 +30,7 @@ class HaTestBase:


    def __del__(self):
-        for worker in self.workers:
-            if worker is None: continue
-            worker.kill()
-            worker.wait()
-        self.workers.clear()
-        self.coordination_config_file.close()
+        self.destroy_cluster()


    def start_cluster(self):
@ -46,6 +41,15 @@ class HaTestBase:
        time.sleep(5)


+    def destroy_cluster(self):
+        for worker in self.workers:
+            if worker is None: continue
+            worker.kill()
+            worker.wait()
+        self.workers.clear()
+        self.coordination_config_file.close()
+
+
    def kill_worker(self, worker_id):
        assert worker_id >= 0 and worker_id < self.cluster_size, \
                "Invalid worker ID {}".format(worker_id)