Fix Raft failure discovered in apollo run 479391
Summary: We noticed a Raft test failure https://apollo.memgraph.io/runs/479391/ This diff should fix it. Reviewers: ipaljak Reviewed By: ipaljak Subscribers: mferencevic, pullbot Differential Revision: https://phabricator.memgraph.io/D1865
This commit is contained in:
parent
d1eeaa8de0
commit
ed75e45541
@ -717,7 +717,7 @@ void RaftServer::SendLogEntries(
|
||||
return;
|
||||
}
|
||||
|
||||
if (current_term_ != request_term || exiting_) {
|
||||
if (current_term_ != request_term || mode_ != Mode::LEADER || exiting_) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -800,7 +800,7 @@ void RaftServer::SendSnapshot(uint16_t peer_id,
|
||||
return;
|
||||
}
|
||||
|
||||
if (current_term_ != request_term || exiting_) {
|
||||
if (current_term_ != request_term || mode_ != Mode::LEADER || exiting_) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -53,7 +53,7 @@
|
||||
|
||||
- name: integration__ha_basic
|
||||
cd: ha/basic
|
||||
commands: ./runner.py
|
||||
commands: TIMEOUT=480 ./runner.py
|
||||
infiles:
|
||||
- runner.py # runner script
|
||||
- raft.json # raft configuration
|
||||
|
@ -19,10 +19,12 @@ from ha_test import HaTestBase
|
||||
class HaBasicTest(HaTestBase):
|
||||
def execute_step(self, step, expected_results):
|
||||
if step == "create":
|
||||
print("Executing create query")
|
||||
client = subprocess.Popen([self.tester_binary, "--step", "create",
|
||||
"--cluster_size", str(self.cluster_size)])
|
||||
|
||||
elif step == "count":
|
||||
print("Executing count query")
|
||||
client = subprocess.Popen([self.tester_binary, "--step", "count",
|
||||
"--cluster_size", str(self.cluster_size), "--expected_results",
|
||||
str(expected_results)])
|
||||
@ -33,6 +35,7 @@ class HaBasicTest(HaTestBase):
|
||||
try:
|
||||
code = client.wait(timeout=30)
|
||||
except subprocess.TimeoutExpired as e:
|
||||
print("HA client timed out!")
|
||||
client.kill()
|
||||
return 1
|
||||
|
||||
@ -55,9 +58,10 @@ class HaBasicTest(HaTestBase):
|
||||
|
||||
# Kill workers.
|
||||
for worker_id in partition:
|
||||
print("Killing worker {}".format(worker_id))
|
||||
self.kill_worker(worker_id)
|
||||
|
||||
time.sleep(2) # allow some time for possible leader re-election
|
||||
time.sleep(5) # allow some time for possible leader re-election
|
||||
|
||||
if random.random() < 0.7:
|
||||
assert self.execute_step("create", expected_results) == 0, \
|
||||
@ -69,8 +73,11 @@ class HaBasicTest(HaTestBase):
|
||||
|
||||
# Bring workers back to life.
|
||||
for worker_id in partition:
|
||||
print("Starting worker {}".format(worker_id))
|
||||
self.start_worker(worker_id)
|
||||
|
||||
time.sleep(5) # allow some time for possible leader re-election
|
||||
|
||||
# Check that no data was lost.
|
||||
assert self.execute_step("count", expected_results) == 0, \
|
||||
"Error while executing count query"
|
||||
|
@ -64,6 +64,7 @@ int main(int argc, char **argv) {
|
||||
// This one seems to be down, continue.
|
||||
continue;
|
||||
}
|
||||
LOG(INFO) << "Current Raft cluster leader is " << i;
|
||||
}
|
||||
if (!successfull) {
|
||||
LOG(INFO) << "Couldn't find Raft cluster leader, retrying.";
|
||||
|
@ -30,12 +30,7 @@ class HaTestBase:
|
||||
|
||||
|
||||
def __del__(self):
|
||||
for worker in self.workers:
|
||||
if worker is None: continue
|
||||
worker.kill()
|
||||
worker.wait()
|
||||
self.workers.clear()
|
||||
self.coordination_config_file.close()
|
||||
self.destroy_cluster()
|
||||
|
||||
|
||||
def start_cluster(self):
|
||||
@ -46,6 +41,15 @@ class HaTestBase:
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
def destroy_cluster(self):
|
||||
for worker in self.workers:
|
||||
if worker is None: continue
|
||||
worker.kill()
|
||||
worker.wait()
|
||||
self.workers.clear()
|
||||
self.coordination_config_file.close()
|
||||
|
||||
|
||||
def kill_worker(self, worker_id):
|
||||
assert worker_id >= 0 and worker_id < self.cluster_size, \
|
||||
"Invalid worker ID {}".format(worker_id)
|
||||
|
Loading…
Reference in New Issue
Block a user