Skip to content

Commit 89bb8cc

Browse files
authored
Merge pull request #1228 from liangxin1300/20230803_stop_cluster_all_crmsh45
[crmsh-4.5] Fix: ui_cluster: Improve the process of 'crm cluster stop' (bsc#1213889)
2 parents 8b3c138 + 1fcb08c commit 89bb8cc

File tree

5 files changed

+191
-91
lines changed

5 files changed

+191
-91
lines changed

crmsh/ui_cluster.py

Lines changed: 57 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -190,40 +190,73 @@ def do_start(self, context, *args):
190190
for node in node_list:
191191
logger.info("The cluster stack started on {}".format(node))
192192

193+
@staticmethod
194+
def _node_ready_to_stop_cluster_service(node):
195+
"""
196+
Check if the specific node is ready to stop cluster service
197+
198+
If both corosync.service and pacemaker.service is active, return True
199+
If some services started, stop them first and return False
200+
"""
201+
corosync_active = utils.service_is_active("corosync.service", remote_addr=node)
202+
sbd_active = utils.service_is_active("sbd.service", remote_addr=node)
203+
pacemaker_active = utils.service_is_active("pacemaker.service", remote_addr=node)
204+
205+
if not corosync_active:
206+
if sbd_active:
207+
utils.stop_service("corosync", remote_addr=node)
208+
logger.info(f"The cluster stack stopped on {node}")
209+
else:
210+
logger.info(f"The cluster stack already stopped on {node}")
211+
return False
212+
213+
elif not pacemaker_active:
214+
utils.stop_service("corosync", remote_addr=node)
215+
logger.info("The cluster stack stopped on {}".format(node))
216+
return False
217+
218+
return True
219+
220+
@staticmethod
221+
def _wait_for_dc(node=None):
222+
"""
223+
Wait for the cluster's DC to become available
224+
"""
225+
if not utils.service_is_active("pacemaker.service", remote_addr=node):
226+
return
227+
228+
dc_deadtime = utils.get_property("dc-deadtime", peer=node) or str(constants.DC_DEADTIME_DEFAULT)
229+
dc_timeout = int(dc_deadtime.strip('s')) + 5
230+
try:
231+
utils.check_function_with_timeout(utils.get_dc, wait_timeout=dc_timeout, peer=node)
232+
except TimeoutError:
233+
logger.error("No DC found currently, please wait if the cluster is still starting")
234+
raise utils.TerminateSubCommand
235+
236+
@staticmethod
237+
def _set_dlm(node=None):
238+
"""
239+
When dlm running and quorum is lost, before stop cluster service, should set
240+
enable_quorum_fencing=0, enable_quorum_lockspace=0 for dlm config option
241+
"""
242+
if utils.is_dlm_running(node) and not utils.is_quorate(node):
243+
logger.debug("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm")
244+
utils.set_dlm_option(peer=node, enable_quorum_fencing=0, enable_quorum_lockspace=0)
245+
193246
@command.skill_level('administrator')
194247
def do_stop(self, context, *args):
195248
'''
196249
Stops the cluster stack on all nodes or specific node(s)
197250
'''
198251
node_list = parse_option_for_nodes(context, *args)
199-
for node in node_list[:]:
200-
if not utils.service_is_active("corosync.service", remote_addr=node):
201-
if utils.service_is_active("sbd.service", remote_addr=node):
202-
utils.stop_service("corosync", remote_addr=node)
203-
logger.info("The cluster stack stopped on {}".format(node))
204-
else:
205-
logger.info("The cluster stack already stopped on {}".format(node))
206-
node_list.remove(node)
207-
elif not utils.service_is_active("pacemaker.service", remote_addr=node):
208-
utils.stop_service("corosync", remote_addr=node)
209-
logger.info("The cluster stack stopped on {}".format(node))
210-
node_list.remove(node)
252+
node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)]
211253
if not node_list:
212254
return
255+
logger.debug(f"stop node list: {node_list}")
213256

214-
dc_deadtime = utils.get_property("dc-deadtime") or str(constants.DC_DEADTIME_DEFAULT)
215-
dc_timeout = int(dc_deadtime.strip('s')) + 5
216-
try:
217-
utils.check_function_with_timeout(utils.get_dc, wait_timeout=dc_timeout)
218-
except TimeoutError:
219-
logger.error("No DC found currently, please wait if the cluster is still starting")
220-
return False
257+
self._wait_for_dc(node_list[0])
221258

222-
# When dlm running and quorum is lost, before stop cluster service, should set
223-
# enable_quorum_fencing=0, enable_quorum_lockspace=0 for dlm config option
224-
if utils.is_dlm_running() and not utils.is_quorate():
225-
logger.debug("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm")
226-
utils.set_dlm_option(enable_quorum_fencing=0, enable_quorum_lockspace=0)
259+
self._set_dlm(node_list[0])
227260

228261
# Stop pacemaker since it can make sure cluster has quorum until stop corosync
229262
node_list = utils.stop_service("pacemaker", node_list=node_list)

crmsh/utils.py

Lines changed: 38 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,14 +1142,14 @@ def append_file(dest, src):
11421142
return False
11431143

11441144

1145-
def get_dc():
1145+
def get_dc(peer=None):
11461146
cmd = "crmadmin -D -t 1"
1147-
rc, s, _ = get_stdout_stderr(add_sudo(cmd))
1148-
if rc != 0:
1147+
out = get_stdout_or_raise_error(add_sudo(cmd), remote=peer, no_raise=True)
1148+
if not out:
11491149
return None
1150-
if not s.startswith("Designated"):
1150+
if not out.startswith("Designated"):
11511151
return None
1152-
return s.split()[-1]
1152+
return out.split()[-1]
11531153

11541154

11551155
def wait4dc(what="", show_progress=True):
@@ -3204,47 +3204,62 @@ def is_standby(node):
32043204
return re.search(r'Node\s+{}:\s+standby'.format(node), out) is not None
32053205

32063206

3207-
def get_dlm_option_dict():
3207+
def get_dlm_option_dict(peer=None):
32083208
"""
32093209
Get dlm config option dictionary
32103210
"""
3211-
out = get_stdout_or_raise_error("dlm_tool dump_config")
3211+
out = get_stdout_or_raise_error("dlm_tool dump_config", remote=peer)
32123212
return dict(re.findall("(\w+)=(\w+)", out))
32133213

32143214

3215-
def set_dlm_option(**kargs):
3215+
def set_dlm_option(peer=None, **kargs):
32163216
"""
32173217
Set dlm option
32183218
"""
3219-
dlm_option_dict = get_dlm_option_dict()
3219+
dlm_option_dict = get_dlm_option_dict(peer=peer)
32203220
for option, value in kargs.items():
32213221
if option not in dlm_option_dict:
3222-
raise ValueError('"{}" is not dlm config option'.format(option))
3222+
raise ValueError(f'"{option}" is not dlm config option')
32233223
if dlm_option_dict[option] != value:
3224-
get_stdout_or_raise_error('dlm_tool set_config "{}={}"'.format(option, value))
3224+
get_stdout_or_raise_error(f'dlm_tool set_config "{option}={value}"', remote=peer)
32253225

32263226

3227-
def is_dlm_running():
3227+
def is_dlm_running(peer=None):
32283228
"""
32293229
Check if dlm ra controld is running
32303230
"""
3231-
from . import xmlutil
3232-
return xmlutil.CrmMonXmlParser.is_resource_started(constants.DLM_CONTROLD_RA)
3231+
return is_resource_running(constants.DLM_CONTROLD_RA, peer=peer)
3232+
3233+
3234+
def has_resource_configured(ra_type, peer=None):
3235+
"""
3236+
Check if the RA configured
3237+
"""
3238+
out = get_stdout_or_raise_error("crm_mon -1rR", remote=peer)
3239+
return re.search(ra_type, out) is not None
32333240

32343241

3235-
def is_dlm_configured():
3242+
def is_resource_running(ra_type, peer=None):
3243+
"""
3244+
Check if the RA running
3245+
"""
3246+
out = get_stdout_or_raise_error("crm_mon -1rR", remote=peer)
3247+
patt = f"\({ra_type}\):\s*Started"
3248+
return re.search(patt, out) is not None
3249+
3250+
3251+
def is_dlm_configured(peer=None):
32363252
"""
32373253
Check if dlm configured
32383254
"""
3239-
from . import xmlutil
3240-
return xmlutil.CrmMonXmlParser.is_resource_configured(constants.DLM_CONTROLD_RA)
3255+
return has_resource_configured(constants.DLM_CONTROLD_RA, peer=peer)
32413256

32423257

3243-
def is_quorate():
3258+
def is_quorate(peer=None):
32443259
"""
32453260
Check if cluster is quorated
32463261
"""
3247-
out = get_stdout_or_raise_error("corosync-quorumtool -s", success_val_list=[0, 2])
3262+
out = get_stdout_or_raise_error("corosync-quorumtool -s", remote=peer, success_val_list=[0, 2])
32483263
res = re.search(r'Quorate:\s+(.*)', out)
32493264
if res:
32503265
return res.group(1) == "Yes"
@@ -3270,7 +3285,7 @@ def get_pcmk_delay_max(two_node_without_qdevice=False):
32703285
return 0
32713286

32723287

3273-
def get_property(name, property_type="crm_config"):
3288+
def get_property(name, property_type="crm_config", peer=None):
32743289
"""
32753290
Get cluster properties
32763291
@@ -3281,8 +3296,7 @@ def get_property(name, property_type="crm_config"):
32813296
cmd = "CIB_file={} sudo --preserve-env=CIB_file crm configure get_property {}".format(cib_path, name)
32823297
else:
32833298
cmd = "sudo crm_attribute -t {} -n {} -Gq".format(property_type, name)
3284-
rc, stdout, _ = get_stdout_stderr(cmd)
3285-
return stdout if rc == 0 else None
3299+
return get_stdout_or_raise_error(cmd, remote=peer, no_raise=True)
32863300

32873301

32883302
def check_no_quorum_policy_with_dlm():
@@ -3428,7 +3442,7 @@ def detect_file(_file, remote=None):
34283442
return rc
34293443

34303444

3431-
def check_function_with_timeout(check_function, wait_timeout=30, interval=1):
3445+
def check_function_with_timeout(check_function, wait_timeout=30, interval=1, *args, **kwargs):
34323446
"""
34333447
Run check_function in a loop
34343448
Return when check_function is true
@@ -3437,7 +3451,7 @@ def check_function_with_timeout(check_function, wait_timeout=30, interval=1):
34373451
current_time = int(time.time())
34383452
timeout = current_time + wait_timeout
34393453
while current_time <= timeout:
3440-
if check_function():
3454+
if check_function(*args, **kwargs):
34413455
return
34423456
time.sleep(interval)
34433457
current_time = int(time.time())

test/features/bootstrap_bugs.feature

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,20 @@ Feature: Regression test for bootstrap bugs
132132
When Run "crm cluster stop" on "hanode1"
133133
Then Service "corosync" is "stopped" on "hanode1"
134134

135+
@clean
136+
Scenario: Can't stop all nodes' cluster service when local node's service is down(bsc#1213889)
137+
Given Cluster service is "stopped" on "hanode1"
138+
And Cluster service is "stopped" on "hanode2"
139+
When Run "crm cluster init -y" on "hanode1"
140+
Then Cluster service is "started" on "hanode1"
141+
When Run "crm cluster join -c hanode1 -y" on "hanode2"
142+
Then Cluster service is "started" on "hanode2"
143+
When Wait for DC
144+
And Run "crm cluster stop" on "hanode1"
145+
And Run "crm cluster stop --all" on "hanode1"
146+
Then Cluster service is "stopped" on "hanode1"
147+
And Cluster service is "stopped" on "hanode2"
148+
135149
@skip_non_root
136150
@clean
137151
Scenario: crm cluster join default behavior change in ssh key handling (bsc#1210693)

test/unittests/test_ui_cluster.py

Lines changed: 73 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -80,52 +80,91 @@ def test_do_start(self, mock_parse_nodes, mock_active, mock_start, mock_qdevice_
8080
mock_qdevice_configured.assert_called_once_with()
8181
mock_info.assert_called_once_with("The cluster stack started on node1")
8282

83-
@mock.patch('logging.Logger.info')
84-
@mock.patch('crmsh.utils.service_is_active')
83+
@mock.patch('crmsh.ui_cluster.Cluster._wait_for_dc')
84+
@mock.patch('crmsh.ui_cluster.Cluster._node_ready_to_stop_cluster_service')
8585
@mock.patch('crmsh.ui_cluster.parse_option_for_nodes')
86-
def test_do_stop_already_stopped(self, mock_parse_nodes, mock_active, mock_info):
86+
def test_do_stop_return(self, mock_parse_nodes, mock_node_ready_to_stop_cluster_service, mock_dc):
87+
mock_parse_nodes.return_value = ["node1", "node2"]
88+
mock_node_ready_to_stop_cluster_service.side_effect = [False, False]
89+
8790
context_inst = mock.Mock()
88-
mock_parse_nodes.return_value = ["node1"]
89-
mock_active.side_effect = [False, False]
90-
self.ui_cluster_inst.do_stop(context_inst, "node1")
91-
mock_active.assert_has_calls([
92-
mock.call("corosync.service", remote_addr="node1"),
93-
mock.call("sbd.service", remote_addr="node1")
94-
])
95-
mock_info.assert_called_once_with("The cluster stack already stopped on node1")
91+
self.ui_cluster_inst.do_stop(context_inst, "node1", "node2")
92+
93+
mock_parse_nodes.assert_called_once_with(context_inst, "node1", "node2")
94+
mock_node_ready_to_stop_cluster_service.assert_has_calls([mock.call("node1"), mock.call("node2")])
95+
mock_dc.assert_not_called()
9696

9797
@mock.patch('logging.Logger.debug')
9898
@mock.patch('logging.Logger.info')
99-
@mock.patch('crmsh.utils.stop_service')
100-
@mock.patch('crmsh.utils.set_dlm_option')
101-
@mock.patch('crmsh.utils.is_quorate')
102-
@mock.patch('crmsh.utils.is_dlm_running')
103-
@mock.patch('crmsh.utils.get_dc')
104-
@mock.patch('crmsh.utils.check_function_with_timeout')
105-
@mock.patch('crmsh.utils.get_property')
10699
@mock.patch('crmsh.utils.service_is_active')
100+
@mock.patch('crmsh.utils.stop_service')
101+
@mock.patch('crmsh.ui_cluster.Cluster._set_dlm')
102+
@mock.patch('crmsh.ui_cluster.Cluster._wait_for_dc')
103+
@mock.patch('crmsh.ui_cluster.Cluster._node_ready_to_stop_cluster_service')
107104
@mock.patch('crmsh.ui_cluster.parse_option_for_nodes')
108-
def test_do_stop(self, mock_parse_nodes, mock_active, mock_get_property, mock_check, mock_get_dc, mock_dlm_running, mock_is_quorate, mock_set_dlm, mock_stop, mock_info, mock_debug):
105+
def test_do_stop(self, mock_parse_nodes, mock_node_ready_to_stop_cluster_service, mock_dc,
106+
mock_set_dlm, mock_stop, mock_is_active, mock_info, mock_debug):
107+
mock_parse_nodes.return_value = ["node1", "node2"]
108+
mock_node_ready_to_stop_cluster_service.side_effect = [True, False]
109+
mock_stop.side_effect = [["node1"], ["node1"], ["node1"]]
110+
mock_is_active.return_value = True
111+
109112
context_inst = mock.Mock()
110-
mock_stop.side_effect = [["node1"], ["ndoe1"], ["node1"]]
111-
mock_parse_nodes.return_value = ["node1"]
112-
mock_active.side_effect = [True, True, True]
113-
mock_dlm_running.return_value = True
114-
mock_is_quorate.return_value = False
115-
mock_get_property.return_value = "20s"
113+
self.ui_cluster_inst.do_stop(context_inst, "node1", "node2")
116114

117-
self.ui_cluster_inst.do_stop(context_inst, "node1")
115+
mock_parse_nodes.assert_called_once_with(context_inst, "node1", "node2")
116+
mock_node_ready_to_stop_cluster_service.assert_has_calls([mock.call("node1"), mock.call("node2")])
117+
mock_debug.assert_called_once_with("stop node list: ['node1']")
118+
mock_dc.assert_called_once_with("node1")
119+
mock_set_dlm.assert_called_once_with("node1")
120+
mock_stop.assert_has_calls([
121+
mock.call("pacemaker", node_list=["node1"]),
122+
mock.call("corosync-qdevice.service", node_list=["node1"]),
123+
mock.call("corosync", node_list=["node1"]),
124+
])
125+
mock_info.assert_called_once_with("The cluster stack stopped on node1")
118126

119-
mock_active.assert_has_calls([
127+
@mock.patch('logging.Logger.info')
128+
@mock.patch('crmsh.utils.stop_service')
129+
@mock.patch('crmsh.utils.service_is_active')
130+
def test__node_ready_to_stop_cluster_service_corosync(self, mock_is_active, mock_stop, mock_info):
131+
mock_is_active.side_effect = [False, True, False]
132+
res = self.ui_cluster_inst._node_ready_to_stop_cluster_service("node1")
133+
assert res is False
134+
mock_is_active.assert_has_calls([
120135
mock.call("corosync.service", remote_addr="node1"),
136+
mock.call("sbd.service", remote_addr="node1"),
121137
mock.call("pacemaker.service", remote_addr="node1"),
122-
mock.call("corosync-qdevice.service")
123138
])
124-
mock_stop.assert_has_calls([
125-
mock.call("pacemaker", node_list=["node1"]),
126-
mock.call("corosync-qdevice.service", node_list=["node1"]),
127-
mock.call("corosync", node_list=["node1"])
139+
mock_stop.assert_called_once_with("corosync", remote_addr="node1")
140+
mock_info.assert_called_once_with("The cluster stack stopped on node1")
141+
142+
@mock.patch('logging.Logger.info')
143+
@mock.patch('crmsh.utils.stop_service')
144+
@mock.patch('crmsh.utils.service_is_active')
145+
def test__node_ready_to_stop_cluster_service_pacemaker(self, mock_is_active, mock_stop, mock_info):
146+
mock_is_active.side_effect = [True, True, False]
147+
res = self.ui_cluster_inst._node_ready_to_stop_cluster_service("node1")
148+
assert res is False
149+
mock_is_active.assert_has_calls([
150+
mock.call("corosync.service", remote_addr="node1"),
151+
mock.call("sbd.service", remote_addr="node1"),
152+
mock.call("pacemaker.service", remote_addr="node1"),
128153
])
154+
mock_stop.assert_called_once_with("corosync", remote_addr="node1")
129155
mock_info.assert_called_once_with("The cluster stack stopped on node1")
130-
mock_debug.assert_called_once_with("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm")
131-
mock_check.assert_called_once_with(mock_get_dc, wait_timeout=25)
156+
157+
@mock.patch('logging.Logger.info')
158+
@mock.patch('crmsh.utils.stop_service')
159+
@mock.patch('crmsh.utils.service_is_active')
160+
def test__node_ready_to_stop_cluster_service(self, mock_is_active, mock_stop, mock_info):
161+
mock_is_active.side_effect = [True, True, True]
162+
res = self.ui_cluster_inst._node_ready_to_stop_cluster_service("node1")
163+
assert res is True
164+
mock_is_active.assert_has_calls([
165+
mock.call("corosync.service", remote_addr="node1"),
166+
mock.call("sbd.service", remote_addr="node1"),
167+
mock.call("pacemaker.service", remote_addr="node1"),
168+
])
169+
mock_info.assert_not_called()
170+
mock_stop.assert_not_called()

0 commit comments

Comments
 (0)