Skip to content

Commit f8b0f1b

Browse files
committed
check and fix
1 parent 264b228 commit f8b0f1b

File tree

4 files changed

+192
-38
lines changed

4 files changed

+192
-38
lines changed

crmsh/bootstrap.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
from .sh import ShellUtils
4747
from .ui_node import NodeMgmt
4848
from .user_of_host import UserOfHost, UserNotFoundError
49-
from .sbd import SBDUtils, SBDManager, SBDTimeout
49+
from .sbd import SBDUtils, SBDManager, SBDTimeout, SBDTimeoutChecker
5050
from . import watchdog
5151
import crmsh.healthcheck
5252

@@ -2743,7 +2743,7 @@ def adjust_stonith_timeout(with_sbd: bool = False):
27432743
Adjust stonith-timeout for sbd and other scenarios
27442744
"""
27452745
if ServiceManager().service_is_active(constants.SBD_SERVICE) or with_sbd:
2746-
SBDTimeout.adjust_sbd_timeout_related_cluster_configuration()
2746+
SBDTimeoutChecker(fix=True, warn=False).check_and_fix()
27472747
else:
27482748
value = get_stonith_timeout_generally_expected()
27492749
if value:

crmsh/sbd.py

Lines changed: 166 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from . import xmlutil
1111
from . import watchdog
1212
from . import parallax
13+
from . import healthcheck
1314
from .service_manager import ServiceManager
1415
from .sh import ShellUtils
1516

@@ -196,6 +197,7 @@ def __init__(self, context=None):
196197
self.sbd_watchdog_timeout = self.SBD_WATCHDOG_TIMEOUT_DEFAULT
197198
self.stonith_watchdog_timeout = None
198199
self.two_node_without_qdevice = False
200+
self.qdevice_sync_timeout = None
199201
if self.context:
200202
self._initialize_timeout_in_bootstrap()
201203

@@ -292,6 +294,8 @@ def _load_configurations(self):
292294
self.disk_based = False
293295
self.sbd_watchdog_timeout = SBDTimeout.get_sbd_watchdog_timeout()
294296
self.stonith_watchdog_timeout = SBDTimeout.get_stonith_watchdog_timeout_expected()
297+
if corosync.is_qdevice_configured() and ServiceManager().service_is_active("corosync-qdevice.service")
298+
self.qdevice_sync_timeout = utils.get_qdevice_sync_timeout()
295299
self.sbd_delay_start_value_expected = self.get_sbd_delay_start_expected() if utils.detect_virt() else "no"
296300
self.sbd_delay_start_value_from_config = SBDUtils.get_sbd_value_from_config("SBD_DELAY_START")
297301

@@ -359,53 +363,187 @@ def get_sbd_systemd_start_timeout() -> int:
359363
out = sh.cluster_shell().get_stdout_or_raise_error(SBDTimeout.SHOW_SBD_START_TIMEOUT_CMD)
360364
return utils.get_systemd_timeout_start_in_sec(out)
361365

362-
def adjust_systemd_start_timeout(self):
366+
367+
class SBDTimeoutChecker(SBDTimeout):
368+
369+
def __init__(self, fix=False, warn=True):
370+
super().__init__()
371+
self.fix = fix
372+
self.warning_during_check = warn
373+
374+
def check_and_fix(self) -> bool:
375+
'''
376+
Return True if all checks pass (after optional fixes),
377+
otherwise False or raise healthcheck.FixFailure if fixing fails
378+
'''
379+
checks_and_fixes = [
380+
# failure name, check function, fix function
381+
("SBD disk metadata",
382+
self._check_sbd_disk_metadata, self._adjust_sbd_disk_metadata),
383+
("SBD_WATCHDOG_TIMEOUT",
384+
self._check_sbd_watchdog_timeout, self._adjust_sbd_watchdog_timeout),
385+
("SBD_DELAY_START",
386+
self._check_sbd_delay_start, self._adjust_sbd_delay_start),
387+
("systemd start timeout for sbd.service",
388+
self._check_sbd_systemd_start_timeout, self._adjust_sbd_systemd_start_timeout),
389+
("stonith-watchdog-timeout property",
390+
self._check_stonith_watchdog_timeout, self._adjust_stonith_watchdog_timeout),
391+
("stonith-timeout property",
392+
self._check_stonith_timeout, self._adjust_stonith_timeout)
393+
]
394+
395+
self._load_configurations()
396+
for name, check_func, fix_func in checks_and_fixes:
397+
if check_func(warn=self.warning_during_check):
398+
continue
399+
if not self.fix:
400+
return False
401+
fix_func()
402+
self._load_configurations()
403+
if not check_func(warn=False):
404+
raise healthcheck.FixFailure(f"Failed to fix: {name}")
405+
406+
return True
407+
408+
def _check_sbd_disk_metadata(self, warn=True) -> bool:
363409
'''
364-
Adjust start timeout for sbd when set SBD_DELAY_START
410+
Check msgwait and watchdog timeout for disk-based sbd
365411
'''
366-
sbd_delay_start_value = SBDUtils.get_sbd_value_from_config("SBD_DELAY_START")
367-
if sbd_delay_start_value == "no":
368-
return
412+
if self.disk_based and self.sbd_msgwait < 2*self.sbd_watchdog_timeout:
413+
if warn:
414+
logger.warning("It's recommended that msgwait(now %d) >= 2*watchdog timeout(now %d)",
415+
self.sbd_msgwait, self.sbd_watchdog_timeout)
416+
return False
417+
return True
369418

370-
start_timeout = SBDTimeout.get_sbd_systemd_start_timeout()
371-
if start_timeout > int(sbd_delay_start_value):
372-
return
419+
def _adjust_sbd_disk_metadata(self):
420+
'''
421+
Adjust msgwait to 2*watchdog timeout for disk-based sbd
422+
'''
423+
advised_msgwait = 2*self.sbd_watchdog_timeout
424+
logger.info("Adjusting sbd msgwait to %d", advised_msgwait)
425+
cmd = f"crm sbd configure msgwait-timeout={advised_msgwait} watchdog-timeout={self.sbd_watchdog_timeout}"
426+
output = sh.cluster_shell().get_stdout_or_raise_error(cmd)
427+
if output:
428+
print(output)
373429

374-
utils.mkdirp(SBDManager.SBD_SYSTEMD_DELAY_START_DIR)
375-
sbd_delay_start_file = "{}/sbd_delay_start.conf".format(SBDManager.SBD_SYSTEMD_DELAY_START_DIR)
376-
utils.str2file("[Service]\nTimeoutSec={}".format(int(1.2*int(sbd_delay_start_value))), sbd_delay_start_file)
377-
bootstrap.sync_path(SBDManager.SBD_SYSTEMD_DELAY_START_DIR)
378-
utils.cluster_run_cmd("systemctl daemon-reload")
430+
def _check_sbd_watchdog_timeout(self, warn=True) -> bool:
431+
'''
432+
Check SBD_WATCHDOG_TIMEOUT for disk-less sbd
433+
'''
434+
if self.disk_based or not self.qdevice_sync_timeout:
435+
return True
436+
if self.sbd_watchdog_timeout < self.qdevice_sync_timeout:
437+
if warn:
438+
logger.warning("It's recommended that SBD_WATCHDOG_TIMEOUT(now %d) > qdevice sync timeout(now %d)",
439+
self.sbd_watchdog_timeout, self.qdevice_sync_timeout)
440+
return False
441+
return True
379442

380-
def adjust_stonith_timeout(self):
443+
def _adjust_sbd_watchdog_timeout(self):
381444
'''
382-
Adjust stonith-timeout property
445+
Adjust SBD_WATCHDOG_TIMEOUT for disk-less sbd
383446
'''
384-
utils.set_property("stonith-timeout", self.get_stonith_timeout_expected(), conditional=True)
447+
advised_watchdog_timeout = self.qdevice_sync_timeout + SBDTimeout.QDEVICE_SYNC_TIMEOUT_MARGIN
448+
SBDManager.update_sbd_configuration({"SBD_WATCHDOG_TIMEOUT": str(advised_watchdog_timeout)})
449+
logger.info("Adjusting SBD_WATCHDOG_TIMEOUT to %d", advised_watchdog_timeout)
385450

386-
def adjust_sbd_delay_start(self):
451+
def _check_sbd_delay_start(self, warn=True) -> bool:
387452
'''
388-
Adjust SBD_DELAY_START in /etc/sysconfig/sbd
453+
Check SBD_DELAY_START
389454
'''
390455
expected_value = str(self.sbd_delay_start_value_expected)
391456
config_value = self.sbd_delay_start_value_from_config
392457
if expected_value == config_value:
393-
return
458+
return True
394459
if expected_value == "no" \
395460
or (not re.search(r'\d+', config_value)) \
396461
or (int(expected_value) > int(config_value)):
397-
SBDManager.update_sbd_configuration({"SBD_DELAY_START": expected_value})
462+
if warn:
463+
logger.warning("It's recommended that SBD_DELAY_START is set to %s, current value is %s",
464+
expected_value, config_value)
465+
return False
398466

399-
@classmethod
400-
def adjust_sbd_timeout_related_cluster_configuration(cls):
467+
def _adjust_sbd_delay_start(self):
401468
'''
402-
Adjust sbd timeout related configurations
469+
Adjust SBD_DELAY_START
403470
'''
404-
cls_inst = cls()
405-
cls_inst._load_configurations()
406-
cls_inst.adjust_sbd_delay_start()
407-
cls_inst.adjust_stonith_timeout()
408-
cls_inst.adjust_systemd_start_timeout()
471+
SBDManager.update_sbd_configuration({"SBD_DELAY_START": str(self.sbd_delay_start_value_expected)})
472+
473+
def _check_sbd_systemd_start_timeout(self, warn=True) -> bool:
474+
'''
475+
Check systemd start timeout for sbd.service
476+
'''
477+
if not self.sbd_delay_start_value_from_config or self.sbd_delay_start_value_from_config == "no":
478+
return True
479+
systemd_start_timeout = SBDTimeout.get_sbd_systemd_start_timeout()
480+
if systemd_start_timeout > int(self.sbd_delay_start_value_from_config):
481+
return True
482+
if warn:
483+
logger.warning("It's recommended that systemd start timeout for sbd.service is set to %d, current value is %d",
484+
int(1.2*int(self.sbd_delay_start_value_from_config)), systemd_start_timeout)
485+
return False
486+
487+
def _adjust_sbd_systemd_start_timeout(self):
488+
'''
489+
Adjust systemd start timeout for sbd.service
490+
'''
491+
systemd_start_timeout = int(1.2*int(self.sbd_delay_start_value_from_config))
492+
utils.mkdirp(SBDManager.SBD_SYSTEMD_DELAY_START_DIR)
493+
sbd_delay_start_file = f"{SBDManager.SBD_SYSTEMD_DELAY_START_DIR}/sbd_delay_start.conf"
494+
utils.str2file(f"[Service]\nTimeoutSec={systemd_start_timeout}", sbd_delay_start_file)
495+
bootstrap.sync_path(SBDManager.SBD_SYSTEMD_DELAY_START_DIR)
496+
utils.cluster_run_cmd("systemctl daemon-reload")
497+
logger.info("Adjusted systemd start timeout for sbd.service to %d", systemd_start_timeout)
498+
499+
def _check_stonith_watchdog_timeout(self, warn=True) -> bool:
500+
'''
501+
Check stonith-watchdog-timeout property
502+
'''
503+
value = utils.get_property("stonith-watchdog-timeout", get_default=False)
504+
if self.disk_based:
505+
if value and warn:
506+
logger.warning("It's recommended to remove stonith-watchdog-timeout property when using disk-based SBD")
507+
return False
508+
elif int(value) < 2*self.sbd_watchdog_timeout:
509+
if warn:
510+
logger.warning("It's recommended that stonith-watchdog-timeout(now %s) >= 2*SBD_WATCHDOG_TIMEOUT(now %d)",
511+
value, self.sbd_watchdog_timeout)
512+
return False
513+
return True
514+
515+
def _adjust_stonith_watchdog_timeout(self):
516+
'''
517+
Adjust stonith-watchdog-timeout property
518+
'''
519+
if self.disk_based:
520+
utils.delete_property("stonith-watchdog-timeout")
521+
logger.info("Removed stonith-watchdog-timeout property")
522+
else:
523+
adviced_value = SBDTimeout.get_stonith_watchdog_timeout_expected()
524+
utils.set_property("stonith-watchdog-timeout", adviced_value)
525+
logger.info("Adjusted stonith-watchdog-timeout to %d", adviced_value)
526+
527+
def _check_stonith_timeout(self, warn=True) -> bool:
528+
'''
529+
Check stonith-timeout property
530+
'''
531+
value = utils.get_property("stonith-timeout", get_default=False)
532+
expected_value = self.get_stonith_timeout_expected()
533+
if not value or int(value) < expected_value:
534+
if warn:
535+
logger.warning("It's recommended that stonith-timeout is set to %d, current value is %s",
536+
expected_value, value)
537+
return False
538+
return True
539+
540+
def _adjust_stonith_timeout(self):
541+
'''
542+
Adjust stonith-timeout property
543+
'''
544+
expected_value = self.get_stonith_timeout_expected()
545+
utils.set_property("stonith-timeout", expected_value)
546+
logger.info("Adjusted stonith-timeout to %d", expected_value)
409547

410548

411549
class SBDManager:
@@ -497,7 +635,6 @@ def update_configuration(self) -> None:
497635
utils.sysconfig_set(self.SYSCONFIG_SBD, **self.update_dict)
498636
if self.cluster_is_running:
499637
bootstrap.sync_path(self.SYSCONFIG_SBD)
500-
logger.info("Already synced %s to all nodes", self.SYSCONFIG_SBD)
501638

502639
@classmethod
503640
def update_sbd_configuration(cls, update_dict: typing.Dict[str, str]) -> None:

crmsh/ui_cluster.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from .sh import ShellUtils
2929
from .ui_node import parse_option_for_nodes
3030
from . import constants
31-
31+
from . import sbd
3232

3333
from . import log
3434
from .utils import TerminateSubCommand
@@ -793,18 +793,17 @@ def do_geo_init_arbitrator(self, context, *args):
793793
bootstrap.bootstrap_arbitrator(geo_context)
794794
return True
795795

796-
@command.completers(compl.choice([
797-
'hawk2',
798-
'sles16',
799-
]))
796+
HEALTH_COMPONENTS = ['hawk2', 'sles16', 'sbd']
797+
798+
@command.completers(compl.choice(HEALTH_COMPONENTS))
800799
def do_health(self, context, *args):
801800
'''
802801
Extensive health check.
803802
'''
804803
if not args:
805804
return Cluster._do_health_legacy()
806805
parser = argparse.ArgumentParser('health')
807-
parser.add_argument('component', choices=['hawk2', 'sles16'])
806+
parser.add_argument('component', choices=Cluster.HEALTH_COMPONENTS)
808807
parser.add_argument('-f', '--fix', action='store_true')
809808
parsed_args, remaining_args = parser.parse_known_args(args)
810809
match parsed_args.component:
@@ -837,8 +836,23 @@ def do_health(self, context, *args):
837836
return True
838837
else:
839838
logger.error("hawk2: passwordless ssh authentication: FAIL.")
840-
logger.warning('Please run "crm cluster health hawk2 --fix"')
839+
logger.warning('Please run "crm cluster health hawk2 --fix".')
841840
return False
841+
case 'sbd':
842+
fix = parsed_args.fix
843+
try:
844+
result = sbd.SBDTimeoutChecker(fix=fix).check_and_fix()
845+
except healthcheck.FixFailure as e:
846+
logger.error('%s', e)
847+
return False
848+
if result:
849+
logger.info("SBD: Check sbd timeout settings: OK.")
850+
return True
851+
else:
852+
logger.error("SBD: Check sbd timeout settings: FAIL.")
853+
if not fix:
854+
logger.warning('Please run "crm cluster health sbd --fix".')
855+
return False
842856
case 'sles16':
843857
try:
844858
if parsed_args.fix:

crmsh/ui_sbd.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,9 @@ def _configure_show(self, args) -> None:
253253
print()
254254
self._show_property()
255255

256+
print()
257+
sbd.SBDTimeoutChecker().check_and_fix()
258+
256259
def _parse_args(self, args: tuple[str, ...]) -> dict[str, int|str]:
257260
'''
258261
Parse arguments and verify them

0 commit comments

Comments
 (0)