| 
10 | 10 | from . import xmlutil  | 
11 | 11 | from . import watchdog  | 
12 | 12 | from . import parallax  | 
 | 13 | +from . import healthcheck  | 
13 | 14 | from .service_manager import ServiceManager  | 
14 | 15 | from .sh import ShellUtils  | 
15 | 16 | 
 
  | 
@@ -196,6 +197,7 @@ def __init__(self, context=None):  | 
196 | 197 |         self.sbd_watchdog_timeout = self.SBD_WATCHDOG_TIMEOUT_DEFAULT  | 
197 | 198 |         self.stonith_watchdog_timeout = None  | 
198 | 199 |         self.two_node_without_qdevice = False  | 
 | 200 | +        self.qdevice_sync_timeout = None  | 
199 | 201 |         if self.context:  | 
200 | 202 |             self._initialize_timeout_in_bootstrap()  | 
201 | 203 | 
 
  | 
@@ -292,6 +294,8 @@ def _load_configurations(self):  | 
292 | 294 |             self.disk_based = False  | 
293 | 295 |             self.sbd_watchdog_timeout = SBDTimeout.get_sbd_watchdog_timeout()  | 
294 | 296 |             self.stonith_watchdog_timeout = SBDTimeout.get_stonith_watchdog_timeout_expected()  | 
 | 297 | +            if corosync.is_qdevice_configured() and ServiceManager().service_is_active("corosync-qdevice.service")  | 
 | 298 | +                self.qdevice_sync_timeout = utils.get_qdevice_sync_timeout()  | 
295 | 299 |         self.sbd_delay_start_value_expected = self.get_sbd_delay_start_expected() if utils.detect_virt() else "no"  | 
296 | 300 |         self.sbd_delay_start_value_from_config = SBDUtils.get_sbd_value_from_config("SBD_DELAY_START")  | 
297 | 301 | 
 
  | 
@@ -359,53 +363,187 @@ def get_sbd_systemd_start_timeout() -> int:  | 
359 | 363 |         out = sh.cluster_shell().get_stdout_or_raise_error(SBDTimeout.SHOW_SBD_START_TIMEOUT_CMD)  | 
360 | 364 |         return utils.get_systemd_timeout_start_in_sec(out)  | 
361 | 365 | 
 
  | 
362 |  | -    def adjust_systemd_start_timeout(self):  | 
 | 366 | + | 
 | 367 | +class SBDTimeoutChecker(SBDTimeout):  | 
 | 368 | + | 
 | 369 | +    def __init__(self, fix=False, warn=True):  | 
 | 370 | +        super().__init__()  | 
 | 371 | +        self.fix = fix  | 
 | 372 | +        self.warning_during_check = warn  | 
 | 373 | + | 
 | 374 | +    def check_and_fix(self) -> bool:  | 
 | 375 | +        '''  | 
 | 376 | +        Return True if all checks pass (after optional fixes),  | 
 | 377 | +        otherwise False or raise healthcheck.FixFailure if fixing fails  | 
 | 378 | +        '''  | 
 | 379 | +        checks_and_fixes = [  | 
 | 380 | +            # failure name, check function, fix function  | 
 | 381 | +            ("SBD disk metadata",  | 
 | 382 | +             self._check_sbd_disk_metadata, self._adjust_sbd_disk_metadata),  | 
 | 383 | +            ("SBD_WATCHDOG_TIMEOUT",  | 
 | 384 | +             self._check_sbd_watchdog_timeout, self._adjust_sbd_watchdog_timeout),  | 
 | 385 | +            ("SBD_DELAY_START",  | 
 | 386 | +             self._check_sbd_delay_start, self._adjust_sbd_delay_start),  | 
 | 387 | +            ("systemd start timeout for sbd.service",  | 
 | 388 | +             self._check_sbd_systemd_start_timeout, self._adjust_sbd_systemd_start_timeout),  | 
 | 389 | +            ("stonith-watchdog-timeout property",  | 
 | 390 | +             self._check_stonith_watchdog_timeout, self._adjust_stonith_watchdog_timeout),  | 
 | 391 | +            ("stonith-timeout property",  | 
 | 392 | +             self._check_stonith_timeout, self._adjust_stonith_timeout)  | 
 | 393 | +        ]  | 
 | 394 | + | 
 | 395 | +        self._load_configurations()  | 
 | 396 | +        for name, check_func, fix_func in checks_and_fixes:  | 
 | 397 | +            if check_func(warn=self.warning_during_check):  | 
 | 398 | +                continue  | 
 | 399 | +            if not self.fix:  | 
 | 400 | +                return False  | 
 | 401 | +            fix_func()  | 
 | 402 | +            self._load_configurations()  | 
 | 403 | +            if not check_func(warn=False):  | 
 | 404 | +                raise healthcheck.FixFailure(f"Failed to fix: {name}")  | 
 | 405 | + | 
 | 406 | +        return True  | 
 | 407 | + | 
 | 408 | +    def _check_sbd_disk_metadata(self, warn=True) -> bool:  | 
363 | 409 |         '''  | 
364 |  | -        Adjust start timeout for sbd when set SBD_DELAY_START  | 
 | 410 | +        Check msgwait and watchdog timeout for disk-based sbd  | 
365 | 411 |         '''  | 
366 |  | -        sbd_delay_start_value = SBDUtils.get_sbd_value_from_config("SBD_DELAY_START")  | 
367 |  | -        if sbd_delay_start_value == "no":  | 
368 |  | -            return  | 
 | 412 | +        if self.disk_based and self.sbd_msgwait < 2*self.sbd_watchdog_timeout:  | 
 | 413 | +            if warn:  | 
 | 414 | +                logger.warning("It's recommended that msgwait(now %d) >= 2*watchdog timeout(now %d)",  | 
 | 415 | +                               self.sbd_msgwait, self.sbd_watchdog_timeout)  | 
 | 416 | +            return False  | 
 | 417 | +        return True  | 
369 | 418 | 
 
  | 
370 |  | -        start_timeout = SBDTimeout.get_sbd_systemd_start_timeout()  | 
371 |  | -        if start_timeout > int(sbd_delay_start_value):  | 
372 |  | -            return  | 
 | 419 | +    def _adjust_sbd_disk_metadata(self):  | 
 | 420 | +        '''  | 
 | 421 | +        Adjust msgwait to 2*watchdog timeout for disk-based sbd  | 
 | 422 | +        '''  | 
 | 423 | +        advised_msgwait = 2*self.sbd_watchdog_timeout  | 
 | 424 | +        logger.info("Adjusting sbd msgwait to %d", advised_msgwait)  | 
 | 425 | +        cmd = f"crm sbd configure msgwait-timeout={advised_msgwait} watchdog-timeout={self.sbd_watchdog_timeout}"  | 
 | 426 | +        output = sh.cluster_shell().get_stdout_or_raise_error(cmd)  | 
 | 427 | +        if output:  | 
 | 428 | +            print(output)  | 
373 | 429 | 
 
  | 
374 |  | -        utils.mkdirp(SBDManager.SBD_SYSTEMD_DELAY_START_DIR)  | 
375 |  | -        sbd_delay_start_file = "{}/sbd_delay_start.conf".format(SBDManager.SBD_SYSTEMD_DELAY_START_DIR)  | 
376 |  | -        utils.str2file("[Service]\nTimeoutSec={}".format(int(1.2*int(sbd_delay_start_value))), sbd_delay_start_file)  | 
377 |  | -        bootstrap.sync_path(SBDManager.SBD_SYSTEMD_DELAY_START_DIR)  | 
378 |  | -        utils.cluster_run_cmd("systemctl daemon-reload")  | 
 | 430 | +    def _check_sbd_watchdog_timeout(self, warn=True) -> bool:  | 
 | 431 | +        '''  | 
 | 432 | +        Check SBD_WATCHDOG_TIMEOUT for disk-less sbd  | 
 | 433 | +        '''  | 
 | 434 | +        if self.disk_based or not self.qdevice_sync_timeout:  | 
 | 435 | +            return True  | 
 | 436 | +        if self.sbd_watchdog_timeout < self.qdevice_sync_timeout:  | 
 | 437 | +            if warn:  | 
 | 438 | +                logger.warning("It's recommended that SBD_WATCHDOG_TIMEOUT(now %d) > qdevice sync timeout(now %d)",  | 
 | 439 | +                               self.sbd_watchdog_timeout, self.qdevice_sync_timeout)  | 
 | 440 | +            return False  | 
 | 441 | +        return True  | 
379 | 442 | 
 
  | 
380 |  | -    def adjust_stonith_timeout(self):  | 
 | 443 | +    def _adjust_sbd_watchdog_timeout(self):  | 
381 | 444 |         '''  | 
382 |  | -        Adjust stonith-timeout property  | 
 | 445 | +        Adjust SBD_WATCHDOG_TIMEOUT for disk-less sbd  | 
383 | 446 |         '''  | 
384 |  | -        utils.set_property("stonith-timeout", self.get_stonith_timeout_expected(), conditional=True)  | 
 | 447 | +        advised_watchdog_timeout = self.qdevice_sync_timeout + SBDTimeout.QDEVICE_SYNC_TIMEOUT_MARGIN  | 
 | 448 | +        SBDManager.update_sbd_configuration({"SBD_WATCHDOG_TIMEOUT": str(advised_watchdog_timeout)})  | 
 | 449 | +        logger.info("Adjusting SBD_WATCHDOG_TIMEOUT to %d", advised_watchdog_timeout)  | 
385 | 450 | 
 
  | 
386 |  | -    def adjust_sbd_delay_start(self):  | 
 | 451 | +    def _check_sbd_delay_start(self, warn=True) -> bool:  | 
387 | 452 |         '''  | 
388 |  | -        Adjust SBD_DELAY_START in /etc/sysconfig/sbd  | 
 | 453 | +        Check SBD_DELAY_START  | 
389 | 454 |         '''  | 
390 | 455 |         expected_value = str(self.sbd_delay_start_value_expected)  | 
391 | 456 |         config_value = self.sbd_delay_start_value_from_config  | 
392 | 457 |         if expected_value == config_value:  | 
393 |  | -            return  | 
 | 458 | +            return True  | 
394 | 459 |         if expected_value == "no" \  | 
395 | 460 |                 or (not re.search(r'\d+', config_value)) \  | 
396 | 461 |                 or (int(expected_value) > int(config_value)):  | 
397 |  | -            SBDManager.update_sbd_configuration({"SBD_DELAY_START": expected_value})  | 
 | 462 | +            if warn:  | 
 | 463 | +                logger.warning("It's recommended that SBD_DELAY_START is set to %s, current value is %s",  | 
 | 464 | +                               expected_value, config_value)  | 
 | 465 | +            return False  | 
398 | 466 | 
 
  | 
399 |  | -    @classmethod  | 
400 |  | -    def adjust_sbd_timeout_related_cluster_configuration(cls):  | 
 | 467 | +    def _adjust_sbd_delay_start(self):  | 
401 | 468 |         '''  | 
402 |  | -        Adjust sbd timeout related configurations  | 
 | 469 | +        Adjust SBD_DELAY_START  | 
403 | 470 |         '''  | 
404 |  | -        cls_inst = cls()  | 
405 |  | -        cls_inst._load_configurations()  | 
406 |  | -        cls_inst.adjust_sbd_delay_start()  | 
407 |  | -        cls_inst.adjust_stonith_timeout()  | 
408 |  | -        cls_inst.adjust_systemd_start_timeout()  | 
 | 471 | +        SBDManager.update_sbd_configuration({"SBD_DELAY_START": str(self.sbd_delay_start_value_expected)})  | 
 | 472 | + | 
 | 473 | +    def _check_sbd_systemd_start_timeout(self, warn=True) -> bool:  | 
 | 474 | +        '''  | 
 | 475 | +        Check systemd start timeout for sbd.service  | 
 | 476 | +        '''  | 
 | 477 | +        if not self.sbd_delay_start_value_from_config or self.sbd_delay_start_value_from_config == "no":  | 
 | 478 | +            return True  | 
 | 479 | +        systemd_start_timeout = SBDTimeout.get_sbd_systemd_start_timeout()  | 
 | 480 | +        if systemd_start_timeout > int(self.sbd_delay_start_value_from_config):  | 
 | 481 | +            return True  | 
 | 482 | +        if warn:  | 
 | 483 | +            logger.warning("It's recommended that systemd start timeout for sbd.service is set to %d, current value is %d",  | 
 | 484 | +                           int(1.2*int(self.sbd_delay_start_value_from_config)), systemd_start_timeout)  | 
 | 485 | +        return False  | 
 | 486 | + | 
 | 487 | +    def _adjust_sbd_systemd_start_timeout(self):  | 
 | 488 | +        '''  | 
 | 489 | +        Adjust systemd start timeout for sbd.service  | 
 | 490 | +        '''  | 
 | 491 | +        systemd_start_timeout = int(1.2*int(self.sbd_delay_start_value_from_config))  | 
 | 492 | +        utils.mkdirp(SBDManager.SBD_SYSTEMD_DELAY_START_DIR)  | 
 | 493 | +        sbd_delay_start_file = f"{SBDManager.SBD_SYSTEMD_DELAY_START_DIR}/sbd_delay_start.conf"  | 
 | 494 | +        utils.str2file(f"[Service]\nTimeoutSec={systemd_start_timeout}", sbd_delay_start_file)  | 
 | 495 | +        bootstrap.sync_path(SBDManager.SBD_SYSTEMD_DELAY_START_DIR)  | 
 | 496 | +        utils.cluster_run_cmd("systemctl daemon-reload")  | 
 | 497 | +        logger.info("Adjusted systemd start timeout for sbd.service to %d", systemd_start_timeout)  | 
 | 498 | + | 
 | 499 | +    def _check_stonith_watchdog_timeout(self, warn=True) -> bool:  | 
 | 500 | +        '''  | 
 | 501 | +        Check stonith-watchdog-timeout property  | 
 | 502 | +        '''  | 
 | 503 | +        value = utils.get_property("stonith-watchdog-timeout", get_default=False)  | 
 | 504 | +        if self.disk_based:  | 
 | 505 | +            if value and warn:  | 
 | 506 | +                logger.warning("It's recommended to remove stonith-watchdog-timeout property when using disk-based SBD")  | 
 | 507 | +                return False  | 
 | 508 | +        elif int(value) < 2*self.sbd_watchdog_timeout:  | 
 | 509 | +            if warn:  | 
 | 510 | +                logger.warning("It's recommended that stonith-watchdog-timeout(now %s) >= 2*SBD_WATCHDOG_TIMEOUT(now %d)",  | 
 | 511 | +                               value, self.sbd_watchdog_timeout)  | 
 | 512 | +            return False  | 
 | 513 | +        return True  | 
 | 514 | + | 
 | 515 | +    def _adjust_stonith_watchdog_timeout(self):  | 
 | 516 | +        '''  | 
 | 517 | +        Adjust stonith-watchdog-timeout property  | 
 | 518 | +        '''  | 
 | 519 | +        if self.disk_based:  | 
 | 520 | +            utils.delete_property("stonith-watchdog-timeout")  | 
 | 521 | +            logger.info("Removed stonith-watchdog-timeout property")  | 
 | 522 | +        else:  | 
 | 523 | +            adviced_value = SBDTimeout.get_stonith_watchdog_timeout_expected()  | 
 | 524 | +            utils.set_property("stonith-watchdog-timeout", adviced_value)  | 
 | 525 | +            logger.info("Adjusted stonith-watchdog-timeout to %d", adviced_value)  | 
 | 526 | + | 
 | 527 | +    def _check_stonith_timeout(self, warn=True) -> bool:  | 
 | 528 | +        '''  | 
 | 529 | +        Check stonith-timeout property  | 
 | 530 | +        '''  | 
 | 531 | +        value = utils.get_property("stonith-timeout", get_default=False)  | 
 | 532 | +        expected_value = self.get_stonith_timeout_expected()  | 
 | 533 | +        if not value or int(value) < expected_value:  | 
 | 534 | +            if warn:  | 
 | 535 | +                logger.warning("It's recommended that stonith-timeout is set to %d, current value is %s",  | 
 | 536 | +                               expected_value, value)  | 
 | 537 | +            return False  | 
 | 538 | +        return True  | 
 | 539 | + | 
 | 540 | +    def _adjust_stonith_timeout(self):  | 
 | 541 | +        '''  | 
 | 542 | +        Adjust stonith-timeout property  | 
 | 543 | +        '''  | 
 | 544 | +        expected_value = self.get_stonith_timeout_expected()  | 
 | 545 | +        utils.set_property("stonith-timeout", expected_value)  | 
 | 546 | +        logger.info("Adjusted stonith-timeout to %d", expected_value)  | 
409 | 547 | 
 
  | 
410 | 548 | 
 
  | 
411 | 549 | class SBDManager:  | 
@@ -497,7 +635,6 @@ def update_configuration(self) -> None:  | 
497 | 635 |         utils.sysconfig_set(self.SYSCONFIG_SBD, **self.update_dict)  | 
498 | 636 |         if self.cluster_is_running:  | 
499 | 637 |             bootstrap.sync_path(self.SYSCONFIG_SBD)  | 
500 |  | -            logger.info("Already synced %s to all nodes", self.SYSCONFIG_SBD)  | 
501 | 638 | 
 
  | 
502 | 639 |     @classmethod  | 
503 | 640 |     def update_sbd_configuration(cls, update_dict: typing.Dict[str, str]) -> None:  | 
 | 
0 commit comments