-
Couldn't load subscription status.
- Fork 749
Module graceful shutdown support #4031
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 29 commits
72c209b
d6b341b
b796e12
4927a4b
2d224b4
65b5cc8
060d016
ffbb12a
ec7ee57
1372b53
c8729ec
19e6bec
ff61ba1
af7f3bc
3f2bf0b
7cdc4bb
7608e49
7164fae
40a08dd
cb72aa6
3af519d
ecc59af
2543f2a
20d831f
6cb4402
ea8e199
85ec74e
a590fb5
b7ca97a
630b03b
5473ae2
1418c9f
92ddd66
beff3ab
efabd48
f325579
e3d1eae
1ae6723
1976511
7b96ef1
e7afc0b
f15b887
ebce12d
3d56466
54a6d63
f25aa22
01236f0
f9c5fa4
782d4bf
037a76e
4b1de85
60d156c
5101978
915d485
c2121bd
41bfc57
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,9 +8,30 @@ | |
| from utilities_common.chassis import is_smartswitch, get_all_dpus | ||
| from datetime import datetime, timedelta | ||
|
|
||
| # New imports to use centralized APIs | ||
| try: | ||
| # Prefer swsscommon SonicV2Connector | ||
| from swsscommon.swsscommon import SonicV2Connector | ||
| except ImportError: | ||
| # Fallback (if ever needed) | ||
| from swsssdk import SonicV2Connector | ||
|
|
||
| from sonic_platform_base.module_base import ModuleBase | ||
|
|
||
| TIMEOUT_SECS = 10 | ||
| # CLI uses a single conservative ceiling for timeouts when breaking a stuck transition. | ||
| # (Platform-specific per-op timeouts are applied by platform code during the transition itself.) | ||
| TRANSITION_TIMEOUT = timedelta(seconds=240) # 4 minutes | ||
|
|
||
| _MB_SINGLETON = None | ||
| _STATE_DB_CONN = None | ||
|
|
||
| def _module_base(): | ||
| """Return a cached ModuleBase instance.""" | ||
| global _MB_SINGLETON | ||
| if _MB_SINGLETON is None: | ||
| _MB_SINGLETON = ModuleBase() | ||
| return _MB_SINGLETON | ||
|
|
||
| class StateDBHelper: | ||
| def __init__(self, sonic_db): | ||
|
|
@@ -41,6 +62,118 @@ def modules(): | |
| pass | ||
|
|
||
|
|
||
| # Centralized-transition helpers (use ModuleBase) | ||
| def _state_db_conn(): | ||
| """Return a cached SonicV2Connector for STATE_DB (lazy init).""" | ||
| global _STATE_DB_CONN | ||
| if _STATE_DB_CONN is None: | ||
| conn = SonicV2Connector() | ||
| try: | ||
| conn.connect(conn.STATE_DB) | ||
| except Exception: | ||
| # Some bindings autoconnect; keep tolerant behavior | ||
| pass | ||
| _STATE_DB_CONN = conn | ||
| return _STATE_DB_CONN | ||
|
|
||
|
|
||
| def _transition_entry(module_name: str) -> dict: | ||
| """Read the transition entry for a module via ModuleBase centralized API.""" | ||
| mb = _module_base() | ||
| conn = _state_db_conn() | ||
| return mb.get_module_state_transition(conn, module_name) or {} | ||
|
|
||
|
|
||
| def _transition_in_progress(module_name: str) -> bool: | ||
vvolam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| """Return True if STATE_DB marks the module’s transition as in progress. | ||
|
|
||
| Uses `_transition_entry(module_name)` and checks whether | ||
| `state_transition_in_progress` is exactly the string "True" (strict check). | ||
| """ | ||
| entry = _transition_entry(module_name) | ||
| return entry.get("state_transition_in_progress", "False") == "True" | ||
|
|
||
|
|
||
| def _mark_transition_start(module_name: str, transition_type: str): | ||
| """Set transition via centralized API.""" | ||
| mb = _module_base() | ||
| conn = _state_db_conn() | ||
| mb.set_module_state_transition(conn, module_name, transition_type) | ||
|
|
||
|
|
||
| def _mark_transition_clear(module_name: str): | ||
| """Clear transition via centralized API.""" | ||
| mb = _module_base() | ||
| conn = _state_db_conn() | ||
| mb.clear_module_state_transition(conn, module_name) | ||
|
|
||
|
|
||
| def _transition_timed_out(module_name: str) -> bool: | ||
| """CLI-side safety ceiling (4 minutes) to break a stuck transition.""" | ||
| mb = _module_base() | ||
| conn = _state_db_conn() | ||
| return mb.is_module_state_transition_timed_out(conn, module_name, int(TRANSITION_TIMEOUT.total_seconds())) | ||
|
|
||
|
|
||
| # shared helper | ||
| def _block_if_conflicting_transition(chassis_module_name: str, conflict_type: str, target_oper_status: str) -> bool: | ||
| """ | ||
| Gate a CLI action if a conflicting module transition is still in progress. | ||
|
|
||
| This helper reads the centralized transition state (via `_transition_entry()`) | ||
| and the current `oper_status` from `CHASSIS_MODULE_TABLE`. It **blocks** | ||
| (returns True) when: | ||
| 1) the module currently has `state_transition_in_progress == True`, and | ||
| 2) the last recorded `transition_type` matches the requested `conflict_type` | ||
| (e.g., "startup", "shutdown", or "reboot"), and | ||
| 3) the module has **not** yet reached the `target_oper_status` expected to | ||
| resolve that transition (e.g., "Online" after startup, "Offline" after shutdown). | ||
|
|
||
| If the above conditions are not all true, the function allows the caller to proceed | ||
| (returns False). | ||
|
|
||
| Args: | ||
| chassis_module_name: Module name (e.g., "DPU0"). | ||
| conflict_type: The transition type that would conflict with the caller's action. | ||
| Typical values: "startup", "shutdown", "reboot". | ||
| target_oper_status: The oper status that indicates the conflicting transition has | ||
| effectively completed from the caller’s perspective (e.g., "Online" for startup, | ||
| "Offline" for shutdown). | ||
|
|
||
| Returns: | ||
| bool: True if the function **blocked** the action (a conflicting transition is underway | ||
| and the module hasn't reached `target_oper_status` yet); False if the caller may proceed. | ||
|
|
||
| Side Effects: | ||
| - Prints a user-facing message via `click.echo(...)` when blocking. | ||
| - No database writes are performed. | ||
|
|
||
| Notes: | ||
| - Truthiness of `state_transition_in_progress` is normalized with a case-insensitive | ||
| string check, so both boolean True and string "True" are handled. | ||
| - Missing or empty transition rows result in no block (returns False). | ||
| - There is an inherent race window between this check and the subsequent action; | ||
| callers should treat this as a best-effort gate and keep operations idempotent. | ||
|
|
||
| Example: | ||
| # Block shutdown if a startup is still running and the module is not yet Online | ||
| if _block_if_conflicting_transition("DPU0", "startup", "Online"): | ||
| return # tell CLI to try again later | ||
| """ | ||
| entry = _transition_entry(chassis_module_name) or {} | ||
| in_prog = str(entry.get("state_transition_in_progress", "False")).lower() == "true" | ||
|
||
| last_type = entry.get("transition_type") | ||
|
|
||
| # Current oper_status (keep this simple read from STATE_DB) | ||
| conn = _state_db_conn() | ||
| row = conn.get_all(conn.STATE_DB, f"CHASSIS_MODULE_TABLE|{chassis_module_name}") or {} | ||
| oper = row.get("oper_status") | ||
|
|
||
| if in_prog and last_type == conflict_type and oper != target_oper_status: | ||
| click.echo(f"Module {chassis_module_name} has a {conflict_type} transition underway; try again later.") | ||
| return True | ||
| return False | ||
|
|
||
| def ensure_statedb_connected(db): | ||
| if not hasattr(db, 'statedb'): | ||
| chassisdb = db.db | ||
|
|
@@ -58,42 +191,6 @@ def get_config_module_state(db, chassis_module_name): | |
| else: | ||
| return fvs['admin_status'] | ||
|
|
||
|
|
||
| def get_state_transition_in_progress(db, chassis_module_name): | ||
| ensure_statedb_connected(db) | ||
| fvs = db.statedb.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name) | ||
| value = fvs.get('state_transition_in_progress', 'False') if fvs else 'False' | ||
| return value | ||
|
|
||
|
|
||
| def set_state_transition_in_progress(db, chassis_module_name, value): | ||
| ensure_statedb_connected(db) | ||
| state_db = db.statedb | ||
| entry = state_db.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name) or {} | ||
| entry['state_transition_in_progress'] = value | ||
| if value == 'True': | ||
| entry['transition_start_time'] = datetime.utcnow().isoformat() | ||
| else: | ||
| entry.pop('transition_start_time', None) | ||
| state_db.delete_field('CHASSIS_MODULE_TABLE', chassis_module_name, 'transition_start_time') | ||
| state_db.set_entry('CHASSIS_MODULE_TABLE', chassis_module_name, entry) | ||
|
|
||
|
|
||
| def is_transition_timed_out(db, chassis_module_name): | ||
| ensure_statedb_connected(db) | ||
| state_db = db.statedb | ||
| fvs = state_db.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name) | ||
| if not fvs: | ||
| return False | ||
| start_time_str = fvs.get('transition_start_time') | ||
| if not start_time_str: | ||
| return False | ||
| try: | ||
| start_time = datetime.fromisoformat(start_time_str) | ||
| except ValueError: | ||
| return False | ||
| return datetime.utcnow() - start_time > TRANSITION_TIMEOUT | ||
|
|
||
| # | ||
| # Name: check_config_module_state_with_timeout | ||
| # return: True: timeout, False: not timeout | ||
|
|
@@ -182,15 +279,20 @@ def shutdown_chassis_module(db, chassis_module_name): | |
| return | ||
|
|
||
| if is_smartswitch(): | ||
| if get_state_transition_in_progress(db, chassis_module_name) == 'True': | ||
| if is_transition_timed_out(db, chassis_module_name): | ||
| set_state_transition_in_progress(db, chassis_module_name, 'False') | ||
| if _transition_in_progress(chassis_module_name): | ||
| if _transition_timed_out(chassis_module_name): | ||
| _mark_transition_clear(chassis_module_name) | ||
| click.echo(f"Previous transition for module {chassis_module_name} timed out. Proceeding with shutdown.") | ||
| else: | ||
| click.echo(f"Module {chassis_module_name} state transition is already in progress") | ||
| return | ||
| else: | ||
| set_state_transition_in_progress(db, chassis_module_name, 'True') | ||
| # Use centralized API & shared helper (minimal change) | ||
| if _block_if_conflicting_transition(chassis_module_name, | ||
| conflict_type="startup", | ||
| target_oper_status="Online"): | ||
| return | ||
| _mark_transition_start(chassis_module_name, "shutdown") | ||
|
|
||
| click.echo(f"Shutting down chassis module {chassis_module_name}") | ||
| fvs = { | ||
|
|
@@ -229,15 +331,20 @@ def startup_chassis_module(db, chassis_module_name): | |
| return | ||
|
|
||
| if is_smartswitch(): | ||
| if get_state_transition_in_progress(db, chassis_module_name) == 'True': | ||
| if is_transition_timed_out(db, chassis_module_name): | ||
| set_state_transition_in_progress(db, chassis_module_name, 'False') | ||
| if _transition_in_progress(chassis_module_name): | ||
| if _transition_timed_out(chassis_module_name): | ||
| _mark_transition_clear(chassis_module_name) | ||
| click.echo(f"Previous transition for module {chassis_module_name} timed out. Proceeding with startup.") | ||
| else: | ||
| click.echo(f"Module {chassis_module_name} state transition is already in progress") | ||
| return | ||
| else: | ||
| set_state_transition_in_progress(db, chassis_module_name, 'True') | ||
| # Use centralized API & shared helper (minimal change) | ||
| if _block_if_conflicting_transition(chassis_module_name, | ||
| conflict_type="shutdown", | ||
| target_oper_status="Offline"): | ||
| return | ||
| _mark_transition_start(chassis_module_name, "startup") | ||
|
|
||
| click.echo(f"Starting up chassis module {chassis_module_name}") | ||
| fvs = { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is this fallback import needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@gpunathilell SONiC has two Python bindings for RedisDB: the newer swsscommon and the older swsssdk. Different images/containers (or older platform builds) may ship only one of them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
older platform builds will use older branches, and not master. Do we need to support swssdk usage? This would mask any import issues and using swssdk connectors (which has not been in development for over two years) may cause issues
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@gpunathilell I'm ok removing the fallback option. @vvolam Hope you are ok as well?