Skip to content
Open
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
72c209b
refactored based on the revised HLD
rameshraghupathy Aug 13, 2025
d6b341b
refactored based on the revised HLD
rameshraghupathy Aug 13, 2025
b796e12
refactored based on the revised HLD
rameshraghupathy Aug 13, 2025
4927a4b
resolving SA issues
rameshraghupathy Aug 18, 2025
2d224b4
resolving SA issues
rameshraghupathy Aug 18, 2025
65b5cc8
using CHASSIS_MODULE_TABLE
rameshraghupathy Aug 20, 2025
060d016
using CHASSIS_MODULE_TABLE
rameshraghupathy Aug 20, 2025
ffbb12a
Refactored for graceful shutdown
rameshraghupathy Aug 24, 2025
ec7ee57
fixed sa warnings
rameshraghupathy Aug 25, 2025
1372b53
Refactored for graceful shutdown
rameshraghupathy Aug 25, 2025
c8729ec
fixed sa warnings
rameshraghupathy Aug 25, 2025
19e6bec
Refactored for graceful shutdown
rameshraghupathy Aug 25, 2025
ff61ba1
Refactored for graceful shutdown, fixing UT
rameshraghupathy Aug 26, 2025
af7f3bc
Fixing SA issue
rameshraghupathy Aug 26, 2025
3f2bf0b
Fixing ut
rameshraghupathy Aug 26, 2025
7cdc4bb
Fixing SA warnings
rameshraghupathy Aug 26, 2025
7608e49
Fixing SA warnings
rameshraghupathy Aug 27, 2025
7164fae
Fixing ut
rameshraghupathy Aug 27, 2025
40a08dd
workign on coverage
rameshraghupathy Aug 28, 2025
cb72aa6
workign on coverage
rameshraghupathy Aug 28, 2025
3af519d
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 8, 2025
ecc59af
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 9, 2025
2543f2a
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 9, 2025
20d831f
Addressed copilot PR comments
rameshraghupathy Sep 15, 2025
6cb4402
Addressed SA error
rameshraghupathy Sep 15, 2025
ea8e199
Addressed SA error
rameshraghupathy Sep 15, 2025
85ec74e
Addressed SA error
rameshraghupathy Sep 15, 2025
a590fb5
Addressed PR comments
rameshraghupathy Sep 18, 2025
b7ca97a
Addressed PR comments
rameshraghupathy Sep 18, 2025
630b03b
Fixed SA errors
rameshraghupathy Sep 18, 2025
5473ae2
Fixed SA errors
rameshraghupathy Sep 18, 2025
1418c9f
Fixed a ut failure
rameshraghupathy Sep 18, 2025
92ddd66
working on coverage
rameshraghupathy Sep 18, 2025
beff3ab
working on coverage
rameshraghupathy Sep 18, 2025
efabd48
working on coverage
rameshraghupathy Sep 19, 2025
f325579
working on coverage
rameshraghupathy Sep 19, 2025
e3d1eae
working on coverage
rameshraghupathy Sep 19, 2025
1ae6723
working on coverage
rameshraghupathy Sep 19, 2025
1976511
working on coverage
rameshraghupathy Sep 19, 2025
7b96ef1
working on coverage
rameshraghupathy Sep 19, 2025
e7afc0b
working on coverage
rameshraghupathy Sep 19, 2025
f15b887
Addressed PR comments
rameshraghupathy Sep 26, 2025
ebce12d
Addressed PR comments
rameshraghupathy Sep 26, 2025
3d56466
Addressed review comments related to refactoring
rameshraghupathy Oct 1, 2025
54a6d63
Fixing test failures
rameshraghupathy Oct 1, 2025
f25aa22
Fixing test failures
rameshraghupathy Oct 1, 2025
01236f0
Improving coverage
rameshraghupathy Oct 2, 2025
f9c5fa4
Fixing test failures
rameshraghupathy Oct 2, 2025
782d4bf
Fixing test failures
rameshraghupathy Oct 2, 2025
037a76e
Fixing test failures
rameshraghupathy Oct 2, 2025
4b1de85
Fixing test failures
rameshraghupathy Oct 2, 2025
60d156c
Aliging with the module_base.py changes such as common API, timezone,…
rameshraghupathy Oct 21, 2025
5101978
Merge branch 'master' into graceful-shutdown
rameshraghupathy Oct 21, 2025
915d485
Fixing test failures
rameshraghupathy Oct 21, 2025
c2121bd
Fixing test failures
rameshraghupathy Oct 21, 2025
41bfc57
Fixing test failures
rameshraghupathy Oct 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 151 additions & 44 deletions config/chassis_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,30 @@
from utilities_common.chassis import is_smartswitch, get_all_dpus
from datetime import datetime, timedelta

# New imports to use centralized APIs
try:
# Prefer swsscommon SonicV2Connector
from swsscommon.swsscommon import SonicV2Connector
except ImportError:
# Fallback (if ever needed)
from swsssdk import SonicV2Connector
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this fallback import needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gpunathilell SONiC has two Python bindings for RedisDB: the newer swsscommon and the older swsssdk. Different images/containers (or older platform builds) may ship only one of them.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

older platform builds will use older branches, and not master. Do we need to support swssdk usage? This would mask any import issues and using swssdk connectors (which has not been in development for over two years) may cause issues

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gpunathilell I'm ok removing the fallback option. @vvolam Hope you are ok as well?


from sonic_platform_base.module_base import ModuleBase

TIMEOUT_SECS = 10
# CLI uses a single conservative ceiling for timeouts when breaking a stuck transition.
# (Platform-specific per-op timeouts are applied by platform code during the transition itself.)
TRANSITION_TIMEOUT = timedelta(seconds=240) # 4 minutes

_MB_SINGLETON = None
_STATE_DB_CONN = None

def _module_base():
"""Return a cached ModuleBase instance."""
global _MB_SINGLETON
if _MB_SINGLETON is None:
_MB_SINGLETON = ModuleBase()
return _MB_SINGLETON

class StateDBHelper:
def __init__(self, sonic_db):
Expand Down Expand Up @@ -41,6 +62,118 @@ def modules():
pass


# Centralized-transition helpers (use ModuleBase)
def _state_db_conn():
"""Return a cached SonicV2Connector for STATE_DB (lazy init)."""
global _STATE_DB_CONN
if _STATE_DB_CONN is None:
conn = SonicV2Connector()
try:
conn.connect(conn.STATE_DB)
except Exception:
# Some bindings autoconnect; keep tolerant behavior
pass
_STATE_DB_CONN = conn
return _STATE_DB_CONN


def _transition_entry(module_name: str) -> dict:
"""Read the transition entry for a module via ModuleBase centralized API."""
mb = _module_base()
conn = _state_db_conn()
return mb.get_module_state_transition(conn, module_name) or {}


def _transition_in_progress(module_name: str) -> bool:
"""Return True if STATE_DB marks the module’s transition as in progress.

Uses `_transition_entry(module_name)` and checks whether
`state_transition_in_progress` is exactly the string "True" (strict check).
"""
entry = _transition_entry(module_name)
return entry.get("state_transition_in_progress", "False") == "True"


def _mark_transition_start(module_name: str, transition_type: str):
"""Set transition via centralized API."""
mb = _module_base()
conn = _state_db_conn()
mb.set_module_state_transition(conn, module_name, transition_type)


def _mark_transition_clear(module_name: str):
"""Clear transition via centralized API."""
mb = _module_base()
conn = _state_db_conn()
mb.clear_module_state_transition(conn, module_name)


def _transition_timed_out(module_name: str) -> bool:
"""CLI-side safety ceiling (4 minutes) to break a stuck transition."""
mb = _module_base()
conn = _state_db_conn()
return mb.is_module_state_transition_timed_out(conn, module_name, int(TRANSITION_TIMEOUT.total_seconds()))


# shared helper
def _block_if_conflicting_transition(chassis_module_name: str, conflict_type: str, target_oper_status: str) -> bool:
"""
Gate a CLI action if a conflicting module transition is still in progress.

This helper reads the centralized transition state (via `_transition_entry()`)
and the current `oper_status` from `CHASSIS_MODULE_TABLE`. It **blocks**
(returns True) when:
1) the module currently has `state_transition_in_progress == True`, and
2) the last recorded `transition_type` matches the requested `conflict_type`
(e.g., "startup", "shutdown", or "reboot"), and
3) the module has **not** yet reached the `target_oper_status` expected to
resolve that transition (e.g., "Online" after startup, "Offline" after shutdown).

If the above conditions are not all true, the function allows the caller to proceed
(returns False).

Args:
chassis_module_name: Module name (e.g., "DPU0").
conflict_type: The transition type that would conflict with the caller's action.
Typical values: "startup", "shutdown", "reboot".
target_oper_status: The oper status that indicates the conflicting transition has
effectively completed from the caller’s perspective (e.g., "Online" for startup,
"Offline" for shutdown).

Returns:
bool: True if the function **blocked** the action (a conflicting transition is underway
and the module hasn't reached `target_oper_status` yet); False if the caller may proceed.

Side Effects:
- Prints a user-facing message via `click.echo(...)` when blocking.
- No database writes are performed.

Notes:
- Truthiness of `state_transition_in_progress` is normalized with a case-insensitive
string check, so both boolean True and string "True" are handled.
- Missing or empty transition rows result in no block (returns False).
- There is an inherent race window between this check and the subsequent action;
callers should treat this as a best-effort gate and keep operations idempotent.

Example:
# Block shutdown if a startup is still running and the module is not yet Online
if _block_if_conflicting_transition("DPU0", "startup", "Online"):
return # tell CLI to try again later
"""
entry = _transition_entry(chassis_module_name) or {}
in_prog = str(entry.get("state_transition_in_progress", "False")).lower() == "true"
Copy link

Copilot AI Sep 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The string conversion and case normalization str(...).lower() == \"true\" is complex for a boolean check. Consider using a helper function or direct comparison with expected string values.

Copilot uses AI. Check for mistakes.
last_type = entry.get("transition_type")

# Current oper_status (keep this simple read from STATE_DB)
conn = _state_db_conn()
row = conn.get_all(conn.STATE_DB, f"CHASSIS_MODULE_TABLE|{chassis_module_name}") or {}
oper = row.get("oper_status")

if in_prog and last_type == conflict_type and oper != target_oper_status:
click.echo(f"Module {chassis_module_name} has a {conflict_type} transition underway; try again later.")
return True
return False

def ensure_statedb_connected(db):
if not hasattr(db, 'statedb'):
chassisdb = db.db
Expand All @@ -58,42 +191,6 @@ def get_config_module_state(db, chassis_module_name):
else:
return fvs['admin_status']


def get_state_transition_in_progress(db, chassis_module_name):
ensure_statedb_connected(db)
fvs = db.statedb.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name)
value = fvs.get('state_transition_in_progress', 'False') if fvs else 'False'
return value


def set_state_transition_in_progress(db, chassis_module_name, value):
ensure_statedb_connected(db)
state_db = db.statedb
entry = state_db.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name) or {}
entry['state_transition_in_progress'] = value
if value == 'True':
entry['transition_start_time'] = datetime.utcnow().isoformat()
else:
entry.pop('transition_start_time', None)
state_db.delete_field('CHASSIS_MODULE_TABLE', chassis_module_name, 'transition_start_time')
state_db.set_entry('CHASSIS_MODULE_TABLE', chassis_module_name, entry)


def is_transition_timed_out(db, chassis_module_name):
ensure_statedb_connected(db)
state_db = db.statedb
fvs = state_db.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name)
if not fvs:
return False
start_time_str = fvs.get('transition_start_time')
if not start_time_str:
return False
try:
start_time = datetime.fromisoformat(start_time_str)
except ValueError:
return False
return datetime.utcnow() - start_time > TRANSITION_TIMEOUT

#
# Name: check_config_module_state_with_timeout
# return: True: timeout, False: not timeout
Expand Down Expand Up @@ -182,15 +279,20 @@ def shutdown_chassis_module(db, chassis_module_name):
return

if is_smartswitch():
if get_state_transition_in_progress(db, chassis_module_name) == 'True':
if is_transition_timed_out(db, chassis_module_name):
set_state_transition_in_progress(db, chassis_module_name, 'False')
if _transition_in_progress(chassis_module_name):
if _transition_timed_out(chassis_module_name):
_mark_transition_clear(chassis_module_name)
click.echo(f"Previous transition for module {chassis_module_name} timed out. Proceeding with shutdown.")
else:
click.echo(f"Module {chassis_module_name} state transition is already in progress")
return
else:
set_state_transition_in_progress(db, chassis_module_name, 'True')
# Use centralized API & shared helper (minimal change)
if _block_if_conflicting_transition(chassis_module_name,
conflict_type="startup",
target_oper_status="Online"):
return
_mark_transition_start(chassis_module_name, "shutdown")

click.echo(f"Shutting down chassis module {chassis_module_name}")
fvs = {
Expand Down Expand Up @@ -229,15 +331,20 @@ def startup_chassis_module(db, chassis_module_name):
return

if is_smartswitch():
if get_state_transition_in_progress(db, chassis_module_name) == 'True':
if is_transition_timed_out(db, chassis_module_name):
set_state_transition_in_progress(db, chassis_module_name, 'False')
if _transition_in_progress(chassis_module_name):
if _transition_timed_out(chassis_module_name):
_mark_transition_clear(chassis_module_name)
click.echo(f"Previous transition for module {chassis_module_name} timed out. Proceeding with startup.")
else:
click.echo(f"Module {chassis_module_name} state transition is already in progress")
return
else:
set_state_transition_in_progress(db, chassis_module_name, 'True')
# Use centralized API & shared helper (minimal change)
if _block_if_conflicting_transition(chassis_module_name,
conflict_type="shutdown",
target_oper_status="Offline"):
return
_mark_transition_start(chassis_module_name, "startup")

click.echo(f"Starting up chassis module {chassis_module_name}")
fvs = {
Expand Down
Loading
Loading