Skip to content

Commit bd3de9d

Browse files
authored
Merge pull request #3831 from bhouse-nexthop/bhouse-nexthop/gcu-perf
Generic Configuration Updater (GCU) performance enhancements #### What I did Generic Configuration Updater is extremely slow, using the python profiler it was possible to determine the worst offenders where changes could be made without affecting the overall algorithm and HLD design documentation. Brief overview of changes: * Prevent copy.deepcopy() calls where possible * Don't run validation twice back to back * Move configdb path <> xpath conversion logic to sonic-yang-mgmt where it belongs and enhance it to support schema conversion (not just data) and add caching. * Sort table keys by the number of schema backlinks and must statements for the node to try better guess the right order of the patches to generate rather than doing it in alphabetical order which is likely to cause validation failures. * Add ability to Group patches together in some commits where its known they will not cause issues, these are things like grouping parameter updates under the same key. * When applying changes, do not re-read the configuration from redis twice between each applied patch (this is **extremely** slow, and actually hid a race condition). We are mutating the configuration and a lock is held so we know the expected before and after. There is still a final validation to ensure something didn't go sideways. Dependencies: * sonic-yang-mgmt enhancements: sonic-net/sonic-buildimage#22254 * sonic-yang-mgmt parse uses/grouping: sonic-net/sonic-buildimage#21907 * sonic-utilities rely on sonic-yang-mgmt uses/grouping handling: #3814 Stats below ... (stats need both this and the sonic-utilities PR to be relevant)... <ins>**Original Performance:**</ins> Dry Run: ``` time sudo config replace -d ./config_db.json ... real 2m51.588s user 2m23.777s sys 0m25.300s ``` Full: ``` time sudo config replace ./config_db.json ... real 14m53.772s user 12m2.376s sys 2m8.908s ``` <ins>**With Patch**:</ins> Dry Run: ``` time sudo config replace -d ./config_db.json ... real 0m59.602s user 0m56.434s sys 0m2.110s ``` Full: ``` time sudo config replace ./config_db.json ... real 1m54.303s user 0m58.482s sys 0m2.545s ``` So that's roughly 3x improvement for dry-run, and 7.5x improvement for full commit. There is room for improvement on the full commit due to a `sleep(1)` being used between each patch because of a race condition found in the prior code (that was hidden due to a costly sanity check that has been removed).
2 parents 3994fcd + 21c4da0 commit bd3de9d

File tree

12 files changed

+1094
-1255
lines changed

12 files changed

+1094
-1255
lines changed

generic_config_updater/change_applier.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,12 @@
55
import importlib
66
import os
77
import tempfile
8+
import time
89
from collections import defaultdict
910
from swsscommon.swsscommon import ConfigDBConnector
1011
from sonic_py_common import multi_asic
1112
from .gu_common import GenericConfigUpdaterError, genericUpdaterLogging
12-
from .gu_common import get_config_db_as_json
13+
from .gu_common import JsonChange
1314

1415
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
1516
UPDATER_CONF_FILE = f"{SCRIPT_DIR}/gcu_services_validator.conf.json"
@@ -64,8 +65,8 @@ class DryRunChangeApplier:
6465
def __init__(self, config_wrapper):
6566
self.config_wrapper = config_wrapper
6667

67-
def apply(self, change):
68-
self.config_wrapper.apply_change_to_config_db(change)
68+
def apply(self, current_configdb: dict, change: JsonChange) -> dict:
69+
return self.config_wrapper.apply_change_to_config_db(current_configdb, change)
6970

7071
def remove_backend_tables_from_config(self, data):
7172
return data
@@ -137,25 +138,45 @@ def _report_mismatch(self, run_data, upd_data):
137138
log_error("run_data vs expected_data: {}".format(
138139
str(jsondiff.diff(run_data, upd_data))[0:40]))
139140

140-
def apply(self, change):
141-
run_data = get_config_db_as_json(self.scope)
142-
upd_data = prune_empty_table(change.apply(copy.deepcopy(run_data)))
141+
def apply(self, current_configdb: dict, change: JsonChange) -> dict:
142+
run_data = current_configdb
143+
upd_data = prune_empty_table(change.apply(run_data, in_place=False))
143144
upd_keys = defaultdict(dict)
144145

145146
for tbl in sorted(set(run_data.keys()).union(set(upd_data.keys()))):
146147
self._upd_data(tbl, run_data.get(tbl, {}), upd_data.get(tbl, {}), upd_keys)
147148

148149
ret = self._services_validate(run_data, upd_data, upd_keys)
149-
if not ret:
150-
run_data = get_config_db_as_json(self.scope)
151-
self.remove_backend_tables_from_config(upd_data)
152-
self.remove_backend_tables_from_config(run_data)
153-
if upd_data != run_data:
154-
self._report_mismatch(run_data, upd_data)
155-
ret = -1
156-
if ret:
150+
# The above function returns 0 on success as it uses shell return codes
151+
if ret != 0:
157152
log_error("Failed to apply Json change")
158-
return ret
153+
154+
# There was a sanity check in this position originally that appeared
155+
# to be development-time code to ensure things were operating correctly.
156+
# It would retrieve the configdb from Redis and perform transformation
157+
# and comparison. Its not possible for the configuration to not be what
158+
# we expect since we have a known state we are mutating with a lock.
159+
# That said we are leaving in the final configuration comparison in
160+
# PatchApplier "just in case".
161+
#
162+
# However, this code did hide a pretty nasty race condition since there
163+
# is no feedback loop for when config_db changes are actually consumed.
164+
# This check would consume high CPU and would take a good amount of
165+
# time (0.5s - 1s).
166+
#
167+
# The below sleep is functionally equivalent in terms of preventing the
168+
# race condition (without the high CPU that might cause other control
169+
# plane issues), but is of course not the proper fix.
170+
#
171+
# An upstream SONiC issue will be opened for the race condition, and
172+
# until resolved leaving this comment in place for future reference.
173+
time.sleep(1)
174+
175+
# Interestingly this function returns the updated data and doesn't
176+
# propagate an error. Maybe it should? Or are exceptions thrown
177+
# from _upd_data on failure? We seem to intentionally only log on
178+
# _services_validate()
179+
return upd_data
159180

160181
def remove_backend_tables_from_config(self, data):
161182
for key in self.backend_tables:

generic_config_updater/generic_updater.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,9 +142,10 @@ def apply(self, patch, sort=True):
142142
# Apply changes in order
143143
self.logger.log_notice(f"{scope}: applying {changes_len} change{'s' if changes_len != 1 else ''} " \
144144
f"in order{':' if changes_len > 0 else '.'}")
145+
current_config = old_config
145146
for change in changes:
146147
self.logger.log_notice(f" * {change}")
147-
self.changeapplier.apply(change)
148+
current_config = self.changeapplier.apply(current_config, change)
148149

149150
# Validate config updated successfully
150151
self.logger.log_notice(f"{scope}: verifying patch updates are reflected on ConfigDB.")

0 commit comments

Comments
 (0)