Skip to content

Commit e1b5372

Browse files
committed
feat: Fix Ceph version lookup for newer releases
Ceph made various breaking changes to the status output in the `octopus` release in 2020, including (but not limited to) the entire `mon_status` command being removed. The DataDog check as-was only checked if the release was exactly 'octopus', not any later release as well. Given that `octopus` is now 5 years old, and there doesn't seem to be anything in the responses that gives a semantic version of similar to numerically compare, it's easiest to just _assume_ that we will get stats the New Way ™️, and also try any old way if the `mon_status` content exists in the `raw` map and the new way failed.
1 parent d444872 commit e1b5372

File tree

1 file changed

+28
-30
lines changed

1 file changed

+28
-30
lines changed

ceph/datadog_checks/ceph/ceph.py

Lines changed: 28 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ class Ceph(AgentCheck):
4343

4444
def __init__(self, name, init_config, instances):
4545
super(Ceph, self).__init__(name, init_config, instances)
46-
self._octopus = False
4746

4847
def _collect_raw(self, ceph_cmd, ceph_cluster, instance):
4948
use_sudo = _is_affirmative(instance.get('use_sudo', False))
@@ -58,6 +57,7 @@ def _collect_raw(self, ceph_cmd, ceph_cluster, instance):
5857
ceph_args = '{} --cluster {}'.format(ceph_args, ceph_cluster)
5958

6059
raw = {}
60+
# `mon_status` is only a valid command in versions of Ceph prior to `octopus` (released 2020-03-23)
6161
for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats', 'osd perf', 'health detail', 'osd metadata'):
6262
try:
6363
args = '{} {} -fjson'.format(ceph_args, cmd)
@@ -73,24 +73,22 @@ def _collect_raw(self, ceph_cmd, ceph_cluster, instance):
7373
mon_map = raw.get('status', {}).get('monmap')
7474
if mon_map is None:
7575
raise RuntimeError("Could not detect Ceph release series")
76-
if 'min_mon_release_name' in mon_map and mon_map['min_mon_release_name'] == 'octopus':
77-
self.log.debug("Detected octopus version of ceph...")
78-
self._octopus = True
79-
else:
80-
self._octopus = False
8176

8277
return raw
8378

8479
def _extract_tags(self, raw, instance):
8580
tags = instance.get('tags', [])
8681
fsid = None
87-
if self._octopus:
82+
try:
8883
fsid = raw['status']['fsid']
89-
elif 'mon_status' in raw:
90-
fsid = raw['mon_status']['monmap']['fsid']
84+
except KeyError:
85+
if 'mon_status' in raw:
86+
fsid = raw['mon_status']['monmap']['fsid']
87+
else:
88+
self.log.debug("Could not find fsid")
89+
90+
if 'mon_status' in raw:
9191
tags.append(self.NAMESPACE + '_mon_state:%s' % raw['mon_status']['state'])
92-
else:
93-
self.log.debug("Could not find fsid")
9492

9593
if fsid is not None:
9694
tags.append(self.NAMESPACE + '_fsid:%s' % fsid)
@@ -276,29 +274,29 @@ def _extract_metrics(self, raw, tags):
276274
except KeyError:
277275
self.log.debug('Error retrieving pgstatus metrics')
278276

279-
if self._octopus:
280-
try:
281-
num_mons = int(raw['status']['monmap']['num_mons'])
282-
self.gauge(self.NAMESPACE + '.num_mons', num_mons, tags)
283-
except KeyError:
284-
self.log.debug('Error retrieving num_mons metric')
285-
else:
286-
try:
287-
num_mons = len(raw['mon_status']['monmap']['mons'])
288-
self.gauge(self.NAMESPACE + '.num_mons', num_mons, tags)
289-
except KeyError:
290-
self.log.debug('Error retrieving mon_status metrics')
277+
try:
278+
num_mons = int(raw['status']['monmap']['num_mons'])
279+
self.gauge(self.NAMESPACE + '.num_mons', num_mons, tags)
280+
except KeyError:
281+
if 'mon_status' in raw:
282+
try:
283+
num_mons = len(raw['mon_status']['monmap']['mons'])
284+
self.gauge(self.NAMESPACE + '.num_mons', num_mons, tags)
285+
except KeyError:
286+
self.log.debug('Error retrieving mon_status metrics')
291287

292-
try:
293-
num_mons_active = len(raw['mon_status']['quorum'])
294-
self.gauge(self.NAMESPACE + '.num_mons.active', num_mons_active, tags)
295-
except KeyError:
296-
self.log.debug('Error retrieving mon_status quorum metrics')
288+
try:
289+
num_mons_active = len(raw['mon_status']['quorum'])
290+
self.gauge(self.NAMESPACE + '.num_mons.active', num_mons_active, tags)
291+
except KeyError:
292+
self.log.debug('Error retrieving mon_status quorum metrics')
293+
else:
294+
self.log.debug('Error retrieving num_mons metric')
297295

298296
try:
299297
stats = raw['df_detail']['stats']
300-
if not self._octopus:
301-
self._publish(stats, self.gauge, ['total_objects'], tags)
298+
# This will only work on Ceph versions prior to `octopus`, but will catch+return on later versions
299+
self._publish(stats, self.gauge, ['total_objects'], tags)
302300
used = float(stats['total_used_bytes'])
303301
total = float(stats['total_bytes'])
304302
if total > 0:

0 commit comments

Comments
 (0)