Skip to content

Commit 9fb8e6e

Browse files
committed
feat: Fix Ceph version lookup for newer releases
Ceph made various breaking changes to the status output in the `octopus` release in 2020, including (but not limited to) the entire `mon_status` command being removed. The DataDog check as-was only checked if the release was exactly 'octopus', not any later release as well. Given that `octopus` is now 5 years old, and there doesn't seem to be anything in the responses that gives a semantic version of similar to numerically compare, it's easiest to just _assume_ that we will get stats the New Way ™️, and also try any old way if the `mon_status` content exists in the `raw` map and the new way failed.
1 parent d444872 commit 9fb8e6e

File tree

1 file changed

+27
-31
lines changed

1 file changed

+27
-31
lines changed

ceph/datadog_checks/ceph/ceph.py

Lines changed: 27 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ class Ceph(AgentCheck):
4343

4444
def __init__(self, name, init_config, instances):
4545
super(Ceph, self).__init__(name, init_config, instances)
46-
self._octopus = False
4746

4847
def _collect_raw(self, ceph_cmd, ceph_cluster, instance):
4948
use_sudo = _is_affirmative(instance.get('use_sudo', False))
@@ -58,6 +57,7 @@ def _collect_raw(self, ceph_cmd, ceph_cluster, instance):
5857
ceph_args = '{} --cluster {}'.format(ceph_args, ceph_cluster)
5958

6059
raw = {}
60+
# `mon_status` is only a valid command in versions of Ceph prior to `octopus` (released 2020-03-23)
6161
for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats', 'osd perf', 'health detail', 'osd metadata'):
6262
try:
6363
args = '{} {} -fjson'.format(ceph_args, cmd)
@@ -73,24 +73,20 @@ def _collect_raw(self, ceph_cmd, ceph_cluster, instance):
7373
mon_map = raw.get('status', {}).get('monmap')
7474
if mon_map is None:
7575
raise RuntimeError("Could not detect Ceph release series")
76-
if 'min_mon_release_name' in mon_map and mon_map['min_mon_release_name'] == 'octopus':
77-
self.log.debug("Detected octopus version of ceph...")
78-
self._octopus = True
79-
else:
80-
self._octopus = False
8176

8277
return raw
8378

8479
def _extract_tags(self, raw, instance):
8580
tags = instance.get('tags', [])
8681
fsid = None
87-
if self._octopus:
82+
try:
8883
fsid = raw['status']['fsid']
89-
elif 'mon_status' in raw:
90-
fsid = raw['mon_status']['monmap']['fsid']
91-
tags.append(self.NAMESPACE + '_mon_state:%s' % raw['mon_status']['state'])
92-
else:
93-
self.log.debug("Could not find fsid")
84+
except KeyError:
85+
if 'mon_status' in raw:
86+
fsid = raw['mon_status']['monmap']['fsid']
87+
tags.append(self.NAMESPACE + '_mon_state:%s' % raw['mon_status']['state'])
88+
else:
89+
self.log.debug("Could not find fsid")
9490

9591
if fsid is not None:
9692
tags.append(self.NAMESPACE + '_fsid:%s' % fsid)
@@ -276,29 +272,29 @@ def _extract_metrics(self, raw, tags):
276272
except KeyError:
277273
self.log.debug('Error retrieving pgstatus metrics')
278274

279-
if self._octopus:
280-
try:
281-
num_mons = int(raw['status']['monmap']['num_mons'])
282-
self.gauge(self.NAMESPACE + '.num_mons', num_mons, tags)
283-
except KeyError:
284-
self.log.debug('Error retrieving num_mons metric')
285-
else:
286-
try:
287-
num_mons = len(raw['mon_status']['monmap']['mons'])
288-
self.gauge(self.NAMESPACE + '.num_mons', num_mons, tags)
289-
except KeyError:
290-
self.log.debug('Error retrieving mon_status metrics')
275+
try:
276+
num_mons = int(raw['status']['monmap']['num_mons'])
277+
self.gauge(self.NAMESPACE + '.num_mons', num_mons, tags)
278+
except KeyError:
279+
if 'mon_status' in raw:
280+
try:
281+
num_mons = len(raw['mon_status']['monmap']['mons'])
282+
self.gauge(self.NAMESPACE + '.num_mons', num_mons, tags)
283+
except KeyError:
284+
self.log.debug('Error retrieving mon_status metrics')
291285

292-
try:
293-
num_mons_active = len(raw['mon_status']['quorum'])
294-
self.gauge(self.NAMESPACE + '.num_mons.active', num_mons_active, tags)
295-
except KeyError:
296-
self.log.debug('Error retrieving mon_status quorum metrics')
286+
try:
287+
num_mons_active = len(raw['mon_status']['quorum'])
288+
self.gauge(self.NAMESPACE + '.num_mons.active', num_mons_active, tags)
289+
except KeyError:
290+
self.log.debug('Error retrieving mon_status quorum metrics')
291+
else:
292+
self.log.debug('Error retrieving num_mons metric')
297293

298294
try:
299295
stats = raw['df_detail']['stats']
300-
if not self._octopus:
301-
self._publish(stats, self.gauge, ['total_objects'], tags)
296+
# This will only work on Ceph versions prior to `octopus`, but will catch+return on later versions
297+
self._publish(stats, self.gauge, ['total_objects'], tags)
302298
used = float(stats['total_used_bytes'])
303299
total = float(stats['total_bytes'])
304300
if total > 0:

0 commit comments

Comments
 (0)