From 9e52bc2175cefa9d0f138b6f04841b9ed3e012be Mon Sep 17 00:00:00 2001 From: Titus Ou <58448745+pensan-tou@users.noreply.github.com> Date: Wed, 27 Aug 2025 10:49:07 -0700 Subject: [PATCH 1/2] remove grafana dashboards (#311) * remove grafana dashboards * link to 1.4.0 branch --- grafana/README.md | 6 +- grafana/dashboard_gpu.json | 2130 ----------------------- grafana/dashboard_job.json | 2828 ------------------------------- grafana/dashboard_node.json | 2440 -------------------------- grafana/dashboard_overview.json | 2369 -------------------------- 5 files changed, 1 insertion(+), 9772 deletions(-) delete mode 100644 grafana/dashboard_gpu.json delete mode 100644 grafana/dashboard_job.json delete mode 100644 grafana/dashboard_node.json delete mode 100644 grafana/dashboard_overview.json diff --git a/grafana/README.md b/grafana/README.md index 2765369c..ce0e125e 100644 --- a/grafana/README.md +++ b/grafana/README.md @@ -1,7 +1,3 @@ # Grafana Dashboards -### Variables - -Variables can be configured at any time in each dashboard's **Settings > Variables** section. - -**g_metrics_prefix**: string to prefix names of metrics queries (e.g. gpu_gfx_activity -> amd_gpu_gfx_activity) +Grafana dashboards can be found in the [ROCm/device-metrics-exporter](https://github.com/ROCm/device-metrics-exporter) repository under the [grafana](https://github.com/ROCm/device-metrics-exporter/tree/release-v1.4.0/grafana) directory. diff --git a/grafana/dashboard_gpu.json b/grafana/dashboard_gpu.json deleted file mode 100644 index d9403251..00000000 --- a/grafana/dashboard_gpu.json +++ /dev/null @@ -1,2130 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - }, - { - "name": "DS_EXPRESSION", - "label": "Expression", - "description": "", - "type": "datasource", - "pluginId": "__expr__" - } - ], - "__elements": {}, - "__requires": [ - { - "type": "datasource", - "id": "__expr__", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "gauge", - "name": "Gauge", - "version": "" - }, - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "11.2.2" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "stat", - "name": "Stat", - "version": "" - }, - { - "type": "panel", - "id": "text", - "name": "Text", - "version": "" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "description": "View by GPU", - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": null, - "links": [], - "panels": [ - { - "gridPos": { - "h": 10, - "w": 3, - "x": 0, - "y": 0 - }, - "id": 23, - "options": { - "code": { - "language": "plaintext", - "showLineNumbers": false, - "showMiniMap": false - }, - "content": "##### VENDOR\n${g_card_vendor}\n\n##### SERIES\n${g_card_series}\n\n##### MODEL\n${g_card_model}\n\n##### SERIAL\n${g_serial_number}\n\n##### UUID\n${g_gpu_uuid}", - "mode": "markdown" - }, - "pluginVersion": "11.2.2", - "type": "text" - }, - { - "gridPos": { - "h": 6, - "w": 3, - "x": 3, - "y": 0 - }, - "id": 33, - "options": { - "code": { - "language": "plaintext", - "showLineNumbers": false, - "showMiniMap": false - }, - "content": "#### HOST\n${g_hostname}\n\n#### GPU ID\n${g_gpu_id}\n\n#### PARTITION\n${g_gpu_partition_id}", - "mode": "markdown" - }, - "pluginVersion": "11.2.2", - "type": "text" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "GPU package power usage", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 700 - } - ] - }, - "unit": "watt" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 6, - "y": 0 - }, - "id": 3, - "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "last" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "sizing": "auto" - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "${g_metrics_prefix}gpu_average_package_power{gpu_uuid=\"$g_gpu_uuid\", hostname=\"$g_hostname\"}", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "{{hostname}}[{{gpu_id}}]", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "${g_metrics_prefix}gpu_package_power{gpu_uuid=\"$g_gpu_uuid\", hostname=\"$g_hostname\"}", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "{{hostname}}[{{gpu_id}}]", - "range": true, - "refId": "B", - "useBackend": false - } - ], - "title": "GPU Power Usage", - "transformations": [ - { - "id": "calculateField", - "options": { - "alias": "GPU Package Power", - "mode": "reduceRow", - "reduce": { - "reducer": "sum" - }, - "replaceFields": true - } - } - ], - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Current maximum PCIe speed", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "MBs" - }, - "overrides": [] - }, - "gridPos": { - "h": 5, - "w": 3, - "x": 10, - "y": 0 - }, - "id": 22, - "maxDataPoints": 60, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "${g_metrics_prefix}pcie_max_speed{gpu_uuid=\"$g_gpu_uuid\", hostname=\"$g_hostname\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "{{hostname}}[{{gpu_id}}]", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "PCIe Max Speed", - "transformations": [ - { - "id": "calculateField", - "options": { - "alias": "PCIe Max Speed", - "mode": "reduceRow", - "reduce": { - "reducer": "sum" - }, - "replaceFields": true - } - } - ], - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 8, - "x": 13, - "y": 0 - }, - "id": 34, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "delta(${g_metrics_prefix}pcie_recovery_count{gpu_uuid=\"$g_gpu_uuid\", hostname=\"$g_hostname\"}[$__interval])", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Recovery", - "range": false, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "delta(${g_metrics_prefix}pcie_replay_count{gpu_uuid=\"$g_gpu_uuid\", hostname=\"$g_hostname\"}[$__interval])", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Replay", - "range": false, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "delta(${g_metrics_prefix}pcie_replay_rollover_count{gpu_uuid=\"$g_gpu_uuid\", hostname=\"$g_hostname\"}[$__interval])", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Replay Rollover", - "range": false, - "refId": "C", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "delta(${g_metrics_prefix}pcie_nack_received_count{gpu_uuid=\"$g_gpu_uuid\", hostname=\"$g_hostname\"}[$__interval])", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "NACK Received", - "range": false, - "refId": "D", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "delta(${g_metrics_prefix}pcie_nack_sent_count{gpu_uuid=\"$g_gpu_uuid\", hostname=\"$g_hostname\"}[$__interval])", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "NACK Sent", - "range": false, - "refId": "E", - "useBackend": false - } - ], - "title": "PCIe Counts", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Most recent health status of GPU", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "0": { - "color": "red", - "index": 0, - "text": "Unhealthy" - }, - "1": { - "color": "green", - "index": 1, - "text": "Healthy" - } - }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 21, - "y": 0 - }, - "id": 41, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "${g_metrics_prefix}gpu_health{gpu_uuid=\"$g_gpu_uuid\", hostname=\"$g_hostname\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Health", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Current PCIe bandwidth over time", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "PCIe Bandwidth", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "MBs" - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 13, - "y": 3 - }, - "id": 37, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "${g_metrics_prefix}pcie_bandwidth{gpu_uuid=\"$g_gpu_uuid\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "PCIe Bandwidth", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "% used VRAM of the GPU", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 6, - "y": 4 - }, - "id": 1, - "maxDataPoints": 60, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "/^VRAM Usage$/", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(${g_metrics_prefix}gpu_used_vram{gpu_uuid=\"$g_gpu_uuid\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_total_vram{gpu_uuid=\"$g_gpu_uuid\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "($A / $B) * 100", - "hide": false, - "refId": "VRAM Usage", - "type": "math" - } - ], - "title": "VRAM Usage", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 3, - "x": 21, - "y": 4 - }, - "id": 38, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(delta(${g_metrics_prefix}gpu_ecc_correct_total{gpu_uuid=\"$g_gpu_uuid\"}[$__interval]))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Total Correctable ECC", - "range": false, - "refId": "A", - "useBackend": false - } - ], - "title": "Total Correctable ECC", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Current PCIe bandwidth", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "MBs" - }, - "overrides": [] - }, - "gridPos": { - "h": 5, - "w": 3, - "x": 10, - "y": 5 - }, - "id": 19, - "maxDataPoints": 60, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "${g_metrics_prefix}pcie_bandwidth{gpu_uuid=\"$g_gpu_uuid\", hostname=\"$g_hostname\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "{{hostname}}[{{gpu_id}}]", - "range": false, - "refId": "A", - "useBackend": false - } - ], - "title": "PCIe Bandwidth", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 3, - "y": 6 - }, - "id": 25, - "options": { - "code": { - "language": "plaintext", - "showLineNumbers": false, - "showMiniMap": false - }, - "content": "#### VBIOS\n${g_gpu_vbios}\n\n#### DRIVER\n${g_driver}", - "mode": "markdown" - }, - "pluginVersion": "11.2.2", - "type": "text" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Accumulated energy consumed", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "displayName": "Energy Consumed", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "joule" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 6, - "y": 7 - }, - "id": 8, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "value", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "delta(${g_metrics_prefix}gpu_energy_consumed{gpu_uuid=\"$g_gpu_uuid\", hostname=\"$g_hostname\"}[$__interval])", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "{{hostname}}[{{gpu_id}}]", - "range": false, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "$A / 1000000", - "hide": false, - "refId": "Joules", - "type": "math" - } - ], - "title": "Energy Consumed", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 3, - "x": 21, - "y": 7 - }, - "id": 39, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(delta(${g_metrics_prefix}gpu_ecc_uncorrect_total{gpu_uuid=\"$g_gpu_uuid\"}[$__interval]))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Total Uncorrectable ECC", - "range": false, - "refId": "A", - "useBackend": false - } - ], - "title": "Total Uncorrectable ECC", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Current GFX activity", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "GPU Usage", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 10 - }, - "id": 9, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "${g_metrics_prefix}gpu_gfx_activity{gpu_uuid=\"$g_gpu_uuid\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "GPU Usage", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Used VRAM on the GPU over time", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "Used VRAM", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 8, - "y": 10 - }, - "id": 30, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "${g_metrics_prefix}gpu_used_vram{gpu_uuid=\"$g_gpu_uuid\"}", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Used VRAM", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "clamp_min(${g_metrics_prefix}gpu_total_vram{gpu_uuid=\"$g_gpu_uuid\"}, 1)", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Total VRAM", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "downsampler": "mean", - "expression": "($A / $B) * 100", - "hide": false, - "refId": "C", - "type": "math", - "upsampler": "fillna" - } - ], - "title": "Used VRAM", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "GPU package power, in Watts", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "GPU Power", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "watt" - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 16, - "y": 10 - }, - "id": 6, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "${g_metrics_prefix}gpu_package_power{gpu_uuid=\"$g_gpu_uuid\", card_model=~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "${g_metrics_prefix}gpu_average_package_power{gpu_uuid=\"$g_gpu_uuid\", card_model!~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"}", - "hide": false, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B" - } - ], - "title": "GPU Package Power", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Current GPU temperature, in Celsius", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "GPU Temperature", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "celsius" - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 17 - }, - "id": 32, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "${g_metrics_prefix}gpu_edge_temperature{gpu_uuid=\"$g_gpu_uuid\", card_model!~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "${g_metrics_prefix}gpu_junction_temperature{gpu_uuid=\"$g_gpu_uuid\", card_model=~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"}", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B", - "useBackend": false - } - ], - "title": "GPU Temperature", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Current memory temperature, in Celsius", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "Memory Temperature", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "celsius" - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 8, - "y": 17 - }, - "id": 14, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "${g_metrics_prefix}gpu_memory_temperature{gpu_uuid=\"$g_gpu_uuid\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Memory Temperature", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Current temperatures, in Celsius:\n- 4 HBM temperatures\n- Edge temperature\n- Junction/hotspot temperature", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "celsius" - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 16, - "y": 17 - }, - "id": 10, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "${g_metrics_prefix}gpu_hbm_temperature{gpu_uuid=\"$g_gpu_uuid\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "HBM - {{hbm_index}}", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "${g_metrics_prefix}gpu_edge_temperature{gpu_uuid=\"$g_gpu_uuid\"}", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Edge Temperature", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "${g_metrics_prefix}gpu_junction_temperature{gpu_uuid=\"$g_gpu_uuid\"}", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Junction Temperature", - "range": true, - "refId": "C", - "useBackend": false - } - ], - "title": "Temperature Sensors", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 22, - "y": 24 - }, - "id": 40, - "options": { - "code": { - "language": "plaintext", - "showLineNumbers": false, - "showMiniMap": false - }, - "content": "main", - "mode": "markdown" - }, - "pluginVersion": "11.2.2", - "title": "Version", - "type": "text" - } - ], - "refresh": "", - "schemaVersion": 39, - "tags": [], - "templating": { - "list": [ - { - "current": { - "selected": false, - "text": "", - "value": "" - }, - "description": "string to prefix names of metrics queries (e.g. gpu_gfx_activity -> amd_gpu_gfx_activity)", - "hide": 2, - "label": "Metrics Prefix", - "name": "g_metrics_prefix", - "options": [ - { - "selected": true, - "text": "", - "value": "" - } - ], - "query": "", - "skipUrlSync": false, - "type": "textbox" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values({cluster_name=\"$g_cluster_name\", hostname=\"$g_hostname\", gpu_id=\"$g_gpu_id\", gpu_partition_id=\"$g_gpu_partition_id\"},gpu_uuid)", - "hide": 2, - "includeAll": false, - "label": "GPU UUID", - "multi": false, - "name": "g_gpu_uuid", - "options": [], - "query": { - "qryType": 1, - "query": "label_values({cluster_name=\"$g_cluster_name\", hostname=\"$g_hostname\", gpu_id=\"$g_gpu_id\", gpu_partition_id=\"$g_gpu_partition_id\"},gpu_uuid)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 7, - "type": "query" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values(cluster_name)", - "hide": 0, - "includeAll": false, - "label": "Cluster", - "multi": false, - "name": "g_cluster_name", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(cluster_name)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 7, - "type": "query" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values({cluster_name=\"$g_cluster_name\"},hostname)", - "hide": 0, - "includeAll": false, - "label": "Hostname", - "multi": false, - "name": "g_hostname", - "options": [], - "query": { - "qryType": 1, - "query": "label_values({cluster_name=\"$g_cluster_name\"},hostname)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 7, - "type": "query" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values({hostname=\"$g_hostname\"},gpu_id)", - "hide": 0, - "includeAll": false, - "label": "GPU ID", - "multi": false, - "name": "g_gpu_id", - "options": [], - "query": { - "qryType": 1, - "query": "label_values({hostname=\"$g_hostname\"},gpu_id)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 7, - "type": "query" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values({hostname=\"$g_hostname\", gpu_id=\"$g_gpu_id\"},gpu_partition_id)", - "hide": 0, - "includeAll": true, - "label": "Partition", - "multi": false, - "name": "g_gpu_partition_id", - "options": [], - "query": { - "qryType": 1, - "query": "label_values({hostname=\"$g_hostname\", gpu_id=\"$g_gpu_id\"},gpu_partition_id)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 7, - "type": "query" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values({gpu_uuid=\"$g_gpu_uuid\"},vbios_version)", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "g_gpu_vbios", - "options": [], - "query": { - "qryType": 1, - "query": "label_values({gpu_uuid=\"$g_gpu_uuid\"},vbios_version)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values({gpu_uuid=\"$g_gpu_uuid\"},driver_version)", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "g_driver", - "options": [], - "query": { - "qryType": 1, - "query": "label_values({gpu_uuid=\"$g_gpu_uuid\"},driver_version)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values({gpu_uuid=\"$g_gpu_uuid\"},card_vendor)", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "g_card_vendor", - "options": [], - "query": { - "qryType": 1, - "query": "label_values({gpu_uuid=\"$g_gpu_uuid\"},card_vendor)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values({gpu_uuid=\"$g_gpu_uuid\"},card_series)", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "g_card_series", - "options": [], - "query": { - "qryType": 1, - "query": "label_values({gpu_uuid=\"$g_gpu_uuid\"},card_series)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values({gpu_uuid=\"$g_gpu_uuid\"},card_model)", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "g_card_model", - "options": [], - "query": { - "qryType": 1, - "query": "label_values({gpu_uuid=\"$g_gpu_uuid\"},card_model)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values({gpu_uuid=\"$g_gpu_uuid\"},serial_number)", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "g_serial_number", - "options": [], - "query": { - "qryType": 1, - "query": "label_values({gpu_uuid=\"$g_gpu_uuid\"},serial_number)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - } - ] - }, - "time": { - "from": "now-24h", - "to": "now" - }, - "timepicker": {}, - "timezone": "browser", - "title": "GPU", - "uid": "ae0aj8euc43r4b", - "version": 1, - "weekStart": "" -} \ No newline at end of file diff --git a/grafana/dashboard_job.json b/grafana/dashboard_job.json deleted file mode 100644 index cf940076..00000000 --- a/grafana/dashboard_job.json +++ /dev/null @@ -1,2828 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - }, - { - "name": "DS_EXPRESSION", - "label": "Expression", - "description": "", - "type": "datasource", - "pluginId": "__expr__" - } - ], - "__elements": {}, - "__requires": [ - { - "type": "datasource", - "id": "__expr__", - "version": "1.0.0" - }, - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "11.2.2" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "stat", - "name": "Stat", - "version": "" - }, - { - "type": "panel", - "id": "table", - "name": "Table", - "version": "" - }, - { - "type": "panel", - "id": "text", - "name": "Text", - "version": "" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": null, - "links": [], - "panels": [ - { - "description": "Name of the job ID or pod", - "gridPos": { - "h": 4, - "w": 5, - "x": 0, - "y": 0 - }, - "id": 2, - "options": { - "code": { - "language": "plaintext", - "showLineNumbers": false, - "showMiniMap": false - }, - "content": "${g_job_id}\n${g_pod}", - "mode": "markdown" - }, - "pluginVersion": "11.2.2", - "title": "Name", - "type": "text" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "% of VRAM used across all GPUs this job or pod is running on", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 5, - "y": 0 - }, - "id": 10, - "maxDataPoints": 60, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "sum(${g_metrics_prefix}gpu_used_vram{job_id!=\"\", job_id=\"$g_job_id\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_total_vram{job_id!=\"\", job_id=\"$g_job_id\"})", - "hide": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_used_vram{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "C", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_total_vram{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "D", - "useBackend": false - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "$A/$B", - "hide": false, - "refId": "Job VRAM Usage", - "type": "math" - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "$C/$D", - "hide": false, - "refId": "Pod VRAM Usage", - "type": "math" - } - ], - "title": "Memory Usage", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Maximum PCIe speed", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "MBs" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 8, - "y": 0 - }, - "id": 12, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "max(${g_metrics_prefix}pcie_max_speed{job_id!=\"\", job_id=\"$g_job_id\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "{{job_id}}", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "max(${g_metrics_prefix}pcie_max_speed{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "{{pod}}", - "range": true, - "refId": "B", - "useBackend": false - } - ], - "title": "PCIe Max Speed", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 4, - "x": 12, - "y": 0 - }, - "id": 15, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "sum(delta(${g_metrics_prefix}pcie_recovery_count{job_id!=\"\", job_id=\"$g_job_id\"}[$__interval]))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Recovery", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "sum(delta(${g_metrics_prefix}pcie_recovery_count{pod!=\"\", pod=\"$g_pod\"}[$__interval]))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Recovery", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "sum(delta(${g_metrics_prefix}pcie_replay_count{job_id!=\"\", job_id=\"$g_job_id\"}[$__interval]))", - "hide": false, - "instant": false, - "legendFormat": "Replay", - "range": true, - "refId": "C" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "sum(delta(${g_metrics_prefix}pcie_replay_count{pod!=\"\", pod=\"$g_pod\"}[$__interval]))", - "hide": false, - "instant": false, - "legendFormat": "Replay", - "range": true, - "refId": "D" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "sum(delta(${g_metrics_prefix}pcie_replay_rollover_count{job_id!=\"\", job_id=\"$g_job_id\"}[$__interval]))", - "hide": false, - "instant": false, - "legendFormat": "Replay Rollover", - "range": true, - "refId": "E" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "sum(delta(${g_metrics_prefix}pcie_replay_rollover_count{pod!=\"\", pod=\"$g_pod\"}[$__interval]))", - "hide": false, - "instant": false, - "legendFormat": "Replay Rollover", - "range": true, - "refId": "F" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "sum(delta(${g_metrics_prefix}pcie_nack_received_count{job_id!=\"\", job_id=\"$g_job_id\"}[$__interval]))", - "hide": false, - "instant": false, - "legendFormat": "NACK Received", - "range": true, - "refId": "G" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "sum(delta(${g_metrics_prefix}pcie_nack_received_count{pod!=\"\", pod=\"$g_pod\"}[$__interval]))", - "hide": false, - "instant": false, - "legendFormat": "NACK Received", - "range": true, - "refId": "H" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "sum(delta(${g_metrics_prefix}pcie_nack_sent_count{job_id!=\"\", job_id=\"$g_job_id\"}[$__interval]))", - "hide": false, - "instant": false, - "legendFormat": "NACK Sent", - "range": true, - "refId": "I" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "sum(delta(${g_metrics_prefix}pcie_nack_sent_count{pod!=\"\", pod=\"$g_pod\"}[$__interval]))", - "hide": false, - "instant": false, - "legendFormat": "NACK Sent", - "range": true, - "refId": "J" - } - ], - "title": "PCIe Counts", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Top 5 current GPU GFX Activity, labeled with hostname and GPU ID.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "links": [ - { - "targetBlank": true, - "title": "Go to GPU Dashboard", - "url": "/d/ae0aj8euc43r4b/gpu?var-g_gpu_uuid=${__field.labels.gpu_uuid}&var-g_hostname=${__field.labels.hostname}&var-g_gpu_id=${__field.labels.gpu_id}" - } - ], - "mappings": [], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 4, - "x": 16, - "y": 0 - }, - "id": 16, - "maxDataPoints": 60, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "max" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "code", - "exemplar": false, - "expr": "topk(5, ${g_metrics_prefix}gpu_gfx_activity{job_id!=\"\", job_id=\"$g_job_id\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "{{hostname}} | {{gpu_id}}", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "topk(5, ${g_metrics_prefix}gpu_gfx_activity{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "{{hostname}} | {{gpu_id}}", - "range": true, - "refId": "B", - "useBackend": false - } - ], - "title": "Top 5 GPU Usage", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Top 5 GPUs with highest VRAM used, labeled with hostname and GPU ID.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "links": [ - { - "targetBlank": true, - "title": "Go to GPU Dashboard", - "url": "/d/ae0aj8euc43r4b/gpu?var-g_gpu_uuid=${__field.labels.gpu_uuid}&var-g_hostname=${__field.labels.hostname}&var-g_gpu_id=${__field.labels.gpu_id}" - } - ], - "mappings": [], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "decmbytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 4, - "x": 20, - "y": 0 - }, - "id": 17, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "max" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "topk(5, ${g_metrics_prefix}gpu_used_vram{job_id!=\"\", job_id=\"$g_job_id\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "{{hostname}} | {{gpu_id}}", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "code", - "exemplar": false, - "expr": "topk(5, ${g_metrics_prefix}gpu_used_vram{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "{{hostname}} | {{gpu_id}}", - "range": true, - "refId": "B", - "useBackend": false - } - ], - "title": "Top 5 Used VRAM", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total current PCIe bandwidth", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "MBs" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 8, - "y": 3 - }, - "id": 13, - "maxDataPoints": 60, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "code", - "exemplar": false, - "expr": "sum(${g_metrics_prefix}pcie_bandwidth{job_id!=\"\", job_id=\"$g_job_id\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "PCIe Bandwidth", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(${g_metrics_prefix}pcie_bandwidth{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "PCIe Bandwidth", - "range": true, - "refId": "B", - "useBackend": false - } - ], - "title": "Total Current PCIe Bandwidth", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "\\# of compute nodes used by the job", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "links": [ - { - "targetBlank": true, - "title": "Go to Compute Node Dashboard", - "url": "/d/de1q9vq97fe2oc/compute-node" - } - ], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 0, - "y": 4 - }, - "id": 11, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "sum(group by(hostname) (${g_metrics_prefix}gpu_gfx_activity{job_id!=\"\", job_id=\"$g_job_id\"}))", - "instant": false, - "legendFormat": "# of compute nodes used", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "sum(group by(hostname) (${g_metrics_prefix}gpu_gfx_activity{pod!=\"\", pod=\"$g_pod\"}))", - "hide": false, - "instant": false, - "legendFormat": "# of compute nodes used", - "range": true, - "refId": "B" - } - ], - "title": "Compute Nodes", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total package power usage, in Watts", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "watt" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 5, - "x": 3, - "y": 4 - }, - "id": 9, - "maxDataPoints": 60, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_average_package_power{job_id!=\"\", job_id=\"$g_job_id\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Average Job Package Power Usage", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_package_power{job_id!=\"\", job_id=\"$g_job_id\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Job Package Power Usage", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_average_package_power{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Average Pod Package Power Usage", - "range": true, - "refId": "C", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_package_power{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Pod Package Power Usage", - "range": true, - "refId": "D", - "useBackend": false - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "$A+$B", - "hide": false, - "refId": "Total Job Package Power Usage", - "type": "math" - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "$C+$D", - "hide": false, - "refId": "Total Pod Package Power Usage", - "type": "math" - } - ], - "title": "Total Power Usage", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total PCIe bandwidth over time", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "Total PCIe Bandwidth", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "MBs" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 6 - }, - "id": 14, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}pcie_bandwidth{job_id!=\"\", job_id=\"$g_job_id\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}pcie_bandwidth{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B", - "useBackend": false - } - ], - "title": "Total PCIe Bandwidth", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "\\# of GPUs allocated by the job", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "links": [ - { - "targetBlank": true, - "title": "Go to GPU Dashboard", - "url": "/d/ae0aj8euc43r4b/gpu" - } - ], - "mappings": [], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byFrameRefID", - "options": "Unhealthy (jobs)" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byFrameRefID", - "options": "Unhealthy (pods)" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 0, - "y": 8 - }, - "id": 3, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "text": { - "titleSize": 9 - }, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(group by(gpu_uuid) (${g_metrics_prefix}gpu_gfx_activity{job_id!=\"\", job_id=\"$g_job_id\"}))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Allocated by Jobs", - "range": false, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(group by(gpu_uuid) (${g_metrics_prefix}gpu_gfx_activity{pod!=\"\", pod=\"$g_pod\"}))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Allocated by Pods", - "range": false, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(group by(gpu_id) (${g_metrics_prefix}gpu_gfx_activity{job_id!=\"\", job_id=\"$g_job_id\"} > 0))", - "hide": false, - "instant": true, - "legendFormat": "Busy GPUs", - "range": false, - "refId": "C" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(group by(gpu_id) (${g_metrics_prefix}gpu_gfx_activity{pod!=\"\", pod=\"$g_pod\"} > 0))", - "hide": false, - "instant": true, - "legendFormat": "Busy GPUs", - "range": false, - "refId": "D" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(group by(gpu_uuid) (${g_metrics_prefix}gpu_health{job_id!=\"\", job_id=\"$g_job_id\"} < 1))", - "hide": false, - "instant": true, - "legendFormat": "Unhealthy GPUs", - "range": false, - "refId": "Unhealthy (jobs)" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(group by(gpu_uuid) (${g_metrics_prefix}gpu_health{pod!=\"\", pod=\"$g_pod\"} < 1))", - "hide": false, - "instant": true, - "legendFormat": "Unhealthy GPUs", - "range": false, - "refId": "Unhealthy (pods)" - } - ], - "title": "Allocated GPUs", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 5, - "x": 3, - "y": 8 - }, - "id": 18, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "code", - "exemplar": false, - "expr": "sum(delta(${g_metrics_prefix}gpu_ecc_correct_total{job_id!=\"\", job_id=\"$g_job_id\"}[$__interval]))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Correctable", - "range": false, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "code", - "exemplar": false, - "expr": "sum(delta(${g_metrics_prefix}gpu_ecc_correct_total{pod!=\"\", pod=\"$g_pod\"}[$__interval]))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Correctable", - "range": false, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "code", - "exemplar": false, - "expr": "sum(delta(${g_metrics_prefix}gpu_ecc_uncorrect_total{job_id!=\"\", job_id=\"$g_job_id\"}[$__interval]))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Uncorrectable", - "range": false, - "refId": "C", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "code", - "exemplar": false, - "expr": "sum(delta(${g_metrics_prefix}gpu_ecc_uncorrect_total{pod!=\"\", pod=\"$g_pod\"}[$__interval]))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Uncorrectable", - "range": false, - "refId": "D", - "useBackend": false - } - ], - "title": "Total ECC Counts", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Average of current GFX activity", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "GPU Usage", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 12 - }, - "id": 1, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_gfx_activity{job_id!=\"\", job_id=\"$g_job_id\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_gfx_activity{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B", - "useBackend": false - } - ], - "title": "Average GPU Usage", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "% of used VRAM across GPUs the job or pod is running on", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 12 - }, - "id": 5, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.2.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_used_vram{job_id!=\"\", job_id=\"$g_job_id\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_total_vram{job_id!=\"\", job_id=\"$g_job_id\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_used_vram{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "C", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_total_vram{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "D", - "useBackend": false - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "$A/$B", - "hide": false, - "refId": "Job Memory Usage", - "type": "math" - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "$C/$D", - "hide": false, - "refId": "Pod Memory Usage", - "type": "math" - } - ], - "title": "Used VRAM", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Average GPU package power", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "GPU Power", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "watt" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 16, - "y": 12 - }, - "id": 7, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_package_power{job_id!=\"\", job_id=\"$g_job_id\", card_model=~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_package_power{pod!=\"\", pod=\"$g_pod\", card_model=~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_average_package_power{job_id!=\"\", job_id=\"$g_job_id\", card_model!~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "hide": false, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "C" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_average_package_power{pod!=\"\", pod=\"$g_pod\", card_model!~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "hide": false, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "D" - } - ], - "title": "Average GPU Power", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Average of temperature of currently used GPUs", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "GPU Temperature", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "celsius" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 18 - }, - "id": 4, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_edge_temperature{job_id!=\"\", job_id=\"$g_job_id\", card_model!~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_edge_temperature{pod!=\"\", pod=\"$g_pod\", card_model!~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_junction_temperature{job_id!=\"\", job_id=\"$g_job_id\", card_model=~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "C", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_junction_temperature{pod!=\"\", pod=\"$g_pod\", card_model=~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "D", - "useBackend": false - } - ], - "title": "Average GPU Temperature", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Average memory temperature of currently used GPUs", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "Memory Temperature", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "celsius" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 18 - }, - "id": 6, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_memory_temperature{job_id!=\"\", job_id=\"$g_job_id\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_memory_temperature{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B", - "useBackend": false - } - ], - "title": "Average Memory Temperature", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Averages of current temperatures across GPUs used by the job or pod, in Celsius:\n- 4 HBM temperatures\n- Edge temperature\n- Junction/hotspot temperature", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 16, - "y": 18 - }, - "id": 8, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_hbm_temperature{job_id!=\"\", job_id=\"$g_job_id\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "HBM Temperature", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_edge_temperature{job_id!=\"\", job_id=\"$g_job_id\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Edge Temperature", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_junction_temperature{job_id!=\"\", job_id=\"$g_job_id\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Junction Temperature", - "range": true, - "refId": "C", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_hbm_temperature{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "HBM Temperature", - "range": true, - "refId": "D", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_edge_temperature{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Edge Temperature", - "range": true, - "refId": "E", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_junction_temperature{pod!=\"\", pod=\"$g_pod\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Junction Temperature", - "range": true, - "refId": "F", - "useBackend": false - } - ], - "title": "Average Sensor Temperatures", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "List of all GPUs used by this job. Health is the last known status of the GPU during a running job in the selected time range.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "filterable": true, - "inspect": false - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "HOSTNAME" - }, - "properties": [ - { - "id": "links", - "value": [ - { - "targetBlank": true, - "title": "Go to Compute Node Dashboard", - "url": "/d/de1q9vq97fe2oc/compute-node?var-g_hostname=${__data.fields.HOSTNAME}" - } - ] - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "GPU ID" - }, - "properties": [ - { - "id": "custom.width", - "value": 100 - }, - { - "id": "links", - "value": [ - { - "targetBlank": true, - "title": "Go to GPU Dashboard", - "url": "/d/ae0aj8euc43r4b/gpu?var-g_gpu_uuid=${__data.fields[\"gpu_uuid\"]}&var-g_hostname=${__data.fields.HOSTNAME}&var-g_gpu_id=${__data.fields[\"GPU ID\"]}" - } - ] - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "GPU UUID" - }, - "properties": [ - { - "id": "custom.width", - "value": 320 - }, - { - "id": "links", - "value": [ - { - "targetBlank": true, - "title": "Go to GPU Dashboard", - "url": "/d/ae0aj8euc43r4b/gpu?var-g_gpu_uuid=${__data.fields[\"gpu_uuid\"]}&var-g_hostname=${__data.fields.HOSTNAME}&var-g_gpu_id=${__data.fields[\"GPU ID\"]}" - } - ] - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "HEALTH" - }, - "properties": [ - { - "id": "custom.width", - "value": 105 - }, - { - "id": "mappings", - "value": [ - { - "options": { - "0": { - "color": "red", - "index": 0, - "text": "unhealthy" - }, - "1": { - "color": "green", - "index": 1, - "text": "healthy" - } - }, - "type": "value" - } - ] - }, - { - "id": "custom.cellOptions", - "value": { - "type": "color-text" - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 24 - }, - "id": 19, - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "enablePagination": true, - "fields": "", - "reducer": [ - "sum" - ], - "show": false - }, - "showHeader": true, - "sortBy": [ - { - "desc": false, - "displayName": "HEALTH" - }, - { - "desc": false, - "displayName": "HOSTNAME" - }, - { - "desc": false, - "displayName": "GPU ID" - } - ] - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "${g_metrics_prefix}gpu_health{job_id!=\"\", job_id=\"$g_job_id\"} or vector(0)", - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "${g_metrics_prefix}gpu_health{pod!=\"\", pod=\"$g_pod\"} or vector(0)", - "hide": false, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B" - } - ], - "title": "All GPUs", - "transformations": [ - { - "id": "reduce", - "options": { - "labelsToFields": true, - "reducers": [ - "last" - ] - } - }, - { - "id": "groupBy", - "options": { - "fields": { - "Last": { - "aggregations": [ - "last" - ], - "operation": "aggregate" - }, - "gpu_id": { - "aggregations": [ - "last" - ], - "operation": "aggregate" - }, - "gpu_uuid": { - "aggregations": [], - "operation": "groupby" - }, - "hostname": { - "aggregations": [ - "last" - ], - "operation": "aggregate" - } - } - } - }, - { - "disabled": true, - "id": "filterFieldsByName", - "options": {} - }, - { - "id": "organize", - "options": { - "excludeByName": {}, - "includeByName": {}, - "indexByName": { - "gpu_id (last)": 1, - "gpu_uuid": 2, - "hostname (last)": 0 - }, - "renameByName": { - "Last (last)": "HEALTH", - "gpu_id (last)": "GPU ID", - "gpu_uuid": "GPU UUID", - "hostname (last)": "HOSTNAME" - } - } - }, - { - "id": "filterByValue", - "options": { - "filters": [ - { - "config": { - "id": "isNull", - "options": {} - }, - "fieldName": "GPU UUID" - } - ], - "match": "any", - "type": "exclude" - } - } - ], - "type": "table" - }, - { - "gridPos": { - "h": 2, - "w": 2, - "x": 22, - "y": 24 - }, - "id": 20, - "options": { - "code": { - "language": "plaintext", - "showLineNumbers": false, - "showMiniMap": false - }, - "content": "main", - "mode": "markdown" - }, - "pluginVersion": "11.2.2", - "title": "Version", - "type": "text" - } - ], - "schemaVersion": 39, - "tags": [], - "templating": { - "list": [ - { - "current": { - "selected": false, - "text": "", - "value": "" - }, - "description": "string to prefix names of metrics queries (e.g. gpu_gfx_activity -> amd_gpu_gfx_activity)", - "hide": 2, - "label": "Metrics Prefix", - "name": "g_metrics_prefix", - "options": [ - { - "selected": true, - "text": "", - "value": "" - } - ], - "query": "", - "skipUrlSync": false, - "type": "textbox" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values(cluster_name)", - "hide": 0, - "includeAll": false, - "label": "Cluster", - "multi": false, - "name": "g_cluster_name", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(cluster_name)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 7, - "type": "query" - }, - { - "allValue": "+", - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values({cluster_name=\"$g_cluster_name\"},job_id)", - "hide": 0, - "includeAll": false, - "label": "Job", - "multi": false, - "name": "g_job_id", - "options": [], - "query": { - "qryType": 1, - "query": "label_values({cluster_name=\"$g_cluster_name\"},job_id)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 7, - "type": "query" - }, - { - "allValue": "+", - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values({cluster_name=\"$g_cluster_name\"},pod)", - "hide": 0, - "includeAll": false, - "label": "Pod", - "multi": false, - "name": "g_pod", - "options": [], - "query": { - "qryType": 1, - "query": "label_values({cluster_name=\"$g_cluster_name\"},pod)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 7, - "type": "query" - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": {}, - "timezone": "browser", - "title": "Job", - "uid": "ce1x81pyv3dvkb", - "version": 1, - "weekStart": "" -} \ No newline at end of file diff --git a/grafana/dashboard_node.json b/grafana/dashboard_node.json deleted file mode 100644 index 3b46ac87..00000000 --- a/grafana/dashboard_node.json +++ /dev/null @@ -1,2440 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - }, - { - "name": "DS_EXPRESSION", - "label": "Expression", - "description": "", - "type": "datasource", - "pluginId": "__expr__" - } - ], - "__elements": {}, - "__requires": [ - { - "type": "datasource", - "id": "__expr__", - "version": "1.0.0" - }, - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "11.2.2" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "stat", - "name": "Stat", - "version": "" - }, - { - "type": "panel", - "id": "table", - "name": "Table", - "version": "" - }, - { - "type": "panel", - "id": "text", - "name": "Text", - "version": "" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": null, - "links": [], - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Number of GPUs on the compute node", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "links": [ - { - "targetBlank": true, - "title": "Go to GPU Dashboard", - "url": "/d/ae0aj8euc43r4b/gpu" - } - ], - "mappings": [], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 2, - "x": 0, - "y": 0 - }, - "id": 1, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(group by(gpu_id) (${g_metrics_prefix}gpu_gfx_activity{hostname=\"$g_hostname\"}))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "# of GPUs", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "GPUs", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Number of jobs running on this compute node", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "links": [ - { - "targetBlank": true, - "title": "Go to Job Dashboard", - "url": "/d/ce1x81pyv3dvkb/job" - } - ], - "mappings": [], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 2, - "x": 2, - "y": 0 - }, - "id": 8, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "code", - "exemplar": false, - "expr": "sum(group by(job_id) (${g_metrics_prefix}gpu_gfx_activity{hostname=\"$g_hostname\", job_id!=\"\"}))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Jobs", - "range": false, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(group by(pod) (${g_metrics_prefix}gpu_gfx_activity{hostname=\"$g_hostname\", pod!=\"\"}))", - "hide": false, - "instant": true, - "legendFormat": "Jobs (Pods)", - "range": false, - "refId": "B" - } - ], - "title": "Jobs", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "\\# of GPUs allocated by jobs", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "links": [ - { - "targetBlank": true, - "title": "Go to GPU Dashboard", - "url": "/d/ae0aj8euc43r4b/gpu" - } - ], - "mappings": [], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byFrameRefID", - "options": "Unhealthy" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 4, - "x": 4, - "y": 0 - }, - "id": 19, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "vertical", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "text": { - "titleSize": 13 - }, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(group by(gpu_id) (${g_metrics_prefix}gpu_gfx_activity{hostname=\"$g_hostname\", job_id!=\"\"}))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Allocated by Jobs", - "range": false, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(group by(gpu_id) (${g_metrics_prefix}gpu_gfx_activity{hostname=\"$g_hostname\", pod!=\"\"}))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Allocated by Pods", - "range": false, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "code", - "exemplar": false, - "expr": "sum(group by(gpu_id) (${g_metrics_prefix}gpu_gfx_activity{hostname=\"$g_hostname\"} > 0))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Busy GPUs", - "range": false, - "refId": "C", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "count(${g_metrics_prefix}gpu_health{hostname=\"$g_hostname\"} < 1)", - "hide": false, - "instant": true, - "legendFormat": "Unhealthy GPUs", - "range": false, - "refId": "Unhealthy" - } - ], - "title": "Allocated GPUs", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Current maximum PCIe speed", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "min": -5, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "MBs" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 8, - "y": 0 - }, - "id": 12, - "maxDataPoints": 60, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "${g_metrics_prefix}pcie_max_speed{hostname=\"$g_hostname\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "PCIe Max Speed", - "transformations": [ - { - "id": "calculateField", - "options": { - "alias": "PCIe Max Speed", - "mode": "reduceRow", - "reduce": { - "reducer": "last" - }, - "replaceFields": true - } - } - ], - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 4, - "x": 12, - "y": 0 - }, - "id": 15, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(delta(${g_metrics_prefix}pcie_recovery_count{hostname=\"$g_hostname\"}[$__interval]))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Recovery", - "range": false, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(delta(${g_metrics_prefix}pcie_replay_count{hostname=\"$g_hostname\"}[$__interval]))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Replay", - "range": false, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(delta(${g_metrics_prefix}pcie_replay_rollover_count{hostname=\"$g_hostname\"}[$__interval]))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Replay Rollover", - "range": false, - "refId": "C", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(delta(${g_metrics_prefix}pcie_nack_received_count{hostname=\"$g_hostname\"}[$__interval]))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "NACK Received", - "range": false, - "refId": "D", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(delta(${g_metrics_prefix}pcie_nack_sent_count{hostname=\"$g_hostname\"}[$__interval]))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "NACK Sent", - "range": false, - "refId": "E", - "useBackend": false - } - ], - "title": "PCIe Counts", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Top 5 jobs by GPU usage, listed by the index of the GPU the job is running on and the pod/job ID", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "links": [ - { - "targetBlank": true, - "title": "Go to Job Dashboard", - "url": "/d/ce1x81pyv3dvkb/job?var-g_job_id=${__field.labels.job_id}&var-g_pod=${__field.labels.pod}" - } - ], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 4, - "x": 16, - "y": 0 - }, - "id": 16, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "topk(5, ${g_metrics_prefix}gpu_gfx_activity{hostname=\"$g_hostname\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "[GPU: {{gpu_id}}] {{pod}}{{job_id}}", - "range": false, - "refId": "A", - "useBackend": false - } - ], - "title": "Top 5 Jobs by GPU Usage", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Top 5 GPUs with highest VRAM used, labeled with GPU ID.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "links": [ - { - "targetBlank": true, - "title": "Go to GPU Dashboard", - "url": "/d/ae0aj8euc43r4b/gpu?var-g_gpu_uuid=${__field.labels.gpu_uuid}&var-g_hostname=${__field.labels.hostname}&var-g_gpu_id=${__field.labels.gpu_id}" - } - ], - "mappings": [], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 160000 - } - ] - }, - "unit": "decmbytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 4, - "x": 20, - "y": 0 - }, - "id": 17, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "topk(5, ${g_metrics_prefix}gpu_used_vram{hostname=\"$g_hostname\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "GPU: {{gpu_id}}", - "range": false, - "refId": "A", - "useBackend": false - } - ], - "title": "Top 5 Used VRAM", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total package power usage, in Watts", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "watt" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 0, - "y": 3 - }, - "id": 9, - "maxDataPoints": 60, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_average_package_power{hostname=\"$g_hostname\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "legendFormat": "Average Package Power Usage", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_package_power{hostname=\"$g_hostname\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Package Power Usage", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "$A+$B", - "hide": false, - "refId": "Total Package Power Usage", - "type": "math" - } - ], - "title": "Total Power Usage", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "GBs" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 8, - "y": 3 - }, - "id": 13, - "maxDataPoints": 60, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}pcie_bandwidth{hostname=\"$g_hostname\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "Total current PCIe bandwidth", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Total Current PCIe Bandwidth", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 0, - "y": 6 - }, - "id": 10, - "maxDataPoints": 60, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_used_vram{hostname=\"$g_hostname\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_total_vram{hostname=\"$g_hostname\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "($A / $B) * 100", - "hide": false, - "refId": "Memory Usage", - "type": "math" - } - ], - "title": "Memory Usage", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "PCIe Bandwidth", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 6 - }, - "id": 14, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.3.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}pcie_bandwidth{hostname=\"$g_hostname\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Total PCIe Bandwidth", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total ECC counts across all GPUs in this compute node", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 8 - }, - "id": 18, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(delta(${g_metrics_prefix}gpu_ecc_correct_total{hostname=\"$g_hostname\"}[$__interval]))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "Correctable", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(delta(${g_metrics_prefix}gpu_ecc_uncorrect_total{hostname=\"$g_hostname\"}[$__interval]))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Uncorrectable", - "range": true, - "refId": "B", - "useBackend": false - } - ], - "title": "Total ECC Counts", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "joule" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 0, - "y": 9 - }, - "id": 11, - "maxDataPoints": 60, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(delta(${g_metrics_prefix}gpu_energy_consumed{hostname=\"$g_hostname\"}[$__interval]))", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "$A / 1000000", - "hide": false, - "refId": "Joules", - "type": "math" - } - ], - "title": "Energy Consumed", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Average of current CPU GFX activity", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "GPU Usage", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 12 - }, - "id": 2, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.3.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_gfx_activity{hostname=\"$g_hostname\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Average GPU Usage", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Used VRAM over time", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "Used VRAM", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 12 - }, - "id": 4, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.3.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_used_vram{hostname=\"$g_hostname\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "legendFormat": "Used VRAM", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_total_vram{hostname=\"$g_hostname\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Total VRAM", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "($A / $B) * 100", - "hide": false, - "refId": "C", - "type": "math" - } - ], - "title": "Used VRAM", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Average GPU package power, in Watts", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "GPU Power", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "watt" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 16, - "y": 12 - }, - "id": 6, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.3.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_package_power{hostname=\"$g_hostname\", card_model=~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_average_package_power{hostname=\"$g_hostname\", card_model!~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "hide": false, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B" - } - ], - "title": "Average GPU Power", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Current temperature, in Celsius", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "GPU Temperature", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "celsius" - }, - "overrides": [ - { - "__systemRef": "hideSeriesFrom", - "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "GPU Temperature" - ], - "prefix": "All except:", - "readOnly": true - } - }, - "properties": [ - { - "id": "custom.hideFrom", - "value": { - "legend": false, - "tooltip": false, - "viz": true - } - } - ] - } - ] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 18 - }, - "id": 3, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.3.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_edge_temperature{hostname=\"$g_hostname\", card_model!~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "legendFormat": "Edge Temperature", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_junction_temperature{hostname=\"$g_hostname\", card_model=~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Junction Temperature", - "range": true, - "refId": "B", - "useBackend": false - } - ], - "title": "Average GPU Temperature", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Current memory temperature, in Celsius", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "Memory Temperature", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "celsius" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 18 - }, - "id": 5, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.3.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_memory_temperature{hostname=\"$g_hostname\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Average Memory Temperature", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Average current temperatures, in Celsius, from:\n- HBM (4 sensors)\n- Edge\n- Junction/hotspot", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "celsius" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 16, - "y": 18 - }, - "id": 7, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.3.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_hbm_temperature{hostname=\"$g_hostname\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "HBM Temperature", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_edge_temperature{hostname=\"$g_hostname\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Edge Temperature", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_junction_temperature{hostname=\"$g_hostname\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Junction Temperature", - "range": true, - "refId": "C", - "useBackend": false - } - ], - "title": "Average Sensor Temperatures", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "List of all GPUs in this compute node. Click on the GPU ID values to go to that GPU's dashboard (opens new tab).", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "filterable": true, - "inspect": false - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "GPU ID" - }, - "properties": [ - { - "id": "custom.width", - "value": 100 - }, - { - "id": "links", - "value": [ - { - "targetBlank": true, - "title": "Go to GPU Dashboard", - "url": "/d/ae0aj8euc43r4b/gpu?var-g_gpu_uuid=${__data.fields[\"gpu_uuid\"]}&var-g_hostname=${__data.fields.HOSTNAME}&var-g_gpu_id=${__data.fields[\"GPU ID\"]}" - } - ] - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "GPU UUID" - }, - "properties": [ - { - "id": "custom.width", - "value": 320 - }, - { - "id": "links", - "value": [ - { - "targetBlank": true, - "title": "Go to GPU Dashboard", - "url": "/d/ae0aj8euc43r4b/gpu?var-g_gpu_uuid=${__data.fields[\"gpu_uuid\"]}&var-g_hostname=${__data.fields.HOSTNAME}&var-g_gpu_id=${__data.fields[\"GPU ID\"]}" - } - ] - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "HEALTH" - }, - "properties": [ - { - "id": "custom.width", - "value": 105 - }, - { - "id": "mappings", - "value": [ - { - "options": { - "0": { - "color": "red", - "index": 0, - "text": "unhealthy" - }, - "1": { - "color": "green", - "index": 1, - "text": "healthy" - } - }, - "type": "value" - } - ] - }, - { - "id": "custom.cellOptions", - "value": { - "type": "color-text" - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 24 - }, - "id": 20, - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "enablePagination": true, - "fields": "", - "reducer": [ - "sum" - ], - "show": false - }, - "showHeader": true, - "sortBy": [ - { - "desc": false, - "displayName": "HEALTH" - }, - { - "desc": false, - "displayName": "HOSTNAME" - }, - { - "desc": false, - "displayName": "GPU ID" - } - ] - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "${g_metrics_prefix}gpu_health{hostname=\"$g_hostname\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "All GPUs", - "transformations": [ - { - "id": "reduce", - "options": { - "labelsToFields": true, - "reducers": [ - "last" - ] - } - }, - { - "id": "groupBy", - "options": { - "fields": { - "Last": { - "aggregations": [ - "last" - ], - "operation": "aggregate" - }, - "gpu_id": { - "aggregations": [ - "last" - ], - "operation": "aggregate" - }, - "gpu_uuid": { - "aggregations": [], - "operation": "groupby" - }, - "hostname": { - "aggregations": [ - "last" - ], - "operation": "aggregate" - } - } - } - }, - { - "disabled": true, - "id": "filterFieldsByName", - "options": {} - }, - { - "id": "organize", - "options": { - "excludeByName": {}, - "includeByName": {}, - "indexByName": { - "gpu_id (last)": 1, - "gpu_uuid": 2, - "hostname (last)": 0 - }, - "renameByName": { - "Last (last)": "HEALTH", - "gpu_id (last)": "GPU ID", - "gpu_uuid": "GPU UUID", - "hostname (last)": "HOSTNAME" - } - } - } - ], - "type": "table" - }, - { - "gridPos": { - "h": 2, - "w": 2, - "x": 22, - "y": 24 - }, - "id": 21, - "options": { - "code": { - "language": "plaintext", - "showLineNumbers": false, - "showMiniMap": false - }, - "content": "main", - "mode": "markdown" - }, - "pluginVersion": "11.2.2", - "title": "Version", - "type": "text" - } - ], - "schemaVersion": 39, - "tags": [], - "templating": { - "list": [ - { - "current": { - "selected": false, - "text": "", - "value": "" - }, - "description": "string to prefix names of metrics queries (e.g. gpu_gfx_activity -> amd_gpu_gfx_activity)", - "hide": 2, - "label": "Metrics Prefix", - "name": "g_metrics_prefix", - "options": [ - { - "selected": true, - "text": "", - "value": "" - } - ], - "query": "", - "skipUrlSync": false, - "type": "textbox" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values(cluster_name)", - "hide": 0, - "includeAll": false, - "label": "Cluster", - "multi": false, - "name": "g_cluster_name", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(cluster_name)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 7, - "type": "query" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values({cluster_name=\"$g_cluster_name\"},hostname)", - "hide": 0, - "includeAll": false, - "label": "Compute Node", - "multi": false, - "name": "g_hostname", - "options": [], - "query": { - "qryType": 1, - "query": "label_values({cluster_name=\"$g_cluster_name\"},hostname)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 7, - "type": "query" - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": {}, - "timezone": "browser", - "title": "Compute Node", - "uid": "de1q9vq97fe2oc", - "version": 1, - "weekStart": "" -} \ No newline at end of file diff --git a/grafana/dashboard_overview.json b/grafana/dashboard_overview.json deleted file mode 100644 index 595f86b0..00000000 --- a/grafana/dashboard_overview.json +++ /dev/null @@ -1,2369 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - }, - { - "name": "DS_EXPRESSION", - "label": "Expression", - "description": "", - "type": "datasource", - "pluginId": "__expr__" - } - ], - "__elements": {}, - "__requires": [ - { - "type": "datasource", - "id": "__expr__", - "version": "1.0.0" - }, - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "11.2.2" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "stat", - "name": "Stat", - "version": "" - }, - { - "type": "panel", - "id": "table", - "name": "Table", - "version": "" - }, - { - "type": "panel", - "id": "text", - "name": "Text", - "version": "" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "description": "Overview of the system", - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": null, - "links": [], - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Number of GPUs in the node", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "links": [ - { - "targetBlank": true, - "title": "Go to GPU Dashboard", - "url": "/d/ae0aj8euc43r4b/gpu" - } - ], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byFrameRefID", - "options": "Unhealthy" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 6, - "w": 2, - "x": 0, - "y": 0 - }, - "id": 4, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(${g_metrics_prefix}gpu_nodes_total{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "GPUs", - "range": false, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "code", - "exemplar": false, - "expr": "count(${g_metrics_prefix}gpu_health{cluster_name=\"$g_cluster_name\"} < 1)", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "Unhealthy GPUs", - "range": false, - "refId": "Unhealthy", - "useBackend": false - } - ], - "title": "GPUs", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Number of jobs", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "links": [ - { - "targetBlank": true, - "title": "Go to Job Dashboard", - "url": "/d/ce1x81pyv3dvkb/job" - } - ], - "mappings": [], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 2, - "x": 2, - "y": 0 - }, - "id": 22, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(group by(job_id) (${g_metrics_prefix}gpu_gfx_activity{cluster_name=\"$g_cluster_name\", job_id!=\"\"}))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Jobs", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(group by(pod) (${g_metrics_prefix}gpu_gfx_activity{cluster_name=\"$g_cluster_name\", pod!=\"\"}))", - "hide": false, - "instant": false, - "legendFormat": "Pods", - "range": true, - "refId": "B" - } - ], - "title": "Jobs", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total power usage, in Watts", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "watt" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 0 - }, - "id": 2, - "maxDataPoints": 60, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_average_package_power{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Average Package Power (W)", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_package_power{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Current Package Power (W)", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "$A+$B", - "hide": false, - "refId": "Total Package Power Usage", - "type": "math" - } - ], - "title": "Total Power Usage", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Current maximum PCIe speed", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "MBs" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 8, - "y": 0 - }, - "id": 20, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "count_values(\"count\", ${g_metrics_prefix}pcie_max_speed{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "PCIe Max Speed", - "transformations": [ - { - "disabled": true, - "id": "reduce", - "options": { - "includeTimeField": false, - "labelsToFields": false, - "mode": "seriesToRows", - "reducers": [ - "uniqueValues" - ] - } - } - ], - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 4, - "x": 12, - "y": 0 - }, - "id": 18, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(delta(${g_metrics_prefix}pcie_recovery_count{cluster_name=\"$g_cluster_name\"}[$__interval]))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Recovery", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(delta(${g_metrics_prefix}pcie_replay_count{cluster_name=\"$g_cluster_name\"}[$__interval]))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Replay", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(delta(${g_metrics_prefix}pcie_replay_rollover_count{cluster_name=\"$g_cluster_name\"}[$__interval]))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Replay Rollover", - "range": true, - "refId": "C", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(delta(${g_metrics_prefix}pcie_nack_received_count{cluster_name=\"$g_cluster_name\"}[$__interval]))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "NACK Received", - "range": true, - "refId": "D", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(delta(${g_metrics_prefix}pcie_nack_sent_count{cluster_name=\"$g_cluster_name\"}[$__interval]))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "NACK Sent", - "range": true, - "refId": "E", - "useBackend": false - } - ], - "title": "PCIe Counts", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Top 5 current GPU GFX Activity, labeled with hostname, GPU ID, and job ID.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "links": [ - { - "targetBlank": true, - "title": "Go to GPU dashboard", - "url": "/d/ae0aj8euc43r4b/gpu?var-g_gpu_uuid=${__field.labels.gpu_uuid}&var-g_hostname=${__field.labels.hostname}&var-g_gpu_id=${__field.labels.gpu_id}" - } - ], - "mappings": [], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 4, - "x": 16, - "y": 0 - }, - "id": 21, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "topk by() (5, ${g_metrics_prefix}gpu_gfx_activity{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": true, - "interval": "", - "legendFormat": "{{hostname}} | {{gpu_id}} | {{job_id}}", - "range": false, - "refId": "A", - "useBackend": false - } - ], - "title": "Top 5 GPU Usage", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Top 5 GPUs with highest VRAM used, labeled with hostname and GPU ID.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 160000 - } - ] - }, - "unit": "decmbytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 4, - "x": 20, - "y": 0 - }, - "id": 26, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "topk(5, ${g_metrics_prefix}gpu_used_vram{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": true, - "legendFormat": "{{hostname}} | {{gpu_id}}", - "range": false, - "refId": "A", - "useBackend": false - } - ], - "title": "Top 5 Used VRAM", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "MBs" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 8, - "y": 3 - }, - "id": 17, - "maxDataPoints": 60, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}pcie_bandwidth{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Total current PCIe bandwidth", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Current Total PCIe Bandwidth", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "percentage", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 4 - }, - "id": 24, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "/^Memory Usage$/", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_used_vram{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_total_vram{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "($A / $B) * 100", - "hide": false, - "refId": "Memory Usage", - "type": "math" - } - ], - "title": "Memory Usage", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "\\# of GPUs allocated by jobs", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "links": [ - { - "targetBlank": true, - "title": "Go to GPU Dashboard", - "url": "/d/ae0aj8euc43r4b/gpu" - } - ], - "mappings": [], - "noValue": "0", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 4, - "x": 0, - "y": 6 - }, - "id": 34, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "vertical", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "text": { - "titleSize": 13 - }, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(group by(gpu_uuid) (${g_metrics_prefix}gpu_gfx_activity{cluster_name=\"$g_cluster_name\", job_id!=\"\"}))", - "instant": true, - "legendFormat": "Allocated by Jobs", - "range": false, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "exemplar": false, - "expr": "sum(group by(gpu_uuid) (${g_metrics_prefix}gpu_gfx_activity{cluster_name=\"$g_cluster_name\", pod!=\"\"}))", - "hide": false, - "instant": true, - "legendFormat": "Allocated by Pods", - "range": false, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(group by(gpu_uuid) (${g_metrics_prefix}gpu_gfx_activity{cluster_name=\"$g_cluster_name\"} > 0))", - "hide": false, - "instant": true, - "legendFormat": "Busy GPUs", - "range": false, - "refId": "C" - } - ], - "title": "Allocated GPUs", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total PCIe Bandwidth over time", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "MBs" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 6 - }, - "id": 32, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}pcie_bandwidth{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Total PCIe Bandwidth", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Total PCIe Bandwidth", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Total accumulated energy consumed", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "joule" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 8 - }, - "id": 8, - "maxDataPoints": 60, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "/^Joules$/", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(delta(${g_metrics_prefix}gpu_energy_consumed{cluster_name=\"$g_cluster_name\"}[$__interval]))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Accumulated Energy Consumed (nJ)", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "$A / 1000000", - "hide": false, - "refId": "Joules", - "type": "math" - } - ], - "title": "Energy Consumed", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Average of current GPU GFX activity", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 12 - }, - "id": 9, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_gfx_activity{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "GPU Usage", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Average GPU Usage", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Used VRAM over time", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "Used VRAM", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 12 - }, - "id": 27, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_used_vram{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Used VRAM", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(${g_metrics_prefix}gpu_total_vram{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "hide": true, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Total VRAM", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "__expr__", - "uid": "${DS_EXPRESSION}" - }, - "expression": "($A / $B) * 100", - "hide": false, - "refId": "C", - "type": "math" - } - ], - "title": "Used VRAM", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Average GPU package power, in Watts", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "watt" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 16, - "y": 12 - }, - "id": 6, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_package_power{cluster_name=\"$g_cluster_name\", card_model=~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "GPU Power", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_average_package_power{cluster_name=\"$g_cluster_name\", card_model!~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "hide": false, - "instant": false, - "legendFormat": "GPU Power", - "range": true, - "refId": "B" - } - ], - "title": "Average GPU Power", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Current temperature, in Celsius", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "displayName": "GPU Temperature", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "celsius" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 0, - "y": 18 - }, - "id": 31, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_edge_temperature{cluster_name=\"$g_cluster_name\", card_model!~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Edge Temperature", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_junction_temperature{cluster_name=\"$g_cluster_name\", card_model=~\"102-G30211-00|102-G30211-0C|102-G30211-4C|102-G30212-0C|102-G30213-00|102-G30213-0C\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Junction Temperature", - "range": true, - "refId": "B", - "useBackend": false - } - ], - "title": "Average GPU Temperature", - "transformations": [ - { - "id": "merge", - "options": {} - } - ], - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Current memory temperature, in Celsius", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "celsius" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 8, - "y": 18 - }, - "id": 14, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_memory_temperature{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Memory Temperature", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Average Memory Temperature", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "Average current temperatures, in Celsius, from:\n- HBM (4 sensors)\n- Edge\n- Junction/hotspot", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "celsius" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 8, - "x": 16, - "y": 18 - }, - "id": 10, - "maxDataPoints": 60, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_hbm_temperature{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "HBM Temperature", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_edge_temperature{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Edge Temperature", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "avg(${g_metrics_prefix}gpu_junction_temperature{cluster_name=\"$g_cluster_name\"})", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Junction Temperature", - "range": true, - "refId": "C", - "useBackend": false - } - ], - "title": "Average Sensor Temperatures", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "description": "List of all GPUs.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": "auto", - "cellOptions": { - "type": "auto" - }, - "filterable": true, - "inspect": false - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "HOSTNAME" - }, - "properties": [ - { - "id": "links", - "value": [ - { - "targetBlank": true, - "title": "Go to Compute Node Dashboard", - "url": "/d/de1q9vq97fe2oc/compute-node?var-g_hostname=${__data.fields.HOSTNAME}" - } - ] - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "GPU ID" - }, - "properties": [ - { - "id": "custom.width", - "value": 100 - }, - { - "id": "links", - "value": [ - { - "targetBlank": true, - "title": "Go to GPU Dashboard", - "url": "/d/ae0aj8euc43r4b/gpu?var-g_gpu_uuid=${__data.fields[\"gpu_uuid\"]}&var-g_hostname=${__data.fields.HOSTNAME}&var-g_gpu_id=${__data.fields[\"GPU ID\"]}" - } - ] - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "GPU UUID" - }, - "properties": [ - { - "id": "custom.width", - "value": 320 - }, - { - "id": "links", - "value": [ - { - "targetBlank": true, - "title": "Go to GPU Dashboard", - "url": "/d/ae0aj8euc43r4b/gpu?var-g_gpu_uuid=${__data.fields[\"gpu_uuid\"]}&var-g_hostname=${__data.fields.HOSTNAME}&var-g_gpu_id=${__data.fields[\"GPU ID\"]}" - } - ] - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "HEALTH" - }, - "properties": [ - { - "id": "custom.width", - "value": 105 - }, - { - "id": "mappings", - "value": [ - { - "options": { - "0": { - "color": "red", - "index": 0, - "text": "unhealthy" - }, - "1": { - "color": "green", - "index": 1, - "text": "healthy" - } - }, - "type": "value" - } - ] - }, - { - "id": "custom.cellOptions", - "value": { - "type": "color-text" - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 24 - }, - "id": 33, - "options": { - "cellHeight": "sm", - "footer": { - "countRows": false, - "enablePagination": true, - "fields": "", - "reducer": [ - "sum" - ], - "show": false - }, - "showHeader": true, - "sortBy": [ - { - "desc": false, - "displayName": "HEALTH" - }, - { - "desc": false, - "displayName": "HOSTNAME" - }, - { - "desc": false, - "displayName": "GPU ID" - } - ] - }, - "pluginVersion": "11.2.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "${g_metrics_prefix}gpu_health{cluster_name=\"$g_cluster_name\"}", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "All GPUs", - "transformations": [ - { - "id": "reduce", - "options": { - "labelsToFields": true, - "reducers": [ - "last" - ] - } - }, - { - "id": "groupBy", - "options": { - "fields": { - "GPU UUID": { - "aggregations": [], - "operation": "groupby" - }, - "Last": { - "aggregations": [ - "last" - ], - "operation": "aggregate" - }, - "gpu_id": { - "aggregations": [ - "last" - ], - "operation": "aggregate" - }, - "gpu_uuid": { - "aggregations": [], - "operation": "groupby" - }, - "hostname": { - "aggregations": [ - "last" - ], - "operation": "aggregate" - } - } - } - }, - { - "disabled": true, - "id": "filterFieldsByName", - "options": { - "byVariable": false, - "include": { - "names": [ - "gpu_uuid", - "gpu_id (last)", - "hostname (last)" - ] - } - } - }, - { - "id": "organize", - "options": { - "excludeByName": { - "gpu_uuid": false, - "pod": false - }, - "includeByName": {}, - "indexByName": { - "gpu_id (last)": 1, - "gpu_uuid": 2, - "hostname (last)": 0 - }, - "renameByName": { - "Last (last)": "HEALTH", - "gpu_id": "GPU ID", - "gpu_id (last)": "GPU ID", - "gpu_uuid": "GPU UUID", - "hostname": "HOSTNAME", - "hostname (last)": "HOSTNAME", - "pod": "POD" - } - } - } - ], - "type": "table" - }, - { - "gridPos": { - "h": 2, - "w": 2, - "x": 22, - "y": 24 - }, - "id": 35, - "options": { - "code": { - "language": "plaintext", - "showLineNumbers": false, - "showMiniMap": false - }, - "content": "main", - "mode": "markdown" - }, - "pluginVersion": "11.2.2", - "title": "Version", - "type": "text" - } - ], - "schemaVersion": 39, - "tags": [], - "templating": { - "list": [ - { - "current": { - "selected": false, - "text": "", - "value": "" - }, - "description": "string to prefix names of metrics queries (e.g. gpu_gfx_activity -> amd_gpu_gfx_activity)", - "hide": 2, - "label": "Metrics Prefix", - "name": "g_metrics_prefix", - "options": [ - { - "selected": true, - "text": "", - "value": "" - } - ], - "query": "", - "skipUrlSync": false, - "type": "textbox" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values(GPU_UUID)", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "g_gpu_uuids", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(GPU_UUID)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values(job_id)", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "g_job_ids", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(job_id)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values(cluster_name)", - "hide": 0, - "includeAll": false, - "label": "Cluster", - "multi": false, - "name": "g_cluster_name", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(cluster_name)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 7, - "type": "query" - } - ] - }, - "time": { - "from": "now-24h", - "to": "now" - }, - "timepicker": {}, - "timezone": "browser", - "title": "Overview", - "uid": "fe0ayyeeoxvy8f", - "version": 1, - "weekStart": "" -} \ No newline at end of file From 89079816907093f9f6dd6dcd2a787803df2519a1 Mon Sep 17 00:00:00 2001 From: Titus Ou Date: Wed, 27 Aug 2025 14:53:49 -0700 Subject: [PATCH 2/2] point to main branch --- grafana/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grafana/README.md b/grafana/README.md index ce0e125e..1f2ca896 100644 --- a/grafana/README.md +++ b/grafana/README.md @@ -1,3 +1,3 @@ # Grafana Dashboards -Grafana dashboards can be found in the [ROCm/device-metrics-exporter](https://github.com/ROCm/device-metrics-exporter) repository under the [grafana](https://github.com/ROCm/device-metrics-exporter/tree/release-v1.4.0/grafana) directory. +Grafana dashboards can be found in the [ROCm/device-metrics-exporter](https://github.com/ROCm/device-metrics-exporter) repository under the [grafana](https://github.com/ROCm/device-metrics-exporter/tree/main/grafana) directory.