Skip to content

Commit 69278a2

Browse files
spraveeniosajmera-pensando
authored andcommitted
update examples
1 parent c15ff0c commit 69278a2

File tree

2 files changed

+67
-9
lines changed

2 files changed

+67
-9
lines changed

example/metricsExporter/config.json

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
"PCIE_NAC_RECEIVED_COUNT",
2828
"GPU_CLOCK",
2929
"GPU_POWER_USAGE",
30-
"GPU_TOTAL_MEMORY",
30+
"GPU_TOTAL_VRAM",
3131
"GPU_ECC_CORRECT_TOTAL",
3232
"GPU_ECC_UNCORRECT_TOTAL",
3333
"GPU_ECC_CORRECT_SDMA",
@@ -89,7 +89,10 @@
8989
"GPU_ECC_CORRECT_IH",
9090
"GPU_ECC_UNCORRECT_IH",
9191
"GPU_ECC_CORRECT_MPIO",
92-
"GPU_ECC_UNCORRECT_MPIO"
92+
"GPU_ECC_UNCORRECT_MPIO",
93+
"GPU_HEALTH",
94+
"GPU_XGMI_LINK_RX",
95+
"GPU_XGMI_LINK_TX"
9396
],
9497
"Labels": [
9598
"GPU_UUID",
@@ -107,7 +110,33 @@
107110
"CARD_VENDOR",
108111
"DRIVER_VERSION",
109112
"VBIOS_VERSION",
110-
"HOSTNAME"
111-
]
113+
"HOSTNAME",
114+
"GPU_PARTITION_ID",
115+
"GPU_COMPUTE_PARTITION_TYPE"
116+
],
117+
"HealthThresholds" : {
118+
"GPU_ECC_UNCORRECT_SDMA" : 0,
119+
"GPU_ECC_UNCORRECT_GFX" : 0,
120+
"GPU_ECC_UNCORRECT_MMHUB" : 0,
121+
"GPU_ECC_UNCORRECT_ATHUB" : 0,
122+
"GPU_ECC_UNCORRECT_BIF" : 0,
123+
"GPU_ECC_UNCORRECT_HDP" : 0,
124+
"GPU_ECC_UNCORRECT_XGMI_WAFL" : 0,
125+
"GPU_ECC_UNCORRECT_DF" : 0,
126+
"GPU_ECC_UNCORRECT_SMN" : 0,
127+
"GPU_ECC_UNCORRECT_SEM" : 0,
128+
"GPU_ECC_UNCORRECT_MP0" : 0,
129+
"GPU_ECC_UNCORRECT_MP1" : 0,
130+
"GPU_ECC_UNCORRECT_FUSE" : 0,
131+
"GPU_ECC_UNCORRECT_UMC" : 0,
132+
"GPU_ECC_UNCORRECT_MCA" : 0,
133+
"GPU_ECC_UNCORRECT_VCN" : 0,
134+
"GPU_ECC_UNCORRECT_JPEG" : 0,
135+
"GPU_ECC_UNCORRECT_IH" : 0,
136+
"GPU_ECC_UNCORRECT_MPIO" : 0
137+
},
138+
"CustomLabels" : {
139+
"CLUSTER_NAME" : "amdgpu-k8s-metrics-exporter"
140+
}
112141
}
113142
}

example/metricsExporter/configmap.yaml

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ apiVersion: v1
22
kind: ConfigMap
33
metadata:
44
name: gpu-config
5-
namespace: kube-amd-gpu
5+
namespace: mynamespace
66
data:
77
config.json: |
88
{
@@ -34,7 +34,7 @@ data:
3434
"PCIE_NAC_RECEIVED_COUNT",
3535
"GPU_CLOCK",
3636
"GPU_POWER_USAGE",
37-
"GPU_TOTAL_VRAM",
37+
"GPU_TOTAL_MEMORY",
3838
"GPU_ECC_CORRECT_TOTAL",
3939
"GPU_ECC_UNCORRECT_TOTAL",
4040
"GPU_ECC_CORRECT_SDMA",
@@ -96,7 +96,10 @@ data:
9696
"GPU_ECC_CORRECT_IH",
9797
"GPU_ECC_UNCORRECT_IH",
9898
"GPU_ECC_CORRECT_MPIO",
99-
"GPU_ECC_UNCORRECT_MPIO"
99+
"GPU_ECC_UNCORRECT_MPIO",
100+
"GPU_HEALTH",
101+
"GPU_XGMI_LINK_RX",
102+
"GPU_XGMI_LINK_TX"
100103
],
101104
"Labels": [
102105
"GPU_UUID",
@@ -114,8 +117,34 @@ data:
114117
"CARD_VENDOR",
115118
"DRIVER_VERSION",
116119
"VBIOS_VERSION",
117-
"HOSTNAME"
118-
]
120+
"HOSTNAME",
121+
"GPU_PARTITION_ID",
122+
"GPU_COMPUTE_PARTITION_TYPE"
123+
],
124+
"HealthThresholds" : {
125+
"GPU_ECC_UNCORRECT_SDMA" : 0,
126+
"GPU_ECC_UNCORRECT_GFX" : 0,
127+
"GPU_ECC_UNCORRECT_MMHUB" : 0,
128+
"GPU_ECC_UNCORRECT_ATHUB" : 0,
129+
"GPU_ECC_UNCORRECT_BIF" : 0,
130+
"GPU_ECC_UNCORRECT_HDP" : 0,
131+
"GPU_ECC_UNCORRECT_XGMI_WAFL" : 0,
132+
"GPU_ECC_UNCORRECT_DF" : 0,
133+
"GPU_ECC_UNCORRECT_SMN" : 0,
134+
"GPU_ECC_UNCORRECT_SEM" : 0,
135+
"GPU_ECC_UNCORRECT_MP0" : 0,
136+
"GPU_ECC_UNCORRECT_MP1" : 0,
137+
"GPU_ECC_UNCORRECT_FUSE" : 0,
138+
"GPU_ECC_UNCORRECT_UMC" : 0,
139+
"GPU_ECC_UNCORRECT_MCA" : 0,
140+
"GPU_ECC_UNCORRECT_VCN" : 0,
141+
"GPU_ECC_UNCORRECT_JPEG" : 0,
142+
"GPU_ECC_UNCORRECT_IH" : 0,
143+
"GPU_ECC_UNCORRECT_MPIO" : 0
144+
},
145+
"CustomLabels" : {
146+
"CLUSTER_NAME" : "amdgpu-k8s-metrics-exporter"
147+
}
119148
}
120149
}
121150

0 commit comments

Comments
 (0)