@@ -1051,7 +1051,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) {
1051
1051
s .verifyDeviceConfigStatus (devCfg , c )
1052
1052
s .verifyNodeGPULabel (devCfg , c )
1053
1053
1054
- ret , err := utils .GetAMDGPUCount (ctx , s .clientSet )
1054
+ ret , err := utils .GetAMDGPUCount (ctx , s .clientSet , "gpu" )
1055
1055
if err != nil {
1056
1056
logger .Errorf ("error: %v" , err )
1057
1057
}
@@ -1078,7 +1078,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) {
1078
1078
err = utils .DeployRocmPods (context .TODO (), s .clientSet , res )
1079
1079
assert .NoError (c , err , "failed to deploy pods" )
1080
1080
s .verifyROCMPOD (true , c )
1081
- err = utils .VerifyROCMPODResourceCount (ctx , s .clientSet , gpuReqCount )
1081
+ err = utils .VerifyROCMPODResourceCount (ctx , s .clientSet , gpuReqCount , "gpu" )
1082
1082
assert .NoError (c , err , fmt .Sprintf ("%v" , err ))
1083
1083
1084
1084
// delete
@@ -1092,6 +1092,244 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) {
1092
1092
assert .NoError (c , err , "failed to reboot nodes" )
1093
1093
}
1094
1094
1095
+ func (s * E2ESuite ) TestWorkloadRequestedGPUsHomogeneousSingle (c * C ) {
1096
+ if s .simEnable {
1097
+ c .Skip ("Skipping for non amd gpu testbed" )
1098
+ }
1099
+ if ! dcmImageDefined {
1100
+ c .Skip ("skip DCM test because E2E_DCM_IMAGE is not defined" )
1101
+ }
1102
+
1103
+ s .configMapHelper (c )
1104
+
1105
+ logger .Infof ("Add node label after pod comes up" )
1106
+ time .Sleep (30 * time .Second )
1107
+
1108
+ nodes := utils .GetAMDGpuWorker (s .clientSet , s .openshift )
1109
+ nodeNames := make ([]string , 0 )
1110
+ for _ , node := range nodes {
1111
+ nodeNames = append (nodeNames , node .Name )
1112
+ }
1113
+ for _ , nodeName := range nodeNames {
1114
+ s .addRemoveNodeLabels (nodeName , "e2e_profile2" )
1115
+ }
1116
+
1117
+ logs := s .getLogs ()
1118
+ if strings .Contains (logs , "Partition completed successfully" ) && (! strings .Contains (logs , "ERROR" )) && (s .eventHelper ("SuccessfullyPartitioned" , "Normal" )) {
1119
+ logger .Infof ("Successfully tested homogenous default partitioning" )
1120
+ } else {
1121
+ logger .Errorf ("Failure test homogenous partitioning" )
1122
+ }
1123
+ devCfgDcm := s .getDeviceConfigForDCM (c )
1124
+ s .deleteDeviceConfig (devCfgDcm , c )
1125
+
1126
+ time .Sleep (60 * time .Second )
1127
+
1128
+ ctx := context .TODO ()
1129
+ logger .Infof ("create %v" , s .cfgName )
1130
+ devCfg := s .getDeviceConfig (c )
1131
+ driverEnable := false
1132
+ devCfg .Spec .Driver .Enable = & driverEnable
1133
+ s .createDeviceConfig (devCfg , c )
1134
+ s .checkNFDWorkerStatus (s .ns , c , "" )
1135
+ s .checkNodeLabellerStatus (s .ns , c , devCfg )
1136
+ s .verifyDeviceConfigStatus (devCfg , c )
1137
+ s .verifyNodeGPULabel (devCfg , c )
1138
+
1139
+ ret , err := utils .GetAMDGPUCount (ctx , s .clientSet , "gpu" )
1140
+ if err != nil {
1141
+ logger .Errorf ("error: %v" , err )
1142
+ }
1143
+ var minGPU int = 10000
1144
+ for _ , v := range ret {
1145
+ if v < minGPU {
1146
+ minGPU = v
1147
+ }
1148
+ }
1149
+ assert .Greater (c , minGPU , 0 , "did not find any server with amd gpu" )
1150
+
1151
+ gpuLimitCount := minGPU
1152
+ gpuReqCount := minGPU
1153
+
1154
+ res := & v1.ResourceRequirements {
1155
+ Limits : v1.ResourceList {
1156
+ "amd.com/gpu" : resource .MustParse (fmt .Sprintf ("%d" , gpuLimitCount )),
1157
+ },
1158
+ Requests : v1.ResourceList {
1159
+ "amd.com/gpu" : resource .MustParse (fmt .Sprintf ("%d" , gpuReqCount )),
1160
+ },
1161
+ }
1162
+
1163
+ err = utils .DeployRocmPods (context .TODO (), s .clientSet , res )
1164
+ assert .NoError (c , err , "failed to deploy pods" )
1165
+ err = utils .VerifyROCMPODResourceCount (ctx , s .clientSet , gpuReqCount , "gpu" )
1166
+ assert .NoError (c , err , fmt .Sprintf ("%v" , err ))
1167
+
1168
+ // delete
1169
+ s .deleteDeviceConfig (devCfg , c )
1170
+
1171
+ err = utils .DelRocmPods (context .TODO (), s .clientSet )
1172
+ assert .NoError (c , err , "failed to remove rocm pods" )
1173
+ }
1174
+
1175
+ func (s * E2ESuite ) TestWorkloadRequestedGPUsHomogeneousMixed (c * C ) {
1176
+ if s .simEnable {
1177
+ c .Skip ("Skipping for non amd gpu testbed" )
1178
+ }
1179
+ if ! dcmImageDefined {
1180
+ c .Skip ("skip DCM test because E2E_DCM_IMAGE is not defined" )
1181
+ }
1182
+
1183
+ s .configMapHelper (c )
1184
+
1185
+ logger .Infof ("Add node label after pod comes up" )
1186
+ time .Sleep (30 * time .Second )
1187
+
1188
+ nodes := utils .GetAMDGpuWorker (s .clientSet , s .openshift )
1189
+ nodeNames := make ([]string , 0 )
1190
+ for _ , node := range nodes {
1191
+ nodeNames = append (nodeNames , node .Name )
1192
+ }
1193
+ for _ , nodeName := range nodeNames {
1194
+ s .addRemoveNodeLabels (nodeName , "e2e_profile2" )
1195
+ }
1196
+
1197
+ logs := s .getLogs ()
1198
+ if strings .Contains (logs , "Partition completed successfully" ) && (! strings .Contains (logs , "ERROR" )) && (s .eventHelper ("SuccessfullyPartitioned" , "Normal" )) {
1199
+ logger .Infof ("Successfully tested homogeneous partitioning" )
1200
+ } else {
1201
+ logger .Errorf ("Failure test homogeneous partitioning" )
1202
+ }
1203
+ devCfgDcm := s .getDeviceConfigForDCM (c )
1204
+ s .deleteDeviceConfig (devCfgDcm , c )
1205
+ time .Sleep (60 * time .Second )
1206
+ ctx := context .TODO ()
1207
+ logger .Infof ("create %v" , s .cfgName )
1208
+ devCfg := s .getDeviceConfig (c )
1209
+ driverEnable := false
1210
+ devCfg .Spec .Driver .Enable = & driverEnable
1211
+ devCfg .Spec .DevicePlugin .DevicePluginArguments = map [string ]string {"resource_naming_strategy" : "mixed" }
1212
+ s .createDeviceConfig (devCfg , c )
1213
+ s .checkNFDWorkerStatus (s .ns , c , "" )
1214
+ s .checkNodeLabellerStatus (s .ns , c , devCfg )
1215
+ s .verifyDeviceConfigStatus (devCfg , c )
1216
+
1217
+ ret , err := utils .GetAMDGPUCount (ctx , s .clientSet , "cpx_nps4" )
1218
+ if err != nil {
1219
+ logger .Errorf ("error: %v" , err )
1220
+ }
1221
+ var minGPU int = 10000
1222
+ for _ , v := range ret {
1223
+ if v < minGPU {
1224
+ minGPU = v
1225
+ }
1226
+ }
1227
+ assert .Greater (c , minGPU , 0 , "did not find any server with amd gpu" )
1228
+
1229
+ gpuLimitCount := minGPU
1230
+ gpuReqCount := minGPU
1231
+
1232
+ res := & v1.ResourceRequirements {
1233
+ Limits : v1.ResourceList {
1234
+ "amd.com/cpx_nps4" : resource .MustParse (fmt .Sprintf ("%d" , gpuLimitCount )),
1235
+ },
1236
+ Requests : v1.ResourceList {
1237
+ "amd.com/cpx_nps4" : resource .MustParse (fmt .Sprintf ("%d" , gpuReqCount )),
1238
+ },
1239
+ }
1240
+
1241
+ err = utils .DeployRocmPods (context .TODO (), s .clientSet , res )
1242
+ assert .NoError (c , err , "failed to deploy pods" )
1243
+ err = utils .VerifyROCMPODResourceCount (ctx , s .clientSet , gpuReqCount , "cpx_nps4" )
1244
+ assert .NoError (c , err , fmt .Sprintf ("%v" , err ))
1245
+
1246
+ // delete
1247
+ s .deleteDeviceConfig (devCfg , c )
1248
+
1249
+ err = utils .DelRocmPods (context .TODO (), s .clientSet )
1250
+ assert .NoError (c , err , "failed to remove rocm pods" )
1251
+
1252
+ }
1253
+
1254
+ func (s * E2ESuite ) TestWorkloadRequestedGPUsHeterogeneousMixed (c * C ) {
1255
+ if s .simEnable {
1256
+ c .Skip ("Skipping for non amd gpu testbed" )
1257
+ }
1258
+ if ! dcmImageDefined {
1259
+ c .Skip ("skip DCM test because E2E_DCM_IMAGE is not defined" )
1260
+ }
1261
+
1262
+ s .configMapHelper (c )
1263
+
1264
+ logger .Infof ("Add node label after pod comes up" )
1265
+ time .Sleep (30 * time .Second )
1266
+
1267
+ nodes := utils .GetAMDGpuWorker (s .clientSet , s .openshift )
1268
+ nodeNames := make ([]string , 0 )
1269
+ for _ , node := range nodes {
1270
+ nodeNames = append (nodeNames , node .Name )
1271
+ }
1272
+ for _ , nodeName := range nodeNames {
1273
+ s .addRemoveNodeLabels (nodeName , "e2e_profile1" )
1274
+ }
1275
+
1276
+ logs := s .getLogs ()
1277
+ if strings .Contains (logs , "Partition completed successfully" ) && (! strings .Contains (logs , "ERROR" )) && (s .eventHelper ("SuccessfullyPartitioned" , "Normal" )) {
1278
+ logger .Infof ("Successfully tested homogeneous partitioning" )
1279
+ } else {
1280
+ logger .Errorf ("Failure test heterogenous partitioning" )
1281
+ }
1282
+ devCfgDcm := s .getDeviceConfigForDCM (c )
1283
+ s .deleteDeviceConfig (devCfgDcm , c )
1284
+ time .Sleep (60 * time .Second )
1285
+
1286
+ ctx := context .TODO ()
1287
+ logger .Infof ("create %v" , s .cfgName )
1288
+ devCfg := s .getDeviceConfig (c )
1289
+ driverEnable := false
1290
+ devCfg .Spec .Driver .Enable = & driverEnable
1291
+ devCfg .Spec .DevicePlugin .DevicePluginArguments = map [string ]string {"resource_naming_strategy" : "mixed" }
1292
+ s .createDeviceConfig (devCfg , c )
1293
+ s .checkNFDWorkerStatus (s .ns , c , "" )
1294
+ s .checkNodeLabellerStatus (s .ns , c , devCfg )
1295
+ s .verifyDeviceConfigStatus (devCfg , c )
1296
+
1297
+ ret , err := utils .GetAMDGPUCount (ctx , s .clientSet , "cpx_nps1" )
1298
+ if err != nil {
1299
+ logger .Errorf ("error: %v" , err )
1300
+ }
1301
+ var minGPU int = 10000
1302
+ for _ , v := range ret {
1303
+ if v < minGPU {
1304
+ minGPU = v
1305
+ }
1306
+ }
1307
+ assert .Greater (c , minGPU , 0 , "did not find any server with amd gpu" )
1308
+
1309
+ gpuLimitCount := minGPU
1310
+ gpuReqCount := minGPU
1311
+
1312
+ res := & v1.ResourceRequirements {
1313
+ Limits : v1.ResourceList {
1314
+ "amd.com/cpx_nps1" : resource .MustParse (fmt .Sprintf ("%d" , gpuLimitCount )),
1315
+ },
1316
+ Requests : v1.ResourceList {
1317
+ "amd.com/cpx_nps1" : resource .MustParse (fmt .Sprintf ("%d" , gpuReqCount )),
1318
+ },
1319
+ }
1320
+
1321
+ err = utils .DeployRocmPods (context .TODO (), s .clientSet , res )
1322
+ assert .NoError (c , err , "failed to deploy pods" )
1323
+ err = utils .VerifyROCMPODResourceCount (ctx , s .clientSet , gpuReqCount , "cpx_nps1" )
1324
+ assert .NoError (c , err , fmt .Sprintf ("%v" , err ))
1325
+
1326
+ // delete
1327
+ s .deleteDeviceConfig (devCfg , c )
1328
+
1329
+ err = utils .DelRocmPods (context .TODO (), s .clientSet )
1330
+ assert .NoError (c , err , "failed to remove rocm pods" )
1331
+ }
1332
+
1095
1333
func (s * E2ESuite ) TestKubeRbacProxyClusterIP (c * C ) {
1096
1334
_ , err := s .dClient .DeviceConfigs (s .ns ).Get ("deviceconfig-kuberbac-clusterip" , metav1.GetOptions {})
1097
1335
assert .Errorf (c , err , "config deviceconfig-kuberbac-clusterip exists" )
0 commit comments