Skip to content

Commit 42692bd

Browse files
sriram-30sajmera-pensando
authored andcommitted
Device Plugin e2e for homogeneous/heterogeneous with single/mixed strategy
1 parent 5aaffe5 commit 42692bd

File tree

4 files changed

+251
-18
lines changed

4 files changed

+251
-18
lines changed

tests/e2e/cluster_test.go

Lines changed: 240 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1051,7 +1051,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) {
10511051
s.verifyDeviceConfigStatus(devCfg, c)
10521052
s.verifyNodeGPULabel(devCfg, c)
10531053

1054-
ret, err := utils.GetAMDGPUCount(ctx, s.clientSet)
1054+
ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "gpu")
10551055
if err != nil {
10561056
logger.Errorf("error: %v", err)
10571057
}
@@ -1078,7 +1078,7 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) {
10781078
err = utils.DeployRocmPods(context.TODO(), s.clientSet, res)
10791079
assert.NoError(c, err, "failed to deploy pods")
10801080
s.verifyROCMPOD(true, c)
1081-
err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount)
1081+
err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "gpu")
10821082
assert.NoError(c, err, fmt.Sprintf("%v", err))
10831083

10841084
// delete
@@ -1092,6 +1092,244 @@ func (s *E2ESuite) TestWorkloadRequestedGPUs(c *C) {
10921092
assert.NoError(c, err, "failed to reboot nodes")
10931093
}
10941094

1095+
func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousSingle(c *C) {
1096+
if s.simEnable {
1097+
c.Skip("Skipping for non amd gpu testbed")
1098+
}
1099+
if !dcmImageDefined {
1100+
c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined")
1101+
}
1102+
1103+
s.configMapHelper(c)
1104+
1105+
logger.Infof("Add node label after pod comes up")
1106+
time.Sleep(30 * time.Second)
1107+
1108+
nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift)
1109+
nodeNames := make([]string, 0)
1110+
for _, node := range nodes {
1111+
nodeNames = append(nodeNames, node.Name)
1112+
}
1113+
for _, nodeName := range nodeNames {
1114+
s.addRemoveNodeLabels(nodeName, "e2e_profile2")
1115+
}
1116+
1117+
logs := s.getLogs()
1118+
if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) {
1119+
logger.Infof("Successfully tested homogenous default partitioning")
1120+
} else {
1121+
logger.Errorf("Failure test homogenous partitioning")
1122+
}
1123+
devCfgDcm := s.getDeviceConfigForDCM(c)
1124+
s.deleteDeviceConfig(devCfgDcm, c)
1125+
1126+
time.Sleep(60 * time.Second)
1127+
1128+
ctx := context.TODO()
1129+
logger.Infof("create %v", s.cfgName)
1130+
devCfg := s.getDeviceConfig(c)
1131+
driverEnable := false
1132+
devCfg.Spec.Driver.Enable = &driverEnable
1133+
s.createDeviceConfig(devCfg, c)
1134+
s.checkNFDWorkerStatus(s.ns, c, "")
1135+
s.checkNodeLabellerStatus(s.ns, c, devCfg)
1136+
s.verifyDeviceConfigStatus(devCfg, c)
1137+
s.verifyNodeGPULabel(devCfg, c)
1138+
1139+
ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "gpu")
1140+
if err != nil {
1141+
logger.Errorf("error: %v", err)
1142+
}
1143+
var minGPU int = 10000
1144+
for _, v := range ret {
1145+
if v < minGPU {
1146+
minGPU = v
1147+
}
1148+
}
1149+
assert.Greater(c, minGPU, 0, "did not find any server with amd gpu")
1150+
1151+
gpuLimitCount := minGPU
1152+
gpuReqCount := minGPU
1153+
1154+
res := &v1.ResourceRequirements{
1155+
Limits: v1.ResourceList{
1156+
"amd.com/gpu": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)),
1157+
},
1158+
Requests: v1.ResourceList{
1159+
"amd.com/gpu": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)),
1160+
},
1161+
}
1162+
1163+
err = utils.DeployRocmPods(context.TODO(), s.clientSet, res)
1164+
assert.NoError(c, err, "failed to deploy pods")
1165+
err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "gpu")
1166+
assert.NoError(c, err, fmt.Sprintf("%v", err))
1167+
1168+
// delete
1169+
s.deleteDeviceConfig(devCfg, c)
1170+
1171+
err = utils.DelRocmPods(context.TODO(), s.clientSet)
1172+
assert.NoError(c, err, "failed to remove rocm pods")
1173+
}
1174+
1175+
func (s *E2ESuite) TestWorkloadRequestedGPUsHomogeneousMixed(c *C) {
1176+
if s.simEnable {
1177+
c.Skip("Skipping for non amd gpu testbed")
1178+
}
1179+
if !dcmImageDefined {
1180+
c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined")
1181+
}
1182+
1183+
s.configMapHelper(c)
1184+
1185+
logger.Infof("Add node label after pod comes up")
1186+
time.Sleep(30 * time.Second)
1187+
1188+
nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift)
1189+
nodeNames := make([]string, 0)
1190+
for _, node := range nodes {
1191+
nodeNames = append(nodeNames, node.Name)
1192+
}
1193+
for _, nodeName := range nodeNames {
1194+
s.addRemoveNodeLabels(nodeName, "e2e_profile2")
1195+
}
1196+
1197+
logs := s.getLogs()
1198+
if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) {
1199+
logger.Infof("Successfully tested homogeneous partitioning")
1200+
} else {
1201+
logger.Errorf("Failure test homogeneous partitioning")
1202+
}
1203+
devCfgDcm := s.getDeviceConfigForDCM(c)
1204+
s.deleteDeviceConfig(devCfgDcm, c)
1205+
time.Sleep(60 * time.Second)
1206+
ctx := context.TODO()
1207+
logger.Infof("create %v", s.cfgName)
1208+
devCfg := s.getDeviceConfig(c)
1209+
driverEnable := false
1210+
devCfg.Spec.Driver.Enable = &driverEnable
1211+
devCfg.Spec.DevicePlugin.DevicePluginArguments = map[string]string{"resource_naming_strategy": "mixed"}
1212+
s.createDeviceConfig(devCfg, c)
1213+
s.checkNFDWorkerStatus(s.ns, c, "")
1214+
s.checkNodeLabellerStatus(s.ns, c, devCfg)
1215+
s.verifyDeviceConfigStatus(devCfg, c)
1216+
1217+
ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "cpx_nps4")
1218+
if err != nil {
1219+
logger.Errorf("error: %v", err)
1220+
}
1221+
var minGPU int = 10000
1222+
for _, v := range ret {
1223+
if v < minGPU {
1224+
minGPU = v
1225+
}
1226+
}
1227+
assert.Greater(c, minGPU, 0, "did not find any server with amd gpu")
1228+
1229+
gpuLimitCount := minGPU
1230+
gpuReqCount := minGPU
1231+
1232+
res := &v1.ResourceRequirements{
1233+
Limits: v1.ResourceList{
1234+
"amd.com/cpx_nps4": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)),
1235+
},
1236+
Requests: v1.ResourceList{
1237+
"amd.com/cpx_nps4": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)),
1238+
},
1239+
}
1240+
1241+
err = utils.DeployRocmPods(context.TODO(), s.clientSet, res)
1242+
assert.NoError(c, err, "failed to deploy pods")
1243+
err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "cpx_nps4")
1244+
assert.NoError(c, err, fmt.Sprintf("%v", err))
1245+
1246+
// delete
1247+
s.deleteDeviceConfig(devCfg, c)
1248+
1249+
err = utils.DelRocmPods(context.TODO(), s.clientSet)
1250+
assert.NoError(c, err, "failed to remove rocm pods")
1251+
1252+
}
1253+
1254+
func (s *E2ESuite) TestWorkloadRequestedGPUsHeterogeneousMixed(c *C) {
1255+
if s.simEnable {
1256+
c.Skip("Skipping for non amd gpu testbed")
1257+
}
1258+
if !dcmImageDefined {
1259+
c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined")
1260+
}
1261+
1262+
s.configMapHelper(c)
1263+
1264+
logger.Infof("Add node label after pod comes up")
1265+
time.Sleep(30 * time.Second)
1266+
1267+
nodes := utils.GetAMDGpuWorker(s.clientSet, s.openshift)
1268+
nodeNames := make([]string, 0)
1269+
for _, node := range nodes {
1270+
nodeNames = append(nodeNames, node.Name)
1271+
}
1272+
for _, nodeName := range nodeNames {
1273+
s.addRemoveNodeLabels(nodeName, "e2e_profile1")
1274+
}
1275+
1276+
logs := s.getLogs()
1277+
if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) {
1278+
logger.Infof("Successfully tested homogeneous partitioning")
1279+
} else {
1280+
logger.Errorf("Failure test heterogenous partitioning")
1281+
}
1282+
devCfgDcm := s.getDeviceConfigForDCM(c)
1283+
s.deleteDeviceConfig(devCfgDcm, c)
1284+
time.Sleep(60 * time.Second)
1285+
1286+
ctx := context.TODO()
1287+
logger.Infof("create %v", s.cfgName)
1288+
devCfg := s.getDeviceConfig(c)
1289+
driverEnable := false
1290+
devCfg.Spec.Driver.Enable = &driverEnable
1291+
devCfg.Spec.DevicePlugin.DevicePluginArguments = map[string]string{"resource_naming_strategy": "mixed"}
1292+
s.createDeviceConfig(devCfg, c)
1293+
s.checkNFDWorkerStatus(s.ns, c, "")
1294+
s.checkNodeLabellerStatus(s.ns, c, devCfg)
1295+
s.verifyDeviceConfigStatus(devCfg, c)
1296+
1297+
ret, err := utils.GetAMDGPUCount(ctx, s.clientSet, "cpx_nps1")
1298+
if err != nil {
1299+
logger.Errorf("error: %v", err)
1300+
}
1301+
var minGPU int = 10000
1302+
for _, v := range ret {
1303+
if v < minGPU {
1304+
minGPU = v
1305+
}
1306+
}
1307+
assert.Greater(c, minGPU, 0, "did not find any server with amd gpu")
1308+
1309+
gpuLimitCount := minGPU
1310+
gpuReqCount := minGPU
1311+
1312+
res := &v1.ResourceRequirements{
1313+
Limits: v1.ResourceList{
1314+
"amd.com/cpx_nps1": resource.MustParse(fmt.Sprintf("%d", gpuLimitCount)),
1315+
},
1316+
Requests: v1.ResourceList{
1317+
"amd.com/cpx_nps1": resource.MustParse(fmt.Sprintf("%d", gpuReqCount)),
1318+
},
1319+
}
1320+
1321+
err = utils.DeployRocmPods(context.TODO(), s.clientSet, res)
1322+
assert.NoError(c, err, "failed to deploy pods")
1323+
err = utils.VerifyROCMPODResourceCount(ctx, s.clientSet, gpuReqCount, "cpx_nps1")
1324+
assert.NoError(c, err, fmt.Sprintf("%v", err))
1325+
1326+
// delete
1327+
s.deleteDeviceConfig(devCfg, c)
1328+
1329+
err = utils.DelRocmPods(context.TODO(), s.clientSet)
1330+
assert.NoError(c, err, "failed to remove rocm pods")
1331+
}
1332+
10951333
func (s *E2ESuite) TestKubeRbacProxyClusterIP(c *C) {
10961334
_, err := s.dClient.DeviceConfigs(s.ns).Get("deviceconfig-kuberbac-clusterip", metav1.GetOptions{})
10971335
assert.Errorf(c, err, "config deviceconfig-kuberbac-clusterip exists")

tests/e2e/dcm_e2e_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ func (s *E2ESuite) addRemoveNodeLabels(nodeName string, selectedProfile string)
7272
logger.Infof("Error adding node lbels: %s\n", err.Error())
7373
return
7474
}
75-
time.Sleep(15 * time.Second)
75+
time.Sleep(45 * time.Second)
7676
// Allow partition to happen
7777
err = utils.DeleteNodeLabel(s.clientSet, nodeName, "dcm.amd.com/gpu-config-profile")
7878
_ = utils.DeleteNodeLabel(s.clientSet, nodeName, "dcm.amd.com/apply-gpu-config-profile")
@@ -269,6 +269,7 @@ func (s *E2ESuite) createConfigMap() GPUConfigProfiles {
269269
{
270270
ComputePartition: "CPX",
271271
MemoryPartition: "NPS4",
272+
NumGPUsAssigned: 1,
272273
},
273274
}
274275

tests/e2e/testrunner_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ func (s *E2ESuite) createTestRunnerConfigmap(valid bool, devCfg *v1alpha1.Device
200200
}
201201

202202
func (s *E2ESuite) scheduleWorkloadOnNodeWithMaxGPUs(c *C) string {
203-
ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet)
203+
ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet, "gpu")
204204
if err != nil {
205205
logger.Errorf("error: %v", err)
206206
}
@@ -228,7 +228,7 @@ func (s *E2ESuite) scheduleWorkloadOnNodeWithMaxGPUs(c *C) string {
228228

229229
err = utils.DeployRocmPods(context.TODO(), s.clientSet, res)
230230
assert.NoError(c, err, "failed to deploy pods")
231-
err = utils.VerifyROCMPODResourceCount(context.TODO(), s.clientSet, gpuReqCount)
231+
err = utils.VerifyROCMPODResourceCount(context.TODO(), s.clientSet, gpuReqCount, "gpu")
232232
assert.NoError(c, err, fmt.Sprintf("%v", err))
233233

234234
return nodeWithMaxGPU
@@ -730,7 +730,7 @@ func (s *E2ESuite) TestTestRunnerLogsExport(c *C) {
730730

731731
func (s *E2ESuite) getGPUNodeName() (nodeWithMaxGPU string) {
732732
var maxPerNodeGPU int = 0
733-
ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet)
733+
ret, err := utils.GetAMDGPUCount(context.TODO(), s.clientSet, "gpu")
734734
if err != nil {
735735
logger.Printf("Unable to fetch gpu nodes. Error %v", err)
736736
return

tests/e2e/utils/utils.go

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -598,14 +598,6 @@ func GetWorkerNodes(cl *kubernetes.Clientset) []*v1.Node {
598598
func GetAMDGpuWorker(cl *kubernetes.Clientset, isOpenshift bool) []v1.Node {
599599
ret := make([]v1.Node, 0)
600600
labelSelector := labels.NewSelector()
601-
if !isOpenshift {
602-
r, _ := labels.NewRequirement(
603-
"node-role.kubernetes.io/control-plane",
604-
selection.DoesNotExist,
605-
nil,
606-
)
607-
labelSelector = labelSelector.Add(*r)
608-
}
609601
r, _ := labels.NewRequirement(
610602
"feature.node.kubernetes.io/amd-gpu",
611603
selection.Equals,
@@ -766,7 +758,7 @@ func DelRocmPodsByNodeNames(ctx context.Context, cl *kubernetes.Clientset,
766758

767759
}
768760

769-
func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]int, error) {
761+
func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset, resourceType string) (map[string]int, error) {
770762

771763
ret := make(map[string]int)
772764
// Get the list of nodes
@@ -777,7 +769,8 @@ func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]i
777769

778770
// Iterate over the nodes and count AMD GPUs
779771
for _, node := range nodes.Items {
780-
if val, ok := node.Status.Capacity["amd.com/gpu"]; ok {
772+
resourceKey := v1.ResourceName("amd.com/" + resourceType)
773+
if val, ok := node.Status.Capacity[resourceKey]; ok {
781774
num, err := strconv.ParseInt(val.String(), 10, 64)
782775
if err != nil {
783776
log.Infof("error: %v", err)
@@ -790,7 +783,7 @@ func GetAMDGPUCount(ctx context.Context, cl *kubernetes.Clientset) (map[string]i
790783
}
791784

792785
func VerifyROCMPODResourceCount(ctx context.Context, cl *kubernetes.Clientset,
793-
gpuReqCount int) error {
786+
gpuReqCount int, resourceType string) error {
794787

795788
its, err := cl.CoreV1().Pods("").List(ctx,
796789
metav1.ListOptions{
@@ -805,7 +798,8 @@ func VerifyROCMPODResourceCount(ctx context.Context, cl *kubernetes.Clientset,
805798
continue
806799
}
807800

808-
if gpu, ok := cntr.Resources.Requests["amd.com/gpu"]; ok {
801+
resourceKey := v1.ResourceName("amd.com/" + resourceType)
802+
if gpu, ok := cntr.Resources.Requests[resourceKey]; ok {
809803
gpuAssignedCount := int(gpu.Value())
810804
if gpuReqCount < gpuAssignedCount {
811805
return fmt.Errorf("gpu requested %d got %d",

0 commit comments

Comments
 (0)