diff --git a/habana/pytorch/training/buildspec-1-10-syai-1-3.yml b/.deprecated/habana/pytorch/training/buildspec-1-10-syai-1-3.yml similarity index 100% rename from habana/pytorch/training/buildspec-1-10-syai-1-3.yml rename to .deprecated/habana/pytorch/training/buildspec-1-10-syai-1-3.yml diff --git a/habana/pytorch/training/buildspec-1-10-syai-1-4.yml b/.deprecated/habana/pytorch/training/buildspec-1-10-syai-1-4.yml similarity index 100% rename from habana/pytorch/training/buildspec-1-10-syai-1-4.yml rename to .deprecated/habana/pytorch/training/buildspec-1-10-syai-1-4.yml diff --git a/habana/pytorch/training/buildspec-1-11-syai-1-5.yml b/.deprecated/habana/pytorch/training/buildspec-1-11-syai-1-5.yml similarity index 100% rename from habana/pytorch/training/buildspec-1-11-syai-1-5.yml rename to .deprecated/habana/pytorch/training/buildspec-1-11-syai-1-5.yml diff --git a/habana/pytorch/training/buildspec.yml b/.deprecated/habana/pytorch/training/buildspec.yml similarity index 100% rename from habana/pytorch/training/buildspec.yml rename to .deprecated/habana/pytorch/training/buildspec.yml diff --git a/habana/pytorch/training/docker/1.10/py3/example/Dockerfile.hpu b/.deprecated/habana/pytorch/training/docker/1.10/py3/example/Dockerfile.hpu similarity index 100% rename from habana/pytorch/training/docker/1.10/py3/example/Dockerfile.hpu rename to .deprecated/habana/pytorch/training/docker/1.10/py3/example/Dockerfile.hpu diff --git a/habana/pytorch/training/docker/1.10/py3/synapseai1.3.0/Dockerfile.hpu b/.deprecated/habana/pytorch/training/docker/1.10/py3/synapseai1.3.0/Dockerfile.hpu similarity index 100% rename from habana/pytorch/training/docker/1.10/py3/synapseai1.3.0/Dockerfile.hpu rename to .deprecated/habana/pytorch/training/docker/1.10/py3/synapseai1.3.0/Dockerfile.hpu diff --git a/habana/pytorch/training/docker/1.10/py3/synapseai1.4.1/Dockerfile.hpu b/.deprecated/habana/pytorch/training/docker/1.10/py3/synapseai1.4.1/Dockerfile.hpu similarity index 100% rename from habana/pytorch/training/docker/1.10/py3/synapseai1.4.1/Dockerfile.hpu rename to .deprecated/habana/pytorch/training/docker/1.10/py3/synapseai1.4.1/Dockerfile.hpu diff --git a/habana/pytorch/training/docker/1.11/py3/example/Dockerfile.hpu b/.deprecated/habana/pytorch/training/docker/1.11/py3/example/Dockerfile.hpu similarity index 100% rename from habana/pytorch/training/docker/1.11/py3/example/Dockerfile.hpu rename to .deprecated/habana/pytorch/training/docker/1.11/py3/example/Dockerfile.hpu diff --git a/habana/pytorch/training/docker/1.11/py3/synapseai1.5.0/Dockerfile.hpu b/.deprecated/habana/pytorch/training/docker/1.11/py3/synapseai1.5.0/Dockerfile.hpu similarity index 100% rename from habana/pytorch/training/docker/1.11/py3/synapseai1.5.0/Dockerfile.hpu rename to .deprecated/habana/pytorch/training/docker/1.11/py3/synapseai1.5.0/Dockerfile.hpu diff --git a/habana/pytorch/training/docker/1.12/py3/example/Dockerfile.hpu b/.deprecated/habana/pytorch/training/docker/1.12/py3/example/Dockerfile.hpu similarity index 100% rename from habana/pytorch/training/docker/1.12/py3/example/Dockerfile.hpu rename to .deprecated/habana/pytorch/training/docker/1.12/py3/example/Dockerfile.hpu diff --git a/habana/pytorch/training/docker/1.12/py3/synapseai1.6.0/Dockerfile.hpu b/.deprecated/habana/pytorch/training/docker/1.12/py3/synapseai1.6.0/Dockerfile.hpu similarity index 100% rename from habana/pytorch/training/docker/1.12/py3/synapseai1.6.0/Dockerfile.hpu rename to .deprecated/habana/pytorch/training/docker/1.12/py3/synapseai1.6.0/Dockerfile.hpu diff --git a/habana/pytorch/training/docker/build_artifacts/__init__.py b/.deprecated/habana/pytorch/training/docker/build_artifacts/__init__.py similarity index 100% rename from habana/pytorch/training/docker/build_artifacts/__init__.py rename to .deprecated/habana/pytorch/training/docker/build_artifacts/__init__.py diff --git a/habana/pytorch/training/docker/build_artifacts/changehostname.c b/.deprecated/habana/pytorch/training/docker/build_artifacts/changehostname.c similarity index 100% rename from habana/pytorch/training/docker/build_artifacts/changehostname.c rename to .deprecated/habana/pytorch/training/docker/build_artifacts/changehostname.c diff --git a/habana/pytorch/training/docker/build_artifacts/dockerd-entrypoint.py b/.deprecated/habana/pytorch/training/docker/build_artifacts/dockerd-entrypoint.py similarity index 100% rename from habana/pytorch/training/docker/build_artifacts/dockerd-entrypoint.py rename to .deprecated/habana/pytorch/training/docker/build_artifacts/dockerd-entrypoint.py diff --git a/habana/pytorch/training/docker/build_artifacts/start_with_right_hostname.sh b/.deprecated/habana/pytorch/training/docker/build_artifacts/start_with_right_hostname.sh similarity index 100% rename from habana/pytorch/training/docker/build_artifacts/start_with_right_hostname.sh rename to .deprecated/habana/pytorch/training/docker/build_artifacts/start_with_right_hostname.sh diff --git a/habana/tensorflow/training/buildspec-2-7-syai-1-2.yml b/.deprecated/habana/tensorflow/training/buildspec-2-7-syai-1-2.yml similarity index 100% rename from habana/tensorflow/training/buildspec-2-7-syai-1-2.yml rename to .deprecated/habana/tensorflow/training/buildspec-2-7-syai-1-2.yml diff --git a/habana/tensorflow/training/buildspec-2-8-syai-1-3.yml b/.deprecated/habana/tensorflow/training/buildspec-2-8-syai-1-3.yml similarity index 100% rename from habana/tensorflow/training/buildspec-2-8-syai-1-3.yml rename to .deprecated/habana/tensorflow/training/buildspec-2-8-syai-1-3.yml diff --git a/habana/tensorflow/training/buildspec-2-8-syai-1-4.yml b/.deprecated/habana/tensorflow/training/buildspec-2-8-syai-1-4.yml similarity index 100% rename from habana/tensorflow/training/buildspec-2-8-syai-1-4.yml rename to .deprecated/habana/tensorflow/training/buildspec-2-8-syai-1-4.yml diff --git a/habana/tensorflow/training/buildspec-2-9-syai-1-5.yml b/.deprecated/habana/tensorflow/training/buildspec-2-9-syai-1-5.yml similarity index 100% rename from habana/tensorflow/training/buildspec-2-9-syai-1-5.yml rename to .deprecated/habana/tensorflow/training/buildspec-2-9-syai-1-5.yml diff --git a/habana/tensorflow/training/buildspec-2-9-syai-1-6.yml b/.deprecated/habana/tensorflow/training/buildspec-2-9-syai-1-6.yml similarity index 100% rename from habana/tensorflow/training/buildspec-2-9-syai-1-6.yml rename to .deprecated/habana/tensorflow/training/buildspec-2-9-syai-1-6.yml diff --git a/habana/tensorflow/training/buildspec.yml b/.deprecated/habana/tensorflow/training/buildspec.yml similarity index 100% rename from habana/tensorflow/training/buildspec.yml rename to .deprecated/habana/tensorflow/training/buildspec.yml diff --git a/habana/tensorflow/training/docker/2.7/py3/example/Dockerfile.hpu b/.deprecated/habana/tensorflow/training/docker/2.7/py3/example/Dockerfile.hpu similarity index 100% rename from habana/tensorflow/training/docker/2.7/py3/example/Dockerfile.hpu rename to .deprecated/habana/tensorflow/training/docker/2.7/py3/example/Dockerfile.hpu diff --git a/habana/tensorflow/training/docker/2.7/py3/synapseai1.2.0/Dockerfile.hpu b/.deprecated/habana/tensorflow/training/docker/2.7/py3/synapseai1.2.0/Dockerfile.hpu similarity index 100% rename from habana/tensorflow/training/docker/2.7/py3/synapseai1.2.0/Dockerfile.hpu rename to .deprecated/habana/tensorflow/training/docker/2.7/py3/synapseai1.2.0/Dockerfile.hpu diff --git a/habana/tensorflow/training/docker/2.8/py3/example/Dockerfile.hpu b/.deprecated/habana/tensorflow/training/docker/2.8/py3/example/Dockerfile.hpu similarity index 100% rename from habana/tensorflow/training/docker/2.8/py3/example/Dockerfile.hpu rename to .deprecated/habana/tensorflow/training/docker/2.8/py3/example/Dockerfile.hpu diff --git a/habana/tensorflow/training/docker/2.8/py3/synapseai1.3.0/Dockerfile.hpu b/.deprecated/habana/tensorflow/training/docker/2.8/py3/synapseai1.3.0/Dockerfile.hpu similarity index 100% rename from habana/tensorflow/training/docker/2.8/py3/synapseai1.3.0/Dockerfile.hpu rename to .deprecated/habana/tensorflow/training/docker/2.8/py3/synapseai1.3.0/Dockerfile.hpu diff --git a/habana/tensorflow/training/docker/2.8/py3/synapseai1.4.1/Dockerfile.hpu b/.deprecated/habana/tensorflow/training/docker/2.8/py3/synapseai1.4.1/Dockerfile.hpu similarity index 100% rename from habana/tensorflow/training/docker/2.8/py3/synapseai1.4.1/Dockerfile.hpu rename to .deprecated/habana/tensorflow/training/docker/2.8/py3/synapseai1.4.1/Dockerfile.hpu diff --git a/habana/tensorflow/training/docker/2.9/py3/example/Dockerfile.hpu b/.deprecated/habana/tensorflow/training/docker/2.9/py3/example/Dockerfile.hpu similarity index 100% rename from habana/tensorflow/training/docker/2.9/py3/example/Dockerfile.hpu rename to .deprecated/habana/tensorflow/training/docker/2.9/py3/example/Dockerfile.hpu diff --git a/habana/tensorflow/training/docker/2.9/py3/synapseai1.5.0/Dockerfile.hpu b/.deprecated/habana/tensorflow/training/docker/2.9/py3/synapseai1.5.0/Dockerfile.hpu similarity index 100% rename from habana/tensorflow/training/docker/2.9/py3/synapseai1.5.0/Dockerfile.hpu rename to .deprecated/habana/tensorflow/training/docker/2.9/py3/synapseai1.5.0/Dockerfile.hpu diff --git a/habana/tensorflow/training/docker/2.9/py3/synapseai1.6.0/Dockerfile.hpu b/.deprecated/habana/tensorflow/training/docker/2.9/py3/synapseai1.6.0/Dockerfile.hpu similarity index 100% rename from habana/tensorflow/training/docker/2.9/py3/synapseai1.6.0/Dockerfile.hpu rename to .deprecated/habana/tensorflow/training/docker/2.9/py3/synapseai1.6.0/Dockerfile.hpu diff --git a/habana/tensorflow/training/docker/build_artifacts/__init__.py b/.deprecated/habana/tensorflow/training/docker/build_artifacts/__init__.py similarity index 100% rename from habana/tensorflow/training/docker/build_artifacts/__init__.py rename to .deprecated/habana/tensorflow/training/docker/build_artifacts/__init__.py diff --git a/habana/tensorflow/training/docker/build_artifacts/dockerd-entrypoint.py b/.deprecated/habana/tensorflow/training/docker/build_artifacts/dockerd-entrypoint.py similarity index 100% rename from habana/tensorflow/training/docker/build_artifacts/dockerd-entrypoint.py rename to .deprecated/habana/tensorflow/training/docker/build_artifacts/dockerd-entrypoint.py diff --git a/mxnet/inference/buildspec-eia.yml b/.deprecated/mxnet/inference/buildspec-eia.yml similarity index 100% rename from mxnet/inference/buildspec-eia.yml rename to .deprecated/mxnet/inference/buildspec-eia.yml diff --git a/mxnet/inference/buildspec-graviton.yml b/.deprecated/mxnet/inference/buildspec-graviton.yml similarity index 100% rename from mxnet/inference/buildspec-graviton.yml rename to .deprecated/mxnet/inference/buildspec-graviton.yml diff --git a/mxnet/inference/buildspec-neuron.yml b/.deprecated/mxnet/inference/buildspec-neuron.yml similarity index 100% rename from mxnet/inference/buildspec-neuron.yml rename to .deprecated/mxnet/inference/buildspec-neuron.yml diff --git a/mxnet/inference/buildspec.yml b/.deprecated/mxnet/inference/buildspec.yml similarity index 100% rename from mxnet/inference/buildspec.yml rename to .deprecated/mxnet/inference/buildspec.yml diff --git a/mxnet/inference/docker/1.5.1/py2/Dockerfile.eia b/.deprecated/mxnet/inference/docker/1.5.1/py2/Dockerfile.eia similarity index 100% rename from mxnet/inference/docker/1.5.1/py2/Dockerfile.eia rename to .deprecated/mxnet/inference/docker/1.5.1/py2/Dockerfile.eia diff --git a/mxnet/inference/docker/1.5.1/py3/Dockerfile.eia b/.deprecated/mxnet/inference/docker/1.5.1/py3/Dockerfile.eia similarity index 100% rename from mxnet/inference/docker/1.5.1/py3/Dockerfile.eia rename to .deprecated/mxnet/inference/docker/1.5.1/py3/Dockerfile.eia diff --git a/mxnet/inference/docker/1.5.1/py3/Dockerfile.neuron b/.deprecated/mxnet/inference/docker/1.5.1/py3/Dockerfile.neuron similarity index 100% rename from mxnet/inference/docker/1.5.1/py3/Dockerfile.neuron rename to .deprecated/mxnet/inference/docker/1.5.1/py3/Dockerfile.neuron diff --git a/mxnet/inference/docker/1.6.0/py2/Dockerfile.cpu b/.deprecated/mxnet/inference/docker/1.6.0/py2/Dockerfile.cpu similarity index 100% rename from mxnet/inference/docker/1.6.0/py2/Dockerfile.cpu rename to .deprecated/mxnet/inference/docker/1.6.0/py2/Dockerfile.cpu diff --git a/mxnet/inference/docker/1.6.0/py2/cu101/Dockerfile.gpu b/.deprecated/mxnet/inference/docker/1.6.0/py2/cu101/Dockerfile.gpu similarity index 100% rename from mxnet/inference/docker/1.6.0/py2/cu101/Dockerfile.gpu rename to .deprecated/mxnet/inference/docker/1.6.0/py2/cu101/Dockerfile.gpu diff --git a/mxnet/inference/docker/1.6.0/py3/Dockerfile.cpu b/.deprecated/mxnet/inference/docker/1.6.0/py3/Dockerfile.cpu similarity index 100% rename from mxnet/inference/docker/1.6.0/py3/Dockerfile.cpu rename to .deprecated/mxnet/inference/docker/1.6.0/py3/Dockerfile.cpu diff --git a/mxnet/inference/docker/1.6.0/py3/cu101/Dockerfile.gpu b/.deprecated/mxnet/inference/docker/1.6.0/py3/cu101/Dockerfile.gpu similarity index 100% rename from mxnet/inference/docker/1.6.0/py3/cu101/Dockerfile.gpu rename to .deprecated/mxnet/inference/docker/1.6.0/py3/cu101/Dockerfile.gpu diff --git a/mxnet/inference/docker/1.7.0/py3/Dockerfile.cpu b/.deprecated/mxnet/inference/docker/1.7.0/py3/Dockerfile.cpu similarity index 100% rename from mxnet/inference/docker/1.7.0/py3/Dockerfile.cpu rename to .deprecated/mxnet/inference/docker/1.7.0/py3/Dockerfile.cpu diff --git a/mxnet/inference/docker/1.7.0/py3/Dockerfile.eia b/.deprecated/mxnet/inference/docker/1.7.0/py3/Dockerfile.eia similarity index 100% rename from mxnet/inference/docker/1.7.0/py3/Dockerfile.eia rename to .deprecated/mxnet/inference/docker/1.7.0/py3/Dockerfile.eia diff --git a/mxnet/inference/docker/1.7.0/py3/cu101/Dockerfile.gpu b/.deprecated/mxnet/inference/docker/1.7.0/py3/cu101/Dockerfile.gpu similarity index 100% rename from mxnet/inference/docker/1.7.0/py3/cu101/Dockerfile.gpu rename to .deprecated/mxnet/inference/docker/1.7.0/py3/cu101/Dockerfile.gpu diff --git a/mxnet/inference/docker/1.8/py3/Dockerfile.cpu b/.deprecated/mxnet/inference/docker/1.8/py3/Dockerfile.cpu similarity index 100% rename from mxnet/inference/docker/1.8/py3/Dockerfile.cpu rename to .deprecated/mxnet/inference/docker/1.8/py3/Dockerfile.cpu diff --git a/mxnet/inference/docker/1.8/py3/Dockerfile.cpu.os_scan_allowlist.json b/.deprecated/mxnet/inference/docker/1.8/py3/Dockerfile.cpu.os_scan_allowlist.json similarity index 100% rename from mxnet/inference/docker/1.8/py3/Dockerfile.cpu.os_scan_allowlist.json rename to .deprecated/mxnet/inference/docker/1.8/py3/Dockerfile.cpu.os_scan_allowlist.json diff --git a/mxnet/inference/docker/1.8/py3/apt-upgrade-list-cpu.txt b/.deprecated/mxnet/inference/docker/1.8/py3/apt-upgrade-list-cpu.txt similarity index 100% rename from mxnet/inference/docker/1.8/py3/apt-upgrade-list-cpu.txt rename to .deprecated/mxnet/inference/docker/1.8/py3/apt-upgrade-list-cpu.txt diff --git a/mxnet/inference/docker/1.8/py3/cu110/Dockerfile.gpu b/.deprecated/mxnet/inference/docker/1.8/py3/cu110/Dockerfile.gpu similarity index 100% rename from mxnet/inference/docker/1.8/py3/cu110/Dockerfile.gpu rename to .deprecated/mxnet/inference/docker/1.8/py3/cu110/Dockerfile.gpu diff --git a/mxnet/inference/docker/1.8/py3/cu110/Dockerfile.gpu.os_scan_allowlist.json b/.deprecated/mxnet/inference/docker/1.8/py3/cu110/Dockerfile.gpu.os_scan_allowlist.json similarity index 100% rename from mxnet/inference/docker/1.8/py3/cu110/Dockerfile.gpu.os_scan_allowlist.json rename to .deprecated/mxnet/inference/docker/1.8/py3/cu110/Dockerfile.gpu.os_scan_allowlist.json diff --git a/mxnet/inference/docker/1.8/py3/cu110/apt-upgrade-list-gpu.txt b/.deprecated/mxnet/inference/docker/1.8/py3/cu110/apt-upgrade-list-gpu.txt similarity index 100% rename from mxnet/inference/docker/1.8/py3/cu110/apt-upgrade-list-gpu.txt rename to .deprecated/mxnet/inference/docker/1.8/py3/cu110/apt-upgrade-list-gpu.txt diff --git a/mxnet/inference/docker/1.8/py3/sdk1.17.1/Dockerfile.neuron b/.deprecated/mxnet/inference/docker/1.8/py3/sdk1.17.1/Dockerfile.neuron similarity index 100% rename from mxnet/inference/docker/1.8/py3/sdk1.17.1/Dockerfile.neuron rename to .deprecated/mxnet/inference/docker/1.8/py3/sdk1.17.1/Dockerfile.neuron diff --git a/mxnet/inference/docker/1.8/py3/sdk1.17.1/Dockerfile.neuron.os_scan_allowlist.json b/.deprecated/mxnet/inference/docker/1.8/py3/sdk1.17.1/Dockerfile.neuron.os_scan_allowlist.json similarity index 100% rename from mxnet/inference/docker/1.8/py3/sdk1.17.1/Dockerfile.neuron.os_scan_allowlist.json rename to .deprecated/mxnet/inference/docker/1.8/py3/sdk1.17.1/Dockerfile.neuron.os_scan_allowlist.json diff --git a/mxnet/inference/docker/1.8/py3/sdk1.17.1/apt-upgrade-list-neuron.txt b/.deprecated/mxnet/inference/docker/1.8/py3/sdk1.17.1/apt-upgrade-list-neuron.txt similarity index 100% rename from mxnet/inference/docker/1.8/py3/sdk1.17.1/apt-upgrade-list-neuron.txt rename to .deprecated/mxnet/inference/docker/1.8/py3/sdk1.17.1/apt-upgrade-list-neuron.txt diff --git a/mxnet/inference/docker/1.8/py3/sdk1.18.0/Dockerfile.neuron b/.deprecated/mxnet/inference/docker/1.8/py3/sdk1.18.0/Dockerfile.neuron similarity index 100% rename from mxnet/inference/docker/1.8/py3/sdk1.18.0/Dockerfile.neuron rename to .deprecated/mxnet/inference/docker/1.8/py3/sdk1.18.0/Dockerfile.neuron diff --git a/mxnet/inference/docker/1.8/py3/sdk1.18.0/Dockerfile.neuron.os_scan_allowlist.json b/.deprecated/mxnet/inference/docker/1.8/py3/sdk1.18.0/Dockerfile.neuron.os_scan_allowlist.json similarity index 100% rename from mxnet/inference/docker/1.8/py3/sdk1.18.0/Dockerfile.neuron.os_scan_allowlist.json rename to .deprecated/mxnet/inference/docker/1.8/py3/sdk1.18.0/Dockerfile.neuron.os_scan_allowlist.json diff --git a/mxnet/inference/docker/1.8/py3/sdk1.18.0/apt-upgrade-list-neuron.txt b/.deprecated/mxnet/inference/docker/1.8/py3/sdk1.18.0/apt-upgrade-list-neuron.txt similarity index 100% rename from mxnet/inference/docker/1.8/py3/sdk1.18.0/apt-upgrade-list-neuron.txt rename to .deprecated/mxnet/inference/docker/1.8/py3/sdk1.18.0/apt-upgrade-list-neuron.txt diff --git a/mxnet/inference/docker/1.8/py3/sdk2.5.0/Dockerfile.neuron b/.deprecated/mxnet/inference/docker/1.8/py3/sdk2.5.0/Dockerfile.neuron similarity index 100% rename from mxnet/inference/docker/1.8/py3/sdk2.5.0/Dockerfile.neuron rename to .deprecated/mxnet/inference/docker/1.8/py3/sdk2.5.0/Dockerfile.neuron diff --git a/mxnet/inference/docker/1.8/py3/sdk2.5.0/Dockerfile.neuron.os_scan_allowlist.json b/.deprecated/mxnet/inference/docker/1.8/py3/sdk2.5.0/Dockerfile.neuron.os_scan_allowlist.json similarity index 100% rename from mxnet/inference/docker/1.8/py3/sdk2.5.0/Dockerfile.neuron.os_scan_allowlist.json rename to .deprecated/mxnet/inference/docker/1.8/py3/sdk2.5.0/Dockerfile.neuron.os_scan_allowlist.json diff --git a/mxnet/inference/docker/1.9/py3/Dockerfile.cpu b/.deprecated/mxnet/inference/docker/1.9/py3/Dockerfile.cpu similarity index 100% rename from mxnet/inference/docker/1.9/py3/Dockerfile.cpu rename to .deprecated/mxnet/inference/docker/1.9/py3/Dockerfile.cpu diff --git a/mxnet/inference/docker/1.9/py3/Dockerfile.ec2.cpu.os_scan_allowlist.json b/.deprecated/mxnet/inference/docker/1.9/py3/Dockerfile.ec2.cpu.os_scan_allowlist.json similarity index 100% rename from mxnet/inference/docker/1.9/py3/Dockerfile.ec2.cpu.os_scan_allowlist.json rename to .deprecated/mxnet/inference/docker/1.9/py3/Dockerfile.ec2.cpu.os_scan_allowlist.json diff --git a/mxnet/inference/docker/1.9/py3/Dockerfile.graviton.ec2.cpu b/.deprecated/mxnet/inference/docker/1.9/py3/Dockerfile.graviton.ec2.cpu similarity index 100% rename from mxnet/inference/docker/1.9/py3/Dockerfile.graviton.ec2.cpu rename to .deprecated/mxnet/inference/docker/1.9/py3/Dockerfile.graviton.ec2.cpu diff --git a/mxnet/inference/docker/1.9/py3/Dockerfile.sagemaker.cpu.os_scan_allowlist.json b/.deprecated/mxnet/inference/docker/1.9/py3/Dockerfile.sagemaker.cpu.os_scan_allowlist.json similarity index 100% rename from mxnet/inference/docker/1.9/py3/Dockerfile.sagemaker.cpu.os_scan_allowlist.json rename to .deprecated/mxnet/inference/docker/1.9/py3/Dockerfile.sagemaker.cpu.os_scan_allowlist.json diff --git a/mxnet/inference/docker/1.9/py3/cu112/Dockerfile.ec2.gpu.os_scan_allowlist.json b/.deprecated/mxnet/inference/docker/1.9/py3/cu112/Dockerfile.ec2.gpu.os_scan_allowlist.json similarity index 100% rename from mxnet/inference/docker/1.9/py3/cu112/Dockerfile.ec2.gpu.os_scan_allowlist.json rename to .deprecated/mxnet/inference/docker/1.9/py3/cu112/Dockerfile.ec2.gpu.os_scan_allowlist.json diff --git a/mxnet/inference/docker/1.9/py3/cu112/Dockerfile.gpu b/.deprecated/mxnet/inference/docker/1.9/py3/cu112/Dockerfile.gpu similarity index 100% rename from mxnet/inference/docker/1.9/py3/cu112/Dockerfile.gpu rename to .deprecated/mxnet/inference/docker/1.9/py3/cu112/Dockerfile.gpu diff --git a/mxnet/inference/docker/1.9/py3/cu112/Dockerfile.sagemaker.gpu.os_scan_allowlist.json b/.deprecated/mxnet/inference/docker/1.9/py3/cu112/Dockerfile.sagemaker.gpu.os_scan_allowlist.json similarity index 100% rename from mxnet/inference/docker/1.9/py3/cu112/Dockerfile.sagemaker.gpu.os_scan_allowlist.json rename to .deprecated/mxnet/inference/docker/1.9/py3/cu112/Dockerfile.sagemaker.gpu.os_scan_allowlist.json diff --git a/mxnet/inference/docker/__init__.py b/.deprecated/mxnet/inference/docker/__init__.py similarity index 100% rename from mxnet/inference/docker/__init__.py rename to .deprecated/mxnet/inference/docker/__init__.py diff --git a/mxnet/inference/docker/artifacts/__init__.py b/.deprecated/mxnet/inference/docker/artifacts/__init__.py similarity index 100% rename from mxnet/inference/docker/artifacts/__init__.py rename to .deprecated/mxnet/inference/docker/artifacts/__init__.py diff --git a/mxnet/inference/docker/artifacts/config.properties b/.deprecated/mxnet/inference/docker/artifacts/config.properties similarity index 100% rename from mxnet/inference/docker/artifacts/config.properties rename to .deprecated/mxnet/inference/docker/artifacts/config.properties diff --git a/mxnet/inference/docker/artifacts/mms-ec2-entrypoint.py b/.deprecated/mxnet/inference/docker/artifacts/mms-ec2-entrypoint.py similarity index 100% rename from mxnet/inference/docker/artifacts/mms-ec2-entrypoint.py rename to .deprecated/mxnet/inference/docker/artifacts/mms-ec2-entrypoint.py diff --git a/mxnet/inference/docker/artifacts/mms-entrypoint.py b/.deprecated/mxnet/inference/docker/artifacts/mms-entrypoint.py similarity index 100% rename from mxnet/inference/docker/artifacts/mms-entrypoint.py rename to .deprecated/mxnet/inference/docker/artifacts/mms-entrypoint.py diff --git a/mxnet/inference/docker/artifacts/modelserver-neuron.sh b/.deprecated/mxnet/inference/docker/artifacts/modelserver-neuron.sh similarity index 100% rename from mxnet/inference/docker/artifacts/modelserver-neuron.sh rename to .deprecated/mxnet/inference/docker/artifacts/modelserver-neuron.sh diff --git a/mxnet/inference/docker/artifacts/neuron-entrypoint.py b/.deprecated/mxnet/inference/docker/artifacts/neuron-entrypoint.py similarity index 100% rename from mxnet/inference/docker/artifacts/neuron-entrypoint.py rename to .deprecated/mxnet/inference/docker/artifacts/neuron-entrypoint.py diff --git a/mxnet/inference/docker/artifacts/neuron-monitor.sh b/.deprecated/mxnet/inference/docker/artifacts/neuron-monitor.sh similarity index 100% rename from mxnet/inference/docker/artifacts/neuron-monitor.sh rename to .deprecated/mxnet/inference/docker/artifacts/neuron-monitor.sh diff --git a/mxnet/training/buildspec.yml b/.deprecated/mxnet/training/buildspec.yml similarity index 100% rename from mxnet/training/buildspec.yml rename to .deprecated/mxnet/training/buildspec.yml diff --git a/mxnet/training/docker/1.6.0/py2/Dockerfile.cpu b/.deprecated/mxnet/training/docker/1.6.0/py2/Dockerfile.cpu similarity index 100% rename from mxnet/training/docker/1.6.0/py2/Dockerfile.cpu rename to .deprecated/mxnet/training/docker/1.6.0/py2/Dockerfile.cpu diff --git a/mxnet/training/docker/1.6.0/py2/cu101/Dockerfile.gpu b/.deprecated/mxnet/training/docker/1.6.0/py2/cu101/Dockerfile.gpu similarity index 100% rename from mxnet/training/docker/1.6.0/py2/cu101/Dockerfile.gpu rename to .deprecated/mxnet/training/docker/1.6.0/py2/cu101/Dockerfile.gpu diff --git a/mxnet/training/docker/1.6.0/py2/example/Dockerfile.gpu b/.deprecated/mxnet/training/docker/1.6.0/py2/example/Dockerfile.gpu similarity index 100% rename from mxnet/training/docker/1.6.0/py2/example/Dockerfile.gpu rename to .deprecated/mxnet/training/docker/1.6.0/py2/example/Dockerfile.gpu diff --git a/mxnet/training/docker/1.6.0/py3/Dockerfile.cpu b/.deprecated/mxnet/training/docker/1.6.0/py3/Dockerfile.cpu similarity index 100% rename from mxnet/training/docker/1.6.0/py3/Dockerfile.cpu rename to .deprecated/mxnet/training/docker/1.6.0/py3/Dockerfile.cpu diff --git a/mxnet/training/docker/1.6.0/py3/cu101/Dockerfile.gpu b/.deprecated/mxnet/training/docker/1.6.0/py3/cu101/Dockerfile.gpu similarity index 100% rename from mxnet/training/docker/1.6.0/py3/cu101/Dockerfile.gpu rename to .deprecated/mxnet/training/docker/1.6.0/py3/cu101/Dockerfile.gpu diff --git a/mxnet/training/docker/1.6.0/py3/example/Dockerfile.gpu b/.deprecated/mxnet/training/docker/1.6.0/py3/example/Dockerfile.gpu similarity index 100% rename from mxnet/training/docker/1.6.0/py3/example/Dockerfile.gpu rename to .deprecated/mxnet/training/docker/1.6.0/py3/example/Dockerfile.gpu diff --git a/mxnet/training/docker/1.7.0/py3/Dockerfile.cpu b/.deprecated/mxnet/training/docker/1.7.0/py3/Dockerfile.cpu similarity index 100% rename from mxnet/training/docker/1.7.0/py3/Dockerfile.cpu rename to .deprecated/mxnet/training/docker/1.7.0/py3/Dockerfile.cpu diff --git a/mxnet/training/docker/1.7.0/py3/cu101/Dockerfile.gpu b/.deprecated/mxnet/training/docker/1.7.0/py3/cu101/Dockerfile.gpu similarity index 100% rename from mxnet/training/docker/1.7.0/py3/cu101/Dockerfile.gpu rename to .deprecated/mxnet/training/docker/1.7.0/py3/cu101/Dockerfile.gpu diff --git a/mxnet/training/docker/1.7.0/py3/example/Dockerfile.gpu b/.deprecated/mxnet/training/docker/1.7.0/py3/example/Dockerfile.gpu similarity index 100% rename from mxnet/training/docker/1.7.0/py3/example/Dockerfile.gpu rename to .deprecated/mxnet/training/docker/1.7.0/py3/example/Dockerfile.gpu diff --git a/mxnet/training/docker/1.8/py3/Dockerfile.cpu b/.deprecated/mxnet/training/docker/1.8/py3/Dockerfile.cpu similarity index 100% rename from mxnet/training/docker/1.8/py3/Dockerfile.cpu rename to .deprecated/mxnet/training/docker/1.8/py3/Dockerfile.cpu diff --git a/mxnet/training/docker/1.8/py3/Dockerfile.cpu.os_scan_allowlist.json b/.deprecated/mxnet/training/docker/1.8/py3/Dockerfile.cpu.os_scan_allowlist.json similarity index 100% rename from mxnet/training/docker/1.8/py3/Dockerfile.cpu.os_scan_allowlist.json rename to .deprecated/mxnet/training/docker/1.8/py3/Dockerfile.cpu.os_scan_allowlist.json diff --git a/mxnet/training/docker/1.8/py3/apt-upgrade-list-cpu.txt b/.deprecated/mxnet/training/docker/1.8/py3/apt-upgrade-list-cpu.txt similarity index 100% rename from mxnet/training/docker/1.8/py3/apt-upgrade-list-cpu.txt rename to .deprecated/mxnet/training/docker/1.8/py3/apt-upgrade-list-cpu.txt diff --git a/mxnet/training/docker/1.8/py3/cu110/Dockerfile.gpu b/.deprecated/mxnet/training/docker/1.8/py3/cu110/Dockerfile.gpu similarity index 100% rename from mxnet/training/docker/1.8/py3/cu110/Dockerfile.gpu rename to .deprecated/mxnet/training/docker/1.8/py3/cu110/Dockerfile.gpu diff --git a/mxnet/training/docker/1.8/py3/cu110/Dockerfile.gpu.os_scan_allowlist.json b/.deprecated/mxnet/training/docker/1.8/py3/cu110/Dockerfile.gpu.os_scan_allowlist.json similarity index 100% rename from mxnet/training/docker/1.8/py3/cu110/Dockerfile.gpu.os_scan_allowlist.json rename to .deprecated/mxnet/training/docker/1.8/py3/cu110/Dockerfile.gpu.os_scan_allowlist.json diff --git a/mxnet/training/docker/1.8/py3/cu110/apt-upgrade-list-gpu.txt b/.deprecated/mxnet/training/docker/1.8/py3/cu110/apt-upgrade-list-gpu.txt similarity index 100% rename from mxnet/training/docker/1.8/py3/cu110/apt-upgrade-list-gpu.txt rename to .deprecated/mxnet/training/docker/1.8/py3/cu110/apt-upgrade-list-gpu.txt diff --git a/mxnet/training/docker/1.8/py3/example/Dockerfile.gpu b/.deprecated/mxnet/training/docker/1.8/py3/example/Dockerfile.gpu similarity index 100% rename from mxnet/training/docker/1.8/py3/example/Dockerfile.gpu rename to .deprecated/mxnet/training/docker/1.8/py3/example/Dockerfile.gpu diff --git a/mxnet/training/docker/1.9/py3/Dockerfile.cpu b/.deprecated/mxnet/training/docker/1.9/py3/Dockerfile.cpu similarity index 100% rename from mxnet/training/docker/1.9/py3/Dockerfile.cpu rename to .deprecated/mxnet/training/docker/1.9/py3/Dockerfile.cpu diff --git a/mxnet/training/docker/1.9/py3/cu112/Dockerfile.gpu b/.deprecated/mxnet/training/docker/1.9/py3/cu112/Dockerfile.gpu similarity index 100% rename from mxnet/training/docker/1.9/py3/cu112/Dockerfile.gpu rename to .deprecated/mxnet/training/docker/1.9/py3/cu112/Dockerfile.gpu diff --git a/mxnet/training/docker/1.9/py3/example/Dockerfile.gpu b/.deprecated/mxnet/training/docker/1.9/py3/example/Dockerfile.gpu similarity index 100% rename from mxnet/training/docker/1.9/py3/example/Dockerfile.gpu rename to .deprecated/mxnet/training/docker/1.9/py3/example/Dockerfile.gpu diff --git a/mxnet/training/docker/__init__.py b/.deprecated/mxnet/training/docker/__init__.py similarity index 100% rename from mxnet/training/docker/__init__.py rename to .deprecated/mxnet/training/docker/__init__.py diff --git a/mxnet/training/docker/artifacts/__init__.py b/.deprecated/mxnet/training/docker/artifacts/__init__.py similarity index 100% rename from mxnet/training/docker/artifacts/__init__.py rename to .deprecated/mxnet/training/docker/artifacts/__init__.py diff --git a/mxnet/training/docker/artifacts/dockerd-entrypoint.py b/.deprecated/mxnet/training/docker/artifacts/dockerd-entrypoint.py similarity index 100% rename from mxnet/training/docker/artifacts/dockerd-entrypoint.py rename to .deprecated/mxnet/training/docker/artifacts/dockerd-entrypoint.py diff --git a/stabilityai/pytorch/inference/buildspec.yml b/.deprecated/stabilityai/pytorch/inference/buildspec.yml similarity index 100% rename from stabilityai/pytorch/inference/buildspec.yml rename to .deprecated/stabilityai/pytorch/inference/buildspec.yml diff --git a/stabilityai/pytorch/inference/docker/2.0/py3/cu118/Dockerfile.sagemaker.gpu b/.deprecated/stabilityai/pytorch/inference/docker/2.0/py3/cu118/Dockerfile.sagemaker.gpu similarity index 100% rename from stabilityai/pytorch/inference/docker/2.0/py3/cu118/Dockerfile.sagemaker.gpu rename to .deprecated/stabilityai/pytorch/inference/docker/2.0/py3/cu118/Dockerfile.sagemaker.gpu diff --git a/stabilityai/pytorch/inference/docker/build_artifacts/torchserve-stabilityai-entrypoint.py b/.deprecated/stabilityai/pytorch/inference/docker/build_artifacts/torchserve-stabilityai-entrypoint.py similarity index 100% rename from stabilityai/pytorch/inference/docker/build_artifacts/torchserve-stabilityai-entrypoint.py rename to .deprecated/stabilityai/pytorch/inference/docker/build_artifacts/torchserve-stabilityai-entrypoint.py diff --git a/stabilityai/pytorch/training/buildspec.yml b/.deprecated/stabilityai/pytorch/training/buildspec.yml similarity index 100% rename from stabilityai/pytorch/training/buildspec.yml rename to .deprecated/stabilityai/pytorch/training/buildspec.yml diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index b478a20d6ee7..9d62f1fbfeb4 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -7,29 +7,16 @@ # review when someone opens a pull request. * @aws/dl-containers - # Common files that is managed by multiple owners available_images.md @aws/dl-containers @aws/dlc-autogluon-reviewers @aws/dlc-neuron-reviewers @aws/dlc-eia-reviewers @aws/dlc-trcomp-reviewers @aws/dlc-lmi-reviewers @aws/sagemaker-1p-algorithms @aws/dlc-pytorch-reviewers @aws/dlc-triton-reviewers -release_images_inference.yml @aws/dl-containers @aws/dlc-autogluon-reviewers @aws/dlc-neuron-reviewers @aws/dlc-eia-reviewers @aws/dlc-trcomp-reviewers @aws/dlc-lmi-reviewers @aws/sagemaker-1p-algorithms @aws/dlc-pytorch-reviewers @aws/dlc-triton-reviewers -release_images_training.yml @aws/dl-containers @aws/dlc-autogluon-reviewers @aws/dlc-neuron-reviewers @aws/dlc-eia-reviewers @aws/dlc-trcomp-reviewers @aws/sagemaker-1p-algorithms @aws/dlc-pytorch-reviewers -.release_images_template.yml @aws/dl-containers @aws/dlc-autogluon-reviewers @aws/dlc-neuron-reviewers @aws/dlc-eia-reviewers @aws/dlc-trcomp-reviewers @aws/dlc-pytorch-reviewers @aws/dlc-triton-reviewers -data/ignore_ids_safety_scan.json @aws/dl-containers @aws/sagemaker-1p-algorithms + # Any files modified under autogluon/ will be assigned to the autogluon reviewer team autogluon/ @aws/dlc-autogluon-reviewers test/sagemaker_tests/autogluon @aws/dlc-autogluon-reviewers -# Any PR with a file with "pytorch" in it will be assigned to the conda reviewer team -*pytorch* @aws/dlc-pytorch-reviewers - # Any PR with a file with "neuron" in it will be assigned to the neuron reviewer team *neuron* @aws/dlc-neuron-reviewers -# Any PR with a file with "eia" in it will be assigned to the EI reviewer team -*eia* @aws/dlc-eia-reviewers - -# Any file under dgl_tests dir will require review from DGL team -dgl_tests/ @aws/dlc-dgl-reviewers - # Any PR with a file with "trcomp" in it will be assigned to the SM Training Compiler team *trcomp* @aws/dlc-trcomp-reviewers @@ -41,15 +28,3 @@ dgl_tests/ @aws/dlc-dgl-reviewers # Any PR with a file with "smmodelparallel" in it will be assigned to the SM ModelParallel team *smmodelparallel* @aws/dlc-sm-model-parallel-reviewers - -# Any PR with a file with "triton" in it will be assigned to the SM Triton/ModelServing team -*triton* @aws/dlc-triton-reviewers - -# Files under stabilityai/ and huggingface/ directories can be directly reviewed by below teams -stabilityai/ @aws/dl-containers -huggingface/ @aws/dl-containers -test/sagemaker_tests/huggingface/ @aws/dl-containers -test/sagemaker_tests/huggingface_pytorch/ @aws/dl-containers -test/sagemaker_tests/huggingface_tensorflow/ @aws/dl-containers -test/sagemaker_tests/pytorch/inference/integration/sagemaker/test_stabilityai.py @aws/dl-containers -test/sagemaker_tests/pytorch/inference/resources/stabilityai/ @aws/dl-containers diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md deleted file mode 100644 index 1a6ed0963ba1..000000000000 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -name: Feature request -about: Suggest an idea for this project -title: "[feature-request]" -labels: '' -assignees: '' - ---- - -Checklist -- [ ] I've prepended issue tag with type of change: [feature] -- [ ] (If applicable) I've documented below the DLC image/dockerfile this relates to -- [ ] (If applicable) I've documented the tests I've run on the DLC image -- [ ] I'm using an existing DLC image listed here: https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/deep-learning-containers-images.html -- [ ] I've built my own container based off DLC (and I've attached the code used to build my own image) - -*Concise Description:* - -*DLC image/dockerfile:* - -*Is your feature request related to a problem? Please describe.* -A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] - -*Describe the solution you'd like* -A clear and concise description of what you want to happen. - -*Describe alternatives you've considered* -A clear and concise description of any alternative solutions or features you've considered. - -*Additional context* -Add any other context or screenshots about the feature request here. diff --git a/.github/ISSUE_TEMPLATE/pending_change.md b/.github/ISSUE_TEMPLATE/pending_change.md deleted file mode 100644 index 0aab0dcbd5f7..000000000000 --- a/.github/ISSUE_TEMPLATE/pending_change.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -name: Pending Change -about: Call out changes to be made at a future date -title: "[pending-change]" -labels: '' -assignees: '' - ---- - -## Checklist -- [ ] I've prepended issue tag with type of change: [feature] -- [ ] (If applicable) I've documented below the DLC image/dockerfile this relates to -- [ ] (If applicable) I've documented below the test files this relates to - -#### Concise Description: -*Add a concise description of what change needs to be made, why it must be made, and when it must be made* - -#### DLC image/dockerfile: -*List of images or dockerfiles that this change applies to* - -#### Additional context -*Add any other context about the pending change here.* diff --git a/.release_images_template.yml b/.release_images_template.yml deleted file mode 100644 index 071652985eb2..000000000000 --- a/.release_images_template.yml +++ /dev/null @@ -1,1357 +0,0 @@ ---- -release_images: - 1: - framework: "pytorch" - version: "1.9.1" - arch_type: "x86" - inference: - device_types: ["neuron"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - neuron_sdk_version: "sdk1.16.1" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: True # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 2: - framework: "mxnet" - version: "1.8.0" - arch_type: "x86" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py37"] - os_version: "ubuntu16.04" - cuda_version: "cu110" - example: False - disable_sm_tag: False - force_release: False - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py37"] - os_version: "ubuntu16.04" - cuda_version: "cu110" - example: False - disable_sm_tag: False - force_release: False - 3: - framework: "mxnet" - version: "1.8.0" - arch_type: "x86" - training: - device_types: ["gpu"] - python_versions: ["py37"] - os_version: "ubuntu16.04" - cuda_version: "cu110" - example: True - disable_sm_tag: False - force_release: False - 4: - framework: "mxnet" - version: "1.7.0" - arch_type: "x86" - training: - device_types: ["cpu","gpu"] - python_versions: ["py36"] - os_version: "ubuntu16.04" - cuda_version: "cu101" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - inference: - device_types: ["cpu","gpu"] - python_versions: ["py36"] - os_version: "ubuntu16.04" - cuda_version: "cu101" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 5: - framework: "mxnet" - version: "1.7.0" - arch_type: "x86" - training: - device_types: ["gpu"] - python_versions: ["py36"] - os_version: "ubuntu16.04" - cuda_version: "cu101" - example: True - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 6: - framework: "tensorflow" - version: "2.4.3" - arch_type: "x86" - training: - device_types: ["cpu","gpu"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - cuda_version: "cu110" - example: False - disable_sm_tag: False - force_release: False - inference: - device_types: ["cpu","gpu"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - cuda_version: "cu110" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 7: - framework: "tensorflow" - version: "2.4.3" - arch_type: "x86" - training: - device_types: ["gpu"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - cuda_version: "cu110" - example: True - disable_sm_tag: False - force_release: False - 8: - framework: "pytorch" - version: "1.8.1" - arch_type: "x86" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py36"] - os_version: "ubuntu18.04" - cuda_version: "cu111" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py36"] - os_version: "ubuntu18.04" - cuda_version: "cu111" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 9: - framework: "pytorch" - version: "1.8.1" - arch_type: "x86" - training: - device_types: ["gpu"] - python_versions: ["py36"] - os_version: "ubuntu18.04" - cuda_version: "cu111" - example: True - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 10: - framework: "pytorch" - version: "1.7.1" - arch_type: "x86" - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py36"] - os_version: "ubuntu18.04" - cuda_version: "cu110" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 11: - framework: "huggingface_tensorflow" - version: "2.4.3" - hf_transformers: "4.10.2" - arch_type: "x86" - inference: - device_types: ["cpu", "gpu"] - python_versions: [ "py37" ] - os_version: "ubuntu18.04" - cuda_version: "cu110" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 12: - framework: "pytorch" - version: "1.6.0" - arch_type: "x86" - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py36"] - os_version: "ubuntu16.04" - cuda_version: "cu101" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 13: - framework: "tensorflow" - version: "2.5.1" - arch_type: "x86" - inference: - device_types: ["cpu","gpu"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 14: - framework: "huggingface_pytorch" - version: "1.8.1" - hf_transformers: "4.10.2" - arch_type: "x86" - training: - device_types: ["gpu"] - python_versions: [ "py36" ] - os_version: "ubuntu18.04" - cuda_version: "cu111" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 15: - framework: "pytorch" - version: "1.9.1" - arch_type: "x86" - training: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu111" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 16: - framework: "pytorch" - version: "1.9.1" - arch_type: "x86" - training: - device_types: [ "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu111" - example: True - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 17: - framework: "autogluon" - version: "0.2.1" - arch_type: "x86" - training: - device_types: ["cpu","gpu"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - cuda_version: "cu102" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 18: - framework: "huggingface_pytorch" - version: "1.8.1" - hf_transformers: "4.10.2" - arch_type: "x86" - inference: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py36" ] - os_version: "ubuntu18.04" - cuda_version: "cu111" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 19: - framework: "huggingface_tensorflow" - version: "2.4.3" - hf_transformers: "4.10.2" - arch_type: "x86" - training: - device_types: [ "gpu" ] - python_versions: [ "py37" ] - os_version: "ubuntu18.04" - cuda_version: "cu110" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 20: - framework: "tensorflow" - version: "2.6.2" - arch_type: "x86" - training: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 21: - framework: "tensorflow" - version: "2.6.2" - arch_type: "x86" - training: - device_types: [ "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: True - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 22: - framework: "autogluon" - version: "0.3.1" - arch_type: "x86" - training: - device_types: ["cpu","gpu"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - cuda_version: "cu102" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 23: - framework: "autogluon" - version: "0.3.1" - arch_type: "x86" - inference: - device_types: ["cpu"] - python_versions: [ "py37" ] - os_version: "ubuntu16.04" - example: False - disable_sm_tag: False - force_release: False - 24: - framework: "huggingface_tensorflow" - version: "2.5.1" - hf_transformers: "4.12.3" - arch_type: "x86" - training: - device_types: [ "gpu" ] - python_versions: [ "py37" ] - os_version: "ubuntu18.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 25: - framework: "huggingface_tensorflow" - version: "2.5.1" - hf_transformers: "4.12.3" - arch_type: "x86" - inference: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py37" ] - os_version: "ubuntu18.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 26: - framework: "huggingface_pytorch" - version: "1.9.1" - hf_transformers: "4.12.3" - arch_type: "x86" - training: - device_types: [ "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu111" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 27: - framework: "huggingface_pytorch" - version: "1.9.1" - hf_transformers: "4.12.3" - arch_type: "x86" - inference: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu111" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 28: - framework: "mxnet" - version: "1.8.0" - arch_type: "x86" - inference: - device_types: ["neuron"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - neuron_sdk_version: "sdk1.16.1" - example: False - disable_sm_tag: True - force_release: False - 29: - framework: "tensorflow" - version: "2.5.1" - arch_type: "x86" - inference: - device_types: ["neuron"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - neuron_sdk_version: "sdk1.16.1" - example: False - disable_sm_tag: True - force_release: False - 30: - framework: "tensorflow" - version: "1.15.5" - arch_type: "x86" - inference: - device_types: ["neuron"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - neuron_sdk_version: "sdk1.16.1" - example: False - disable_sm_tag: True - force_release: False - 31: - framework: "pytorch" - version: "1.10.0" - customer_type: "ec2" - arch_type: "graviton" - inference: - device_types: ["cpu"] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: True # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 32: - framework: "tensorflow" - version: "2.7.0" - customer_type: "ec2" - arch_type: "graviton" - inference: - device_types: ["cpu"] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: True # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 33: - framework: "tensorflow" - version: "2.5.2" - arch_type: "x86" - inference: - device_types: ["neuron"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - neuron_sdk_version: "sdk1.17.0" - example: False - disable_sm_tag: True - force_release: False - 34: - framework: "pytorch" - version: "1.10.1" - arch_type: "x86" - inference: - device_types: ["neuron"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - neuron_sdk_version: "sdk1.17.0" - example: False - disable_sm_tag: True - force_release: False - 35: - framework: "mxnet" - version: "1.9.0" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py38"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 36: - framework: "mxnet" - version: "1.9.0" - customer_type: "sagemaker" - arch_type: "x86" - training: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py38"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 37: - framework: "mxnet" - version: "1.9.0" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: [ "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: True # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 38: - framework: "tensorflow" - version: "2.8.0" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: ["cpu","gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False - disable_sm_tag: True - force_release: False - inference: - device_types: ["cpu","gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False - disable_sm_tag: True # [Default: False] This option is not used by Example images - force_release: False - 39: - framework: "tensorflow" - version: "2.8.0" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: ["gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: True - disable_sm_tag: True - force_release: False - 40: - framework: "huggingface_tensorflow" - version: "2.6.3" - arch_type: "x86" - hf_transformers: "4.17.0" - training: - device_types: [ "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 41: - framework: "pytorch" - version: "1.11.0" - arch_type: "x86" - customer_type: "ec2" - training: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu115" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu115" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 42: - framework: "pytorch" - version: "1.11.0" - arch_type: "x86" - customer_type: "ec2" - training: - device_types: [ "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu115" - example: True # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 43: - framework: "huggingface_pytorch" - version: "1.10.2" - arch_type: "x86" - hf_transformers: "4.17.0" - training: - device_types: [ "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu113" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu113" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 44: - framework: "mxnet" - version: "1.9.0" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py38"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 45: - framework: "mxnet" - version: "1.9.0" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: [ "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: True # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 46: - framework: "pytorch" - version: "1.10.2" - arch_type: "x86" - customer_type: "ec2" - training: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - cuda_version: "cu113" - os_version: "ubuntu20.04" - example: False - disable_sm_tag: False - force_release: False - inference: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - cuda_version: "cu113" - os_version: "ubuntu20.04" - example: False - disable_sm_tag: False - force_release: False - 47: - framework: "pytorch" - version: "1.10.2" - arch_type: "x86" - customer_type: "ec2" - training: - device_types: [ "gpu" ] - python_versions: [ "py38" ] - cuda_version: "cu113" - os_version: "ubuntu20.04" - example: True - disable_sm_tag: False - force_release: False - 48: - framework: "pytorch" - version: "1.10.2" - arch_type: "x86" - customer_type: "sagemaker" - training: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - cuda_version: "cu113" - os_version: "ubuntu20.04" - example: False - disable_sm_tag: False - force_release: False - inference: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - cuda_version: "cu113" - os_version: "ubuntu20.04" - example: False - disable_sm_tag: False - force_release: False - 49: - framework: "tensorflow" - version: "2.9.1" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False - force_release: False - 50: - framework: "tensorflow" - version: "2.9.1" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: ["gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: True - disable_sm_tag: False - force_release: False - 51: - framework: "tensorflow" - version: "2.9.1" - customer_type: "sagemaker" - arch_type: "x86" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False - force_release: False - 52: - framework: "autogluon" - version: "0.3.2" - arch_type: "x86" - training: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu18.04" - cuda_version: "cu112" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: [ "cpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 53: - framework: "autogluon" - version: "0.4.2" - arch_type: "x86" - training: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 54: - framework: "tensorflow" - version: "2.6.3" - arch_type: "x86" - training: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False - force_release: False - 55: - framework: "tensorflow" - version: "2.6.3" - arch_type: "x86" - training: - device_types: [ "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: True - disable_sm_tag: False - force_release: False - 56: - framework: "tensorflow" - version: "2.5.3" - arch_type: "x86" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 57: - framework: "tensorflow" - version: "2.5.3" - arch_type: "x86" - training: - device_types: ["gpu"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - cuda_version: "cu112" - example: True - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 58: - framework: "huggingface_tensorflow" - version: "2.5.3" - arch_type: "x86" - hf_transformers: "4.12.3" - training: - device_types: ["gpu"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - cuda_version: "cu112" - example: false - disable_sm_tag: false - force_release: false - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - cuda_version: "cu112" - example: false - disable_sm_tag: false - force_release: false - 59: - framework: "pytorch" - version: "1.12.1" - arch_type: "x86" - customer_type: "ec2" - training: - device_types: ["cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu116" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu116" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 60: - framework: "pytorch" - version: "1.12.1" - arch_type: "x86" - customer_type: "ec2" - training: - device_types: [ "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu116" - example: True # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 61: - framework: "pytorch" - version: "1.12.1" - arch_type: "x86" - customer_type: "sagemaker" - training: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu113" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu113" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 62: - framework: "pytorch" - version: "1.10.2" - arch_type: "x86" - inference: - device_types: ["neuron"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - neuron_sdk_version: "sdk1.19.0" - example: False - disable_sm_tag: True - force_release: False - 63: - framework: "huggingface_pytorch_trcomp" - version: "1.11.0" - hf_transformers: "4.21.1" - arch_type: "x86" - training: - device_types: [ "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu113" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 64: - framework: "pytorch" - version: "1.5.1" - arch_type: "x86" - inference: - device_types: ["eia"] - python_versions: ["py38"] - os_version: "ubuntu20.04" - example: False - disable_sm_tag: True - 65: - framework: "tensorflow" - version: "2.10.0" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: ["gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: True - disable_sm_tag: False - force_release: False - 66: - framework: "pytorch" - version: "1.11.0" - arch_type: "x86" - customer_type: "sagemaker" - training: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu113" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: [ "cpu", "gpu" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - cuda_version: "cu113" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 67: - framework: "autogluon" - version: "0.5.2" - arch_type: "x86" - training: - device_types: ["cpu","gpu"] - python_versions: ["py38"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - inference: - device_types: ["cpu","gpu"] - python_versions: ["py38"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 68: - framework: "pytorch" - version: "1.13.0" - arch_type: "x86" - customer_type: "ec2" - training: - device_types: ["cpu", "gpu"] - python_versions: [ "py39" ] - os_version: "ubuntu20.04" - cuda_version: "cu117" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu117" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 69: - framework: "tensorflow" - version: "2.11.0" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False - force_release: False - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False - force_release: False - 70: - framework: "tensorflow" - version: "2.11.0" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: ["gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: True - disable_sm_tag: False - force_release: False - 71: - framework: "tensorflow" - version: "2.11.0" - customer_type: "sagemaker" - arch_type: "x86" - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False - force_release: False - 72: - framework: "pytorch" - version: "1.13.1" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu117" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 73: - framework: "pytorch" - version: "1.13.1" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: ["gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu117" - example: True - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 74: - framework: "tensorflow" - version: "2.9.3" - customer_type: "ec2" - arch_type: "x86" - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False - force_release: False - 75: - framework: "tensorflow" - version: "2.9.3" - customer_type: "sagemaker" - arch_type: "x86" - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False - force_release: False - 76: - framework: "pytorch_trcomp" - version: "1.13.1" - arch_type: "x86" - customer_type: "sagemaker" - training: - device_types: [ "gpu" ] - python_versions: [ "py39" ] - os_version: "ubuntu20.04" - cuda_version: "cu117" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 77: - framework: "autogluon" - version: "0.6.2" - arch_type: "x86" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py38"] - os_version: "ubuntu20.04" - cuda_version: "cu113" - example: False - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 78: - framework: "huggingface_pytorch" - version: "1.13.1" - arch_type: "x86" - hf_transformers: "4.26.0" - training: - device_types: [ "gpu" ] - python_versions: [ "py39" ] - os_version: "ubuntu20.04" - cuda_version: "cu117" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. Hugging Face images set this to True because they're inherantly built for SageMaker, - # unlike other images which may be built for both SageMaker and EC2 and therefore need to distinguish variants. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 79: - framework: "tensorflow" - version: "2.10.1" - arch_type: "x86" - inference: - device_types: [ "neuron" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - neuron_sdk_version: "sdk2.9.0" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: True # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 80: - framework: "pytorch" - version: "1.13.0" - arch_type: "x86" - inference: - device_types: [ "neuronx" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - neuron_sdk_version: "sdk2.9.0" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: True # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 81: - framework: "tensorflow" - version: "2.10.1" - arch_type: "x86" - inference: - device_types: [ "neuronx" ] - python_versions: [ "py38" ] - os_version: "ubuntu20.04" - neuron_sdk_version: "sdk2.9.0" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: True # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - 82: - framework: "tensorflow" - version: "2.11.1" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False - force_release: False - 83: - framework: "tensorflow" - version: "2.11.1" - customer_type: "sagemaker" - arch_type: "x86" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: False - disable_sm_tag: False - force_release: False - 84: - framework: "tensorflow" - version: "2.11.1" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: ["gpu"] - python_versions: ["py39"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: True - disable_sm_tag: False - force_release: False - 85: - framework: "tensorflow" - version: "2.12.0" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py310"] - os_version: "ubuntu20.04" - cuda_version: "cu118" - example: False - disable_sm_tag: False - force_release: False - 86: - framework: "tensorflow" - version: "2.12.0" - customer_type: "ec2" - arch_type: "x86" - training: - device_types: ["gpu"] - python_versions: ["py310"] - os_version: "ubuntu20.04" - cuda_version: "cu118" - example: True - disable_sm_tag: False - force_release: False - 87: - framework: "tensorflow" - version: "2.12.0" - customer_type: "sagemaker" - arch_type: "x86" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py310"] - os_version: "ubuntu20.04" - cuda_version: "cu118" - example: False - disable_sm_tag: False - force_release: False - 88: - framework: "huggingface_pytorch" - version: "1.13.1" - arch_type: "x86" - hf_transformers: "4.34.1" - training: - device_types: [ "neuronx" ] - python_versions: [ "py310" ] - os_version: "ubuntu20.04" - neuron_sdk_version: "sdk2.15.0" - example: False - disable_sm_tag: True # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. Hugging Face images set this to True because they're inherantly built for SageMaker, - # unlike other images which may be built for both SageMaker and EC2 and therefore need to distinguish variants. - force_release: False diff --git a/.release_templates/mxnet_release_images.yml b/.release_templates/mxnet_release_images.yml deleted file mode 100644 index 9e971763ef0f..000000000000 --- a/.release_templates/mxnet_release_images.yml +++ /dev/null @@ -1,34 +0,0 @@ ---- -release_images: - 1: - framework: "mxnet" - version: "1.6.0" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py27", "py36"] - os_version: "ubuntu16.04" - cuda_version: "cu101" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py27", "py36"] - os_version: "ubuntu16.04" - cuda_version: "cu101" - example: False - disable_sm_tag: False - force_release: False - 2: - framework: "mxnet" - version: "1.6.0" - training: - device_types: ["gpu"] - python_versions: ["py36"] - os_version: "ubuntu16.04" - cuda_version: "cu101" - example: True - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False diff --git a/.release_templates/pytorch_release_images.yml b/.release_templates/pytorch_release_images.yml deleted file mode 100644 index 347eaa5aa593..000000000000 --- a/.release_templates/pytorch_release_images.yml +++ /dev/null @@ -1,34 +0,0 @@ ---- -release_images: - 1: - framework: "pytorch" - version: "1.5.1" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py36"] - os_version: "ubuntu16.04" - cuda_version: "cu101" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: False # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py36"] - os_version: "ubuntu16.04" - cuda_version: "cu101" - example: False - disable_sm_tag: False - force_release: False - 2: - framework: "pytorch" - version: "1.5.1" - training: - device_types: ["gpu"] - python_versions: ["py36"] - os_version: "ubuntu16.04" - cuda_version: "cu101" - example: True - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False diff --git a/.release_templates/stabilityai_release_images.yml b/.release_templates/stabilityai_release_images.yml deleted file mode 100644 index 0064ec891d95..000000000000 --- a/.release_templates/stabilityai_release_images.yml +++ /dev/null @@ -1,16 +0,0 @@ ---- -release_images: - 1: - framework: "stabilityai_pytorch" - version: "2.0.1" - arch_type: "x86" - customer_type: "sagemaker" - inference: - device_types: ["gpu"] - python_versions: ["py310"] - os_version: "ubuntu20.04" - cuda_version: "cu118" - sgm_version: "0.1.0" - example: False - disable_sm_tag: False - force_release: False diff --git a/.release_templates/tensorflow1_release_images.yml b/.release_templates/tensorflow1_release_images.yml deleted file mode 100644 index e6318e4c7f73..000000000000 --- a/.release_templates/tensorflow1_release_images.yml +++ /dev/null @@ -1,34 +0,0 @@ ---- -release_images: - 1: - framework: "tensorflow" - version: "1.15.3" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - cuda_version: "cu100" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: True # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py36"] - os_version: "ubuntu18.04" - cuda_version: "cu100" - example: False - disable_sm_tag: False - force_release: False - 2: - framework: "tensorflow" - version: "1.15.3" - training: - device_types: ["gpu"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - cuda_version: "cu100" - example: True - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False diff --git a/.release_templates/tensorflow2_release_images.yml b/.release_templates/tensorflow2_release_images.yml deleted file mode 100644 index 11f76403253e..000000000000 --- a/.release_templates/tensorflow2_release_images.yml +++ /dev/null @@ -1,55 +0,0 @@ ---- -release_images: - 1: - framework: "tensorflow" - version: "2.2.0" - training: - device_types: ["cpu", "gpu"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - cuda_version: "cu102" - example: False # [Default: False] Set to True to denote that this image is an Example image - disable_sm_tag: True # [Default: False] Set to True to prevent SageMaker Abbreviated Tags from being attached - # to images being published. - force_release: False # [Default: False] Set to True to force images to be published even if the same image - # has already been published. Re-released image will have minor version incremented by 1. - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - cuda_version: "cu102" - example: False - disable_sm_tag: False - force_release: False - 2: - framework: "tensorflow" - version: "2.2.0" - training: - device_types: ["gpu"] - python_versions: ["py37"] - os_version: "ubuntu18.04" - cuda_version: "cu102" - example: True - disable_sm_tag: False # [Default: False] This option is not used by Example images - force_release: False - 3: - framework: "huggingface_tensorflow" - version: "2.6.3" - arch_type: "x86" - hf_transformers: "4.17.0" - training: - device_types: ["gpu"] - python_versions: ["py38"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: false - disable_sm_tag: false - force_release: false - inference: - device_types: ["cpu", "gpu"] - python_versions: ["py38"] - os_version: "ubuntu20.04" - cuda_version: "cu112" - example: false - disable_sm_tag: false - force_release: false \ No newline at end of file diff --git a/autopr_buildspec.yml b/autopr_buildspec.yml index 9ab6e111bc63..e36ae238990a 100644 --- a/autopr_buildspec.yml +++ b/autopr_buildspec.yml @@ -18,7 +18,6 @@ phases: - export PYTHONPATH=$PYTHONPATH:$(pwd)/src - pip install -r src/requirements.txt - pip install -r test/requirements.txt - - pip install scheduler/. - echo Running $TEST_TYPE tests on $DLC_IMAGES... - python src/autopr_caller.py post_build: diff --git a/bjs_release_buildspec.yml b/bjs_release_buildspec.yml deleted file mode 100644 index c77f6d771c7c..000000000000 --- a/bjs_release_buildspec.yml +++ /dev/null @@ -1,9 +0,0 @@ -version: 0.2 - -phases: - pre_build: - commands: - - start-dockerd - build: - commands: - - publish_dlc_images_to_bjs diff --git a/custom_images.md b/custom_images.md deleted file mode 100644 index bacb9e2d81f6..000000000000 --- a/custom_images.md +++ /dev/null @@ -1,70 +0,0 @@ -# Building AWS Deep Learning Containers Custom Images - -## How to Build Custom Images - -We can easily customize both training and inference with Deep Learning Containers to add custom frameworks, libraries, and packages using Docker files\. - -### Training with TensorFlow - -In the following example Dockerfile, the resulting Docker image will have TensorFlow v1\.15\.2 optimized for GPUs and built to support Horovod and Python 3 for multi\-node distributed training\. It will also have the AWS samples GitHub repo which contains many deep learning model examples\. - -``` -# Take the base TensorFlow container -FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:1.15.2-gpu-py36-cu100-ubuntu18.04 - -# Add your custom stack of code -RUN git clone https://github.com/aws-samples/deep-learning-models -``` - -### Training with PyTorch - -In the following example Dockerfile, the resulting Docker image will have PyTorch v1\.6\.0 optimized for GPUs. It will also have the Amazon SageMaker samples GitHub repo which contains many deep learning model examples\. - -``` -# Take the base PyTorch container -FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.6.0-gpu-py36-cu101-ubuntu16.04 - -# Add your custom stack of code -RUN git clone https://github.com/awslabs/amazon-sagemaker-examples -``` - - -### Training with MXNet - -In the following example Dockerfile, the resulting Docker image will have MXNet v1\.6\.0 optimized for GPU inference built to support Horovod and Python 3\. It will also have the MXNet GitHub repo which contains many deep learning model examples\. - -``` -# Take the base MXNet Container -FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/mxnet-training:1.6.0-gpu-py36-cu101-ubuntu16.04 - -# Add Custom stack of code -RUN git clone -b 1.6.0 https://github.com/apache/incubator-mxnet.git - -ENTRYPOINT ["python", "/incubator-mxnet/example/image-classification/train_mnist.py"] -``` -### Building the image and running the container - -Build the Docker image, pointing to your personal Docker registry \(usually your username\), with the image's custom name and custom tag\. - -``` -docker build -f Dockerfile -t /: -``` - -Push to your personal Docker Registry: - -``` -docker push /: -``` - -You can use the following command to run the container: - -``` -docker run -it < name or tag> -``` - -**Important** -You may need to login to access to the Deep Learning Containers image repository\. Specify your region in the following command: - -``` -aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com -``` diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 8e544493d24d..ce4cad98d4e8 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -111,8 +111,6 @@ sagemaker_remote_efa_instance_type = "" # false by default nightly_pr_test_mode = false -use_scheduler = false - [buildspec_override] # Assign the path to the required buildspec file from the deep-learning-containers folder # For example: diff --git a/extended_release_buildspec.yml b/extended_release_buildspec.yml deleted file mode 100644 index c3fe608bfbde..000000000000 --- a/extended_release_buildspec.yml +++ /dev/null @@ -1,9 +0,0 @@ -version: 0.2 - -phases: - pre_build: - commands: - - start-dockerd - build: - commands: - - stage_extended_release diff --git a/image_transfer_buildspec.yml b/image_transfer_buildspec.yml index 9e4c6aa34f79..936a9c4761c3 100644 --- a/image_transfer_buildspec.yml +++ b/image_transfer_buildspec.yml @@ -18,7 +18,6 @@ phases: - export PYTHONPATH=$PYTHONPATH:$(pwd)/src - pip install -r src/requirements.txt - pip install -r test/requirements.txt - - pip install scheduler/. - echo Running $TEST_TYPE tests on $DLC_IMAGES... - python src/image_transfer.py post_build: diff --git a/scheduler/job_requester/__init__.py b/scheduler/job_requester/__init__.py deleted file mode 100644 index 7f0424c760f7..000000000000 --- a/scheduler/job_requester/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from job_requester.response import Message -from job_requester.requester import JobRequester diff --git a/scheduler/job_requester/requester.py b/scheduler/job_requester/requester.py deleted file mode 100644 index c21a86a19ebf..000000000000 --- a/scheduler/job_requester/requester.py +++ /dev/null @@ -1,320 +0,0 @@ -import json -import logging -import os -import re -import sys -import time - -from datetime import datetime -from threading import Lock -from functools import cmp_to_key - -import boto3 - -from job_requester import Message - -MAX_TIMEOUT_IN_SEC = 5000 - -LOGGER = logging.getLogger(__name__) -LOGGER.setLevel(logging.DEBUG) -LOGGER.addHandler(logging.StreamHandler(sys.stdout)) - - -class JobRequester: - def __init__(self, timeout=MAX_TIMEOUT_IN_SEC): - self.s3_ticket_bucket = "dlc-test-tickets" - self.s3_ticket_bucket_folder = "request_tickets" - self.timeout_limit = min(timeout, MAX_TIMEOUT_IN_SEC) - - self.s3_client = boto3.client("s3") - self.s3_resource = boto3.resource("s3") - - self.ticket_name_counter = 0 - self.request_lock = Lock() - - def create_ticket_content(self, image, context, num_of_instances, request_time): - """ - Create content of the ticket to be sent to S3 - - :param image: ECR URI - :param context: build context (PR/MAINLINE/NIGHTLY/DEV) - :param num_of_instances: number of instances required by the test job - :param request_time: datetime timestamp of when request was made - :return: content of the request ticket - """ - content = { - "CONTEXT": context, - "TIMESTAMP": request_time, - "ECR-URI": image, - "SCHEDULING_TRIES": 0, - "INSTANCES_NUM": num_of_instances, - "TIMEOUT_LIMIT": self.timeout_limit, - "COMMIT": os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "default"), - } - - return content - - def get_ticket_name_prefix(self): - """ - Create a length 7 prefix for ticket name - - :return: prefix for request ticket name - """ - source_version = os.getenv("PR_NUMBER", "default") - - if "pr/" in source_version: - # mod the PR ID by 100000 to make the prefix 7 digits - return f"pr{(int(source_version.split('/')[-1]) % 100000):05}" - else: - return source_version[:7] - - def send_ticket(self, ticket_content, framework): - """ - Send a request ticket to S3 bucket, self.s3_ticket_bucket - - Could run under multi-threading context, unique ticket name for each threads - - :param ticket_content: content of the ticket - :return: name of the ticket - """ - # ticket name: {CB source version}-{framework}{ticket name counter}_(datetime string) - ticket_name_prefix = self.get_ticket_name_prefix() - request_time = ticket_content["TIMESTAMP"] - self.request_lock.acquire() - ticket_name = ( - f"{ticket_name_prefix}-{framework}{str(self.ticket_name_counter)}_{request_time}.json" - ) - self.ticket_name_counter += 1 - self.request_lock.release() - self.s3_client.put_object( - Bucket=self.s3_ticket_bucket, - Key=f"{self.s3_ticket_bucket_folder}/{ticket_name}", - ) - S3_ticket_object = self.s3_resource.Object( - self.s3_ticket_bucket, f"{self.s3_ticket_bucket_folder}/{ticket_name}" - ) - S3_ticket_object.put(Body=bytes(json.dumps(ticket_content).encode("UTF-8"))) - try: - # change object acl to make ticket accessible to dev account. - self.s3_client.put_object_acl( - ACL="bucket-owner-full-control", - Bucket=self.s3_ticket_bucket, - Key=f"{self.s3_ticket_bucket_folder}/{ticket_name}", - ) - except Exception as e: - raise e - LOGGER.info(f"Ticket sent successfully, ticket name: {ticket_name}") - return ticket_name - - def assign_sagemaker_instance_type(self, image): - """ - Assign the instance type that the input image needs for testing - - :param image: ECR URI - :return: type of instance used by the image - """ - return ( - "ml.g5.12xlarge" - if "gpu" in image - else "ml.c5.4xlarge" if "tensorflow" in image else "ml.c5.9xlarge" - ) - - def extract_timestamp(self, ticket_key): - """ - extract the timestamp string from S3 request ticket key - :param ticket_key: key of the request ticket - :return: timestamp in format "%Y-%m-%d-%H-%M-%S" that is encoded in the ticket name - """ - return re.match(r".*_(\d{4}(-\d{2}){5})\.json", ticket_key).group(1) - - def ticket_timestamp_cmp_function(self, ticket1_name, ticket2_name): - """ - Compares the timestamp of the two request tickets - - :param ticket1, ticket2: S3 object descriptors from s3_client.list_objects - :return: - """ - ticket1_timestamp, ticket2_timestamp = ( - self.extract_timestamp(ticket1_name), - self.extract_timestamp(ticket2_name), - ) - return ticket1_timestamp > ticket2_timestamp - - def construct_query_response(self, status, reason=None, queueNum=None): - """ - Create query response for query_status calls - - :param status: queuing/preparing/completed/runtimeError - :param reason: maxRetries/timeout - :param queueNum: - :return: response for the ticket query - """ - query_response = {"status": status} - if reason != None: - query_response["reason"] = reason - if queueNum != None: - query_response["queueNum"] = queueNum - - return query_response - - def search_ticket_folder(self, folder, path): - """ - Search folder/path on S3 to find the target ticket. If found, return a query response for the search. Otherwise - return None. - - :param folder: folder to search - :param path: path within the folder - :return: - """ - objects = self.s3_client.list_objects( - Bucket=self.s3_ticket_bucket, Prefix=f"{folder}/{path}" - ) - if "Contents" in objects: - ticket_key = objects["Contents"][0]["Key"] - suffix_pattern = re.compile(".*-(.*).json") - suffix = suffix_pattern.match(ticket_key).group(1) - if folder == "dead_letter_queue" or folder == "duplicate_pr_requests": - return self.construct_query_response("failed", reason=suffix) - else: - return self.construct_query_response(suffix) - - return None - - def send_request(self, image, build_context, num_of_instances): - """ - Sending a request to test job executor (place request ticket to S3) - - Could run under multi-threading context - - :param num_of_instances: number of instances needed for the test - :param image: ECR uri - :param build_context: PR/MAINLINE/NIGHTLY/DEV - :return: - """ - assert ( - "training" in image or "inference" in image - ), f"Job type (training/inference) not stated in image tag: {image}" - time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - ticket_content = self.create_ticket_content(image, build_context, num_of_instances, time) - framework = ( - "mxnet" if "mxnet" in image else "pytorch" if "pytorch" in image else "tensorflow" - ) - ticket_name = self.send_ticket(ticket_content, framework) - - instance_type = self.assign_sagemaker_instance_type(image) - job_type = "training" if "training" in image else "inference" - identifier = Message( - self.s3_ticket_bucket, ticket_name, image, instance_type, job_type, time - ) - return identifier - - def receive_logs(self, identifier): - """ - Requesting for the test logs - - :param identifier: returned from send_request - :return: if log received, return the json log. Otherwise return None. - """ - ticket_name_without_extension = identifier.ticket_name.rstrip(".json") - objects = self.s3_client.list_objects( - Bucket=self.s3_ticket_bucket, - Prefix=f"resource_pool/{identifier.instance_type}-{identifier.job_type}/{ticket_name_without_extension}", - ) - ticket_prefix = f"resource_pool/{identifier.instance_type}-{identifier.job_type}/{ticket_name_without_extension}" - - if "Contents" in objects: - entry = objects["Contents"][0] - ticket_object = self.s3_client.get_object(Bucket="dlc-test-tickets", Key=entry["Key"]) - ticket_body = json.loads(ticket_object["Body"].read().decode("utf-8")) - - return ticket_body["LOGS"] - - return None - - def cancel_request(self, identifier): - """ - Cancel the test request by removing ticket from the queue. - If the test request is already running, do nothing. - - :param identifier: the response object returned from send_request - """ - - # check if ticket is on the queue - ticket_in_queue = self.search_ticket_folder( - "request_tickets", identifier.ticket_name.rstrip(".json") - ) - if ticket_in_queue: - self.s3_client.delete_object( - Bucket=self.s3_ticket_bucket, Key=f"request_tickets/{identifier.ticket_name}" - ) - return - - # check if ticket is a PR duplicate - ticket_in_duplicate = self.search_ticket_folder( - "duplicate_pr_requests", identifier.ticket_name.rstrip(".json") - ) - if ticket_in_duplicate: - LOGGER.info( - f"{identifier.ticket_name} is a duplicate PR test, test request will not be scheduled." - ) - return - - LOGGER.info( - f"{identifier.ticket_name} test has begun, test request could not be cancelled." - ) - - def query_status(self, identifier): - """ - :param identifier: unique identifier returned from call to send_request - :return: {"status": queuing/preparing/completed/failed/runtimeError, - "reason" (if status == failed): maxRetries/timeout/duplicatePR, - "queueNum" (if status == queuing): - } - """ - retries = 2 - request_ticket_name = identifier.ticket_name - ticket_without_extension = request_ticket_name.rstrip(".json") - instance_type = identifier.instance_type - job_type = identifier.job_type - - for _ in range(retries): - # check if ticket is on the queue - ticket_objects = self.s3_client.list_objects( - Bucket=self.s3_ticket_bucket, Prefix="request_tickets/" - ) - # "Contents" in the API response only if there are objects satisfy the prefix - if "Contents" in ticket_objects: - ticket_name_pattern = re.compile(".*\/(.*)") - ticket_names_list = [ - ticket_name_pattern.match(ticket["Key"]).group(1) - for ticket in ticket_objects["Contents"] - if ticket["Key"].endswith(".json") - ] - # ticket is on the queue, find the queue number - if request_ticket_name in ticket_names_list: - ticket_names_list.sort(key=cmp_to_key(self.ticket_timestamp_cmp_function)) - queue_num = ticket_names_list.index(request_ticket_name) - return self.construct_query_response("queuing", queueNum=queue_num) - - # check if ticket is on the dead letter queue - ticket_in_dead_letter = self.search_ticket_folder( - "dead_letter_queue", ticket_without_extension - ) - if ticket_in_dead_letter: - return ticket_in_dead_letter - - ticket_in_duplicate = self.search_ticket_folder( - "duplicate_pr_requests", ticket_without_extension - ) - if ticket_in_duplicate: - return ticket_in_duplicate - - ticket_in_progress = self.search_ticket_folder( - "resource_pool", f"{instance_type}-{job_type}/{ticket_without_extension}" - ) - if ticket_in_progress: - return ticket_in_progress - - time.sleep(2) - - raise AssertionError(f"Request ticket name {request_ticket_name} could not be found.") diff --git a/scheduler/job_requester/response.py b/scheduler/job_requester/response.py deleted file mode 100644 index f8218cf7ff04..000000000000 --- a/scheduler/job_requester/response.py +++ /dev/null @@ -1,22 +0,0 @@ -import json - - -class Message: - def __init__(self, ticket_bucket, ticket_name, image, instance_type, job_type, request_time): - self.image = image - self.request_time = request_time - self.ticket_bucket = ticket_bucket - self.ticket_name = ticket_name - self.instance_type = instance_type - self.job_type = job_type - - data_set = { - "instance_type": instance_type, - "job_type": job_type, - "S3_bucket": ticket_bucket, - "S3_ticket_name": ticket_name, - } - self.data = json.dumps(data_set) - - def __str__(self): - return self.data diff --git a/scheduler/log_return/__init__.py b/scheduler/log_return/__init__.py deleted file mode 100644 index 75b6ef37a51c..000000000000 --- a/scheduler/log_return/__init__.py +++ /dev/null @@ -1,97 +0,0 @@ -import boto3 -import json -import logging -import os -import sys -import xml.etree.ElementTree as ET - - -LOGGER = logging.getLogger(__name__) -LOGGER.setLevel(logging.DEBUG) -LOGGER.addHandler(logging.StreamHandler(sys.stdout)) - - -def construct_log_content(report_path): - """ - Create message that contains info allowing user to locate the logs - - :return: returned message to SQS for locating the log - """ - logs_client = boto3.client("logs") - codebuild_arn = os.getenv("CODEBUILD_BUILD_ARN") - log_group_name = "/aws/codebuild/DLCTestJobExecutor" - log_stream_name = codebuild_arn.split(":")[-1] - log_events = logs_client.get_log_events( - logGroupName=log_group_name, logStreamName=log_stream_name - ) - log_stream = "".join([event["message"] for event in log_events["events"]]) - - try: - with open(report_path) as xml_file: - report_data = ET.parse(xml_file).getroot() - report_data_in_string = ET.tostring(report_data).decode("utf-8") - except FileNotFoundError as e: - LOGGER.error(e) - report_data_in_string = "" - - content = { - "LOG_STREAM": log_stream, - "XML_REPORT": report_data_in_string, - } - - return content - - -def update_pool(status, instance_type, num_of_instances, job_type, report_path=None): - """ - Update the S3 resource pool for usage of SageMaker resources. - Naming convention of resource usage json: request ticket_name#num_of_instances-status. - - :param job_type: training/inference - :param report_path: path to find the xml reports. Only set if status == completed/runtimeError - :param status: status of the test job, options: preparing/running/completed/runtimeError - :param instance_type: ml.g5.12xlarge/ml.c5.4xlarge/ml.c5.9xlarge - :param num_of_instances: number of instances required - """ - s3_client = boto3.client("s3") - codebuild_arn = os.getenv("CODEBUILD_BUILD_ARN") - ticket_name = os.getenv("TICKET_KEY").split("/")[-1].split(".")[0] - - if status not in {"preparing", "running", "completed", "runtimeError"}: - raise ValueError( - "Not a valid status. Test job status could be preparing, running, completed or runtimeError." - ) - - pool_ticket_content = { - "REQUEST_TICKET_KEY": os.getenv("TICKET_KEY"), - "STATUS": status, - "INSTANCE_TYPE": instance_type, - "EXECUTOR_ARN": codebuild_arn, - "INSTANCES_NUM": num_of_instances, - } - - if status == "completed" or status == "runtimeError": - pool_ticket_content["LOGS"] = construct_log_content(report_path) - - # find previous entry of the test job - response = s3_client.list_objects( - Bucket="dlc-test-tickets", - MaxKeys=1, - Prefix=f"resource_pool/{instance_type}-{job_type}/{ticket_name}", - ) - - # creating json file locally and upload to S3 - filename = f"{ticket_name}#{num_of_instances}-{status}.json" - with open(filename, "w") as f: - json.dump(pool_ticket_content, f) - - with open(filename, "rb") as data: - s3_client.upload_fileobj( - data, "dlc-test-tickets", f"resource_pool/{instance_type}-{job_type}/{filename}" - ) - - # delete previous entry of the test job. Note: the deletion is performed after uploading a new ticket to avoid - # S3's Eventual Consistency causes any issues with finding the state of a ticket during a state-transition - if "Contents" in response: - previous_entry = response["Contents"][0] - s3_client.delete_object(Bucket="dlc-test-tickets", Key=previous_entry["Key"]) diff --git a/scheduler/setup.py b/scheduler/setup.py deleted file mode 100644 index fe028ab63d39..000000000000 --- a/scheduler/setup.py +++ /dev/null @@ -1,20 +0,0 @@ -import os -import sys - -from setuptools import setup - - -def read(fname): - return open(os.path.join(os.path.dirname(__file__), fname)).read() - - -if sys.version_info.major == 2: - raise EnvironmentError("This package requires Python 3.6.9 or above.") - - -setup( - name="DLCScheduler", - version="0.1", - packages=["job_requester", "log_return"], - install_requires=["boto3", "botocore", "pytest"], -) diff --git a/scheduler/tests/jobrequester_test.py b/scheduler/tests/jobrequester_test.py deleted file mode 100644 index 54200e248e1d..000000000000 --- a/scheduler/tests/jobrequester_test.py +++ /dev/null @@ -1,93 +0,0 @@ -import concurrent.futures -import logging -import os -import sys - - -import boto3 - -import log_return - -from job_requester import JobRequester - - -LOGGER = logging.getLogger(__name__) -LOGGER.setLevel(logging.DEBUG) -LOGGER.addHandler(logging.StreamHandler(sys.stdout)) - -TEST_IMAGE = "763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:2.2.0-gpu-py37-cu101-ubuntu18.04" -SAMPLE_XML_MESSAGE = ( - "SampleXMLReportHello World!" -) -SAMPLE_CB_ARN = "arn:aws:codebuild:us-west-2:754106851545:build/DLCTestJobExecutor:894c9690-f6dc-4a15-b4b8-b9f2ddc51ea9" - - -def test_requester(): - """ - Tests the send_request and receive_logs functions of the Job Requester package. - How tests are executed: - - create one Job Requester object, and multiple threads. Perform send_request with the Job Requester object in - each of these threads. - - send messages to the SQS queue that the Job Requester object created, to imitate the response logs received back - from the Job Executor. - - In each of the threads, perform receive_logs to receive the log correspond to the send_request earlier. - """ - threads = 10 - request_object = JobRequester() - identifiers_list = [] - input_list = [] - - # creating unique image names and build_context strings - for _ in range(threads): - input_list.append((TEST_IMAGE, "PR", 3)) - - # sending requests - with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: - futures = [ - executor.submit(request_object.send_request, x, y, z) for (x, y, z) in input_list - ] - - print("Created tickets......") - for future in futures: - res = future.result() - print(res) - identifiers_list.append(res) - print("\n") - - # create sample xml report files - image_tag = TEST_IMAGE.split(":")[-1] - report_path = os.path.join(os.getcwd(), f"{image_tag}.xml") - with open(report_path, "w") as report: - report.write(SAMPLE_XML_MESSAGE) - - os.environ["CODEBUILD_BUILD_ARN"] = SAMPLE_CB_ARN - for identifier in identifiers_list: - os.environ["TICKET_KEY"] = f"folder/{identifier.ticket_name}" - log_return.update_pool( - "completed", identifier.instance_type, 3, identifier.job_type, report_path - ) - - # receiving logs - with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: - logs = [ - executor.submit(request_object.receive_logs, identifier) - for identifier in identifiers_list - ] - - LOGGER.info("Receiving logs...") - for log in logs: - assert ( - "XML_REPORT" in log.result() - ), f"XML Report not found as part of the returned log message." - - # clean up test artifacts - S3 = boto3.client("s3") - ticket_names = [item.ticket_name for item in identifiers_list] - for name in ticket_names: - S3.delete_object(Bucket=request_object.s3_ticket_bucket, Key=name) - - LOGGER.info("Tests passed.") - - -if __name__ == "__main__": - test_requester() diff --git a/scheduler/tests/query_and_cancel_test.py b/scheduler/tests/query_and_cancel_test.py deleted file mode 100644 index b842ce2a27e5..000000000000 --- a/scheduler/tests/query_and_cancel_test.py +++ /dev/null @@ -1,190 +0,0 @@ -import json -import logging -import sys - -from datetime import datetime - -import boto3 - -from job_requester import JobRequester -from job_requester import Message - - -""" -How tests are executed: -- Put tickets on the request queue, in-progress pool and dead letter queue; create Message objects (request identifiers) -that correspond to these tickets. -- Create a JobRequester object and call query_status on the identifiers, check that the statuses returned are corrected. -- Call cancel_request on tickets on the request queue, check that the request tickets are removed. -- Clean up the artifacts. -""" - -LOGGER = logging.getLogger(__name__) -LOGGER.setLevel(logging.DEBUG) -LOGGER.addHandler(logging.StreamHandler(sys.stdout)) - -# test parameters -TEST_ECR_URI = "763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:2.2.0-gpu-py37-cu101-ubuntu18.04" -INSTANCE_TYPE = "ml.g5.12xlarge" -JOB_TYPE = "training" -SQS_RETURN_QUEUE_URL = "dummy_sqs_url" -REQUEST_TICKET_TIME = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") -REQUEST_TICKET_CONTENT = { - "CONTEXT": "PR", - "TIMESTAMP": REQUEST_TICKET_TIME, - "ECR-URI": TEST_ECR_URI, - "RETURN-SQS-URL": SQS_RETURN_QUEUE_URL, - "SCHEDULING_TRIES": 0, - "INSTANCES_NUM": 1, - "TIMEOUT_LIMIT": 14400, -} -IN_PROGRESS_TICKET_CONTENT = { - "INSTANCE_TYPE": INSTANCE_TYPE, - "INSTANCES_NUM": 1, - "STATUS": "preparing", -} -DEAD_LETTER_TICKET_CONTENT = { - "INSTANCE_TYPE": INSTANCE_TYPE, - "INSTANCES_NUM": 1, - "STATUS": "failed", -} - -# S3 path to tickets -BUCKET_NAME = "dlc-test-tickets" -REQUEST_TICKETS_FOLDER = "request_tickets" -IN_PROGRESS_POOL_FOLDER = "resource_pool" -DEAD_LETTER_QUEUE_FOLDER = "dead_letter_queue" - - -def put_ticket(ticket_key, ticket_content): - """ - API calls to put and write to a ticket on S3 - :param ticket_key: S3 path to the ticket - :param ticket_content: - """ - s3_client = boto3.client("s3") - s3_resource = boto3.resource("s3") - s3_client.put_object(Bucket=BUCKET_NAME, Key=ticket_key) - S3_ticket_object = s3_resource.Object(BUCKET_NAME, ticket_key) - S3_ticket_object.put(Body=bytes(json.dumps(ticket_content).encode("UTF-8"))) - - -def clean_up_ticket(ticket_key): - """ - Clean up testing ticket artifacts - :param ticket_key: S3 path to ticket - """ - s3_client = boto3.client("s3") - s3_client.delete_object(Bucket=BUCKET_NAME, Key=ticket_key) - - -def test_query_and_cancel_queuing_tickets( - job_requester, request_queue_ticket_name, request_identifier -): - """ - test querying and cancelling tickets that are yet to be scheduled - :param job_requester: JobRequester object - :param request_queue_ticket_name: request ticket name - :param request_identifier: identifier for the request sent - """ - s3_client = boto3.client("s3") - put_ticket(f"{REQUEST_TICKETS_FOLDER}/{request_queue_ticket_name}", REQUEST_TICKET_CONTENT) - # check the response message is correct - request_queue_response = job_requester.query_status(request_identifier) - assert ( - request_queue_response["status"] == "queuing" - ), f"Returned status incorrect: {request_queue_ticket_name}, should be queuing." - assert ( - "queueNum" in request_queue_response - ), f"Queue number not found for request ticket {request_queue_ticket_name}" - - # test cancelling request for tickets on the queue - job_requester.cancel_request(request_identifier) - # check that the request ticket has been removed - list_request_ticket_response = s3_client.list_objects( - Bucket=BUCKET_NAME, Prefix=f"request_tickets/{request_queue_ticket_name}" - ) - # if there is no object that satisfies list_objects conditions, "Contents" field would not be in the API response - assert ( - "Contents" not in list_request_ticket_response - ), f"Request ticket {request_queue_ticket_name} not correctly cancelled." - - LOGGER.info("Tests passed for querying and cancelling tickets on the queue.") - - -def test_query_in_progress_tickets(job_requester, in_progress_ticket_name, request_identifier): - """ - test querying test jobs that are scheduled and running - :param job_requester: JobRequester object - :param request_queue_ticket_name: request ticket name - :param request_identifier: identifier for the request sent - """ - put_ticket( - f"{IN_PROGRESS_POOL_FOLDER}/{INSTANCE_TYPE}-{JOB_TYPE}/{in_progress_ticket_name}", - IN_PROGRESS_TICKET_CONTENT, - ) - - in_progress_response = job_requester.query_status(request_identifier) - assert ( - "status" in in_progress_response and in_progress_response["status"] == "running" - ), f"Returned status incorrect: {in_progress_ticket_name}, should be running." - - clean_up_ticket(f"{IN_PROGRESS_POOL_FOLDER}/{INSTANCE_TYPE}-{JOB_TYPE}/{job_requester}") - - LOGGER.info("Tests passed for querying tickets on the in-progress pool.") - - -def test_query_dead_letter_tickets(job_requester, dead_letter_ticket_name, request_identifier): - """ - test querying test jobs that are failed to be scheduled - :param job_requester: JobRequester object - :param request_queue_ticket_name: request ticket name - :param request_identifier: identifier for the request sent - """ - put_ticket(f"{DEAD_LETTER_QUEUE_FOLDER}/{dead_letter_ticket_name}", DEAD_LETTER_TICKET_CONTENT) - - dead_letter_response = job_requester.query_status(request_identifier) - assert ( - "status" in dead_letter_response and dead_letter_response["status"] == "failed" - ), f"Returned status incorrect: {dead_letter_ticket_name}, should be failed." - assert ( - "reason" in dead_letter_response and dead_letter_response["reason"] == "timeout" - ), f"Failure reason not found for request ticket {dead_letter_ticket_name}" - - clean_up_ticket(f"{DEAD_LETTER_QUEUE_FOLDER}/{dead_letter_ticket_name}") - - LOGGER.info("Tests passed for querying tickets on the dead letter queue.") - - -def main(): - job_requester_object = JobRequester() - request_ticket_prefix = f"testing-0_{REQUEST_TICKET_TIME}" - # create identifier for the request ticket - request_identifier = Message( - SQS_RETURN_QUEUE_URL, - BUCKET_NAME, - f"{request_ticket_prefix}.json", - TEST_ECR_URI, - REQUEST_TICKET_TIME, - ) - test_query_and_cancel_queuing_tickets( - job_requester_object, f"{request_ticket_prefix}.json", request_identifier - ) - - # naming convention of in-progress pool tickets: {request ticket name}#{num of instances}-{status}.json - in_progress_ticket_name = f"{request_ticket_prefix}#1-running.json" - test_query_in_progress_tickets( - job_requester_object, in_progress_ticket_name, request_identifier - ) - - # naming convention of in-progress pool tickets: {request ticket name}-{failure reason}.json - dead_letter_ticket_name = f"{request_ticket_prefix}-timeout.json" - test_query_dead_letter_tickets( - job_requester_object, dead_letter_ticket_name, request_identifier - ) - - LOGGER.info("Tests passed.") - - -if __name__ == "__main__": - main() diff --git a/src/config.py b/src/config.py index 476c9693f7e7..fbaf47daa17e 100644 --- a/src/config.py +++ b/src/config.py @@ -123,10 +123,6 @@ def is_nightly_pr_test_mode_enabled(): return parse_dlc_developer_configs("test", "nightly_pr_test_mode") -def is_scheduler_enabled(): - return parse_dlc_developer_configs("test", "use_scheduler") - - def is_safety_check_test_enabled(): return parse_dlc_developer_configs("test", "safety_check_test") diff --git a/src/start_testbuilds.py b/src/start_testbuilds.py index b87fb3c0b4a4..1c428c68e31c 100644 --- a/src/start_testbuilds.py +++ b/src/start_testbuilds.py @@ -85,13 +85,6 @@ def run_test_job(commit, codebuild_project, images_str=""): "value": str(config.is_nightly_pr_test_mode_enabled()), "type": "PLAINTEXT", }, - # USE_SCHEDULER is passed as an env variable here because it is more convenient to set this in - # dlc_developer_config, compared to having another config file under dlc/tests/. - { - "name": "USE_SCHEDULER", - "value": str(config.is_scheduler_enabled()), - "type": "PLAINTEXT", - }, # SM_EFA_TEST_INSTANCE_TYPE is passed to SM test job to pick a matching instance type as defined by user { "name": "SM_EFA_TEST_INSTANCE_TYPE", diff --git a/test/dlc_tests/sanity/quick_checks/test_dlc_developer_config.py b/test/dlc_tests/sanity/quick_checks/test_dlc_developer_config.py index dbd6c8119da0..1d3492265d68 100644 --- a/test/dlc_tests/sanity/quick_checks/test_dlc_developer_config.py +++ b/test/dlc_tests/sanity/quick_checks/test_dlc_developer_config.py @@ -48,7 +48,6 @@ def test_developer_configuration(): assert config.parse_dlc_developer_configs("test", "use_new_test_structure") is False assert config.parse_dlc_developer_configs("test", "ipv6_vpc_name") == "" assert config.parse_dlc_developer_configs("test", "nightly_pr_test_mode") is False - assert config.parse_dlc_developer_configs("test", "use_scheduler") is False assert config.parse_dlc_developer_configs("test", "safety_check_test") is False assert config.parse_dlc_developer_configs("test", "ecr_scan_allowlist_feature") is False @@ -81,7 +80,6 @@ def test_developer_config_wrappers_defaults(): assert config.is_ipv6_test_enabled() is False assert config.get_ipv6_vpc_name() == "" assert config.is_nightly_pr_test_mode_enabled() is False - assert config.is_scheduler_enabled() is False assert config.is_safety_check_test_enabled() is False assert config.is_ecr_scan_allowlist_feature_enabled() is False assert config.is_notify_test_failures_enabled() is False diff --git a/test/dlc_tests/sanity/quick_checks/test_generate_coverage_doc.py b/test/dlc_tests/sanity/quick_checks/test_generate_coverage_doc.py deleted file mode 100644 index 0f467bfc5b79..000000000000 --- a/test/dlc_tests/sanity/quick_checks/test_generate_coverage_doc.py +++ /dev/null @@ -1,64 +0,0 @@ -import os - -import boto3 -import pytest - -from botocore.exceptions import ClientError -from invoke.context import Context - -from test.test_utils import ( - LOGGER, - is_mainline_context, - is_graviton_architecture, - is_arm64_architecture, -) -from test.test_utils.test_reporting import get_test_coverage_file_path - - -ACCOUNT_ID = os.getenv("ACCOUNT_ID", boto3.client("sts").get_caller_identity().get("Account")) -TEST_COVERAGE_REPORT_BUCKET = f"dlc-test-coverage-reports-{ACCOUNT_ID}" - - -@pytest.mark.quick_checks -@pytest.mark.integration("Generating this coverage doc") -@pytest.mark.model("N/A") -@pytest.mark.skipif( - (is_mainline_context() and (is_graviton_architecture() or is_arm64_architecture())), - reason="Skipping the test for Graviton/ARM64 image build in mainline context as ARM64 image is used as a base", -) -def test_generate_coverage_doc(): - """ - Test generating the test coverage doc - """ - - test_coverage_file = get_test_coverage_file_path() - ctx = Context() - # Set DLC_IMAGES to 'test' to avoid image names affecting function metadata (due to parametrization) - # Set CODEBUILD_RESOLVED_SOURCE_VERSION to test for ease of running this test locally - ctx.run( - "export DLC_IMAGES='' && export CODEBUILD_RESOLVED_SOURCE_VERSION='test' && export BUILD_CONTEXT=''" - "&& python -m pip install --upgrade pip --root-user-action=ignore && " - "python -m pytest -s --collect-only --generate-coverage-doc --ignore=container_tests/", - hide=True, - ) - - # Ensure that the coverage report is created - assert os.path.exists( - test_coverage_file - ), f"Cannot find test coverage report file {test_coverage_file}" - - # Write test coverage file to S3 - if is_mainline_context(): - client = boto3.client("s3") - with open(test_coverage_file, "rb") as test_file: - try: - client.put_object( - Bucket=TEST_COVERAGE_REPORT_BUCKET, - Key=os.path.basename(test_coverage_file), - Body=test_file, - ) - except ClientError as e: - LOGGER.error( - f"Unable to upload report to bucket {TEST_COVERAGE_REPORT_BUCKET}. Error: {e}" - ) - raise diff --git a/test/dlc_tests/sanity/quick_checks/test_git_secrets.py b/test/dlc_tests/sanity/quick_checks/test_git_secrets.py deleted file mode 100644 index ee4b5592bf74..000000000000 --- a/test/dlc_tests/sanity/quick_checks/test_git_secrets.py +++ /dev/null @@ -1,53 +0,0 @@ -import logging -import os -import sys - -import pytest - -from invoke.context import Context - -from test.test_utils import PR_ONLY_REASON, get_repository_local_path, is_pr_context - -LOGGER = logging.getLogger(__name__) -LOGGER.setLevel(logging.INFO) -LOGGER.addHandler(logging.StreamHandler(sys.stderr)) - - -@pytest.mark.quick_checks -@pytest.mark.skipif(not is_pr_context(), reason=PR_ONLY_REASON) -@pytest.mark.model("N/A") -def test_git_secrets(): - ctx = Context() - repository_path = os.getenv("CODEBUILD_SRC_DIR") - if not repository_path: - repository_path = get_repository_local_path() - LOGGER.info(f"repository_path = {repository_path}") - - # Replace the regex pattern below with a matching string to run test that makes scan fail: - SOME_FAKE_CREDENTIALS = "ASIA[A-Z0-9]{16}" - WHITELISTED_CREDENTIALS = "AKIAIOSFODNN7EXAMPLE" - # End of Test Section - - with ctx.cd(repository_path): - ctx.run("git clone https://github.com/awslabs/git-secrets.git") - with ctx.cd("git-secrets"): - ctx.run("make install") - ctx.run("git secrets --install") - ctx.run("git secrets --register-aws") - # Custom pattern to catch credential in the format :@ - ctx.run("git secrets --add '\w+:\w+@'") - output = ctx.run("git secrets --list") - LOGGER.info( - f"\n--COMMAND--\n{output.command}\n" - f"--STDOUT--\n{output.stdout}\n" - f"--STDERR--\n{output.stderr}\n" - f"----------" - ) - scan_results = ctx.run("git secrets --scan", hide=True, warn=True) - LOGGER.info( - f"\n--COMMAND--\n{scan_results.command}\n" - f"--STDOUT--\n{scan_results.stdout}\n" - f"--STDERR--\n{scan_results.stderr}" - f"----------" - ) - assert scan_results.ok, scan_results.stderr diff --git a/test/dlc_tests/sanity/quick_checks/test_prepare_dev_env.py b/test/dlc_tests/sanity/quick_checks/test_prepare_dev_env.py deleted file mode 100644 index 631e486000ba..000000000000 --- a/test/dlc_tests/sanity/quick_checks/test_prepare_dev_env.py +++ /dev/null @@ -1,288 +0,0 @@ -import pytest -import os - -from unittest.mock import patch, mock_open -from src import prepare_dlc_dev_environment -from test.test_utils import is_pr_context - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("build_frameworks") -@pytest.mark.skipif( - not is_pr_context(), - reason="Development environment utility only needs to be tested in PRs, and does not add functional value in other contexts.", -) -def test_build_frameworks(): - overrider = prepare_dlc_dev_environment.TomlOverrider() - overrider.set_build_frameworks(("pytorch", "tensorflow")) - - assert overrider.overrides["build"]["build_frameworks"] == ["pytorch", "tensorflow"] - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("job_types") -@pytest.mark.skipif( - not is_pr_context(), - reason="Development environment utility only needs to be tested in PRs, and does not add functional value in other contexts.", -) -def test_build_job_types(): - overrider = prepare_dlc_dev_environment.TomlOverrider() - overrider.set_job_type(("inference", "training")) - assert ( - overrider.overrides["build"]["build_training"] is True - and overrider.overrides["build"]["build_inference"] is True - ) - - overrider.set_job_type(["inference"]) - assert ( - overrider.overrides["build"]["build_training"] is False - and overrider.overrides["build"]["build_inference"] is True - ) - - overrider.set_job_type(["training"]) - assert ( - overrider.overrides["build"]["build_training"] is True - and overrider.overrides["build"]["build_inference"] is False - ) - - overrider.set_job_type([]) - assert ( - overrider.overrides["build"]["build_training"] is False - and overrider.overrides["build"]["build_inference"] is False - ) - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("test_types") -@pytest.mark.skipif( - not is_pr_context(), - reason="Development environment utility only needs to be tested in PRs, and does not add functional value in other contexts.", -) -def test_set_test_types(): - overrider = prepare_dlc_dev_environment.TomlOverrider() - - # Test case with a subset of test types - test_types = ["ec2_tests", "ecs_tests", "sagemaker_remote_tests"] - overrider.set_test_types(test_types) - assert overrider.overrides["test"]["sanity_tests"] is False - assert overrider.overrides["test"]["security_tests"] is False - assert overrider.overrides["test"]["ecs_tests"] is True - assert overrider.overrides["test"]["eks_tests"] is False - assert overrider.overrides["test"]["ec2_tests"] is True - assert overrider.overrides["test"]["sagemaker_local_tests"] is False - assert overrider.overrides["test"]["sagemaker_remote_tests"] is True - - # Test case with no test types (default behavior); Should not override anything - empty_overrider = prepare_dlc_dev_environment.TomlOverrider() - empty_test_types = [] - empty_overrider.set_test_types(empty_test_types) - assert not empty_overrider.overrides["test"] - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("dev_mode") -@pytest.mark.skipif( - not is_pr_context(), - reason="Development environment utility only needs to be tested in PRs, and does not add functional value in other contexts.", -) -def test_set_dev_mode(): - overrider = prepare_dlc_dev_environment.TomlOverrider() - - # test with no dev mode provided - overrider.set_dev_mode(None) - assert overrider.overrides["dev"]["graviton_mode"] is False - assert overrider.overrides["dev"]["arm64_mode"] is False - assert overrider.overrides["dev"]["neuronx_mode"] is False - assert overrider.overrides["dev"]["deep_canary_mode"] is False - - overrider.set_dev_mode("graviton_mode") - assert overrider.overrides["dev"]["graviton_mode"] is True - assert overrider.overrides["dev"]["arm64_mode"] is False - assert overrider.overrides["dev"]["neuronx_mode"] is False - assert overrider.overrides["dev"]["deep_canary_mode"] is False - - overrider.set_dev_mode("arm64_mode") - assert overrider.overrides["dev"]["graviton_mode"] is False - assert overrider.overrides["dev"]["arm64_mode"] is True - assert overrider.overrides["dev"]["neuronx_mode"] is False - assert overrider.overrides["dev"]["deep_canary_mode"] is False - - overrider.set_dev_mode("neuronx_mode") - assert overrider.overrides["dev"]["graviton_mode"] is False - assert overrider.overrides["dev"]["arm64_mode"] is False - assert overrider.overrides["dev"]["neuronx_mode"] is True - assert overrider.overrides["dev"]["deep_canary_mode"] is False - - overrider.set_dev_mode("deep_canary_mode") - assert overrider.overrides["dev"]["graviton_mode"] is False - assert overrider.overrides["dev"]["arm64_mode"] is False - assert overrider.overrides["dev"]["neuronx_mode"] is False - assert overrider.overrides["dev"]["deep_canary_mode"] is True - - # Test case with multiple dev modes (error) - with pytest.raises(ValueError): - overrider.set_dev_mode(["graviton_mode", "neuronx_mode"]) - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("set_buildspec") -@pytest.mark.skipif( - not is_pr_context(), - reason="Development environment utility only needs to be tested in PRs, and does not add functional value in other contexts.", -) -def test_set_buildspec_updates_buildspec_override(): - overrider = prepare_dlc_dev_environment.TomlOverrider() - - valid_buildspec_paths = [ - "pytorch/inference/buildspec-graviton.yml", - "pytorch/inference/buildspec-arm64.yml", - "tensorflow/inference/buildspec-neuronx.yml", - "huggingface/pytorch/training/buildspec.yml", - ] - - overrider.set_buildspec(valid_buildspec_paths) - - expected_buildspec_override = { - "dlc-pr-huggingface-pytorch-training": "huggingface/pytorch/training/buildspec.yml", - "dlc-pr-pytorch-graviton-inference": "pytorch/inference/buildspec-graviton.yml", - "dlc-pr-pytorch-arm64-inference": "pytorch/inference/buildspec-arm64.yml", - "dlc-pr-tensorflow-2-neuronx-inference": "tensorflow/inference/buildspec-neuronx.yml", - } - - assert overrider.overrides["buildspec_override"] == expected_buildspec_override - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("set_buildspec") -@pytest.mark.skipif( - not is_pr_context(), - reason="Development environment utility only needs to be tested in PRs, and does not add functional value in other contexts.", -) -def test_set_buildspec_invalid_path(): - overrider = prepare_dlc_dev_environment.TomlOverrider() - - invalid_buildspec_paths = [ # invalid path - "invalid/path/buildspec.yml", - "pytorch/invalid/buildspec-aws-graviton2.yml", - "tensorflow/inference/buildspec-aws-neuronx.yml", - ] - - with pytest.raises(RuntimeError): - overrider.set_buildspec(invalid_buildspec_paths) - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("set_buildspec") -@pytest.mark.skipif( - not is_pr_context(), - reason="Development environment utility only needs to be tested in PRs, and does not add functional value in other contexts.", -) -def test_set_buildspec_updates_dev_mode(): - overrider = prepare_dlc_dev_environment.TomlOverrider() - - valid_buildspec_paths = [ - "pytorch/inference/buildspec-graviton.yml", - "tensorflow/inference/buildspec-neuronx.yml", - ] - - overrider.set_buildspec(valid_buildspec_paths) - - assert overrider.overrides["dev"]["graviton_mode"] is True - # Only the first dev mode is used, so neuronx is set to False - assert overrider.overrides["dev"]["neuronx_mode"] is False - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("set_buildspec") -@pytest.mark.skipif( - not is_pr_context(), - reason="Development environment utility only needs to be tested in PRs, and does not add functional value in other contexts.", -) -def test_set_buildspec_updates_build_frameworks(): - overrider = prepare_dlc_dev_environment.TomlOverrider() - - valid_buildspec_paths = [ - "pytorch/inference/buildspec-graviton.yml", - "tensorflow/inference/buildspec-neuronx.yml", - "huggingface/pytorch/training/buildspec.yml", - ] - - overrider.set_buildspec(valid_buildspec_paths) - - expected_build_frameworks = ["pytorch", "tensorflow", "huggingface_pytorch"] - assert overrider.overrides["build"]["build_frameworks"] == expected_build_frameworks - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("set_buildspec") -@pytest.mark.skipif( - not is_pr_context(), - reason="Development environment utility only needs to be tested in PRs, and does not add functional value in other contexts.", -) -def test_set_buildspec_updates_build_training_only(): - overrider = prepare_dlc_dev_environment.TomlOverrider() - - buildspec_paths = [ - "pytorch/training/buildspec.yml", - "huggingface/pytorch/inference/buildspec.yml", - ] - - overrider.set_buildspec(buildspec_paths) - - assert overrider.overrides["build"]["build_training"] is True - assert overrider.overrides["build"]["build_inference"] is True - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("set_buildspec") -@pytest.mark.skipif( - not is_pr_context(), - reason="Development environment utility only needs to be tested in PRs, and does not add functional value in other contexts.", -) -def test_set_buildspec_updates_build_inference_only(): - overrider = prepare_dlc_dev_environment.TomlOverrider() - - buildspec_paths = [ - "tensorflow/inference/buildspec-neuronx.yml", - ] - - overrider.set_buildspec(buildspec_paths) - - assert overrider.overrides["build"]["build_training"] is False - assert overrider.overrides["build"]["build_inference"] is True - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("generate_new_file_content") -@pytest.mark.skipif( - not is_pr_context(), - reason="Development environment utility only needs to be tested in PRs, and does not add functional value in other contexts.", -) -def test_generate_new_file_content(): - previous_version_path = "path/to/previous/version/file" - major_version = "1" - minor_version = "14" - - mock_file_content = 'version: &VERSION 1.13.0\nshort_version: &SHORT_VERSION "1.13"\n' - - @patch("builtins.open", new_callable=mock_open, read_data=mock_file_content) - def mock_generate_new_file_content(mock_file): - expected_content = ["version: &VERSION 1.14.0\n", 'short_version: &SHORT_VERSION "1.14"\n'] - - result = prepare_dlc_dev_environment.generate_new_file_content( - previous_version_path, major_version, minor_version - ) - assert result == expected_content - - mock_generate_new_file_content() diff --git a/test/dlc_tests/sanity/quick_checks/test_required_fixture.py b/test/dlc_tests/sanity/quick_checks/test_required_fixture.py deleted file mode 100644 index 67d7e1090097..000000000000 --- a/test/dlc_tests/sanity/quick_checks/test_required_fixture.py +++ /dev/null @@ -1,143 +0,0 @@ -import logging -import os -import sys -import re - -import pytest - -from test.test_utils import PR_ONLY_REASON, get_repository_local_path, is_pr_context - -LOGGER = logging.getLogger(__name__) -LOGGER.setLevel(logging.INFO) -LOGGER.addHandler(logging.StreamHandler(sys.stderr)) - - -@pytest.mark.quick_checks -@pytest.mark.skipif(not is_pr_context(), reason=PR_ONLY_REASON) -@pytest.mark.model("N/A") -def test_sanity_fixture(): - """ - Checks that each sanity test that run within PR or MAINLINE contexts - under test/dlc_tests/sanity/ directory contains either - `security_sanity` or `functionality_sanity` fixtures, not both. - - This test assumes that each test method declare the fixtures - using marker pattern `@pytest.mark.usefixtures()` for regex matching. - """ - repository_path = os.getenv("CODEBUILD_SRC_DIR") - if not repository_path: - repository_path = get_repository_local_path() - - # Look only at test files within the sanity test directory - sanity_test_path = os.path.join(repository_path, "test", "dlc_tests", "sanity") - LOGGER.debug(f"Test directory: {sanity_test_path}") - - sanity_test_fixture_mapping = {} - failed_assertion = "" - - # Tests that do not run in PR or MAINLINE contexts do not need to have - # `security_sanity` or `functionality_sanity` fixtures - non_pr_mainline_tests = ["test_canary_integration.py::test_deep_canary_integration"] - - # Navigate through files and look at test files at the top level test/dlc_tests/sanity/ - for item in os.listdir(sanity_test_path): - file_path = os.path.join(sanity_test_path, item) - if os.path.isfile(file_path): - _update_test_fixtures_mapping(file_path, sanity_test_fixture_mapping) - - for test_name, test_fixtures in sanity_test_fixture_mapping.items(): - LOGGER.debug( - f"Checking test method: {test_name} with the following fixtures {test_fixtures}\n" - ) - # Check only tests that run in PR or MAINLINE contexts - if test_name not in non_pr_mainline_tests: - # Append to failed assertion result on XOR condition that the test - # must have either `security_sanity` or `functionality_sanity` fixture - if not ( - ("security_sanity" in test_fixtures) ^ ("functionality_sanity" in test_fixtures) - ): - failed_assertion = "\n".join( - [ - failed_assertion, - f"{test_name} must have either `security_sanity` or `functionality_sanity` fixture, current fixtures: {test_fixtures}", - ] - ) - - # Throw assertion error if failed_assertion string is not empty - assert not failed_assertion, f"{failed_assertion}" - - -@pytest.mark.quick_checks -@pytest.mark.skipif(not is_pr_context(), reason=PR_ONLY_REASON) -@pytest.mark.model("N/A") -def test_telemetry_fixture(): - """ - Checks that each telemetry test that run within PR or MAINLINE contexts - under test/dlc_tests/ec2/ directory contains `telemetry` fixture. - - This test assumes that each test method declare the fixtures - using marker pattern `@pytest.mark.usefixtures()` for regex matching. - """ - repository_path = os.getenv("CODEBUILD_SRC_DIR") - if not repository_path: - repository_path = get_repository_local_path() - - # Look only at ec2 telemetry test file - telemetry_test_path = os.path.join( - repository_path, "test", "dlc_tests", "ec2", "test_telemetry.py" - ) - LOGGER.debug(f"Test path: {telemetry_test_path}") - - telemetry_test_fixture_mapping = {} - failed_assertion = "" - - # Look at ec2 telemetry test file - _update_test_fixtures_mapping(telemetry_test_path, telemetry_test_fixture_mapping) - - for test_name, test_fixtures in telemetry_test_fixture_mapping.items(): - LOGGER.debug( - f"Checking test method: {test_name} with the following fixtures {test_fixtures}\n" - ) - # Append to failed assertion result if ec2 telemetry tests doesn't contain a `telemetry` fixture - if "telemetry" not in test_fixtures: - failed_assertion = "\n".join( - [ - failed_assertion, - f"{test_name} must have `telemetry` fixture, current fixtures: {test_fixtures}", - ] - ) - - # Throw assertion error if failed_assertion string is not empty - assert not failed_assertion, f"{failed_assertion}" - - -def _update_test_fixtures_mapping(file_to_check, test_fixtures_mapping): - fixture_pattern = r"@pytest.mark.usefixtures\(" - test_func_pattern = r"def (test_(.*))\(" - fixture_list = [] - - with open(file_to_check, "r") as file: - for line in file: - # If sees a `usefixtures` marker, add the list of fixtures to fixture_per_test - # to collect all the fixture names used within a single test method - if re.match(fixture_pattern, line): - # If sees a multiline `usefixtures` marker, - # append line until closing `)` for fixture regex matching - while ")" not in line: - line += next(file) - # Remove quotes, newlines, tabs, spaces from string - line = re.sub(r"[\"\n\t\s]*", "", line) - # Get only the fixture names and remove `@pytest*` prefix - fixture_regex = re.match(rf"{fixture_pattern}(.*)\)", line) - current_fixtures = fixture_regex.group(1).split(",") - fixture_list += current_fixtures - - # If sees a `test_*` method, update the : dictionary - if re.match(test_func_pattern, line): - function_name = re.match(test_func_pattern, line).group(1) - # Map list of fixtures per tests - test_fixtures_mapping[f"{os.path.basename(file_to_check)}::{function_name}"] = ( - fixture_list - ) - # Empty test_fixtures list for the next test method - fixture_list = [] diff --git a/test/dlc_tests/sanity/quick_checks/test_yml_config.py b/test/dlc_tests/sanity/quick_checks/test_yml_config.py deleted file mode 100644 index be8f7f02e1aa..000000000000 --- a/test/dlc_tests/sanity/quick_checks/test_yml_config.py +++ /dev/null @@ -1,141 +0,0 @@ -import os -import re - -import pytest -import yaml - -from test.test_utils import is_pr_context, get_repository_local_path, LOGGER - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("release_images_training.yml") -@pytest.mark.skipif( - not is_pr_context(), - reason="This test is only needed to validate release_images configs in PRs.", -) -def test_release_images_training_yml(): - _release_images_yml_verifier(image_type="training", excluded_image_type="inference") - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("release_images_inference.yml") -@pytest.mark.skipif( - not is_pr_context(), - reason="This test is only needed to validate release_images configs in PRs.", -) -def test_release_images_inference_yml(): - _release_images_yml_verifier(image_type="inference", excluded_image_type="training") - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("release_images_patches.yml") -@pytest.mark.skipif( - not is_pr_context(), - reason="This test is only needed to validate release_images configs in PRs.", -) -def test_release_images_patches_yml(): - dlc_base_dir = get_repository_local_path() - - release_images_yml_file = os.path.join(dlc_base_dir, "release_images_patches.yml") - - with open(release_images_yml_file, "r") as release_imgs_handle: - for line in release_imgs_handle: - assert ( - "force_release: True" not in line - ), f"Force release is not permitted in patch file {release_images_yml_file}." - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("release_images_numbering") -@pytest.mark.skipif( - not is_pr_context(), - reason="This test is only needed to validate release_images configs in PRs.", -) -def test_release_images_numbering(): - release_yamls = [ - "release_images_patches.yml", - "release_images_training.yml", - "release_images_inference.yml", - ] - dlc_base_dir = get_repository_local_path() - - for release_yaml in release_yamls: - yaml_path = os.path.join(dlc_base_dir, release_yaml) - - with open(yaml_path, "r") as rf: - contents = yaml.safe_load(rf) - for i, (num, imgs) in enumerate(contents["release_images"].items()): - if i + 1 != int(num): - LOGGER.error( - f"Numbering seems incorrect in {release_yaml}. Try updating to the following:" - ) - counter = 1 - with open(yaml_path, "r") as debugger: - for line in debugger: - if re.match(r"^\s{2}\d+:", line): - LOGGER.info(f" {counter}:"), - counter += 1 - else: - LOGGER.info(line.strip("\n") if line else line), - raise RuntimeError( - f"Line {i+1} in {release_yaml} is numbered incorrectly as {num}. Please correct the ordering." - ) - - -@pytest.mark.quick_checks -@pytest.mark.model("N/A") -@pytest.mark.integration("release_images_no_overlaps") -@pytest.mark.skipif( - not is_pr_context(), - reason="This test is only needed to validate release_images configs in PRs.", -) -def test_release_images_no_overlaps(): - release_yamls = [ - "release_images_patches.yml", - "release_images_training.yml", - "release_images_inference.yml", - ] - dlc_base_dir = get_repository_local_path() - - configs = [] - - for release_yaml in release_yamls: - yaml_path = os.path.join(dlc_base_dir, release_yaml) - - with open(yaml_path, "r") as rf: - contents = yaml.safe_load(rf) - for num, imgs in contents["release_images"].items(): - if imgs in configs: - raise RuntimeError( - f"Found duplicate configs for {imgs} in {release_yaml}. These could be coming from another release yaml or the same file - please double check." - ) - configs.append(imgs) - - -def _release_images_yml_verifier(image_type, excluded_image_type): - """ - Simple test to ensure release images yml file is loadable - Also test that excluded_image_type is not present in the release yml file - """ - dlc_base_dir = get_repository_local_path() - - release_images_yml_file = os.path.join(dlc_base_dir, f"release_images_{image_type}.yml") - - # Define exclude regex - exclude_pattern = re.compile(rf"{excluded_image_type}", re.IGNORECASE) - - with open(release_images_yml_file, "r") as release_imgs_handle: - for line in release_imgs_handle: - assert not exclude_pattern.search( - line - ), f"{exclude_pattern.pattern} found in {release_images_yml_file}. Please ensure there are not conflicting job types here." - try: - yaml.safe_load(release_imgs_handle) - except yaml.YAMLError as e: - raise RuntimeError( - f"Failed to load {release_images_yml_file} via pyyaml package. Please check the contents of the file, correct errors and retry." - ) from e diff --git a/test/test_utils/metrics.py b/test/test_utils/metrics.py index a6ed005cd4a3..c9cd2a6d2ad0 100644 --- a/test/test_utils/metrics.py +++ b/test/test_utils/metrics.py @@ -42,15 +42,9 @@ def send_test_duration_metrics(start_time): :param start_time: start time of the test execution """ cloudwatch_client = boto3.client("cloudwatch") - use_scheduler = os.getenv("USE_SCHEDULER", "False").lower() == "true" executor_mode = os.getenv("EXECUTOR_MODE", "False").lower() == "true" if not executor_mode: # metrics should only be sent by the test CB - if use_scheduler: - metric_data = construct_duration_metrics_data(start_time, "With Scheduler") - - else: - metric_data = construct_duration_metrics_data(start_time, "Without Scheduler") - + metric_data = construct_duration_metrics_data(start_time, "Without Scheduler") cloudwatch_client.put_metric_data(Namespace="DLCCI", MetricData=[metric_data]) @@ -60,13 +54,7 @@ def send_test_result_metrics(stdout): :param stdout: 0/1. 0 indicates no error during test execution, 1 indicates errors occurred """ cloudwatch_client = boto3.client("cloudwatch") - use_scheduler = os.getenv("USE_SCHEDULER", "False").lower() == "true" executor_mode = os.getenv("EXECUTOR_MODE", "False").lower() == "true" if not executor_mode: # metrics should only be sent by the test CB - if use_scheduler: - metric_data = construct_test_result_metrics_data(stdout, "With Scheduler") - - else: - metric_data = construct_test_result_metrics_data(stdout, "Without Scheduler") - + metric_data = construct_test_result_metrics_data(stdout, "Without Scheduler") cloudwatch_client.put_metric_data(Namespace="DLCCI", MetricData=[metric_data]) diff --git a/test/testrunner.py b/test/testrunner.py index e3f0b73b8e0c..6ccef81e95cc 100644 --- a/test/testrunner.py +++ b/test/testrunner.py @@ -121,57 +121,11 @@ def print_log_stream(logs): LOGGER.info("Print log stream complete.") -def send_scheduler_requests(requester, image): - """ - Send a PR test request through the requester, and wait for the response. - If test completed or encountered runtime error, create local XML reports. - Otherwise the test failed, print the failure reason. - - :param requester: JobRequester object - :param image: ECR URI - """ - # Note: 3 is the max number of instances required for any tests. Here we schedule tests conservatively. - identifier = requester.send_request(image, "PR", 3) - image_tag = image.split(":")[-1] - report_path = os.path.join(os.getcwd(), "test", f"{image_tag}.xml") - while True: - query_status_response = requester.query_status(identifier) - test_status = query_status_response["status"] - if test_status == "completed": - LOGGER.info(f"Test for image {image} completed.") - logs_response = requester.receive_logs(identifier) - LOGGER.info( - f"Receive logs success for ticket {identifier.ticket_name}, report path: {report_path}" - ) - print_log_stream(logs_response) - metrics_utils.send_test_result_metrics(0) - with open(report_path, "w") as xml_report: - xml_report.write(logs_response["XML_REPORT"]) - break - - elif test_status == "runtimeError": - logs_response = requester.receive_logs(identifier) - with open(report_path, "w") as xml_report: - xml_report.write(logs_response["XML_REPORT"]) - print_log_stream(logs_response) - metrics_utils.send_test_result_metrics(1) - raise Exception(f"Test for image {image} ran into runtime error.") - break - - elif test_status == "failed": - metrics_utils.send_test_result_metrics(1) - raise Exception( - f"Scheduling failed for image {image}. Reason: {query_status_response['reason']}" - ) - break - - def run_sagemaker_remote_tests(images, pytest_cache_params): """ Function to set up multiprocessing for SageMaker tests :param images: List of all images to be used in SageMaker tests """ - use_scheduler = os.getenv("USE_SCHEDULER", "False").lower() == "true" executor_mode = os.getenv("EXECUTOR_MODE", "False").lower() == "true" if executor_mode: @@ -197,19 +151,6 @@ def run_sagemaker_remote_tests(images, pytest_cache_params): "runtimeError", instance_type, num_of_instances, job_type, test_report ) return - - elif use_scheduler: - LOGGER.info("entered scheduler mode.") - import concurrent.futures - from job_requester import JobRequester - - job_requester = JobRequester() - with concurrent.futures.ThreadPoolExecutor(max_workers=len(images)) as executor: - futures = [ - executor.submit(send_scheduler_requests, job_requester, image) for image in images - ] - for future in futures: - future.result() else: if not images: return diff --git a/testspec.yml b/testspec.yml index 31abd98c35dd..045feabdba2e 100644 --- a/testspec.yml +++ b/testspec.yml @@ -20,7 +20,6 @@ phases: $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION --registry-ids 763104351884) fi - pip install -r test/requirements.txt - - pip install scheduler/. - echo Running pytest $TEST_TYPE tests on $DLC_IMAGES... - export PYTHONPATH=$PYTHONPATH:$(pwd)/src - python test/testrunner.py