1313print_exec () {
1414 echo " + $* "
1515 echo " "
16- " $@ "
16+ if " $@ " ; then
17+ local retcode=0
18+ else
19+ local retcode=$?
20+ fi
1721 echo " "
22+ return $retcode
1823}
1924
2025exec_with_retries () {
@@ -205,7 +210,7 @@ run_python_test () {
205210 echo " ################################################################################"
206211 fi
207212
208- if conda run -n " ${env_name} " python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning " ${python_test_file} " ; then
213+ if print_exec conda run -n " ${env_name} " python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning " ${python_test_file} " ; then
209214 echo " [TEST] Python test suite PASSED: ${python_test_file} "
210215 else
211216 echo " [TEST] Python test suite FAILED: ${python_test_file} "
@@ -313,7 +318,7 @@ print_ec2_info () {
313318
314319
315320# ###############################################################################
316- # Environment Setup and Install Functions
321+ # Miniconda Setup Functions
317322# ###############################################################################
318323
319324setup_miniconda () {
@@ -398,6 +403,11 @@ create_conda_environment () {
398403 echo " [SETUP] Successfully created Conda environment: ${env_name} "
399404}
400405
406+
407+ # ###############################################################################
408+ # PyTorch Setup Functions
409+ # ###############################################################################
410+
401411install_pytorch_conda () {
402412 local env_name=" $1 "
403413 local pytorch_version=" $2 "
@@ -553,6 +563,28 @@ install_pytorch_pip () {
553563 echo " [INSTALL] NOTE: The installed version is: ${installed_pytorch_version} "
554564}
555565
566+
567+ # ###############################################################################
568+ # CUDA Setup Functions
569+ # ###############################################################################
570+
571+ install_nvidia_drivers_centos () {
572+ echo " ################################################################################"
573+ echo " # Install NVIDIA Drivers"
574+ echo " #"
575+ echo " # [TIMESTAMP] $( date --utc +%FT%T.%3NZ) "
576+ echo " ################################################################################"
577+ echo " "
578+
579+ echo " [SETUP] Adding NVIDIA repos to yum ..."
580+ print_exec sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
581+ print_exec sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
582+ print_exec sudo yum clean expire-cache
583+
584+ echo " [SETUP] Installing NVIDIA drivers ..."
585+ install_system_packages nvidia-driver-latest-dkms
586+ }
587+
556588install_cuda () {
557589 local env_name=" $1 "
558590 local cuda_version=" $2 "
@@ -604,6 +636,86 @@ install_cuda () {
604636 echo " [INSTALL] Successfully installed CUDA ${cuda_version} "
605637}
606638
639+ install_cudnn () {
640+ local env_name=" $1 "
641+ local install_path=" $2 "
642+ local cuda_version=" $3 "
643+ if [ " $cuda_version " == " " ]; then
644+ echo " Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
645+ echo " Example:"
646+ echo " ${FUNCNAME[0]} build_env \$ (pwd)/cudnn_install 11.7"
647+ return 1
648+ else
649+ echo " ################################################################################"
650+ echo " # Install cuDNN"
651+ echo " #"
652+ echo " # [TIMESTAMP] $( date --utc +%FT%T.%3NZ) "
653+ echo " ################################################################################"
654+ echo " "
655+ fi
656+
657+ # Install cuDNN manually
658+ # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
659+ local cudnn_packages=(
660+ [" 115" ]=" https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
661+ [" 116" ]=" https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
662+ [" 117" ]=" https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
663+ [" 118" ]=" https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
664+ )
665+
666+ # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
667+ # shellcheck disable=SC2206
668+ local cuda_version_arr=(${cuda_version// ./ } )
669+ # Fetch the major and minor version to concat
670+ local cuda_concat_version=" ${cuda_version_arr[0]}${cuda_version_arr[1]} "
671+
672+ # Get the URL
673+ local cudnn_url=" ${cudnn_packages[cuda_concat_version]} "
674+ if [ " $cudnn_url " == " " ]; then
675+ # Default to cuDNN for 11.7 if no CUDA version fits
676+ echo " [INSTALL] Defaulting to cuDNN for CUDA 11.7"
677+ cudnn_url=" ${cudnn_packages[117]} "
678+ fi
679+
680+ # Clear the install path
681+ rm -rf " $install_path "
682+ mkdir -p " $install_path "
683+
684+ # Create temporary directory
685+ # shellcheck disable=SC2155
686+ local tmp_dir=$( mktemp -d)
687+ cd " $tmp_dir " || return 1
688+
689+ # Download cuDNN
690+ echo " [INSTALL] Downloading cuDNN to ${tmp_dir} ..."
691+ (exec_with_retries wget -q " $cudnn_url " -O cudnn.tar.xz) || return 1
692+
693+ # Unpack the tarball
694+ echo " [INSTALL] Unpacking cuDNN ..."
695+ tar -xvf cudnn.tar.xz
696+
697+ # Copy the includes and libs over to the install path
698+ echo " [INSTALL] Moving cuDNN files to ${install_path} ..."
699+ rm -rf " ${install_path:? } /include"
700+ rm -rf " ${install_path:? } /lib"
701+ mv cudnn-linux-* /include " $install_path "
702+ mv cudnn-linux-* /lib " $install_path "
703+
704+ # Delete the temporary directory
705+ cd - || return 1
706+ rm -rf " $tmp_dir "
707+
708+ # Export the environment variables to the Conda environment
709+ echo " [INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
710+ print_exec conda env config vars set -n " ${env_name} " CUDNN_INCLUDE_DIR=" ${install_path} /include" CUDNN_LIBRARY=" ${install_path} /lib"
711+
712+ echo " [INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version} )"
713+ }
714+
715+ # ###############################################################################
716+ # ROCm Setup Functions
717+ # ###############################################################################
718+
607719install_rocm_ubuntu () {
608720 local env_name=" $1 "
609721 local rocm_version=" $2 "
@@ -652,15 +764,25 @@ install_rocm_ubuntu () {
652764 (exec_with_retries amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms) || return 1
653765
654766 echo " [INSTALL] Installing HIP-relevant packages ..."
655- install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
656767 install_system_packages hipify-clang miopen-hip miopen-hip-dev
657768
769+ # There is no need to install these packages for ROCm
770+ # install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev
771+
658772 echo " [INSTALL] Cleaning up ..."
659773 print_exec rm -f " ${package_name} "
660774
775+ echo " [INFO] Check ROCM GPU info ..."
776+ print_exec rocm-smi
777+
661778 echo " [INSTALL] Successfully installed ROCm ${rocm_version} "
662779}
663780
781+
782+ # ###############################################################################
783+ # Build Tools Setup Functions
784+ # ###############################################################################
785+
664786install_cxx_compiler () {
665787 local env_name=" $1 "
666788 local use_system_package_manager=" $2 "
@@ -759,82 +881,6 @@ install_build_tools () {
759881 echo " [INSTALL] Successfully installed all the build tools"
760882}
761883
762- install_cudnn () {
763- local env_name=" $1 "
764- local install_path=" $2 "
765- local cuda_version=" $3 "
766- if [ " $cuda_version " == " " ]; then
767- echo " Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION"
768- echo " Example:"
769- echo " ${FUNCNAME[0]} build_env \$ (pwd)/cudnn_install 11.7"
770- return 1
771- else
772- echo " ################################################################################"
773- echo " # Install cuDNN"
774- echo " #"
775- echo " # [TIMESTAMP] $( date --utc +%FT%T.%3NZ) "
776- echo " ################################################################################"
777- echo " "
778- fi
779-
780- # Install cuDNN manually
781- # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh
782- local cudnn_packages=(
783- [" 115" ]=" https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
784- [" 116" ]=" https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz"
785- [" 117" ]=" https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz"
786- [" 118" ]=" https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz"
787- )
788-
789- # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1]
790- # shellcheck disable=SC2206
791- local cuda_version_arr=(${cuda_version// ./ } )
792- # Fetch the major and minor version to concat
793- local cuda_concat_version=" ${cuda_version_arr[0]}${cuda_version_arr[1]} "
794-
795- # Get the URL
796- local cudnn_url=" ${cudnn_packages[cuda_concat_version]} "
797- if [ " $cudnn_url " == " " ]; then
798- # Default to cuDNN for 11.7 if no CUDA version fits
799- echo " [INSTALL] Defaulting to cuDNN for CUDA 11.7"
800- cudnn_url=" ${cudnn_packages[117]} "
801- fi
802-
803- # Clear the install path
804- rm -rf " $install_path "
805- mkdir -p " $install_path "
806-
807- # Create temporary directory
808- # shellcheck disable=SC2155
809- local tmp_dir=$( mktemp -d)
810- cd " $tmp_dir " || return 1
811-
812- # Download cuDNN
813- echo " [INSTALL] Downloading cuDNN to ${tmp_dir} ..."
814- (exec_with_retries wget -q " $cudnn_url " -O cudnn.tar.xz) || return 1
815-
816- # Unpack the tarball
817- echo " [INSTALL] Unpacking cuDNN ..."
818- tar -xvf cudnn.tar.xz
819-
820- # Copy the includes and libs over to the install path
821- echo " [INSTALL] Moving cuDNN files to ${install_path} ..."
822- rm -rf " ${install_path:? } /include"
823- rm -rf " ${install_path:? } /lib"
824- mv cudnn-linux-* /include " $install_path "
825- mv cudnn-linux-* /lib " $install_path "
826-
827- # Delete the temporary directory
828- cd - || return 1
829- rm -rf " $tmp_dir "
830-
831- # Export the environment variables to the Conda environment
832- echo " [INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..."
833- print_exec conda env config vars set -n " ${env_name} " CUDNN_INCLUDE_DIR=" ${install_path} /include" CUDNN_LIBRARY=" ${install_path} /lib"
834-
835- echo " [INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version} )"
836- }
837-
838884
839885# ###############################################################################
840886# Combination Functions
@@ -876,7 +922,7 @@ create_conda_pytorch_environment () {
876922
877923
878924# ###############################################################################
879- # Build Functions
925+ # FBGEMM_GPU Build Functions
880926# ###############################################################################
881927
882928prepare_fbgemm_gpu_build () {
@@ -895,6 +941,11 @@ prepare_fbgemm_gpu_build () {
895941 echo " "
896942 fi
897943
944+ if [[ " ${GITHUB_WORKSPACE} " ]]; then
945+ # https://github.com/actions/checkout/issues/841
946+ git config --global --add safe.directory " ${GITHUB_WORKSPACE} "
947+ fi
948+
898949 echo " [BUILD] Running git submodules update ..."
899950 git submodule sync
900951 git submodule update --init --recursive
0 commit comments