From 052b73042d2aeef25d1f7d3663d7ae38ecbd34a9 Mon Sep 17 00:00:00 2001 From: Hao Zhu Date: Mon, 2 May 2022 14:25:15 -0700 Subject: [PATCH 01/11] Update 2204 script and the way to install gpu driver for ubuntu Signed-off-by: Hao Zhu --- gpu/install_gpu_driver.sh | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 6f64bb5a3..60cfc04f7 100755 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -68,7 +68,6 @@ readonly NVIDIA_DEBIAN_CUDA_URL # Parameters for NVIDIA-provided Ubuntu GPU driver readonly NVIDIA_UBUNTU_REPOSITORY_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/ubuntu1804/x86_64" -readonly NVIDIA_UBUNTU_REPOSITORY_KEY="${NVIDIA_UBUNTU_REPOSITORY_URL}/7fa2af80.pub" readonly NVIDIA_UBUNTU_REPOSITORY_CUDA_PIN="${NVIDIA_UBUNTU_REPOSITORY_URL}/cuda-ubuntu1804.pin" # Parameter for NVIDIA-provided Rocky Linux GPU driver @@ -169,9 +168,7 @@ EOF # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { - if [[ ${OS_NAME} == debian ]]; then - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_UBUNTU_REPOSITORY_KEY}" | apt-key add - + if [[ ${OS_NAME} == debian ]] || [[ ${OS_NAME} == ubuntu ]]; then curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_DEBIAN_GPU_DRIVER_URL}" -o driver.run bash "./driver.run" --silent --install-libglvnd @@ -179,23 +176,6 @@ function install_nvidia_gpu_driver() { curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_DEBIAN_CUDA_URL}" -o cuda.run bash "./cuda.run" --silent --toolkit --no-opengl-libs - elif [[ ${OS_NAME} == ubuntu ]]; then - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_UBUNTU_REPOSITORY_KEY}" | apt-key add - - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_UBUNTU_REPOSITORY_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600 - - add-apt-repository "deb ${NVIDIA_UBUNTU_REPOSITORY_URL} /" - execute_with_retries "apt-get update" - - if [[ -n "${CUDA_VERSION}" ]]; then - local -r cuda_package=cuda-toolkit-${CUDA_VERSION//./-} - else - local -r cuda_package=cuda-toolkit - fi - # Without --no-install-recommends this takes a very long time. - execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-460" - execute_with_retries "apt-get install -y -q --no-install-recommends ${cuda_package}" elif [[ ${OS_NAME} == rocky ]]; then execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPOSITORY_URL}" execute_with_retries "dnf clean all" From b100d52ad6c8779dbe76d4bea17d062263882395 Mon Sep 17 00:00:00 2001 From: Hao Zhu Date: Mon, 2 May 2022 14:27:55 -0700 Subject: [PATCH 02/11] update rapids 2204 Signed-off-by: Hao Zhu --- rapids/rapids.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 82253cbd3..be0e67492 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -8,17 +8,17 @@ function get_metadata_attribute() { /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" } -readonly DEFAULT_RAPIDS_VERSION="22.02" +readonly DEFAULT_RAPIDS_VERSION="22.04" readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_RAPIDS_VERSION}) readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) -readonly DEFAULT_SPARK_RAPIDS_VERSION="22.02.0" +readonly DEFAULT_SPARK_RAPIDS_VERSION="22.04.0" if [[ "${SPARK_VERSION_ENV}" == "3"* ]]; then readonly DEFAULT_CUDA_VERSION="11.0" - readonly DEFAULT_CUDF_VERSION="22.02.0" + readonly DEFAULT_CUDF_VERSION="22.04.0" readonly DEFAULT_XGBOOST_VERSION="1.4.2" - readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="0.2.0" + readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="0.3.0" # TODO: uncomment when Spark 3.1 jars will be released - RAPIDS work with Spark 3.1, this is just for Maven URL # readonly SPARK_VERSION="${SPARK_VERSION_ENV}" readonly SPARK_VERSION="3.0" From 74c21210ea1c4ad55c30ce5941f9fb08747146e7 Mon Sep 17 00:00:00 2001 From: Hao Zhu Date: Mon, 2 May 2022 14:25:15 -0700 Subject: [PATCH 03/11] Update 2204 script and the way to install gpu driver for ubuntu Signed-off-by: Hao Zhu --- gpu/install_gpu_driver.sh | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 6f64bb5a3..60cfc04f7 100755 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -68,7 +68,6 @@ readonly NVIDIA_DEBIAN_CUDA_URL # Parameters for NVIDIA-provided Ubuntu GPU driver readonly NVIDIA_UBUNTU_REPOSITORY_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/ubuntu1804/x86_64" -readonly NVIDIA_UBUNTU_REPOSITORY_KEY="${NVIDIA_UBUNTU_REPOSITORY_URL}/7fa2af80.pub" readonly NVIDIA_UBUNTU_REPOSITORY_CUDA_PIN="${NVIDIA_UBUNTU_REPOSITORY_URL}/cuda-ubuntu1804.pin" # Parameter for NVIDIA-provided Rocky Linux GPU driver @@ -169,9 +168,7 @@ EOF # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { - if [[ ${OS_NAME} == debian ]]; then - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_UBUNTU_REPOSITORY_KEY}" | apt-key add - + if [[ ${OS_NAME} == debian ]] || [[ ${OS_NAME} == ubuntu ]]; then curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_DEBIAN_GPU_DRIVER_URL}" -o driver.run bash "./driver.run" --silent --install-libglvnd @@ -179,23 +176,6 @@ function install_nvidia_gpu_driver() { curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_DEBIAN_CUDA_URL}" -o cuda.run bash "./cuda.run" --silent --toolkit --no-opengl-libs - elif [[ ${OS_NAME} == ubuntu ]]; then - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_UBUNTU_REPOSITORY_KEY}" | apt-key add - - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_UBUNTU_REPOSITORY_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600 - - add-apt-repository "deb ${NVIDIA_UBUNTU_REPOSITORY_URL} /" - execute_with_retries "apt-get update" - - if [[ -n "${CUDA_VERSION}" ]]; then - local -r cuda_package=cuda-toolkit-${CUDA_VERSION//./-} - else - local -r cuda_package=cuda-toolkit - fi - # Without --no-install-recommends this takes a very long time. - execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-460" - execute_with_retries "apt-get install -y -q --no-install-recommends ${cuda_package}" elif [[ ${OS_NAME} == rocky ]]; then execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPOSITORY_URL}" execute_with_retries "dnf clean all" From 3e4ab1f5e80905f16f06a634f66e9e2a517e6d70 Mon Sep 17 00:00:00 2001 From: Hao Zhu Date: Mon, 2 May 2022 14:27:55 -0700 Subject: [PATCH 04/11] update rapids 2204 Signed-off-by: Hao Zhu --- rapids/rapids.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 82253cbd3..be0e67492 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -8,17 +8,17 @@ function get_metadata_attribute() { /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" } -readonly DEFAULT_RAPIDS_VERSION="22.02" +readonly DEFAULT_RAPIDS_VERSION="22.04" readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_RAPIDS_VERSION}) readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) -readonly DEFAULT_SPARK_RAPIDS_VERSION="22.02.0" +readonly DEFAULT_SPARK_RAPIDS_VERSION="22.04.0" if [[ "${SPARK_VERSION_ENV}" == "3"* ]]; then readonly DEFAULT_CUDA_VERSION="11.0" - readonly DEFAULT_CUDF_VERSION="22.02.0" + readonly DEFAULT_CUDF_VERSION="22.04.0" readonly DEFAULT_XGBOOST_VERSION="1.4.2" - readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="0.2.0" + readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="0.3.0" # TODO: uncomment when Spark 3.1 jars will be released - RAPIDS work with Spark 3.1, this is just for Maven URL # readonly SPARK_VERSION="${SPARK_VERSION_ENV}" readonly SPARK_VERSION="3.0" From 6e26e105f56976425e247b44522b391975b17fc0 Mon Sep 17 00:00:00 2001 From: Hao Zhu Date: Tue, 3 May 2022 09:42:17 -0700 Subject: [PATCH 05/11] Add key back Signed-off-by: Hao Zhu --- gpu/install_gpu_driver.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 60cfc04f7..37af08ed8 100755 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -45,6 +45,7 @@ readonly CUDA_VERSION readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb" NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}") readonly NCCL_REPO_URL +readonly NVIDIA_NCCL_UBUNTU_REPOSITORY_KEY="${NCCL_REPO_URL}/7fa2af80.pub" readonly DEFAULT_NCCL_VERSION="2.8.3" readonly DEFAULT_NCCL_VERSION_ROCKY="2.8.4" @@ -68,6 +69,7 @@ readonly NVIDIA_DEBIAN_CUDA_URL # Parameters for NVIDIA-provided Ubuntu GPU driver readonly NVIDIA_UBUNTU_REPOSITORY_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/ubuntu1804/x86_64" +readonly NVIDIA_UBUNTU_REPOSITORY_KEY="${NVIDIA_UBUNTU_REPOSITORY_URL}/3bf863cc.pub" readonly NVIDIA_UBUNTU_REPOSITORY_CUDA_PIN="${NVIDIA_UBUNTU_REPOSITORY_URL}/cuda-ubuntu1804.pin" # Parameter for NVIDIA-provided Rocky Linux GPU driver @@ -110,6 +112,8 @@ function install_nvidia_nccl() { if [[ ${OS_NAME} == rocky ]]; then execute_with_retries "dnf -y -q install libnccl-${nccl_version} libnccl-devel-${nccl_version} libnccl-static-${nccl_version}" elif [[ ${OS_NAME} == ubuntu ]] || [[ ${OS_NAME} == debian ]]; then + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_NCCL_UBUNTU_REPOSITORY_KEY}" | apt-key add - local tmp_dir tmp_dir=$(mktemp -d -t gpu-init-action-nccl-XXXX) @@ -169,6 +173,8 @@ EOF # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { if [[ ${OS_NAME} == debian ]] || [[ ${OS_NAME} == ubuntu ]]; then + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_UBUNTU_REPOSITORY_KEY}" | apt-key add - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_DEBIAN_GPU_DRIVER_URL}" -o driver.run bash "./driver.run" --silent --install-libglvnd From dae72f6c6179c93c4bba85a510949eb5cdfd4e76 Mon Sep 17 00:00:00 2001 From: Hao Zhu Date: Tue, 3 May 2022 11:53:05 -0700 Subject: [PATCH 06/11] Fix key repo Signed-off-by: Hao Zhu --- gpu/install_gpu_driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 37af08ed8..b999ebbf2 100755 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -45,7 +45,7 @@ readonly CUDA_VERSION readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb" NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}") readonly NCCL_REPO_URL -readonly NVIDIA_NCCL_UBUNTU_REPOSITORY_KEY="${NCCL_REPO_URL}/7fa2af80.pub" +readonly NVIDIA_NCCL_UBUNTU_REPOSITORY_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub" readonly DEFAULT_NCCL_VERSION="2.8.3" readonly DEFAULT_NCCL_VERSION_ROCKY="2.8.4" From a1fb6d25ee38c989f4f855caef744865e9a4147c Mon Sep 17 00:00:00 2001 From: Hao Zhu Date: Tue, 3 May 2022 19:16:15 -0700 Subject: [PATCH 07/11] Change to old way Signed-off-by: Hao Zhu --- gpu/install_gpu_driver.sh | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index b999ebbf2..7067c31a7 100755 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -172,7 +172,7 @@ EOF # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { - if [[ ${OS_NAME} == debian ]] || [[ ${OS_NAME} == ubuntu ]]; then + if [[ ${OS_NAME} == debian ]]; then curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_UBUNTU_REPOSITORY_KEY}" | apt-key add - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ @@ -182,6 +182,23 @@ function install_nvidia_gpu_driver() { curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_DEBIAN_CUDA_URL}" -o cuda.run bash "./cuda.run" --silent --toolkit --no-opengl-libs + elif [[ ${OS_NAME} == ubuntu ]]; then + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_UBUNTU_REPOSITORY_KEY}" | apt-key add - + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_UBUNTU_REPOSITORY_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600 + + add-apt-repository "deb ${NVIDIA_UBUNTU_REPOSITORY_URL} /" + execute_with_retries "apt-get update" + + if [[ -n "${CUDA_VERSION}" ]]; then + local -r cuda_package=cuda-toolkit-${CUDA_VERSION//./-} + else + local -r cuda_package=cuda-toolkit + fi + # Without --no-install-recommends this takes a very long time. + execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-460" + execute_with_retries "apt-get install -y -q --no-install-recommends ${cuda_package}" elif [[ ${OS_NAME} == rocky ]]; then execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPOSITORY_URL}" execute_with_retries "dnf clean all" From f86cff38bf77af27cc96a36d091574fea01f0eee Mon Sep 17 00:00:00 2001 From: Igor Dvorzhak Date: Tue, 3 May 2022 20:35:39 -0700 Subject: [PATCH 08/11] Update install_gpu_driver.sh --- gpu/install_gpu_driver.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 7067c31a7..3c4c98c48 100755 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -112,8 +112,11 @@ function install_nvidia_nccl() { if [[ ${OS_NAME} == rocky ]]; then execute_with_retries "dnf -y -q install libnccl-${nccl_version} libnccl-devel-${nccl_version} libnccl-static-${nccl_version}" elif [[ ${OS_NAME} == ubuntu ]] || [[ ${OS_NAME} == debian ]]; then - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_NCCL_UBUNTU_REPOSITORY_KEY}" | apt-key add - + if [[ ${OS_NAME} == ubuntu ]]; then + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_NCCL_UBUNTU_REPOSITORY_KEY}" | apt-key add - + fi + local tmp_dir tmp_dir=$(mktemp -d -t gpu-init-action-nccl-XXXX) From 73d14892d121631dff089466f936d99936f4d818 Mon Sep 17 00:00:00 2001 From: Igor Dvorzhak Date: Tue, 3 May 2022 20:37:08 -0700 Subject: [PATCH 09/11] Update install_gpu_driver.sh --- gpu/install_gpu_driver.sh | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 3c4c98c48..259be4c3d 100755 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -45,7 +45,7 @@ readonly CUDA_VERSION readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb" NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}") readonly NCCL_REPO_URL -readonly NVIDIA_NCCL_UBUNTU_REPOSITORY_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub" +readonly NVIDIA_NCCL_UBUNTU_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub" readonly DEFAULT_NCCL_VERSION="2.8.3" readonly DEFAULT_NCCL_VERSION_ROCKY="2.8.4" @@ -68,12 +68,12 @@ NVIDIA_DEBIAN_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_DEB readonly NVIDIA_DEBIAN_CUDA_URL # Parameters for NVIDIA-provided Ubuntu GPU driver -readonly NVIDIA_UBUNTU_REPOSITORY_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/ubuntu1804/x86_64" -readonly NVIDIA_UBUNTU_REPOSITORY_KEY="${NVIDIA_UBUNTU_REPOSITORY_URL}/3bf863cc.pub" -readonly NVIDIA_UBUNTU_REPOSITORY_CUDA_PIN="${NVIDIA_UBUNTU_REPOSITORY_URL}/cuda-ubuntu1804.pin" +readonly NVIDIA_UBUNTU_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/ubuntu1804/x86_64" +readonly NVIDIA_UBUNTU_REPO_KEY="${NVIDIA_UBUNTU_REPO_URL}/3bf863cc.pub" +readonly NVIDIA_UBUNTU_REPO_CUDA_PIN="${NVIDIA_UBUNTU_REPO_URL}/cuda-ubuntu1804.pin" # Parameter for NVIDIA-provided Rocky Linux GPU driver -readonly NVIDIA_ROCKY_REPOSITORY_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/rhel8/x86_64/cuda-rhel8.repo" +readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/rhel8/x86_64/cuda-rhel8.repo" # Parameters for NVIDIA-provided CUDNN library readonly CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' '') @@ -114,7 +114,7 @@ function install_nvidia_nccl() { elif [[ ${OS_NAME} == ubuntu ]] || [[ ${OS_NAME} == debian ]]; then if [[ ${OS_NAME} == ubuntu ]]; then curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_NCCL_UBUNTU_REPOSITORY_KEY}" | apt-key add - + "${NVIDIA_NCCL_UBUNTU_REPO_KEY}" | apt-key add - fi local tmp_dir @@ -177,7 +177,7 @@ EOF function install_nvidia_gpu_driver() { if [[ ${OS_NAME} == debian ]]; then curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_UBUNTU_REPOSITORY_KEY}" | apt-key add - + "${NVIDIA_UBUNTU_REPO_KEY}" | apt-key add - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_DEBIAN_GPU_DRIVER_URL}" -o driver.run bash "./driver.run" --silent --install-libglvnd @@ -187,11 +187,11 @@ function install_nvidia_gpu_driver() { bash "./cuda.run" --silent --toolkit --no-opengl-libs elif [[ ${OS_NAME} == ubuntu ]]; then curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_UBUNTU_REPOSITORY_KEY}" | apt-key add - + "${NVIDIA_UBUNTU_REPO_KEY}" | apt-key add - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_UBUNTU_REPOSITORY_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600 + "${NVIDIA_UBUNTU_REPO_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600 - add-apt-repository "deb ${NVIDIA_UBUNTU_REPOSITORY_URL} /" + add-apt-repository "deb ${NVIDIA_UBUNTU_REPO_URL} /" execute_with_retries "apt-get update" if [[ -n "${CUDA_VERSION}" ]]; then @@ -203,7 +203,7 @@ function install_nvidia_gpu_driver() { execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-460" execute_with_retries "apt-get install -y -q --no-install-recommends ${cuda_package}" elif [[ ${OS_NAME} == rocky ]]; then - execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPOSITORY_URL}" + execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" execute_with_retries "dnf clean all" execute_with_retries "dnf -y -q module install nvidia-driver:460-dkms" execute_with_retries "dnf -y -q install cuda-${CUDA_VERSION//./-}" From c541cf61e6e2122f10cdf2d0c5c13b4a8357d52a Mon Sep 17 00:00:00 2001 From: Igor Dvorzhak Date: Tue, 3 May 2022 23:55:04 -0700 Subject: [PATCH 10/11] Update install_gpu_driver.sh --- gpu/install_gpu_driver.sh | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 259be4c3d..d3ad88733 100755 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -45,7 +45,7 @@ readonly CUDA_VERSION readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb" NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}") readonly NCCL_REPO_URL -readonly NVIDIA_NCCL_UBUNTU_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub" +readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub" readonly DEFAULT_NCCL_VERSION="2.8.3" readonly DEFAULT_NCCL_VERSION_ROCKY="2.8.4" @@ -112,10 +112,7 @@ function install_nvidia_nccl() { if [[ ${OS_NAME} == rocky ]]; then execute_with_retries "dnf -y -q install libnccl-${nccl_version} libnccl-devel-${nccl_version} libnccl-static-${nccl_version}" elif [[ ${OS_NAME} == ubuntu ]] || [[ ${OS_NAME} == debian ]]; then - if [[ ${OS_NAME} == ubuntu ]]; then - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_NCCL_UBUNTU_REPO_KEY}" | apt-key add - - fi + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${NCCL_REPO_KEY}" | apt-key add - local tmp_dir tmp_dir=$(mktemp -d -t gpu-init-action-nccl-XXXX) From 232117eba4c0e0561d3032aa83d523b2fe6299fb Mon Sep 17 00:00:00 2001 From: Igor Dvorzhak Date: Wed, 4 May 2022 13:42:08 -0700 Subject: [PATCH 11/11] Fix up --- dask/dask.sh | 2 +- rapids/rapids.sh | 21 ++++----------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/dask/dask.sh b/dask/dask.sh index c51fc5581..f954e11d3 100644 --- a/dask/dask.sh +++ b/dask/dask.sh @@ -36,7 +36,7 @@ readonly MASTER="$(/usr/share/google/get_metadata_value attributes/dataproc-mast readonly DASK_LAUNCHER=/usr/local/bin/dask-launcher.sh readonly DASK_SERVICE=dask-cluster -CONDA_PACKAGES=("dask=${DASK_VERSION}" 'dask-bigquery' 'dask-ml') +CONDA_PACKAGES=("dask=${DASK_VERSION}" 'dask-bigquery' 'dask-ml' 'dask-sql') if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then # Pin `distributed` package version because `dask-yarn` 0.9 diff --git a/rapids/rapids.sh b/rapids/rapids.sh index be0e67492..03812a00c 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -15,7 +15,7 @@ readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[ readonly DEFAULT_SPARK_RAPIDS_VERSION="22.04.0" if [[ "${SPARK_VERSION_ENV}" == "3"* ]]; then - readonly DEFAULT_CUDA_VERSION="11.0" + readonly DEFAULT_CUDA_VERSION="11.2" readonly DEFAULT_CUDF_VERSION="22.04.0" readonly DEFAULT_XGBOOST_VERSION="1.4.2" readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="0.3.0" @@ -65,22 +65,9 @@ function execute_with_retries() { } function install_dask_rapids() { - local base - base=$(conda info --base) - local -r mamba_env=mamba - - # Using mamba significantly reduces the conda solve-time. Create a separate conda - # environment with mamba installed to manage installations. - conda create -y -n ${mamba_env} -c conda-forge mamba - - # Install RAPIDS, cudatoolkit. Use mamba in new env to resolve base environment - ${base}/envs/${mamba_env}/bin/mamba install -y \ - -c "rapidsai" -c "nvidia" -c "conda-forge" -c "defaults" \ - "cudatoolkit=${CUDA_VERSION}" "rapids=${RAPIDS_VERSION}" "dask-sql" \ - -p ${base} - - # Remove mamba env - conda env remove -n ${mamba_env} + # Install RAPIDS, cudatoolkit + mamba install -y --no-channel-priority -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ + "cudatoolkit=${CUDA_VERSION}" "rapids=${RAPIDS_VERSION}" } function install_spark_rapids() {