Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[gpu][rapids] Fix GPU init action on Ubuntu and update Spark RAPIDS to 22.04 #991

Merged
merged 12 commits into from
May 5, 2022
2 changes: 1 addition & 1 deletion dask/dask.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ readonly MASTER="$(/usr/share/google/get_metadata_value attributes/dataproc-mast
readonly DASK_LAUNCHER=/usr/local/bin/dask-launcher.sh
readonly DASK_SERVICE=dask-cluster

CONDA_PACKAGES=("dask=${DASK_VERSION}" 'dask-bigquery' 'dask-ml')
CONDA_PACKAGES=("dask=${DASK_VERSION}" 'dask-bigquery' 'dask-ml' 'dask-sql')

if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then
# Pin `distributed` package version because `dask-yarn` 0.9
Expand Down
21 changes: 12 additions & 9 deletions gpu/install_gpu_driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ readonly CUDA_VERSION
readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb"
NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}")
readonly NCCL_REPO_URL
readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub"

readonly DEFAULT_NCCL_VERSION="2.8.3"
readonly DEFAULT_NCCL_VERSION_ROCKY="2.8.4"
Expand All @@ -67,12 +68,12 @@ NVIDIA_DEBIAN_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_DEB
readonly NVIDIA_DEBIAN_CUDA_URL

# Parameters for NVIDIA-provided Ubuntu GPU driver
readonly NVIDIA_UBUNTU_REPOSITORY_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/ubuntu1804/x86_64"
readonly NVIDIA_UBUNTU_REPOSITORY_KEY="${NVIDIA_UBUNTU_REPOSITORY_URL}/7fa2af80.pub"
readonly NVIDIA_UBUNTU_REPOSITORY_CUDA_PIN="${NVIDIA_UBUNTU_REPOSITORY_URL}/cuda-ubuntu1804.pin"
readonly NVIDIA_UBUNTU_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/ubuntu1804/x86_64"
readonly NVIDIA_UBUNTU_REPO_KEY="${NVIDIA_UBUNTU_REPO_URL}/3bf863cc.pub"
readonly NVIDIA_UBUNTU_REPO_CUDA_PIN="${NVIDIA_UBUNTU_REPO_URL}/cuda-ubuntu1804.pin"

# Parameter for NVIDIA-provided Rocky Linux GPU driver
readonly NVIDIA_ROCKY_REPOSITORY_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/rhel8/x86_64/cuda-rhel8.repo"
readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/rhel8/x86_64/cuda-rhel8.repo"

# Parameters for NVIDIA-provided CUDNN library
readonly CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' '')
Expand Down Expand Up @@ -111,6 +112,8 @@ function install_nvidia_nccl() {
if [[ ${OS_NAME} == rocky ]]; then
execute_with_retries "dnf -y -q install libnccl-${nccl_version} libnccl-devel-${nccl_version} libnccl-static-${nccl_version}"
elif [[ ${OS_NAME} == ubuntu ]] || [[ ${OS_NAME} == debian ]]; then
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${NCCL_REPO_KEY}" | apt-key add -

local tmp_dir
tmp_dir=$(mktemp -d -t gpu-init-action-nccl-XXXX)

Expand Down Expand Up @@ -171,7 +174,7 @@ EOF
function install_nvidia_gpu_driver() {
if [[ ${OS_NAME} == debian ]]; then
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
"${NVIDIA_UBUNTU_REPOSITORY_KEY}" | apt-key add -
"${NVIDIA_UBUNTU_REPO_KEY}" | apt-key add -
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
"${NVIDIA_DEBIAN_GPU_DRIVER_URL}" -o driver.run
bash "./driver.run" --silent --install-libglvnd
Expand All @@ -181,11 +184,11 @@ function install_nvidia_gpu_driver() {
bash "./cuda.run" --silent --toolkit --no-opengl-libs
elif [[ ${OS_NAME} == ubuntu ]]; then
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
"${NVIDIA_UBUNTU_REPOSITORY_KEY}" | apt-key add -
"${NVIDIA_UBUNTU_REPO_KEY}" | apt-key add -
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
"${NVIDIA_UBUNTU_REPOSITORY_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600
"${NVIDIA_UBUNTU_REPO_CUDA_PIN}" -o /etc/apt/preferences.d/cuda-repository-pin-600

add-apt-repository "deb ${NVIDIA_UBUNTU_REPOSITORY_URL} /"
add-apt-repository "deb ${NVIDIA_UBUNTU_REPO_URL} /"
execute_with_retries "apt-get update"

if [[ -n "${CUDA_VERSION}" ]]; then
Expand All @@ -197,7 +200,7 @@ function install_nvidia_gpu_driver() {
execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-460"
execute_with_retries "apt-get install -y -q --no-install-recommends ${cuda_package}"
elif [[ ${OS_NAME} == rocky ]]; then
execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPOSITORY_URL}"
execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
execute_with_retries "dnf clean all"
execute_with_retries "dnf -y -q module install nvidia-driver:460-dkms"
execute_with_retries "dnf -y -q install cuda-${CUDA_VERSION//./-}"
Expand Down
29 changes: 8 additions & 21 deletions rapids/rapids.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@ function get_metadata_attribute() {
/usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
}

readonly DEFAULT_RAPIDS_VERSION="22.02"
readonly DEFAULT_RAPIDS_VERSION="22.04"
readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_RAPIDS_VERSION})

readonly SPARK_VERSION_ENV=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
readonly DEFAULT_SPARK_RAPIDS_VERSION="22.02.0"
readonly DEFAULT_SPARK_RAPIDS_VERSION="22.04.0"

if [[ "${SPARK_VERSION_ENV}" == "3"* ]]; then
readonly DEFAULT_CUDA_VERSION="11.0"
readonly DEFAULT_CUDF_VERSION="22.02.0"
readonly DEFAULT_CUDA_VERSION="11.2"
readonly DEFAULT_CUDF_VERSION="22.04.0"
readonly DEFAULT_XGBOOST_VERSION="1.4.2"
readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="0.2.0"
readonly DEFAULT_XGBOOST_GPU_SUB_VERSION="0.3.0"
# TODO: uncomment when Spark 3.1 jars will be released - RAPIDS work with Spark 3.1, this is just for Maven URL
# readonly SPARK_VERSION="${SPARK_VERSION_ENV}"
readonly SPARK_VERSION="3.0"
Expand Down Expand Up @@ -65,22 +65,9 @@ function execute_with_retries() {
}

function install_dask_rapids() {
local base
base=$(conda info --base)
local -r mamba_env=mamba

# Using mamba significantly reduces the conda solve-time. Create a separate conda
# environment with mamba installed to manage installations.
conda create -y -n ${mamba_env} -c conda-forge mamba

# Install RAPIDS, cudatoolkit. Use mamba in new env to resolve base environment
${base}/envs/${mamba_env}/bin/mamba install -y \
-c "rapidsai" -c "nvidia" -c "conda-forge" -c "defaults" \
"cudatoolkit=${CUDA_VERSION}" "rapids=${RAPIDS_VERSION}" "dask-sql" \
-p ${base}

# Remove mamba env
conda env remove -n ${mamba_env}
# Install RAPIDS, cudatoolkit
mamba install -y --no-channel-priority -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \
"cudatoolkit=${CUDA_VERSION}" "rapids=${RAPIDS_VERSION}"
}

function install_spark_rapids() {
Expand Down