From bbcebf753e32193fbc007f6a3073571e9458af9e Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Thu, 11 Jun 2020 14:18:44 -0500 Subject: [PATCH 1/4] Moved cudf to 0.14 for CI --- integration_tests/README.md | 4 ++-- jenkins/Jenkinsfile.integration | 2 +- jenkins/Jenkinsfile.nightly | 4 ++-- jenkins/spark-tests.sh | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/integration_tests/README.md b/integration_tests/README.md index 359e1258148..ab3a08716d9 100644 --- a/integration_tests/README.md +++ b/integration_tests/README.md @@ -39,7 +39,7 @@ Most clusters probably will not have the RAPIDS plugin installed in the cluster If just want to verify the SQL replacement is working you will need to add the `rapids-4-spark` and `cudf` jars to your `spark-submit` command. ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.1-SNAPSHOT.jar,cudf-0.14-SNAPSHOT.jar" ./runtests.py +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.1-SNAPSHOT.jar,cudf-0.14.jar" ./runtests.py ``` You don't have to enable the plugin for this to work, the test framework will do that for you. @@ -70,7 +70,7 @@ The TPCxBB, TPCH, and Mortgage tests in this framework can be enabled by providi As an example, here is the `spark-submit` command with the TPCxBB parameters: ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.1-SNAPSHOT.jar,cudf-0.14-SNAPSHOT.jar,rapids-4-spark-tests_2.12-0.1-SNAPSHOT.jar" ./runtests.py --tpcxbb_format="csv" --tpcxbb_path="/path/to/tpcxbb/csv" +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.1-SNAPSHOT.jar,cudf-0.14.jar,rapids-4-spark-tests_2.12-0.1-SNAPSHOT.jar" ./runtests.py --tpcxbb_format="csv" --tpcxbb_path="/path/to/tpcxbb/csv" ``` ## Writing tests diff --git a/jenkins/Jenkinsfile.integration b/jenkins/Jenkinsfile.integration index 56159b56f63..a53afaa9a66 100644 --- a/jenkins/Jenkinsfile.integration +++ b/jenkins/Jenkinsfile.integration @@ -32,7 +32,7 @@ pipeline { } parameters { - string(name: 'CUDF_VER', defaultValue: '0.14-SNAPSHOT', + string(name: 'CUDF_VER', defaultValue: '0.14', description: '-Dcudf.version= \n\n Default for cudf version') string(name: 'CUDA_CLASSIFIER', defaultValue: '', description: '-Dclassifier=\n\n cuda10-1, cuda10-2, EMPTY as cuda10-1') diff --git a/jenkins/Jenkinsfile.nightly b/jenkins/Jenkinsfile.nightly index f8f23e06f43..f6e89a93223 100644 --- a/jenkins/Jenkinsfile.nightly +++ b/jenkins/Jenkinsfile.nightly @@ -63,7 +63,7 @@ pipeline { -v ${HOME}/.zinc:${HOME}/.zinc:rw \ -v /etc/passwd:/etc/passwd -v /etc/group:/etc/group") { sh "mvn -U -B clean deploy $MVN_URM_MIRROR" - sh "jenkins/printJarVersion.sh 'CUDFVersion' '${HOME}/.m2/repository/ai/rapids/cudf/0.14-SNAPSHOT' 'cudf-0.14-' '-cuda10-1.jar'" + sh "jenkins/printJarVersion.sh 'CUDFVersion' '${HOME}/.m2/repository/ai/rapids/cudf/0.14' 'cudf-0.14' '-cuda10-1.jar'" sh "jenkins/printJarVersion.sh 'SPARKVersion' '${HOME}/.m2/repository/org/apache/spark/spark-core_2.12/3.0.1-SNAPSHOT' 'spark-core_2.12-3.0.1-' '.jar'" } } @@ -78,7 +78,7 @@ pipeline { build(job: 'spark/rapids_integration-0.1-github', propagate: false, parameters: [string(name: 'REF', value: 'branch-0.1'), - string(name: 'CUDF_VER', value: '0.14-SNAPSHOT'), + string(name: 'CUDF_VER', value: '0.14'), booleanParam(name: 'BUILD_CENTOS7', value: false),]) slack("#rapidsai-spark-cicd", "Success", color: "#33CC33") diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh index 58777b03056..4e3a89decae 100755 --- a/jenkins/spark-tests.sh +++ b/jenkins/spark-tests.sh @@ -17,7 +17,7 @@ set -ex if [ "$CUDF_VER"x == x ];then - CUDF_VER="0.14-SNAPSHOT" + CUDF_VER="0.14" fi if [ "$PROJECT_VER"x == x ];then From b90cf84b063bb380bc183dc1bedf21ae85e99015 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Thu, 11 Jun 2020 15:43:24 -0500 Subject: [PATCH 2/4] Testing --- jenkins/spark-premerge-build.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index 1bdcae374b0..bca4ce1f34b 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -17,6 +17,8 @@ set -ex +nvidia-smi + if [ "$SPARK_VER"x == x ];then SPARK_VER="3.0.1-SNAPSHOT" fi From 4ff261d7ec2ada74ca2c2ce9e9ef02d9e7188be0 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Thu, 11 Jun 2020 16:01:37 -0500 Subject: [PATCH 3/4] More tests --- .../src/main/scala/ai/rapids/spark/GpuDeviceManager.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala b/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala index c94e93c7a6e..e67d649fcfe 100644 --- a/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala +++ b/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala @@ -53,10 +53,12 @@ object GpuDeviceManager extends Logging { // Attempt to set and acquire the gpu, return true if acquired, false otherwise def tryToSetGpuDeviceAndAcquire(addr: Int): Boolean = { + logError(s"TRYING TO GET GPU $addr") try { GpuDeviceManager.setGpuDeviceAndAcquire(addr) } catch { case NonFatal(e) => + logError("COULD NOT GET IT", e) // we may have lost a race trying to acquire this addr or GPU is already busy return false } @@ -70,9 +72,10 @@ object GpuDeviceManager extends Logging { */ private def findGpuAndAcquire(): Int = { val deviceCount: Int = Cuda.getDeviceCount() + logError(s"FOUND $deviceCount GPUs") // loop multiple times to see if a GPU was released or something unexpected happened that // we couldn't acquire on first try - var numRetries = 2 + var numRetries = 10 val addrsToTry = ArrayBuffer.empty ++= (0 to (deviceCount - 1)) while (numRetries > 0) { val addr = addrsToTry.find(tryToSetGpuDeviceAndAcquire) From 2d9499bc0ee020bab03938e1c606ce7f75e202e5 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Thu, 11 Jun 2020 16:24:05 -0500 Subject: [PATCH 4/4] Cleanup debug code --- .../src/main/scala/ai/rapids/spark/GpuDeviceManager.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala b/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala index e67d649fcfe..d6ba6d3113d 100644 --- a/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala +++ b/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala @@ -53,12 +53,11 @@ object GpuDeviceManager extends Logging { // Attempt to set and acquire the gpu, return true if acquired, false otherwise def tryToSetGpuDeviceAndAcquire(addr: Int): Boolean = { - logError(s"TRYING TO GET GPU $addr") try { GpuDeviceManager.setGpuDeviceAndAcquire(addr) } catch { case NonFatal(e) => - logError("COULD NOT GET IT", e) + logInfo(s"Will not use GPU $addr because of $e") // we may have lost a race trying to acquire this addr or GPU is already busy return false } @@ -72,10 +71,9 @@ object GpuDeviceManager extends Logging { */ private def findGpuAndAcquire(): Int = { val deviceCount: Int = Cuda.getDeviceCount() - logError(s"FOUND $deviceCount GPUs") // loop multiple times to see if a GPU was released or something unexpected happened that // we couldn't acquire on first try - var numRetries = 10 + var numRetries = 2 val addrsToTry = ArrayBuffer.empty ++= (0 to (deviceCount - 1)) while (numRetries > 0) { val addr = addrsToTry.find(tryToSetGpuDeviceAndAcquire)