From bbcebf753e32193fbc007f6a3073571e9458af9e Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Thu, 11 Jun 2020 14:18:44 -0500
Subject: [PATCH 1/4] Moved cudf to 0.14 for CI

---
 integration_tests/README.md     | 4 ++--
 jenkins/Jenkinsfile.integration | 2 +-
 jenkins/Jenkinsfile.nightly     | 4 ++--
 jenkins/spark-tests.sh          | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/integration_tests/README.md b/integration_tests/README.md
index 359e1258148..ab3a08716d9 100644
--- a/integration_tests/README.md
+++ b/integration_tests/README.md
@@ -39,7 +39,7 @@ Most clusters probably will not have the RAPIDS plugin installed in the cluster
 If just want to verify the SQL replacement is working you will need to add the `rapids-4-spark` and `cudf` jars to your `spark-submit` command.
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.1-SNAPSHOT.jar,cudf-0.14-SNAPSHOT.jar" ./runtests.py
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.1-SNAPSHOT.jar,cudf-0.14.jar" ./runtests.py
 ```
 
 You don't have to enable the plugin for this to work, the test framework will do that for you.
@@ -70,7 +70,7 @@ The TPCxBB, TPCH, and Mortgage tests in this framework can be enabled by providi
 As an example, here is the `spark-submit` command with the TPCxBB parameters:
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.1-SNAPSHOT.jar,cudf-0.14-SNAPSHOT.jar,rapids-4-spark-tests_2.12-0.1-SNAPSHOT.jar" ./runtests.py --tpcxbb_format="csv" --tpcxbb_path="/path/to/tpcxbb/csv"
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-0.1-SNAPSHOT.jar,cudf-0.14.jar,rapids-4-spark-tests_2.12-0.1-SNAPSHOT.jar" ./runtests.py --tpcxbb_format="csv" --tpcxbb_path="/path/to/tpcxbb/csv"
 ```
 
 ## Writing tests
diff --git a/jenkins/Jenkinsfile.integration b/jenkins/Jenkinsfile.integration
index 56159b56f63..a53afaa9a66 100644
--- a/jenkins/Jenkinsfile.integration
+++ b/jenkins/Jenkinsfile.integration
@@ -32,7 +32,7 @@ pipeline {
     }
 
     parameters {
-        string(name: 'CUDF_VER', defaultValue: '0.14-SNAPSHOT',
+        string(name: 'CUDF_VER', defaultValue: '0.14',
             description: '-Dcudf.version= \n\n Default for cudf version')
         string(name: 'CUDA_CLASSIFIER', defaultValue: '',
             description: '-Dclassifier=\n\n cuda10-1, cuda10-2, EMPTY as cuda10-1')
diff --git a/jenkins/Jenkinsfile.nightly b/jenkins/Jenkinsfile.nightly
index f8f23e06f43..f6e89a93223 100644
--- a/jenkins/Jenkinsfile.nightly
+++ b/jenkins/Jenkinsfile.nightly
@@ -63,7 +63,7 @@ pipeline {
                         -v ${HOME}/.zinc:${HOME}/.zinc:rw \
                         -v /etc/passwd:/etc/passwd -v /etc/group:/etc/group") {
                         sh "mvn -U -B clean deploy $MVN_URM_MIRROR"
-                        sh "jenkins/printJarVersion.sh 'CUDFVersion' '${HOME}/.m2/repository/ai/rapids/cudf/0.14-SNAPSHOT' 'cudf-0.14-' '-cuda10-1.jar'"
+                        sh "jenkins/printJarVersion.sh 'CUDFVersion' '${HOME}/.m2/repository/ai/rapids/cudf/0.14' 'cudf-0.14' '-cuda10-1.jar'"
                         sh "jenkins/printJarVersion.sh 'SPARKVersion' '${HOME}/.m2/repository/org/apache/spark/spark-core_2.12/3.0.1-SNAPSHOT' 'spark-core_2.12-3.0.1-' '.jar'"
                     }
                 }
@@ -78,7 +78,7 @@ pipeline {
                     build(job: 'spark/rapids_integration-0.1-github',
                           propagate: false,
                           parameters: [string(name: 'REF', value: 'branch-0.1'),
-                                       string(name: 'CUDF_VER', value: '0.14-SNAPSHOT'),
+                                       string(name: 'CUDF_VER', value: '0.14'),
                                        booleanParam(name: 'BUILD_CENTOS7', value: false),])
 
                     slack("#rapidsai-spark-cicd", "Success", color: "#33CC33")
diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh
index 58777b03056..4e3a89decae 100755
--- a/jenkins/spark-tests.sh
+++ b/jenkins/spark-tests.sh
@@ -17,7 +17,7 @@
 
 set -ex
 if [ "$CUDF_VER"x == x ];then
-    CUDF_VER="0.14-SNAPSHOT"
+    CUDF_VER="0.14"
 fi
 
 if [ "$PROJECT_VER"x == x ];then

From b90cf84b063bb380bc183dc1bedf21ae85e99015 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Thu, 11 Jun 2020 15:43:24 -0500
Subject: [PATCH 2/4] Testing

---
 jenkins/spark-premerge-build.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh
index 1bdcae374b0..bca4ce1f34b 100755
--- a/jenkins/spark-premerge-build.sh
+++ b/jenkins/spark-premerge-build.sh
@@ -17,6 +17,8 @@
 
 set -ex
 
+nvidia-smi
+
 if [ "$SPARK_VER"x == x ];then
     SPARK_VER="3.0.1-SNAPSHOT"
 fi

From 4ff261d7ec2ada74ca2c2ce9e9ef02d9e7188be0 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Thu, 11 Jun 2020 16:01:37 -0500
Subject: [PATCH 3/4] More tests

---
 .../src/main/scala/ai/rapids/spark/GpuDeviceManager.scala    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala b/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala
index c94e93c7a6e..e67d649fcfe 100644
--- a/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala
+++ b/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala
@@ -53,10 +53,12 @@ object GpuDeviceManager extends Logging {
 
   // Attempt to set and acquire the gpu, return true if acquired, false otherwise
   def tryToSetGpuDeviceAndAcquire(addr: Int): Boolean = {
+    logError(s"TRYING TO GET GPU $addr")
     try {
       GpuDeviceManager.setGpuDeviceAndAcquire(addr)
     } catch {
       case NonFatal(e) =>
+        logError("COULD NOT GET IT", e)
         // we may have lost a race trying to acquire this addr or GPU is already busy
         return false
     }
@@ -70,9 +72,10 @@ object GpuDeviceManager extends Logging {
    */
   private def findGpuAndAcquire(): Int = {
     val deviceCount: Int = Cuda.getDeviceCount()
+    logError(s"FOUND $deviceCount GPUs")
     // loop multiple times to see if a GPU was released or something unexpected happened that
     // we couldn't acquire on first try
-    var numRetries = 2
+    var numRetries = 10
     val addrsToTry = ArrayBuffer.empty ++= (0 to (deviceCount - 1))
     while (numRetries > 0) {
       val addr = addrsToTry.find(tryToSetGpuDeviceAndAcquire)

From 2d9499bc0ee020bab03938e1c606ce7f75e202e5 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Thu, 11 Jun 2020 16:24:05 -0500
Subject: [PATCH 4/4] Cleanup debug code

---
 .../src/main/scala/ai/rapids/spark/GpuDeviceManager.scala   | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala b/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala
index e67d649fcfe..d6ba6d3113d 100644
--- a/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala
+++ b/sql-plugin/src/main/scala/ai/rapids/spark/GpuDeviceManager.scala
@@ -53,12 +53,11 @@ object GpuDeviceManager extends Logging {
 
   // Attempt to set and acquire the gpu, return true if acquired, false otherwise
   def tryToSetGpuDeviceAndAcquire(addr: Int): Boolean = {
-    logError(s"TRYING TO GET GPU $addr")
     try {
       GpuDeviceManager.setGpuDeviceAndAcquire(addr)
     } catch {
       case NonFatal(e) =>
-        logError("COULD NOT GET IT", e)
+        logInfo(s"Will not use GPU $addr because of $e")
         // we may have lost a race trying to acquire this addr or GPU is already busy
         return false
     }
@@ -72,10 +71,9 @@ object GpuDeviceManager extends Logging {
    */
   private def findGpuAndAcquire(): Int = {
     val deviceCount: Int = Cuda.getDeviceCount()
-    logError(s"FOUND $deviceCount GPUs")
     // loop multiple times to see if a GPU was released or something unexpected happened that
     // we couldn't acquire on first try
-    var numRetries = 10
+    var numRetries = 2
     val addrsToTry = ArrayBuffer.empty ++= (0 to (deviceCount - 1))
     while (numRetries > 0) {
       val addr = addrsToTry.find(tryToSetGpuDeviceAndAcquire)