From 7a913e2e195681d5b34bc1da0cc414229e9f1d9c Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Wed, 9 Sep 2020 09:56:45 -0500 Subject: [PATCH 1/2] Fix 3.0.1 shim to be released Signed-off-by: Robert (Bobby) Evans --- docs/configs.md | 2 +- shims/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/configs.md b/docs/configs.md index 736760a9eee..df4e23bf317 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -57,7 +57,7 @@ Name | Description | Default Value spark.rapids.sql.format.parquet.enabled|When set to false disables all parquet input and output acceleration|true spark.rapids.sql.format.parquet.multiThreadedRead.enabled|When set to true, reads multiple small files within a partition more efficiently by reading each file in a separate thread in parallel on the CPU side before sending to the GPU. Limited by spark.rapids.sql.format.parquet.multiThreadedRead.numThreads and spark.rapids.sql.format.parquet.multiThreadedRead.maxNumFileProcessed|true spark.rapids.sql.format.parquet.multiThreadedRead.maxNumFilesParallel|A limit on the maximum number of files per task processed in parallel on the CPU side before the file is sent to the GPU. This affects the amount of host memory used when reading the files in parallel.|2147483647 -spark.rapids.sql.format.parquet.multiThreadedRead.numThreads|The maximum number of threads, on the executor, to use for reading small parquet files in parallel.|20 +spark.rapids.sql.format.parquet.multiThreadedRead.numThreads|The maximum number of threads, on the executor, to use for reading small parquet files in parallel. This can not be changed at runtime after the executor hasstarted.|20 spark.rapids.sql.format.parquet.read.enabled|When set to false disables parquet input acceleration|true spark.rapids.sql.format.parquet.write.enabled|When set to false disables parquet output acceleration|true spark.rapids.sql.hasNans|Config to indicate if your data has NaN's. Cudf doesn't currently support NaN's properly so you can get corrupt data if you have NaN's in your data and it runs on the GPU.|true diff --git a/shims/pom.xml b/shims/pom.xml index 5946987def5..fcd3957344e 100644 --- a/shims/pom.xml +++ b/shims/pom.xml @@ -45,7 +45,6 @@ true - spark301 spark302 spark310 @@ -54,6 +53,7 @@ spark300 + spark301 aggregator From 02a5817d2272156239d5fcc4f1bf7202fa67a91a Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Wed, 9 Sep 2020 10:41:25 -0500 Subject: [PATCH 2/2] Fix escaped doc change and missing space --- docs/configs.md | 2 +- .../src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/configs.md b/docs/configs.md index df4e23bf317..c8c5ec517b0 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -57,7 +57,7 @@ Name | Description | Default Value spark.rapids.sql.format.parquet.enabled|When set to false disables all parquet input and output acceleration|true spark.rapids.sql.format.parquet.multiThreadedRead.enabled|When set to true, reads multiple small files within a partition more efficiently by reading each file in a separate thread in parallel on the CPU side before sending to the GPU. Limited by spark.rapids.sql.format.parquet.multiThreadedRead.numThreads and spark.rapids.sql.format.parquet.multiThreadedRead.maxNumFileProcessed|true spark.rapids.sql.format.parquet.multiThreadedRead.maxNumFilesParallel|A limit on the maximum number of files per task processed in parallel on the CPU side before the file is sent to the GPU. This affects the amount of host memory used when reading the files in parallel.|2147483647 -spark.rapids.sql.format.parquet.multiThreadedRead.numThreads|The maximum number of threads, on the executor, to use for reading small parquet files in parallel. This can not be changed at runtime after the executor hasstarted.|20 +spark.rapids.sql.format.parquet.multiThreadedRead.numThreads|The maximum number of threads, on the executor, to use for reading small parquet files in parallel. This can not be changed at runtime after the executor has started.|20 spark.rapids.sql.format.parquet.read.enabled|When set to false disables parquet input acceleration|true spark.rapids.sql.format.parquet.write.enabled|When set to false disables parquet output acceleration|true spark.rapids.sql.hasNans|Config to indicate if your data has NaN's. Cudf doesn't currently support NaN's properly so you can get corrupt data if you have NaN's in your data and it runs on the GPU.|true diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index 4e78991fc93..f8de4941962 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -459,7 +459,7 @@ object RapidsConf { val PARQUET_MULTITHREAD_READ_NUM_THREADS = conf("spark.rapids.sql.format.parquet.multiThreadedRead.numThreads") .doc("The maximum number of threads, on the executor, to use for reading small " + - "parquet files in parallel. This can not be changed at runtime after the executor has" + + "parquet files in parallel. This can not be changed at runtime after the executor has " + "started.") .integerConf .createWithDefault(20)