NVIDIA · jlowe · Sep 22, 2020 · Sep 21, 2020 · Sep 21, 2020 · Sep 21, 2020
diff --git a/integration_tests/README.md b/integration_tests/README.md
@@ -58,6 +58,11 @@ All of the tests will run in a single application.  They just enable and disable
 
 You do need to have access to a compatible GPU with the needed CUDA drivers. The exact details of how to set this up are beyond the scope of this document, but the Spark feature for scheduling GPUs does make this very simple if you have it configured.
 
+### Runtime Environment
+
+`--runtime_env` is used to specify the environment you are running the tests in. generally Valid values are `databricks` and `emr`. This is generally used
+when certain environment have different behavior.
+
 ### timezone
 
 The RAPIDS plugin currently only supports the UTC time zone. Spark uses the default system time zone unless explicitly set otherwise.

diff --git a/integration_tests/src/main/python/conftest.py b/integration_tests/src/main/python/conftest.py
@@ -59,6 +59,9 @@ def is_apache_runtime():
 def is_databricks_runtime():
     return runtime_env() == "databricks"
 
+def is_emr_runtime():
+    return runtime_env() == "emr"
+
 _limit = -1
 
 def get_limit():

diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py
@@ -175,7 +175,8 @@ def test_csv_fallback(spark_tmp_path, read_func, disable_conf):
     assert_gpu_fallback_collect(
             lambda spark : reader(spark).select(f.col('*'), f.col('_c2') + f.col('_c3')),
             'FileSourceScanExec',
-            conf={disable_conf: 'false'})
+            conf={disable_conf: 'false',
+                "spark.sql.sources.useV1SourceList": "csv"})
 
 csv_supported_date_formats = ['yyyy-MM-dd', 'yyyy/MM/dd', 'yyyy-MM', 'yyyy/MM',
         'MM-yyyy', 'MM/yyyy', 'MM-dd-yyyy', 'MM/dd/yyyy']

diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py
@@ -15,7 +15,7 @@
 import pytest
 from pyspark.sql.functions import broadcast
 from asserts import assert_gpu_and_cpu_are_equal_collect
-from conftest import is_databricks_runtime
+from conftest import is_databricks_runtime, is_emr_runtime
 from data_gen import *
 from marks import ignore_order, allow_non_gpu, incompat
 from spark_session import with_spark_session, is_before_spark_310
@@ -165,6 +165,8 @@ def do_join(spark):
 
 @ignore_order
 @allow_non_gpu('DataWritingCommandExec')
+@pytest.mark.xfail(condition=is_emr_runtime(),
+    reason='https://github.com/NVIDIA/spark-rapids/issues/821')
 @pytest.mark.parametrize('repartition', ["true", "false"], ids=idfn)
 def test_join_bucketed_table(repartition):
     def do_join(spark):

diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py
@@ -57,7 +57,8 @@ def test_orc_fallback(spark_tmp_path, read_func, disable_conf):
     assert_gpu_fallback_collect(
             lambda spark : reader(spark).select(f.col('*'), f.col('_c2') + f.col('_c3')),
             'FileSourceScanExec',
-            conf={disable_conf: 'false'})
+            conf={disable_conf: 'false',
+                "spark.sql.sources.useV1SourceList": "orc"})
 
 @pytest.mark.parametrize('orc_gens', orc_gens_list, ids=idfn)
 @pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql])

diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py
@@ -62,7 +62,8 @@ def test_parquet_fallback(spark_tmp_path, read_func, disable_conf):
     assert_gpu_fallback_collect(
             lambda spark : reader(spark).select(f.col('*'), f.col('_c2') + f.col('_c3')),
             'FileSourceScanExec',
-            conf={disable_conf: 'false'})
+            conf={disable_conf: 'false',
+                "spark.sql.sources.useV1SourceList": "parquet"})
 
 parquet_compress_options = ['none', 'uncompressed', 'snappy', 'gzip']
 # The following need extra jars 'lzo', 'lz4', 'brotli', 'zstd'

diff --git a/integration_tests/src/main/python/spark_session.py b/integration_tests/src/main/python/spark_session.py
@@ -91,7 +91,7 @@ def with_gpu_session(func, conf={}):
     return with_spark_session(func, conf=copy)
 
 def is_spark_300():
-    return spark_version() == "3.0.0"
+    return (spark_version() == "3.0.0" or spark_version().startswith('3.0.0-amzn'))
 
 def is_before_spark_310():
     return spark_version() < "3.1.0"
@@ -168,6 +168,7 @@
         <rat.consoleOutput>false</rat.consoleOutput>
         <slf4j.version>1.7.30</slf4j.version>
         <spark300.version>3.0.0</spark300.version>
+        <spark300emr.version>3.0.0-amzn</spark300emr.version>
         <!--
          If you update a dependendy version so it is no longer a SNAPSHOT
          please update the snapshot-shims profile as well so it is accurate -->

@@ -85,6 +85,12 @@
             <version>${project.version}</version>
             <scope>compile</scope>
         </dependency>
+        <dependency>
+            <groupId>com.nvidia</groupId>
+            <artifactId>rapids-4-spark-shims-spark300emr_${scala.binary.version}</artifactId>
+            <version>${project.version}</version>
+            <scope>compile</scope>
+        </dependency>
         <dependency>
             <groupId>com.nvidia</groupId>
             <artifactId>rapids-4-spark-shims-spark301_${scala.binary.version}</artifactId>

@@ -53,6 +53,7 @@
 
     <modules>
         <module>spark300</module>
+        <module>spark300emr</module>
         <module>spark301</module>
         <module>aggregator</module>
     </modules>

@@ -0,0 +1,87 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Copyright (c) 2020, NVIDIA CORPORATION.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>com.nvidia</groupId>
+        <artifactId>rapids-4-spark-shims_2.12</artifactId>
+        <version>0.3.0-SNAPSHOT</version>
+	<relativePath>../pom.xml</relativePath>
+    </parent>
+    <groupId>com.nvidia</groupId>
+    <artifactId>rapids-4-spark-shims-spark300emr_2.12</artifactId>
+    <name>RAPIDS Accelerator for Apache Spark SQL Plugin Spark 3.0.0 EMR Shim</name>
+    <description>The RAPIDS SQL plugin for Apache Spark 3.0.0 EMR Shim</description>
+    <version>0.3.0-SNAPSHOT</version>
+
+    <!-- Set 'spark.version' for the shims layer -->
+    <!-- Create a separate file 'SPARK_VER.properties' in the jar to save cudf & spark version info -->
+    <build>
+        <plugins>
+            <plugin>
+                <artifactId>maven-antrun-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <id>dependency</id>
+                        <phase>generate-resources</phase>
+                        <configuration>
+                            <target>
+                                <mkdir dir="${project.build.directory}/extra-resources"/>
+                                <exec executable="bash" output="${project.build.directory}/extra-resources/spark-${spark300emr.version}-info.properties">
+                                    <arg value="${user.dir}/build/dependency-info.sh"/>
+                                    <arg value="${cudf.version}"/>
+                                    <arg value="${cuda.version}"/>
+                                    <arg value="${spark300emr.version}"/>
+                                </exec>
+                            </target>
+                        </configuration>
+                        <goals>
+                            <goal>run</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+
+        <resources>
+          <resource>
+            <!-- Include the properties file to provide the build information. -->
+            <directory>${project.build.directory}/extra-resources</directory>
+          </resource>
+          <resource>
+            <directory>src/main/resources</directory>
+          </resource>
+        </resources>
+    </build>
+
+    <dependencies>
+        <dependency>
+            <groupId>com.nvidia</groupId>
+            <artifactId>rapids-4-spark-shims-spark300_${scala.binary.version}</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-sql_${scala.binary.version}</artifactId>
+            <version>${spark300.version}</version>
+            <scope>provided</scope>
+        </dependency>
+    </dependencies>
+</project>
diff --git a/...emr/src/main/resources/META-INF/services/com.nvidia.spark.rapids.SparkShimServiceProvider b/...emr/src/main/resources/META-INF/services/com.nvidia.spark.rapids.SparkShimServiceProvider
@@ -0,0 +1 @@
+com.nvidia.spark.rapids.shims.spark300emr.SparkShimServiceProvider
diff --git a/...ark300emr/src/main/scala/com/nvidia/spark/rapids/shims/spark300emr/Spark300EMRShims.scala b/...ark300emr/src/main/scala/com/nvidia/spark/rapids/shims/spark300emr/Spark300EMRShims.scala
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.shims.spark300emr
+
+import com.nvidia.spark.rapids._
+import com.nvidia.spark.rapids.shims.spark300.Spark300Shims
+import com.nvidia.spark.rapids.spark300emr.RapidsShuffleManager
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.datasources.{FilePartition, FileScanRDD, PartitionedFile}
+
+class Spark300EMRShims extends Spark300Shims {
+
+  override def getSparkShimVersion: ShimVersion = SparkShimServiceProvider.VERSION
+
+  override def getRapidsShuffleManagerClass: String = {
+    classOf[RapidsShuffleManager].getCanonicalName
+  }
+
+  // use reflection here so we don't have to compile against their jars
+  override def getFileScanRDD(
+      sparkSession: SparkSession,
+      readFunction: (PartitionedFile) => Iterator[InternalRow],
+      filePartitions: Seq[FilePartition]): RDD[InternalRow] = {
+
+    val tclass = classOf[org.apache.spark.sql.execution.datasources.FileScanRDD]
+    val constructors = tclass.getConstructors()
+    if (constructors.size > 1) {
+      throw new IllegalStateException(s"Only expected 1 constructor for FileScanRDD")
+    }
+    val constructor = constructors(0)
+    val instance = if (constructor.getParameterCount() == 4) {
+      constructor.newInstance(sparkSession, readFunction, filePartitions, None)
+    } else if (constructor.getParameterCount() == 3) {
+      constructor.newInstance(sparkSession, readFunction, filePartitions)
+    } else {
+      throw new IllegalStateException("Could not find appropriate constructor for FileScan RDD")
+    }
+    instance.asInstanceOf[FileScanRDD]
+  }
+}
diff --git a/...r/src/main/scala/com/nvidia/spark/rapids/shims/spark300emr/SparkShimServiceProvider.scala b/...r/src/main/scala/com/nvidia/spark/rapids/shims/spark300emr/SparkShimServiceProvider.scala
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.shims.spark300emr
+
+import com.nvidia.spark.rapids.{EMRShimVersion, SparkShims, SparkShimVersion}
+
+object SparkShimServiceProvider {
+  val VERSION = EMRShimVersion(3, 0, 0)
+}
+
+class SparkShimServiceProvider extends com.nvidia.spark.rapids.SparkShimServiceProvider {
+
+  def matchesVersion(version: String): Boolean = {
+    // EMR version looks like 3.0.0-amzn-0
+    val amznVersion = (SparkShimServiceProvider.VERSION.toString + raw"(-\d+)").r
+    version match {
+        case amznVersion(_*) => true
+        case _ => false
+    }
+  }
+
+  def buildShim: SparkShims = {
+    new Spark300EMRShims()
+  }
+}
diff --git a/...spark300emr/src/main/scala/com/nvidia/spark/rapids/spark300emr/RapidsShuffleManager.scala b/...spark300emr/src/main/scala/com/nvidia/spark/rapids/spark300emr/RapidsShuffleManager.scala
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.spark300emr
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.rapids.shims.spark300.RapidsShuffleInternalManager
+
+/** A shuffle manager optimized for the RAPIDS Plugin for Apache Spark. */
+sealed class RapidsShuffleManager(
+    conf: SparkConf,
+    isDriver: Boolean) extends RapidsShuffleInternalManager(conf, isDriver) {
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SparkShims.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SparkShims.scala
@@ -54,6 +54,10 @@ case class DatabricksShimVersion(major: Int, minor: Int, patch: Int) extends Shi
   override def toString(): String = s"$major.$minor.$patch-databricks"
 }
 
+case class EMRShimVersion(major: Int, minor: Int, patch: Int) extends ShimVersion {
+  override def toString(): String = s"$major.$minor.$patch-amzn"
+}
+
 trait SparkShims {
   def getSparkShimVersion: ShimVersion
   def isGpuHashJoin(plan: SparkPlan): Boolean