Support comparing ORC data (#1545)

* Support comparing ORC data Signed-off-by: Allen Xu <allxu@nvidia.com> * clean code * Add 2021 copyright Signed-off-by: Allen Xu <allxu@nvidia.com> * fix bug Co-authored-by: Allen Xu <allxu@nvidia.com>
NVIDIA · Jan 22, 2021 · 7a76023 · 7a76023
1 parent a0edf73
commit 7a76023
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 17 deletions.
diff --git a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/common/BenchUtils.scala b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/common/BenchUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -520,12 +520,7 @@ object BenchUtils {
 
     val spark = df1.sparkSession
 
-    val readPathAction = inputFormat match {
-      case "csv" =>
-        path: String => spark.read.csv(path)
-      case "parquet" =>
-        path: String => spark.read.parquet(path)
-    }
+    val readPathAction = (path: String) => spark.read.format(inputFormat).load(path)
 
     val count1 = df1.count()
     val count2 = df2.count()

diff --git a/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/common/CompareResults.scala b/integration_tests/src/main/scala/com/nvidia/spark/rapids/tests/common/CompareResults.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,16 +48,11 @@ object CompareResults {
         .config("spark.rapids.sql.enabled", "false")
         .getOrCreate()
 
-    val (df1, df2) = conf.inputFormat() match {
-      case "csv" =>
-        (spark.read.csv(conf.input1()), spark.read.csv(conf.input2()))
-      case "parquet" =>
-        (spark.read.parquet(conf.input1()), spark.read.parquet(conf.input2()))
-    }
+    val dfReader = spark.read.format(conf.inputFormat())
 
     BenchUtils.compareResults(
-      df1,
-      df2,
+      dfReader.load(conf.input1()),
+      dfReader.load(conf.input2()),
       conf.inputFormat(),
       conf.ignoreOrdering(),
       conf.useIterator(),
@@ -71,7 +66,7 @@ class Conf(arguments: Seq[String]) extends ScallopConf(arguments) {
   val input1 = opt[String](required = true)
   /** Path to second data set */
   val input2 = opt[String](required = true)
-  /** Input format (csv or parquet) */
+  /** Input format (csv, parquet or orc) */
   val inputFormat = opt[String](required = true)
   /** Sort the data collected from the DataFrames before comparing them. */
   val ignoreOrdering = opt[Boolean](required = false, default = Some(false))