Skip to content

Commit

Permalink
Support comparing ORC data (#1545)
Browse files Browse the repository at this point in the history
* Support comparing ORC data

Signed-off-by: Allen Xu <allxu@nvidia.com>

* clean code

* Add 2021 copyright

Signed-off-by: Allen Xu <allxu@nvidia.com>

* fix bug

Co-authored-by: Allen Xu <allxu@nvidia.com>
  • Loading branch information
wjxiz1992 and wjxiz1992 authored Jan 22, 2021
1 parent a0edf73 commit 7a76023
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 17 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -520,12 +520,7 @@ object BenchUtils {

val spark = df1.sparkSession

val readPathAction = inputFormat match {
case "csv" =>
path: String => spark.read.csv(path)
case "parquet" =>
path: String => spark.read.parquet(path)
}
val readPathAction = (path: String) => spark.read.format(inputFormat).load(path)

val count1 = df1.count()
val count2 = df2.count()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -48,16 +48,11 @@ object CompareResults {
.config("spark.rapids.sql.enabled", "false")
.getOrCreate()

val (df1, df2) = conf.inputFormat() match {
case "csv" =>
(spark.read.csv(conf.input1()), spark.read.csv(conf.input2()))
case "parquet" =>
(spark.read.parquet(conf.input1()), spark.read.parquet(conf.input2()))
}
val dfReader = spark.read.format(conf.inputFormat())

BenchUtils.compareResults(
df1,
df2,
dfReader.load(conf.input1()),
dfReader.load(conf.input2()),
conf.inputFormat(),
conf.ignoreOrdering(),
conf.useIterator(),
Expand All @@ -71,7 +66,7 @@ class Conf(arguments: Seq[String]) extends ScallopConf(arguments) {
val input1 = opt[String](required = true)
/** Path to second data set */
val input2 = opt[String](required = true)
/** Input format (csv or parquet) */
/** Input format (csv, parquet or orc) */
val inputFormat = opt[String](required = true)
/** Sort the data collected from the DataFrames before comparing them. */
val ignoreOrdering = opt[Boolean](required = false, default = Some(false))
Expand Down

0 comments on commit 7a76023

Please sign in to comment.