forked from lakesoul-io/LakeSoul
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request lakesoul-io#149 from meta-soul/spark_native_io_pac…
…kaging [NativeIO][Spark] Package native lib in lakesoul-spark jar
- Loading branch information
Showing
8 changed files
with
271 additions
and
61 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
86 changes: 86 additions & 0 deletions
86
...park/src/test/scala/org/apache/spark/sql/lakesoul/benchmark/io/ParquetScanBenchmark.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
package org.apache.spark.sql.lakesoul.benchmark.io | ||
|
||
import com.dmetasoul.lakesoul.tables.LakeSoulTable | ||
import org.apache.spark.sql.SparkSession | ||
import org.apache.spark.sql.internal.SQLConf | ||
import org.apache.spark.sql.lakesoul.sources.LakeSoulSQLConf | ||
|
||
/** | ||
* Run with following commands with local minio env: | ||
* | ||
* mvn package -pl lakesoul-spark -am -DskipTests | ||
* docker run --rm -ti --net host -v /opt/spark/work-dir/data:/opt/spark/work-dir/data -v $PWD/lakesoul-spark/target:/opt/spark/work-dir/jars bitnami/spark:3.3.1 spark-submit --driver-memory 4g --jars /opt/spark/work-dir/jars/lakesoul-spark-2.2.0-spark-3.3-SNAPSHOT.jar --class org.apache.spark.sql.lakesoul.benchmark.io.ParquetScanBenchmark /opt/spark/work-dir/jars/lakesoul-spark-2.2.0-spark-3.3-SNAPSHOT-tests.jar --localtest | ||
*/ | ||
object ParquetScanBenchmark { | ||
def main(args: Array[String]): Unit = { | ||
val builder = SparkSession.builder() | ||
.appName("ParquetScanBenchmark") | ||
.master("local[1]") | ||
.config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") | ||
.config("hadoop.fs.s3a.committer.name", "directory") | ||
.config("spark.hadoop.fs.s3a.committer.staging.conflict-mode", "append") | ||
.config("spark.hadoop.fs.s3a.committer.staging.tmp.path", "/opt/spark/work-dir/s3a_staging") | ||
.config("spark.hadoop.mapreduce.outputcommitter.factory.scheme.s3a", "org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory") | ||
.config("spark.hadoop.fs.s3a.path.style.access", "true") | ||
.config("spark.hadoop.fs.s3.buffer.dir", "/tmp") | ||
.config("spark.hadoop.fs.s3a.buffer.dir", "/tmp") | ||
.config("spark.hadoop.fs.s3a.fast.upload.buffer", "disk") | ||
.config("spark.hadoop.fs.s3a.fast.upload", value = true) | ||
.config("spark.hadoop.fs.s3a.multipart.size", 33554432) | ||
.config("spark.sql.shuffle.partitions", 1) | ||
.config("spark.sql.files.maxPartitionBytes", "2g") | ||
.config("spark.default.parallelism", 1) | ||
.config("spark.sql.parquet.mergeSchema", value = false) | ||
.config("spark.sql.parquet.filterPushdown", value = true) | ||
.config("spark.hadoop.mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter") | ||
.config("spark.sql.warehouse.dir", "s3://lakesoul-test-bucket/data/benchmark") | ||
.config("spark.sql.extensions", "com.dmetasoul.lakesoul.sql.LakeSoulSparkSessionExtension") | ||
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.lakesoul.catalog.LakeSoulCatalog") | ||
|
||
var bucketName = "lakesoul-test-bucket" | ||
if (args.length >= 1 && args(0) == "--localtest") { | ||
builder.config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000") | ||
.config("spark.hadoop.fs.s3a.endpoint.region", "us-east-1") | ||
.config("spark.hadoop.fs.s3a.access.key", "minioadmin1") | ||
.config("spark.hadoop.fs.s3a.secret.key", "minioadmin1") | ||
} else { | ||
if (args.length >= 1 && args(0) == "--bucketname") { | ||
bucketName = args(1) | ||
} | ||
} | ||
|
||
val spark = builder.getOrCreate() | ||
spark.sparkContext.setLogLevel("ERROR") | ||
|
||
val dataPath0 = "/opt/spark/work-dir/data/base-0.parquet" | ||
val tablePath = s"s3://$bucketName/data/benchmark/parquet-scan" | ||
println(s"tablePath: $tablePath") | ||
|
||
var tableExist = true | ||
try { | ||
val _ = LakeSoulTable.forPath(tablePath) | ||
tableExist = true | ||
} catch { | ||
case _: Throwable => tableExist = false | ||
} | ||
|
||
if (!tableExist) { | ||
println(s"LakeSoul table not exist, upload from local file") | ||
val df = spark.read.format("parquet").load(dataPath0).repartition(1) | ||
df.write.format("lakesoul") | ||
.mode("Overwrite").save(tablePath) | ||
} | ||
|
||
println(s"Reading with parquet-mr") | ||
// spark parquet-mr read | ||
SQLConf.get.setConfString(LakeSoulSQLConf.NATIVE_IO_ENABLE.key, "false") | ||
spark.time({ | ||
spark.read.format("lakesoul").load(tablePath).write.format("noop").mode("Overwrite").save() | ||
}) | ||
println(s"Reading with native io") | ||
SQLConf.get.setConfString(LakeSoulSQLConf.NATIVE_IO_ENABLE.key, "true") | ||
spark.time({ | ||
spark.read.format("lakesoul").load(tablePath).write.format("noop").mode("Overwrite").save() | ||
}) | ||
} | ||
} |
35 changes: 35 additions & 0 deletions
35
lakesoul-spark/src/test/scala/org/apache/spark/sql/lakesoul/benchmark/io/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Native IO Performance Comparison Results | ||
|
||
## 1. Parquet Scan | ||
|
||
### Settings | ||
|
||
Code: lakesoul-spark/src/test/scala/org/apache/spark/sql/lakesoul/benchmark/io/ParquetScanBenchmark.scala | ||
|
||
Tested on Spark 3.3.1 with Parquet-mr 1.12.2, Arrow-rs(parquet) 31.0.0. | ||
Parquet file size: 894.3MB, compressed with snappy. Metadata: | ||
|
||
``` | ||
############ file meta data ############ | ||
created_by: parquet-mr version 1.12.2 (build 77e30c8093386ec52c3cfa6c34b7ef3321322c94) | ||
num_columns: 8 | ||
num_rows: 10000000 | ||
num_row_groups: 7 | ||
format_version: 1.0 | ||
serialized_size: 7688 | ||
``` | ||
|
||
File is read with only one parallelism in Spark. | ||
|
||
### Results | ||
1. MinIO | ||
|
||
| | Parquet-mr | Native-IO | Improvement | | ||
|----------|------------|-----------|-------------| | ||
| Time(ms) | 11417 | 4381 | 2.61x | | ||
|
||
2. AWS S3 | ||
|
||
| | Parquet-mr | Native-IO | Improvement | | ||
|----------|------------|-----------|-------------| | ||
| Time(ms) | 25190 | 6965 | 3.62x | |
Oops, something went wrong.