diff --git a/docs/spark-profiling-tool.md b/docs/spark-profiling-tool.md index c2c175e3f20..890659ef241 100644 --- a/docs/spark-profiling-tool.md +++ b/docs/spark-profiling-tool.md @@ -406,8 +406,8 @@ SQL Duration and Executor CPU Time Percent +--------+-------------------+-----+------------+--------------------------+------------+---------------------------+-------------------------+ |appIndex|App ID |sqlID|SQL Duration|Contains Dataset or RDD Op|App Duration|Potential Problems |Executor CPU Time Percent| +--------+-------------------+-----+------------+--------------------------+------------+---------------------------+-------------------------+ -|1 |local-1626104300434|0 |1260 |false |131104 |DECIMAL:NESTED COMPLEX TYPE|92.65 | -|1 |local-1626104300434|1 |259 |false |131104 |DECIMAL:NESTED COMPLEX TYPE|76.79 | +|1 |local-1626104300434|0 |1260 |false |131104 |NESTED COMPLEX TYPE |92.65 | +|1 |local-1626104300434|1 |259 |false |131104 |NESTED COMPLEX TYPE |76.79 | ``` - Shuffle Skew Check: diff --git a/docs/spark-qualification-tool.md b/docs/spark-qualification-tool.md index 11d11f6180b..62bc60b44da 100644 --- a/docs/spark-qualification-tool.md +++ b/docs/spark-qualification-tool.md @@ -318,8 +318,7 @@ Its summary report outputs the following information: 2. Application duration 3. SQL/DF duration 4. Problematic Duration, which indicates potential issues for acceleration. - Some of the potential issues include unsupported data formats such as Decimal 128-bit - or User Defined Function (UDF) or any Dataset APIs. + Some of the potential issues include User Defined Function (UDF) or any Dataset APIs. Note: the duration(s) reported are in milli-seconds. Sample output in text: @@ -335,13 +334,11 @@ In the above example, two application event logs were analyzed. “app-202105071 than the “app-20210507174503-1704” because the score(in the csv output) for “app-20210507174503-2538” is higher than “app-20210507174503-1704”. Here the `Problematic Duration` is zero but please keep in mind that we are only able to detect certain issues. -This currently includes some UDFs, some decimal operations and nested complex types. +This currently includes some UDFs and nested complex types. The tool won't catch all UDFs, and some of the UDFs can be handled with additional steps. Please refer to [supported_ops.md](./supported_ops.md) for more details on UDF. -For decimals, the tool tries to parse for decimal operations but it may not capture all of the decimal operations -if they aren’t in the event logs. The second output is a more detailed output. Here is a sample output requesting csv style output: @@ -358,7 +355,7 @@ Here is a brief description of each of column that is in the CSV: 2. App ID: Spark Application ID. 3. Score : A score calculated based on SQL Dataframe Task Duration and gets negatively affected for any unsupported operators. Please refer to [Qualification tool score algorithm](#Qualification-tool-score-algorithm) for more details. -4. Potential Problems : Some UDFs, some decimal operations and nested complex types. +4. Potential Problems : Some UDFs and nested complex types. 5. SQL DF Duration: Time duration that includes only SQL/Dataframe queries. 6. SQL Dataframe Task Duration: Amount of time spent in tasks of SQL Dataframe operations. 7. App Duration: Total Application time. diff --git a/tools/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala b/tools/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala index 8e92119d1b2..baea0931ec9 100644 --- a/tools/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala +++ b/tools/src/main/scala/org/apache/spark/sql/rapids/tool/AppBase.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -128,17 +128,10 @@ abstract class AppBase( } } - // Decimal support on the GPU is limited to less than 18 digits and decimals - // are configured off by default for now. It would be nice to have this - // based off of what plugin supports at some point. - private val decimalKeyWords = Map(".*promote_precision\\(.*" -> "DECIMAL", - ".*decimal\\([0-9]+,[0-9]+\\).*" -> "DECIMAL", - ".*DecimalType\\([0-9]+,[0-9]+\\).*" -> "DECIMAL") - private val UDFKeywords = Map(".*UDF.*" -> "UDF") protected def findPotentialIssues(desc: String): Set[String] = { - val potentialIssuesRegexs = UDFKeywords ++ decimalKeyWords + val potentialIssuesRegexs = UDFKeywords val issues = potentialIssuesRegexs.filterKeys(desc.matches(_)) issues.values.toSet } diff --git a/tools/src/test/resources/ProfilingExpectations/rapids_duration_and_cpu_expectation.csv b/tools/src/test/resources/ProfilingExpectations/rapids_duration_and_cpu_expectation.csv index a6751bc4509..fa0ddeb1127 100644 --- a/tools/src/test/resources/ProfilingExpectations/rapids_duration_and_cpu_expectation.csv +++ b/tools/src/test/resources/ProfilingExpectations/rapids_duration_and_cpu_expectation.csv @@ -1,9 +1,9 @@ appIndex,App ID,sqlID,SQL Duration,Contains Dataset or RDD Op,App Duration,Potential Problems,Executor CPU Time Percent -1,local-1626104300434,0,1260,false,131104,DECIMAL:NESTED COMPLEX TYPE,92.65 -1,local-1626104300434,1,259,false,131104,DECIMAL:NESTED COMPLEX TYPE,76.79 +1,local-1626104300434,0,1260,false,131104,NESTED COMPLEX TYPE,92.65 +1,local-1626104300434,1,259,false,131104,NESTED COMPLEX TYPE,76.79 1,local-1626104300434,2,130,false,131104,NESTED COMPLEX TYPE,90.48 -1,local-1626104300434,3,76,false,131104,DECIMAL:NESTED COMPLEX TYPE,97.56 +1,local-1626104300434,3,76,false,131104,NESTED COMPLEX TYPE,97.56 1,local-1626104300434,4,65,false,131104,NESTED COMPLEX TYPE,100.0 1,local-1626104300434,5,479,false,131104,NESTED COMPLEX TYPE,87.32 -1,local-1626104300434,6,95,false,131104,DECIMAL:NESTED COMPLEX TYPE,96.3 -1,local-1626104300434,7,65,false,131104,DECIMAL:NESTED COMPLEX TYPE,95.24 +1,local-1626104300434,6,95,false,131104,NESTED COMPLEX TYPE,96.3 +1,local-1626104300434,7,65,false,131104,NESTED COMPLEX TYPE,95.24 diff --git a/tools/src/test/resources/QualificationExpectations/complex_dec_expectation.csv b/tools/src/test/resources/QualificationExpectations/complex_dec_expectation.csv index 76f4c1abfed..006544fe10e 100644 --- a/tools/src/test/resources/QualificationExpectations/complex_dec_expectation.csv +++ b/tools/src/test/resources/QualificationExpectations/complex_dec_expectation.csv @@ -1,2 +1,2 @@ App Name,App ID,Score,Potential Problems,SQL DF Duration,SQL Dataframe Task Duration,App Duration,Executor CPU Time Percent,App Duration Estimated,SQL Duration with Potential Problems,SQL Ids with Failures,Read Score Percent,Read File Format Score,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types -Spark shell,local-1626104300434,1469.0,DECIMAL:NESTED COMPLEX TYPE,2429,1469,131104,88.35,false,160,"",20,100.0,"","",struct;lastname:string>;struct;previous:struct;city:string>>;array>;map;map>;map>;array>;array,struct;lastname:string>;struct;previous:struct;city:string>>;array>;map>;map>;array> +Spark shell,local-1626104300434,1469.0,NESTED COMPLEX TYPE,2429,1469,131104,88.35,false,0,"",20,100.0,"","",struct;lastname:string>;struct;previous:struct;city:string>>;array>;map;map>;map>;array>;array,struct;lastname:string>;struct;previous:struct;city:string>>;array>;map>;map>;array> diff --git a/tools/src/test/resources/QualificationExpectations/decimal_part_expectation.csv b/tools/src/test/resources/QualificationExpectations/decimal_part_expectation.csv deleted file mode 100644 index b3ca37780b5..00000000000 --- a/tools/src/test/resources/QualificationExpectations/decimal_part_expectation.csv +++ /dev/null @@ -1,2 +0,0 @@ -App Name,App ID,Score,Potential Problems,SQL DF Duration,SQL Dataframe Task Duration,App Duration,Executor CPU Time Percent,App Duration Estimated,SQL Duration with Potential Problems,SQL Ids with Failures,Read Score Percent,Read File Format Score,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types -Spark shell,local-1626189209260,1238.0,DECIMAL:NESTED COMPLEX TYPE,1314,1238,106033,57.21,false,1023,"",20,100.0,"","",array>;map;map>;map>,array>;map>;map> diff --git a/tools/src/test/resources/QualificationExpectations/write_format_expectation.csv b/tools/src/test/resources/QualificationExpectations/write_format_expectation.csv index dca68f57f55..e6c10162e72 100644 --- a/tools/src/test/resources/QualificationExpectations/write_format_expectation.csv +++ b/tools/src/test/resources/QualificationExpectations/write_format_expectation.csv @@ -1,2 +1,2 @@ -App Name,App ID,Score,Potential Problems,SQL Dataframe Duration,SQL Dataframe Task Duration,App Duration,Executor CPU Time Percent,App Duration Estimated,SQL Duration with Potential Problems,SQL Ids with Failures,Read Score Percent,ReadFileFormat Score,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types -Spark shell,local-1629442299891,920.0,DECIMAL,1992,920,19554,91.72,false,1992,"",20,100.0,"",CSV;JSON,"","" +App Name,App ID,Score,Potential Problems,SQL DF Duration,SQL Dataframe Task Duration,App Duration,Executor CPU Time Percent,App Duration Estimated,SQL Duration with Potential Problems,SQL Ids with Failures,Read Score Percent,Read File Format Score,Unsupported Read File Formats and Types,Unsupported Write Data Format,Complex Types,Nested Complex Types +Spark shell,local-1629442299891,920.0,"",1992,920,19554,91.72,false,0,"",20,100.0,"",CSV;JSON,"","" diff --git a/tools/src/test/resources/spark-events-qualification/decimal_part_eventlog.zstd b/tools/src/test/resources/spark-events-qualification/decimal_part_eventlog.zstd deleted file mode 100644 index d963f5af3af..00000000000 Binary files a/tools/src/test/resources/spark-events-qualification/decimal_part_eventlog.zstd and /dev/null differ diff --git a/tools/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala b/tools/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala index 284ec4eb88b..d917bd59a75 100644 --- a/tools/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala +++ b/tools/src/test/scala/com/nvidia/spark/rapids/tool/qualification/QualificationSuite.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -401,13 +401,6 @@ class QualificationSuite extends FunSuite with BeforeAndAfterEach with Logging { } } - // this event log has both decimal and non-decimal so comes out partial - // it has both reading decimal, multiplication and join on decimal - test("test decimal problematic") { - val logFiles = Array(s"$logDir/decimal_part_eventlog.zstd") - runQualificationTest(logFiles, "decimal_part_expectation.csv") - } - test("test jdbc problematic") { val logFiles = Array(s"$logDir/jdbc_eventlog.zstd") runQualificationTest(logFiles, "jdbc_expectation.csv") @@ -420,7 +413,7 @@ class QualificationSuite extends FunSuite with BeforeAndAfterEach with Logging { dfGen.write.parquet(dir) } - test("test decimal generate udf same") { + test("test generate udf same") { TrampolineUtil.withTempDir { outpath => TrampolineUtil.withTempDir { eventLogDir => val tmpParquet = s"$outpath/decparquet" @@ -444,14 +437,13 @@ class QualificationSuite extends FunSuite with BeforeAndAfterEach with Logging { assert(exit == 0) assert(appSum.size == 1) val probApp = appSum.head - assert(probApp.potentialProblems.contains("UDF") && - probApp.potentialProblems.contains("DECIMAL")) + assert(probApp.potentialProblems.contains("UDF")) assert(probApp.sqlDataFrameDuration == probApp.sqlDurationForProblematic) } } } - test("test decimal generate udf different sql ops") { + test("test generate udf different sql ops") { TrampolineUtil.withTempDir { outpath => TrampolineUtil.withTempDir { eventLogDir => @@ -483,8 +475,7 @@ class QualificationSuite extends FunSuite with BeforeAndAfterEach with Logging { assert(exit == 0) assert(appSum.size == 1) val probApp = appSum.head - assert(probApp.potentialProblems.contains("UDF") && - probApp.potentialProblems.contains("DECIMAL")) + assert(probApp.potentialProblems.contains("UDF")) assert(probApp.sqlDurationForProblematic > 0) assert(probApp.sqlDataFrameDuration > probApp.sqlDurationForProblematic) } @@ -503,7 +494,7 @@ class QualificationSuite extends FunSuite with BeforeAndAfterEach with Logging { runQualificationTest(logFiles, "read_dsv2_expectation.csv") } - test("test dsv1 complex and decimal") { + test("test dsv1 complex") { val logFiles = Array(s"$logDir/complex_dec_eventlog.zstd") runQualificationTest(logFiles, "complex_dec_expectation.csv") }