diff --git a/docs/compatibility.md b/docs/compatibility.md index e14bc66cd222..560002f4d5ed 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -309,15 +309,6 @@ Also, the GPU does not support casting from strings containing hex values. To enable this operation on the GPU, set [`spark.rapids.sql.castStringToFloat.enabled`](configs.md#sql.castStringToFloat.enabled) to `true`. - -### String to Integral Types - -The GPU will return incorrect results for strings representing values greater than Long.MaxValue or -less than Long.MinValue. The correct behavior would be to return null for these values, but the GPU -currently overflows and returns an incorrect integer value. - -To enable this operation on the GPU, set -[`spark.rapids.sql.castStringToInteger.enabled`](configs.md#sql.castStringToInteger.enabled) to `true`. ### String to Date diff --git a/docs/configs.md b/docs/configs.md index 1ea28bcfc962..66170a7f5c2b 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -56,7 +56,6 @@ Name | Description | Default Value spark.rapids.sql.castFloatToString.enabled|Casting from floating point types to string on the GPU returns results that have a different precision than the default results of Spark.|false spark.rapids.sql.castStringToDecimal.enabled|When set to true, enables casting from strings to decimal type on the GPU. Currently string to decimal type on the GPU might produce results which slightly differed from the correct results when the string represents any number exceeding the max precision that CAST_STRING_TO_FLOAT can keep. For instance, the GPU returns 99999999999999987 given input string "99999999999999999". The cause of divergence is that we can not cast strings containing scientific notation to decimal directly. So, we have to cast strings to floats firstly. Then, cast floats to decimals. The first step may lead to precision loss.|false spark.rapids.sql.castStringToFloat.enabled|When set to true, enables casting from strings to float types (float, double) on the GPU. Currently hex values aren't supported on the GPU. Also note that casting from string to float types on the GPU returns incorrect results when the string represents any number "1.7976931348623158E308" <= x < "1.7976931348623159E308" and "-1.7976931348623158E308" >= x > "-1.7976931348623159E308" in both these cases the GPU returns Double.MaxValue while CPU returns "+Infinity" and "-Infinity" respectively|false -spark.rapids.sql.castStringToInteger.enabled|When set to true, enables casting from strings to integer types (byte, short, int, long) on the GPU. Casting from string to integer types on the GPU returns incorrect results when the string represents a number larger than Long.MaxValue or smaller than Long.MinValue.|false spark.rapids.sql.castStringToTimestamp.enabled|When set to true, casting from string to timestamp is supported on the GPU. The GPU only supports a subset of formats when casting strings to timestamps. Refer to the CAST documentation for more details.|false spark.rapids.sql.concurrentGpuTasks|Set the number of tasks that can execute concurrently per GPU. Tasks may temporarily block when the number of concurrent tasks in the executor exceeds this amount. Allowing too many concurrent tasks on the same GPU may lead to GPU out of memory errors.|1 spark.rapids.sql.csvTimestamps.enabled|When set to true, enables the CSV parser to read timestamps. The default output format for Spark includes a timezone at the end. Anything except the UTC timezone is not supported. Timestamps after 2038 and before 1902 are also not supported.|false diff --git a/docs/tuning-guide.md b/docs/tuning-guide.md index 835e57987fc0..ffd23e78107e 100644 --- a/docs/tuning-guide.md +++ b/docs/tuning-guide.md @@ -209,5 +209,4 @@ performance. - [`spark.rapids.sql.variableFloatAgg.enabled`](configs.md#sql.variableFloatAgg.enabled) - [`spark.rapids.sql.hasNans`](configs.md#sql.hasNans) - [`spark.rapids.sql.castFloatToString.enabled`](configs.md#sql.castFloatToString.enabled) -- [`spark.rapids.sql.castStringToInteger.enabled`](configs.md#sql.castStringToInteger.enabled) - [`spark.rapids.sql.castStringToFloat.enabled`](configs.md#sql.castStringToFloat.enabled) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala index f7b9883d4e72..f2a24a15a7ab 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala @@ -69,15 +69,7 @@ class CastExprMeta[INPUT <: CastBase]( "CPU returns \"+Infinity\" and \"-Infinity\" respectively. To enable this operation on " + "the GPU, set" + s" ${RapidsConf.ENABLE_CAST_STRING_TO_FLOAT} to true.") } - if (!conf.isCastStringToIntegerEnabled && cast.child.dataType == DataTypes.StringType && - Seq(DataTypes.ByteType, DataTypes.ShortType, DataTypes.IntegerType, DataTypes.LongType) - .contains(cast.dataType)) { - willNotWorkOnGpu("the GPU will return incorrect results for strings representing" + - "values greater than Long.MaxValue or less than Long.MinValue. To enable this " + - "operation on the GPU, set" + - s" ${RapidsConf.ENABLE_CAST_STRING_TO_INTEGER} to true.") - } - if (!conf.isCastStringToTimestampEnabled && fromDataType == DataTypes.StringType + if (!conf.isCastStringToTimestampEnabled && fromType == DataTypes.StringType && toType == DataTypes.TimestampType) { willNotWorkOnGpu("the GPU only supports a subset of formats " + "when casting strings to timestamps. Refer to the CAST documentation " + @@ -133,18 +125,6 @@ object GpuCast { */ private val FULL_TIMESTAMP_LENGTH = 27 - /** - * Regex for identifying strings that contain numeric values that can be casted to integral - * types. This includes floating point numbers but not numbers containing exponents. - */ - private val CASTABLE_TO_INT_REGEX = "\\s*[+\\-]?[0-9]*(\\.)?[0-9]+\\s*$" - - /** - * Regex for identifying strings that contain numeric values that can be casted to integral - * types when ansi is enabled. - */ - private val ANSI_CASTABLE_TO_INT_REGEX = "\\s*[+\\-]?[0-9]+\\s*$" - /** * Regex to match timestamps with or without trailing zeros. */ @@ -398,44 +378,8 @@ case class GpuCast( castStringToFloats(trimmed, ansiMode, GpuColumnVector.getNonNestedRapidsType(dataType)) case ByteType | ShortType | IntegerType | LongType => - // filter out values that are not valid longs or nulls - val regex = if (ansiMode) { - GpuCast.ANSI_CASTABLE_TO_INT_REGEX - } else { - GpuCast.CASTABLE_TO_INT_REGEX - } - val longStrings = withResource(trimmed.matchesRe(regex)) { regexMatches => - if (ansiMode) { - withResource(regexMatches.all()) { allRegexMatches => - if (!allRegexMatches.getBoolean) { - throw new NumberFormatException(GpuCast.INVALID_INPUT_MESSAGE) - } - } - } - withResource(Scalar.fromNull(DType.STRING)) { nullString => - regexMatches.ifElse(trimmed, nullString) - } - } - // cast to specific integral type after filtering out values that are not in range - // for that type. Note that the scalar values here are named parameters so are not - // created until they are needed - withResource(longStrings) { longStrings => - GpuColumnVector.getNonNestedRapidsType(dataType) match { - case DType.INT8 => - castStringToIntegralType(longStrings, DType.INT8, - Scalar.fromInt(Byte.MinValue), Scalar.fromInt(Byte.MaxValue)) - case DType.INT16 => - castStringToIntegralType(longStrings, DType.INT16, - Scalar.fromInt(Short.MinValue), Scalar.fromInt(Short.MaxValue)) - case DType.INT32 => - castStringToIntegralType(longStrings, DType.INT32, - Scalar.fromInt(Int.MinValue), Scalar.fromInt(Int.MaxValue)) - case DType.INT64 => - longStrings.castTo(DType.INT64) - case _ => - throw new IllegalStateException("Invalid integral type") - } - } + castStringToInts(trimmed, ansiMode, + GpuColumnVector.getNonNestedRapidsType(dataType)) } } case (StringType, dt: DecimalType) => @@ -733,6 +677,52 @@ case class GpuCast( } } + def castStringToInts( + input: ColumnVector, + ansiEnabled: Boolean, + dType: DType): ColumnVector = { + val cleaned = if (!ansiEnabled) { + // TODO would be great to get rid of this regex, but the overflow checks don't work + // on the more lenient pattern. + // To avoid doing the expensive regex all the time, we will first check to see if we need + // to do it. The only time we do need to do it is when we have a '.' in any of the strings. + val data = input.getData + val hasDot = withResource( + ColumnView.fromDeviceBuffer(data, 0, DType.INT8, data.getLength.toInt)) { childData => + withResource(GpuScalar.from('.'.toByte, ByteType)) { dot => + childData.contains(dot) + } + } + if (hasDot) { + withResource(input.extractRe("^([+\\-]?[0-9]+)(?:\\.[0-9]*)?$")) { table => + table.getColumn(0).incRefCount() + } + } else { + input.incRefCount() + } + } else { + input.incRefCount() + } + withResource(cleaned) { cleaned => + withResource(cleaned.isInteger(dType)) { isInt => + if (ansiEnabled) { + withResource(isInt.all()) { allInts => + if (!allInts.getBoolean) { + throw new NumberFormatException(GpuCast.INVALID_INPUT_MESSAGE) + } + } + cleaned.castTo(dType) + } else { + withResource(cleaned.castTo(dType)) { parsedInt => + withResource(GpuScalar.from(null, dataType)) { nullVal => + isInt.ifElse(parsedInt, nullVal) + } + } + } + } + } + } + def castStringToFloats( input: ColumnVector, ansiEnabled: Boolean, @@ -1109,63 +1099,6 @@ case class GpuCast( } } - /** - * Cast column of long values to a smaller integral type (bytes, short, int). - * - * @param longStrings Long values in string format - * @param castToType Type to cast to - * @param minValue Named parameter for function to create Scalar representing range minimum value - * @param maxValue Named parameter for function to create Scalar representing range maximum value - * @return Values cast to specified integral type - */ - private def castStringToIntegralType(longStrings: ColumnVector, - castToType: DType, - minValue: => Scalar, - maxValue: => Scalar): ColumnVector = { - - // evaluate min and max named parameters once since they are used in multiple places - withResource(minValue) { minValue: Scalar => - withResource(maxValue) { maxValue: Scalar => - withResource(Scalar.fromNull(DType.INT64)) { nulls => - withResource(longStrings.castTo(DType.INT64)) { values => - - // replace values less than minValue with null - val gtEqMinOrNull = withResource(values.greaterOrEqualTo(minValue)) { isGtEqMin => - if (ansiMode) { - withResource(isGtEqMin.all()) { all => - if (!all.getBoolean) { - throw new NumberFormatException(GpuCast.INVALID_INPUT_MESSAGE) - } - } - } - isGtEqMin.ifElse(values, nulls) - } - - // replace values greater than maxValue with null - val ltEqMaxOrNull = withResource(gtEqMinOrNull) { gtEqMinOrNull => - withResource(gtEqMinOrNull.lessOrEqualTo(maxValue)) { isLtEqMax => - if (ansiMode) { - withResource(isLtEqMax.all()) { all => - if (!all.getBoolean) { - throw new NumberFormatException(GpuCast.INVALID_INPUT_MESSAGE) - } - } - } - isLtEqMax.ifElse(gtEqMinOrNull, nulls) - } - } - - // cast the final values - withResource(ltEqMaxOrNull) { ltEqMaxOrNull => - ltEqMaxOrNull.castTo(castToType) - } - } - } - } - - } - } - private def castIntegralsToDecimal(input: ColumnVector, dt: DecimalType): ColumnVector = { // Use INT64 bounds instead of FLOAT64 bounds, which enables precise comparison. diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index fb2611edeeae..03be5a2b2001 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -582,14 +582,6 @@ object RapidsConf { .booleanConf .createWithDefault(false) - val ENABLE_CAST_STRING_TO_INTEGER = conf("spark.rapids.sql.castStringToInteger.enabled") - .doc("When set to true, enables casting from strings to integer types (byte, short, " + - "int, long) on the GPU. Casting from string to integer types on the GPU returns incorrect " + - "results when the string represents a number larger than Long.MaxValue or smaller than " + - "Long.MinValue.") - .booleanConf - .createWithDefault(false) - val ENABLE_CSV_TIMESTAMPS = conf("spark.rapids.sql.csvTimestamps.enabled") .doc("When set to true, enables the CSV parser to read timestamps. The default output " + "format for Spark includes a timezone at the end. Anything except the UTC timezone is not " + @@ -1191,8 +1183,6 @@ class RapidsConf(conf: Map[String, String]) extends Logging { lazy val isCastStringToTimestampEnabled: Boolean = get(ENABLE_CAST_STRING_TO_TIMESTAMP) - lazy val isCastStringToIntegerEnabled: Boolean = get(ENABLE_CAST_STRING_TO_INTEGER) - lazy val isCastStringToFloatEnabled: Boolean = get(ENABLE_CAST_STRING_TO_FLOAT) lazy val isCastStringToDecimalEnabled: Boolean = get(ENABLE_CAST_STRING_TO_DECIMAL) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala index eb589dd277c6..59960c751a9e 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala @@ -335,7 +335,6 @@ class AdaptiveQueryExecSuite .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "true") .set(SQLConf.LOCAL_SHUFFLE_READER_ENABLED.key, "true") .set(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key, "400") - .set(RapidsConf.ENABLE_CAST_STRING_TO_INTEGER.key, "true") .set(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key, "50") // disable DemoteBroadcastHashJoin rule from removing BHJ due to empty partitions .set(SQLConf.NON_EMPTY_PARTITION_RATIO_FOR_BROADCAST_JOIN.key, "0") @@ -370,7 +369,6 @@ class AdaptiveQueryExecSuite // disable DemoteBroadcastHashJoin rule from removing BHJ due to empty partitions .set(SQLConf.NON_EMPTY_PARTITION_RATIO_FOR_BROADCAST_JOIN.key, "0") .set(SQLConf.SHUFFLE_PARTITIONS.key, "5") - .set(RapidsConf.ENABLE_CAST_STRING_TO_INTEGER.key, "true") .set(RapidsConf.DECIMAL_TYPE_ENABLED.key, "true") .set(RapidsConf.TEST_ALLOWED_NONGPU.key, "DataWritingCommandExec") diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala index 38f113602e9c..eaa6224dfbec 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,7 +37,6 @@ class AnsiCastOpSuite extends GpuExpressionTestSuite { .set("spark.sql.storeAssignmentPolicy", "ANSI") // note this is the default in 3.0.0 .set(RapidsConf.ENABLE_CAST_FLOAT_TO_INTEGRAL_TYPES.key, "true") .set(RapidsConf.ENABLE_CAST_FLOAT_TO_STRING.key, "true") - .set(RapidsConf.ENABLE_CAST_STRING_TO_INTEGER.key, "true") .set(RapidsConf.ENABLE_CAST_STRING_TO_FLOAT.key, "true") .set(RapidsConf.ENABLE_CAST_STRING_TO_TIMESTAMP.key, "true") diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala index b86c16da5f7e..e89f27bb07bc 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala @@ -63,7 +63,6 @@ class CastOpSuite extends GpuExpressionTestSuite { .set(RapidsConf.ENABLE_CAST_FLOAT_TO_INTEGRAL_TYPES.key, "true") .set(RapidsConf.ENABLE_CAST_FLOAT_TO_STRING.key, "true") .set(RapidsConf.ENABLE_CAST_STRING_TO_TIMESTAMP.key, "true") - .set(RapidsConf.ENABLE_CAST_STRING_TO_INTEGER.key, "true") .set(RapidsConf.ENABLE_CAST_STRING_TO_FLOAT.key, "true") .set("spark.sql.ansi.enabled", String.valueOf(ansiEnabled)) @@ -362,6 +361,15 @@ class CastOpSuite extends GpuExpressionTestSuite { col("doubles").cast(TimestampType)) } + testSparkResultsAreEqual("Test cast from strings to int", doublesAsStrings, + conf = sparkConf) { + frame => frame.select( + col("c0").cast(LongType), + col("c0").cast(IntegerType), + col("c0").cast(ShortType), + col("c0").cast(ByteType)) + } + testSparkResultsAreEqual("Test cast from strings to doubles", doublesAsStrings, conf = sparkConf, maxFloatDiff = 0.0001) { frame => frame.select(