NVIDIA · revans2 · Mar 30, 2021 · Feb 11, 2021 · Feb 11, 2021 · Feb 25, 2021
diff --git a/docs/supported_ops.md b/docs/supported_ops.md
@@ -17991,7 +17991,7 @@ and the accelerator produces the same result.
 <td> </td>
 <td> </td>
 <td> </td>
-<td><b>NS</b></td>
+<td>S</td>
 <td> </td>
 <td> </td>
 <td> </td>
@@ -18395,7 +18395,7 @@ and the accelerator produces the same result.
 <td> </td>
 <td> </td>
 <td> </td>
-<td><b>NS</b></td>
+<td>S</td>
 <td> </td>
 <td> </td>
 <td> </td>

diff --git a/integration_tests/src/main/python/struct_test.py b/integration_tests/src/main/python/struct_test.py
@@ -57,3 +57,20 @@ def test_orderby_struct_2(data_gen):
         lambda spark : append_unique_int_col_to_df(spark, unary_op_df(spark, data_gen)),
         'struct_table',
         'select struct_table.a, struct_table.uniq_int from struct_table order by uniq_int')
+
+# conf with legacy cast to string on
+legacy_complex_types_to_string = {'spark.sql.legacy.castComplexTypesToString.enabled': 'true'}
+# @pytest.mark.parametrize('data_gen', [StructGen([["first", boolean_gen], ["second", byte_gen], ["third", short_gen], ["fourth", int_gen], ["fifth", long_gen], ["sixth", string_gen], ["seventh", date_gen], ["eighth", float_gen], ], ["ninth", double_gen], ["tenth", timestamp_gen]])], ids=idfn)
+@pytest.mark.parametrize('data_gen', [StructGen([["first", boolean_gen], ["second", byte_gen], ["third", short_gen], ["fourth", int_gen], ["fifth", long_gen], ["sixth", string_gen], ["seventh", date_gen]])], ids=idfn)
+def test_legacy_cast_struct_to_string(data_gen):
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark : unary_op_df(spark, data_gen).select(
+            f.col('a').cast("STRING")),
+            conf = legacy_complex_types_to_string)
+
+# @pytest.mark.parametrize('data_gen', [StructGen([["first", boolean_gen], ["second", byte_gen], ["third", short_gen], ["fourth", int_gen], ["fifth", long_gen], ["sixth", string_gen], ["seventh", date_gen], ["eighth", float_gen], ], ["ninth", double_gen], ["tenth", timestamp_gen]])], ids=idfn)
+@pytest.mark.parametrize('data_gen', [StructGen([["first", boolean_gen], ["second", byte_gen], ["third", short_gen], ["fourth", int_gen], ["fifth", long_gen], ["sixth", string_gen], ["seventh", date_gen]])], ids=idfn)
+def test_cast_struct_to_string(data_gen):
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark : unary_op_df(spark, data_gen).select(
+            f.col('a').cast("STRING")))
diff --git a/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/Spark300Shims.scala b/shims/spark300/src/main/scala/com/nvidia/spark/rapids/shims/spark300/Spark300Shims.scala
@@ -478,6 +478,8 @@ class Spark300Shims extends SparkShims {
     InMemoryFileIndex.shouldFilterOut(path)
   }
 
+  override def getLegacyComplexTypeToString(): Boolean = true
+
   // Arrow version changed between Spark versions
   override def getArrowDataBuf(vec: ValueVector): (ByteBuffer, ReferenceManager) = {
     val arrowBuf = vec.getDataBuffer()

diff --git a/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/Spark311Shims.scala b/shims/spark311/src/main/scala/com/nvidia/spark/rapids/shims/spark311/Spark311Shims.scala
@@ -424,6 +424,10 @@ class Spark311Shims extends Spark301Shims {
     HadoopFSUtilsShim.shouldIgnorePath(path)
   }
 
+  override def getLegacyComplexTypeToString(): Boolean = {
+    SQLConf.get.getConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING)
+  }
+
   // Arrow version changed between Spark versions
   override def getArrowDataBuf(vec: ValueVector): (ByteBuffer, ReferenceManager) = {
     val arrowBuf = vec.getDataBuffer()

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala
@@ -19,10 +19,12 @@ package com.nvidia.spark.rapids
 import java.text.SimpleDateFormat
 import java.time.DateTimeException
 
-import ai.rapids.cudf.{ColumnVector, DType, Scalar}
+import scala.collection.mutable.ArrayBuffer
+
+import ai.rapids.cudf.{BinaryOp, ColumnVector, ColumnView, DType, Scalar}
 
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
-import org.apache.spark.sql.catalyst.expressions.{Cast, CastBase, Expression, NullIntolerant, TimeZoneAwareExpression}
+import org.apache.spark.sql.catalyst.expressions.{AnsiCast, Cast, CastBase, Expression, NullIntolerant, TimeZoneAwareExpression}
 import org.apache.spark.sql.types._
 
 /** Meta-data for cast and ansi_cast. */
@@ -37,17 +39,22 @@ class CastExprMeta[INPUT <: CastBase](
   private val castExpr = if (ansiEnabled) "ansi_cast" else "cast"
   val fromType = cast.child.dataType
   val toType = cast.dataType
+  var legacyCastToString = ShimLoader.getSparkShims.getLegacyComplexTypeToString()
 
   override def tagExprForGpu(): Unit = {
+    recursiveTagExprForGpuCheck(fromType)
+  }
+
+  def recursiveTagExprForGpuCheck(fromDataType: DataType) {
     if (!conf.isCastFloatToDecimalEnabled && toType.isInstanceOf[DecimalType] &&
-      (fromType == DataTypes.FloatType || fromType == DataTypes.DoubleType)) {
+      (fromDataType == DataTypes.FloatType || fromDataType == DataTypes.DoubleType)) {
       willNotWorkOnGpu("the GPU will use a different strategy from Java's BigDecimal to convert " +
         "floating point data types to decimals and this can produce results that slightly " +
         "differ from the default behavior in Spark.  To enable this operation on the GPU, set " +
         s"${RapidsConf.ENABLE_CAST_FLOAT_TO_DECIMAL} to true.")
     }
     if (!conf.isCastFloatToStringEnabled && toType == DataTypes.StringType &&
-      (fromType == DataTypes.FloatType || fromType == DataTypes.DoubleType)) {
+      (fromDataType == DataTypes.FloatType || fromDataType == DataTypes.DoubleType)) {
       willNotWorkOnGpu("the GPU will use different precision than Java's toString method when " +
         "converting floating point data types to strings and this can produce results that " +
         "differ from the default behavior in Spark.  To enable this operation on the GPU, set" +
@@ -71,21 +78,35 @@ class CastExprMeta[INPUT <: CastBase](
         "operation on the GPU, set" +
         s" ${RapidsConf.ENABLE_CAST_STRING_TO_INTEGER} to true.")
     }
-    if (!conf.isCastStringToTimestampEnabled && fromType == DataTypes.StringType
+    if (!conf.isCastStringToTimestampEnabled && fromDataType == DataTypes.StringType
       && toType == DataTypes.TimestampType) {
       willNotWorkOnGpu("the GPU only supports a subset of formats " +
         "when casting strings to timestamps. Refer to the CAST documentation " +
         "for more details. To enable this operation on the GPU, set" +
         s" ${RapidsConf.ENABLE_CAST_STRING_TO_TIMESTAMP} to true.")
     }
+    if (fromDataType.isInstanceOf[StructType]) {
+      val key = if (ansiEnabled) classOf[AnsiCast] else classOf[Cast]
+      val checks = GpuOverrides.expressions(key).getChecks.get.asInstanceOf[CastChecks]
+      fromDataType.asInstanceOf[StructType].foreach{field =>
+        recursiveTagExprForGpuCheck(field.dataType)
+        if (toType == StringType) {
+          if (!checks.gpuCanCast(field.dataType, toType)) {
+            willNotWorkOnGpu(s"Unsupported type ${field.dataType} found in Struct column. " +
+              s"Casting ${field.dataType} to ${toType} not currently supported. Refer to " +
+              "CAST documentation for more details.")
+          }
+        }
+      }
+    }
   }
 
   def buildTagMessage(entry: ConfEntry[_]): String = {
     s"${entry.doc}. To enable this operation on the GPU, set ${entry.key} to true."
   }
 
   override def convertToGpu(child: Expression): GpuExpression =
-    GpuCast(child, toType, ansiEnabled, cast.timeZoneId)
+    GpuCast(child, toType, ansiEnabled, cast.timeZoneId, legacyCastToString)
 }
 
 object GpuCast {
@@ -134,7 +155,8 @@ case class GpuCast(
     child: Expression,
     dataType: DataType,
     ansiMode: Boolean = false,
-    timeZoneId: Option[String] = None)
+    timeZoneId: Option[String] = None,
+    legacyCastToString: Boolean = false)
   extends GpuUnaryExpression with TimeZoneAwareExpression with NullIntolerant {
 
   import GpuCast._
@@ -240,6 +262,8 @@ case class GpuCast(
         }
       case (TimestampType, StringType) =>
         castTimestampToString(input)
+      case (StructType(fields), StringType) =>
+        castStructToString(input, legacyCastToString, fields)
 
       // ansi cast from larger-than-integer integral types, to integer
       case (LongType, IntegerType) if ansiMode =>
@@ -485,6 +509,105 @@ case class GpuCast(
     }
   }
 
+  private def castStructToString(input: GpuColumnVector,
+    legacyCastToString: Boolean, inputSchema: Array[StructField]): ColumnVector = {
+    // The brackets that are used in casting structs and maps to strings
+    val (leftBracket, rightBracket) = if (legacyCastToString) ("[", "]") else ("{", "}")
+    var separatorColumn: ColumnVector = null
+    var spaceColumn: ColumnVector = null
+    val columns: ArrayBuffer[ColumnVector] = new ArrayBuffer[ColumnVector]()
+    // coreColumns tracks the casted child columns
+    val coreColumns: ArrayBuffer[ColumnVector] = new ArrayBuffer[ColumnVector]()
+
+    try {
+      withResource(GpuScalar.from(leftBracket, StringType)) { bracketScalar =>
+        columns += ColumnVector.fromScalar(bracketScalar, input.getRowCount().toInt)
+      }
+      withResource(input.getBase().getChildColumnView(0)) { childView =>
+        withResource(childView.copyToColumnVector()) { childVector =>
+          columns += doColumnar(GpuColumnVector.from(childVector, inputSchema(0).dataType))
+        }
+      }
+
+      if(legacyCastToString) {
+        coreColumns += columns.last
+        withResource(GpuScalar.from(",", StringType)) { separatorScalar =>
+          separatorColumn = ColumnVector.fromScalar(separatorScalar, input.getRowCount().toInt)
+        }
+        withResource(GpuScalar.from(" ", StringType)) { separatorScalar =>
+          spaceColumn = ColumnVector.fromScalar(separatorScalar, input.getRowCount().toInt)
+        }
+        for(childIndex <- 1 until input.getBase().getNumChildren()) {
+          withResource(input.getBase().getChildColumnView(childIndex)) { childView =>
+            columns += separatorColumn
+            // Copies the whitespace column's validity with the current column's validity.
+            // Mimics the Spark null behavior of consecutive commas with no space between them
+            columns += spaceColumn.mergeAndSetValidity(BinaryOp.BITWISE_AND, childView)
+            withResource(childView.copyToColumnVector()) { childVector =>
+              columns += doColumnar(GpuColumnVector.from(childVector,
+                inputSchema(childIndex).dataType))
+              coreColumns += columns.last
+            }
+          }
+        }
+        withResource(GpuScalar.from(rightBracket, StringType)) { bracketScalar =>
+          columns += ColumnVector.fromScalar(bracketScalar, input.getRowCount().toInt)
+        }
+
+        // Merge casted child columns
+        withResource(GpuScalar.from("", StringType)) { emptyStrScalar =>
+          withResource(ColumnVector.stringConcatenate(emptyStrScalar, emptyStrScalar,
+            columns.toArray[ColumnView])) { fullResult =>
+            // Merge the validity of all child columns, fully null rows are null in the result
+            withResource(fullResult.mergeAndSetValidity(BinaryOp.BITWISE_OR,
+              coreColumns: _*)) { nulledResult =>
+              // Reflect the struct column's validity vector in the result
+              nulledResult.mergeAndSetValidity(BinaryOp.BITWISE_AND, input.getBase(), nulledResult)
+            }
+          }
+        }
+      } else {
+        withResource(GpuScalar.from(", ", StringType)) { separatorScalar =>
+          separatorColumn = ColumnVector.fromScalar(separatorScalar, input.getRowCount().toInt)
+        }
+        for(childIndex <- 1 until input.getBase().getNumChildren()) {
+          withResource(input.getBase().getChildColumnView(childIndex)) { childView =>
+            columns += separatorColumn
+            withResource(childView.copyToColumnVector()) { childVector =>
+              columns += doColumnar(GpuColumnVector.from(childVector,
+                inputSchema(childIndex).dataType))
+            }
+          }
+        }
+        withResource(GpuScalar.from(rightBracket, StringType)) { bracketScalar =>
+          columns += ColumnVector.fromScalar(bracketScalar, input.getRowCount().toInt)
+        }
+
+        // Merge casted child columns
+        withResource(GpuScalar.from("", StringType)) { emptyStrScalar =>
+          withResource(GpuScalar.from("null", StringType)) { nullStringScalar =>
+            withResource(ColumnVector.stringConcatenate(emptyStrScalar, nullStringScalar,
+              columns.toArray[ColumnView])) { fullResult =>
+              // Reflect the struct column's validity vector in the result
+              fullResult.mergeAndSetValidity(BinaryOp.BITWISE_AND, input.getBase())
+            }
+          }
+        }
+      }
+    } finally {
+      if (separatorColumn != null) {
+        columns.foreach(col =>
+          if(col.getNativeView() != separatorColumn.getNativeView()) {
+            col.close()
+          })
+        separatorColumn.close()
+      }
+      if (spaceColumn != null) {
+        spaceColumn.close()
+      }
+    }
+  }
+
   private def castFloatingTypeToString(input: GpuColumnVector): ColumnVector = {
     withResource(input.getBase.castTo(DType.STRING)) { cudfCast =>
 

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SparkShims.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SparkShims.scala
@@ -187,6 +187,8 @@ trait SparkShims {
 
   def shouldIgnorePath(path: String): Boolean
 
+  def getLegacyComplexTypeToString(): Boolean
+
   def getArrowDataBuf(vec: ValueVector): (ByteBuffer, ReferenceManager)
   def getArrowValidityBuf(vec: ValueVector): (ByteBuffer, ReferenceManager)
   def getArrowOffsetsBuf(vec: ValueVector): (ByteBuffer, ReferenceManager)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TypeChecks.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/TypeChecks.scala
@@ -766,7 +766,7 @@ class CastChecks extends ExprChecks {
   val mapChecks: TypeSig = none
   val sparkMapSig: TypeSig = STRING + MAP.nested(all)
 
-  val structChecks: TypeSig = none
+  val structChecks: TypeSig = STRING
   val sparkStructSig: TypeSig = STRING + STRUCT.nested(all)
 
   val udtChecks: TypeSig = none
@@ -840,8 +840,8 @@ class CastChecks extends ExprChecks {
   }
 
   def gpuCanCast(from: DataType, to: DataType, allowDecimal: Boolean = true): Boolean = {
-    val (_, sparkSig) = getChecksAndSigs(from)
-    sparkSig.isSupportedByPlugin(to, allowDecimal)
+    val (checks, _) = getChecksAndSigs(from)
+    checks.isSupportedByPlugin(to, allowDecimal)
   }
 }