Skip to content

Commit

Permalink
Add support for ignoreNullFields=false in to_json (#9640)
Browse files Browse the repository at this point in the history
* Add support for ignoreNullFields=false in to_json

* add comment

* signoff

Signed-off-by: Andy Grove <andygrove@nvidia.com>

---------

Signed-off-by: Andy Grove <andygrove@nvidia.com>
  • Loading branch information
andygrove authored Nov 6, 2023
1 parent 52bc980 commit f25fa42
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 29 deletions.
5 changes: 1 addition & 4 deletions integration_tests/src/main/python/json_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,10 +611,7 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name):
StringGen('[A-Za-z0-9]{0,10}', nullable=True),
pytest.param(StringGen(nullable=True), marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9514')),
], ids=idfn)
@pytest.mark.parametrize('ignore_null_fields', [
True,
pytest.param(False, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9516'))
])
@pytest.mark.parametrize('ignore_null_fields', [True, False])
@pytest.mark.parametrize('pretty', [
pytest.param(True, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9517')),
False
Expand Down
73 changes: 52 additions & 21 deletions sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,8 @@ class CastOptions(
legacyCastComplexTypesToString: Boolean,
ansiMode: Boolean,
stringToDateAnsiMode: Boolean,
val castToJsonString: Boolean = false) extends Serializable {
val castToJsonString: Boolean = false,
val ignoreNullFieldsInStructs: Boolean = true) extends Serializable {

/**
* Retuns the left bracket to use when surrounding brackets when converting
Expand Down Expand Up @@ -1086,7 +1087,7 @@ object GpuCast {
* The main differences are:
*
* - Struct field names are included
* - Null fields are omitted
* - Null fields are optionally omitted
*/
def castStructToJsonString(input: ColumnView,
inputSchema: Array[StructField],
Expand All @@ -1098,36 +1099,66 @@ object GpuCast {
colon: ColumnVector,
quote: ColumnVector): ColumnVector = {
val jsonName = StringEscapeUtils.escapeJson(inputSchema(fieldIndex).name)
val needsQuoting = inputSchema(fieldIndex).dataType == DataTypes.StringType
val dataType = inputSchema(fieldIndex).dataType
val needsQuoting = dataType == DataTypes.StringType
withResource(input.getChildColumnView(fieldIndex)) { cv =>
withResource(ArrayBuffer.empty[ColumnVector]) { attrColumns =>
// prefix with quoted column name followed by colon
withResource(Scalar.fromString("\"" + jsonName + "\"")) { name =>
attrColumns += ColumnVector.fromScalar(name, rowCount)
attrColumns += colon.incRefCount()
}
// write the value
val attrValue = castToString(cv, inputSchema(fieldIndex).dataType, options)
if (needsQuoting) {
attrColumns += quote.incRefCount()
attrColumns += escapeJsonString(attrValue)
attrColumns += quote.incRefCount()
if (options.ignoreNullFieldsInStructs) {
// write the value
val attrValue = castToString(cv, inputSchema(fieldIndex).dataType, options)
if (needsQuoting) {
attrColumns += quote.incRefCount()
attrColumns += escapeJsonString(attrValue)
attrColumns += quote.incRefCount()
} else {
attrColumns += attrValue
}
// now concatenate
val jsonAttr = withResource(Scalar.fromString("")) { emptyString =>
ColumnVector.stringConcatenate(emptyString, emptyString, attrColumns.toArray)
}
// add an empty string or the attribute
withResource(jsonAttr) { _ =>
withResource(cv.isNull) { isNull =>
withResource(Scalar.fromNull(DType.STRING)) { nullScalar =>
isNull.ifElse(nullScalar, jsonAttr)
}
}
}
} else {
attrColumns += attrValue
}
// now concatenate
val jsonAttr = withResource(Scalar.fromString("")) { emptyString =>
ColumnVector.stringConcatenate(emptyString, emptyString, attrColumns.toArray)
}
// add an empty string or the attribute
val jsonAttrOrEmptyString = withResource(jsonAttr) { _ =>
withResource(cv.isNull) { isNull =>
withResource(Scalar.fromNull(DType.STRING)) { nullScalar =>
isNull.ifElse(nullScalar, jsonAttr)
val jsonAttr = withResource(ArrayBuffer.empty[ColumnVector]) { attrValues =>
withResource(castToString(cv, inputSchema(fieldIndex).dataType, options)) {
attrValue =>
if (needsQuoting) {
attrValues += quote.incRefCount()
attrValues += escapeJsonString(attrValue.incRefCount())
attrValues += quote.incRefCount()
withResource(Scalar.fromString("")) { emptyString =>
ColumnVector.stringConcatenate(emptyString, emptyString, attrValues.toArray)
}
} else {
attrValue.incRefCount()
}
}
}
// add attribute value, or null literal string if value is null
attrColumns += withResource(jsonAttr) { _ =>
withResource(cv.isNull) { isNull =>
withResource(Scalar.fromString("null")) { nullScalar =>
isNull.ifElse(nullScalar, jsonAttr)
}
}
}
// now concatenate
withResource(Scalar.fromString("")) { emptyString =>
ColumnVector.stringConcatenate(emptyString, emptyString, attrColumns.toArray)
}
}
jsonAttrOrEmptyString
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3604,9 +3604,6 @@ object GpuOverrides extends Logging {
))),
(a, conf, p, r) => new UnaryExprMeta[StructsToJson](a, conf, p, r) {
override def tagExprForGpu(): Unit = {
if (a.options.get("ignoreNullFields").exists(_.equalsIgnoreCase("false"))) {
willNotWorkOnGpu("to_json option ignore_null_fields=false is not supported")
}
if (a.options.get("pretty").exists(_.equalsIgnoreCase("true"))) {
willNotWorkOnGpu("to_json option pretty=true is not supported")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,20 @@ import ai.rapids.cudf.ColumnVector
import com.nvidia.spark.rapids.{CastOptions, GpuCast, GpuColumnVector, GpuUnaryExpression}

import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{DataType, StringType, StructType}

case class GpuStructsToJson(
options: Map[String, String],
child: Expression,
timeZoneId: Option[String] = None) extends GpuUnaryExpression {
override protected def doColumnar(input: GpuColumnVector): ColumnVector = {
val ignoreNullFields = options.getOrElse("ignoreNullFields", SQLConf.get.getConfString(
SQLConf.JSON_GENERATOR_IGNORE_NULL_FIELDS.key)).toBoolean
GpuCast.castStructToJsonString(input.getBase, child.dataType.asInstanceOf[StructType].fields,
new CastOptions(false, false, false, true))
new CastOptions(legacyCastComplexTypesToString = false, ansiMode = false,
stringToDateAnsiMode = false, castToJsonString = true,
ignoreNullFieldsInStructs = ignoreNullFields))
}

override def dataType: DataType = StringType
Expand Down

0 comments on commit f25fa42

Please sign in to comment.