Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add support for reading structs in GpuJsonScan #10325

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 52 additions & 3 deletions integration_tests/src/main/python/json_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,53 @@ def test_read_valid_json(spark_tmp_table_factory, std_input_path, read_func, fil
{}),
conf=conf)

@pytest.mark.parametrize('filename', ['nested-structs.ndjson'])
@pytest.mark.parametrize('schema', [
StructType([StructField('teacher', StringType())]),
StructType([
StructField('student', StructType([
StructField('name', StringType()),
StructField('age', IntegerType())
]))
]),
StructType([
StructField('teacher', StringType()),
StructField('student', StructType([
StructField('name', StringType()),
StructField('age', IntegerType())
]))
]),
])
@pytest.mark.parametrize('read_func', [read_json_df, read_json_sql])
@pytest.mark.parametrize('v1_enabled_list', ["", "json"])
def test_read_nested_struct(spark_tmp_table_factory, std_input_path, read_func, filename, schema, v1_enabled_list):
conf = copy_and_update(_enable_all_types_conf, {'spark.sql.sources.useV1SourceList': v1_enabled_list})
assert_gpu_and_cpu_are_equal_collect(
read_func(std_input_path + '/' + filename,
schema,
spark_tmp_table_factory,
{}),
conf=conf)

@pytest.mark.parametrize('filename', ['optional-fields.ndjson'])
@pytest.mark.parametrize('schema', [
StructType([StructField('teacher', StringType())]),
StructType([StructField('student', StringType())]),
StructType([
StructField('teacher', StringType()),
StructField('student', StringType())
]),
])
@pytest.mark.parametrize('read_func', [read_json_df, read_json_sql])
@pytest.mark.parametrize('v1_enabled_list', ["", "json"])
def test_read_optional_fields(spark_tmp_table_factory, std_input_path, read_func, filename, schema, v1_enabled_list):
conf = copy_and_update(_enable_all_types_conf, {'spark.sql.sources.useV1SourceList': v1_enabled_list})
assert_gpu_and_cpu_are_equal_collect(
read_func(std_input_path + '/' + filename,
schema,
spark_tmp_table_factory,
{}),
conf=conf)

# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653'
not_utc_json_scan_allow=['FileSourceScanExec'] if is_not_utc() else []
Expand Down Expand Up @@ -794,9 +841,11 @@ def test_from_json_struct_timestamp_fallback_non_default_format(timestamp_gen, t
conf={"spark.rapids.sql.expression.JsonToStructs": True,
'spark.sql.legacy.timeParserPolicy': 'CORRECTED'})

@pytest.mark.parametrize('schema', ['struct<teacher:string>',
'struct<student:struct<name:string,age:int>>',
'struct<teacher:string,student:struct<name:string,age:int>>'])
@pytest.mark.parametrize('schema', [
'struct<teacher:string>',
'struct<student:struct<name:string,age:int>>',
'struct<teacher:string,student:struct<name:string,age:int>>'
])
@allow_non_gpu(*non_utc_allow)
def test_from_json_struct_of_struct(schema):
json_string_gen = StringGen(r'{"teacher": "[A-Z]{1}[a-z]{2,5}",' \
Expand Down
3 changes: 3 additions & 0 deletions integration_tests/src/test/resources/nested-structs.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{ "teacher": "Bob" }
{ "student": { "name": "Carol", "age": 21 } }
{ "teacher": "Bob", "student": { "name": "Carol", "age": 21 } }
3 changes: 3 additions & 0 deletions integration_tests/src/test/resources/optional-fields.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{ "teacher": "Bob" }
{ "student": "Carol" }
{ "teacher": "Bob", "student": "Carol" }
Original file line number Diff line number Diff line change
Expand Up @@ -894,7 +894,8 @@ object GpuOverrides extends Logging {
sparkSig = (TypeSig.cpuAtomics + TypeSig.STRUCT + TypeSig.ARRAY + TypeSig.MAP +
TypeSig.UDT).nested())),
(JsonFormatType, FileFormatChecks(
cudfRead = TypeSig.commonCudfTypes + TypeSig.DECIMAL_128,
cudfRead = (TypeSig.commonCudfTypes + TypeSig.DECIMAL_128 +
TypeSig.STRUCT + TypeSig.ARRAY + TypeSig.MAP).nested(),
cudfWrite = TypeSig.none,
sparkSig = (TypeSig.cpuAtomics + TypeSig.STRUCT + TypeSig.ARRAY + TypeSig.MAP +
TypeSig.UDT).nested())),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,18 @@ abstract class GpuTextBasedPartitionReader[BUFF <: LineBufferer, FACT <: LineBuf
f
}
}))
val cudfSchema = GpuColumnVector.from(dataSchemaWithStrings)

val builder = Schema.builder
for (field <- dataSchemaWithStrings.fields) {
val dt = field.dataType match {
case _: StructType =>
// note we cannot specify to read primitives in the struct as strings yet
DType.STRUCT
case _ => GpuColumnVector.getNonNestedRapidsType(field.dataType)
}
builder.column(dt, field.name)
}
val cudfSchema = builder.build

// about to start using the GPU
GpuSemaphore.acquireIfNecessary(TaskContext.get())
Expand Down
Loading