Skip to content

Commit

Permalink
Fix parsing of Parquet legacy list-of-struct format
Browse files Browse the repository at this point in the history
  • Loading branch information
jlowe committed Oct 4, 2023
1 parent 8eef296 commit 3a0a595
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 20 deletions.
1 change: 0 additions & 1 deletion integration_tests/src/main/python/parquet_testing_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@
"hadoop_lz4_compressed.parquet": "cudf does not support Hadoop LZ4 format",
"hadoop_lz4_compressed_larger.parquet": "cudf does not support Hadoop LZ4 format",
"nested_structs.rust.parquet": "PySpark cannot handle year 52951",
"repeated_no_annotation.parquet": "https://github.com/NVIDIA/spark-rapids/issues/8631",
}
if is_before_spark_330():
_xfail_files["rle_boolean_encoding.parquet"] = "Spark CPU cannot decode V2 style RLE before 3.3.x"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -890,26 +890,27 @@ private case class GpuParquetFileFilterHandler(
}
} else {
val fileGroupType = fileType.asGroupType()
if (fileGroupType.getFieldCount != 1 ||
!fileGroupType.getType(0).isRepetition(Type.Repetition.REPEATED)) {
// LIST column must have a single, REPEATED child field.
// Otherwise, signal error.
errorCallback(fileType, readType)
if (fileGroupType.getFieldCount > 1 &&
fileGroupType.isRepetition(Type.Repetition.REPEATED)) {
// legacy array format where struct child is directly repeated under array type group
checkSchemaCompat(fileGroupType, array.elementType, errorCallback, isCaseSensitive,
useFieldId, rootFileType, rootReadType)
} else {
val repeatedType = fileGroupType.getType(0)
val childType =
if (isElementType(repeatedType, fileType.getName)) {
// Legacy element, per Parquet LogicalType backward compatibility rules.
// Retain the child as the element type.
repeatedType
}
else {
// Conforms to current Parquet LogicalType rules.
// Unwrap child group layer, and use grandchild's element type.
repeatedType.asGroupType().getType(0)
}
checkSchemaCompat(childType, array.elementType, errorCallback, isCaseSensitive,
useFieldId, rootFileType, rootReadType)
}
val repeatedType = fileGroupType.getType(0)
val childType =
if (isElementType(repeatedType, fileType.getName)) {
// Legacy element, per Parquet LogicalType backward compatibility rules.
// Retain the child as the element type.
repeatedType
}
else {
// Conforms to current Parquet LogicalType rules.
// Unwrap child group layer, and use grandchild's element type.
repeatedType.asGroupType().getType(0)
}
checkSchemaCompat(childType, array.elementType, errorCallback, isCaseSensitive,
useFieldId, rootFileType, rootReadType)
}

case map: MapType =>
Expand Down

0 comments on commit 3a0a595

Please sign in to comment.