-
Notifications
You must be signed in to change notification settings - Fork 2.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Arrow: Add support for TimeType / UUIDType #2739
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,6 +30,7 @@ | |
import org.apache.arrow.vector.Float4Vector; | ||
import org.apache.arrow.vector.Float8Vector; | ||
import org.apache.arrow.vector.IntVector; | ||
import org.apache.arrow.vector.TimeMicroVector; | ||
import org.apache.arrow.vector.TimeStampMicroTZVector; | ||
import org.apache.arrow.vector.TimeStampMicroVector; | ||
import org.apache.arrow.vector.types.FloatingPointPrecision; | ||
|
@@ -108,6 +109,8 @@ private enum ReadType { | |
FLOAT, | ||
DOUBLE, | ||
TIMESTAMP_MILLIS, | ||
TIME_MICROS, | ||
UUID, | ||
DICTIONARY | ||
} | ||
|
||
|
@@ -169,6 +172,9 @@ public VectorHolder read(VectorHolder reuse, int numValsToRead) { | |
case TIMESTAMP_MILLIS: | ||
vectorizedColumnIterator.nextBatchTimestampMillis(vec, typeWidth, nullabilityHolder); | ||
break; | ||
case UUID: | ||
vectorizedColumnIterator.nextBatchFixedSizeBinary(vec, typeWidth, nullabilityHolder); | ||
break; | ||
} | ||
} | ||
} | ||
|
@@ -178,6 +184,7 @@ public VectorHolder read(VectorHolder reuse, int numValsToRead) { | |
nullabilityHolder, icebergField.type()); | ||
} | ||
|
||
@SuppressWarnings("MethodLength") | ||
private void allocateFieldVector(boolean dictionaryEncodedVector) { | ||
if (dictionaryEncodedVector) { | ||
Field field = new Field( | ||
|
@@ -240,6 +247,12 @@ private void allocateFieldVector(boolean dictionaryEncodedVector) { | |
this.readType = ReadType.LONG; | ||
this.typeWidth = (int) BigIntVector.TYPE_WIDTH; | ||
break; | ||
case TIME_MICROS: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as discussed offline do we want to add support for MILLIs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like Parquet's There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will look into supporting There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. after some investigation, supporting |
||
this.vec = arrowField.createVector(rootAlloc); | ||
((TimeMicroVector) vec).allocateNew(batchSize); | ||
this.readType = ReadType.LONG; | ||
this.typeWidth = 8; | ||
break; | ||
case DECIMAL: | ||
this.vec = arrowField.createVector(rootAlloc); | ||
((DecimalVector) vec).allocateNew(batchSize); | ||
|
@@ -269,11 +282,17 @@ private void allocateFieldVector(boolean dictionaryEncodedVector) { | |
} else { | ||
switch (primitive.getPrimitiveTypeName()) { | ||
case FIXED_LEN_BYTE_ARRAY: | ||
int len = ((Types.FixedType) icebergField.type()).length(); | ||
int len; | ||
if (icebergField.type() instanceof Types.UUIDType) { | ||
len = 16; | ||
this.readType = ReadType.UUID; | ||
} else { | ||
len = ((Types.FixedType) icebergField.type()).length(); | ||
this.readType = ReadType.FIXED_WIDTH_BINARY; | ||
} | ||
this.vec = arrowField.createVector(rootAlloc); | ||
vec.setInitialCapacity(batchSize * len); | ||
vec.allocateNew(); | ||
this.readType = ReadType.FIXED_WIDTH_BINARY; | ||
this.typeWidth = len; | ||
break; | ||
case BINARY: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How come only UUID was added to this switch?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TIME_MICROS
andTIMESTAMP_MICROS
are evaluated asLONG
: https://github.com/nastra/iceberg/blob/21af7bcd73f450e997d1af085634567a734a16b9/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java#L240-L255so both are basically handled in the
LONG
part of that switch statement. OnlyUUID
needs to be handled differently