Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Java] Added Java bindings for Parquet options for binary read #11410

Merged
merged 6 commits into from
Aug 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 52 additions & 3 deletions java/src/main/java/ai/rapids/cudf/ParquetOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@

package ai.rapids.cudf;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

/**
* Options for reading a parquet file
*/
Expand All @@ -26,24 +30,32 @@ public class ParquetOptions extends ColumnFilterOptions {
public static ParquetOptions DEFAULT = new ParquetOptions(new Builder());

private final DType unit;


private final boolean[] readBinaryAsString;

private ParquetOptions(Builder builder) {
super(builder);
unit = builder.unit;
readBinaryAsString = new boolean[builder.binaryAsStringColumns.size()];
for (int i = 0 ; i < builder.binaryAsStringColumns.size() ; i++) {
readBinaryAsString[i] = builder.binaryAsStringColumns.get(i);
}
}

DType timeUnit() {
return unit;
}

public static Builder builder() {
boolean[] getReadBinaryAsString() {
return readBinaryAsString;
}

public static ParquetOptions.Builder builder() {
return new Builder();
}

public static class Builder extends ColumnFilterOptions.Builder<Builder> {
private DType unit = DType.EMPTY;
final List<Boolean> binaryAsStringColumns = new ArrayList<>();

/**
* Specify the time unit to use when returning timestamps.
Expand All @@ -56,6 +68,43 @@ public Builder withTimeUnit(DType unit) {
return this;
}

/**
* Include one or more specific columns. Any column not included will not be read.
* @param names the name of the column, or more than one if you want.
*/
@Override
public Builder includeColumn(String... names) {
razajafri marked this conversation as resolved.
Show resolved Hide resolved
super.includeColumn(names);
for (int i = 0 ; i < names.length ; i++) {
binaryAsStringColumns.add(true);
}
return this;
}

/**
* Include this column.
* @param name the name of the column
* @param isBinary whether this column is to be read in as binary
*/
public Builder includeColumn(String name, boolean isBinary) {
includeColumnNames.add(name);
razajafri marked this conversation as resolved.
Show resolved Hide resolved
binaryAsStringColumns.add(!isBinary);
return this;
}

/**
* Include one or more specific columns. Any column not included will not be read.
* @param names the name of the column, or more than one if you want.
*/
@Override
public Builder includeColumn(Collection<String> names) {
razajafri marked this conversation as resolved.
Show resolved Hide resolved
super.includeColumn(names);
for (int i = 0 ; i < names.size() ; i++) {
binaryAsStringColumns.add(true);
}
return this;
}

public ParquetOptions build() {
return new ParquetOptions(this);
}
Expand Down
7 changes: 4 additions & 3 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -243,12 +243,13 @@ private static native long[] readJSON(String[] columnNames,
* Read in Parquet formatted data.
* @param filterColumnNames name of the columns to read, or an empty array if we want to read
* all of them
* @param binaryToString whether to convert this column to String if binary
* @param filePath the path of the file to read, or null if no path should be read.
* @param address the address of the buffer to read from or 0 if we should not.
* @param length the length of the buffer to read from.
* @param timeUnit return type of TimeStamp in units
*/
private static native long[] readParquet(String[] filterColumnNames, String filePath,
private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath,
long address, long length, int timeUnit) throws CudfException;

/**
Expand Down Expand Up @@ -956,7 +957,7 @@ public static Table readParquet(File path) {
* @return the file parsed as a table on the GPU.
*/
public static Table readParquet(ParquetOptions opts, File path) {
return new Table(readParquet(opts.getIncludeColumnNames(),
return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId()));
}

Expand Down Expand Up @@ -1016,7 +1017,7 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer,
assert len > 0;
assert len <= buffer.getLength() - offset;
assert offset >= 0 && offset < buffer.length;
return new Table(readParquet(opts.getIncludeColumnNames(),
return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()));
}

Expand Down
14 changes: 8 additions & 6 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1428,11 +1428,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
CATCH_STD(env, NULL);
}

JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env, jclass,
jobjectArray filter_col_names,
jstring inputfilepath,
jlong buffer,
jlong buffer_length, jint unit) {
JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read,
jstring inputfilepath, jlong buffer, jlong buffer_length, jint unit) {

JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0);
bool read_buffer = true;
if (buffer == 0) {
JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
Expand All @@ -1454,14 +1454,16 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env,
}

cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);

auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char *>(buffer),
static_cast<std::size_t>(buffer_length)) :
cudf::io::source_info(filename.get());

auto builder = cudf::io::parquet_reader_options::builder(source);
if (n_filter_col_names.size() > 0) {
builder = builder.columns(n_filter_col_names.as_cpp_vector());
builder = builder.columns(n_filter_col_names.as_cpp_vector())
.convert_binary_to_strings(n_col_binary_read.to_vector<bool>());
}

cudf::io::parquet_reader_options opts =
Expand Down
14 changes: 14 additions & 0 deletions java/src/test/java/ai/rapids/cudf/TableTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@

public class TableTest extends CudfTestBase {
private static final File TEST_PARQUET_FILE = TestUtils.getResourceAsFile("acq.parquet");
private static final File TEST_PARQUET_FILE_BINARY = TestUtils.getResourceAsFile("binary.parquet");
private static final File TEST_ORC_FILE = TestUtils.getResourceAsFile("TestOrcFile.orc");
private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc");
private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet");
Expand Down Expand Up @@ -566,6 +567,19 @@ void testReadParquet() {
}
}

@Test
void testReadParquetBinary() {
ParquetOptions opts = ParquetOptions.builder()
.includeColumn("value1", true)
.includeColumn("value2", false)
.build();
try (Table table = Table.readParquet(opts, TEST_PARQUET_FILE_BINARY)) {
assertTableTypes(new DType[]{DType.LIST, DType.STRING}, table);
ColumnView columnView = table.getColumn(0);
assertEquals(DType.INT8, columnView.getChildColumnView(0).getType());
}
}

@Test
void testReadParquetBuffer() throws IOException {
ParquetOptions opts = ParquetOptions.builder()
Expand Down
Binary file added java/src/test/resources/binary.parquet
Binary file not shown.