From 3e883e3e42c0674e0b823f05ffffd1e868a02143 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Wed, 27 Jul 2022 17:02:26 -0700 Subject: [PATCH 1/6] Added Parquet options for binary read --- .../ai/rapids/cudf/ColumnFilterOptions.java | 25 ++++++++++++++++++- .../java/ai/rapids/cudf/ParquetOptions.java | 2 -- java/src/main/java/ai/rapids/cudf/Table.java | 7 +++--- java/src/main/native/src/TableJni.cpp | 12 ++++----- 4 files changed, 34 insertions(+), 12 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java index 371fe9defa5..e9d261a31b9 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java @@ -29,18 +29,26 @@ public abstract class ColumnFilterOptions { // Names of the columns to be returned (other columns are skipped) // If empty all columns are returned. private final String[] includeColumnNames; + private final Boolean[] binaryRead; protected ColumnFilterOptions(Builder builder) { includeColumnNames = builder.includeColumnNames.toArray( new String[builder.includeColumnNames.size()]); + binaryRead = builder.binaryColumns.toArray( + new Boolean[builder.binaryColumns.size()]); } String[] getIncludeColumnNames() { return includeColumnNames; } + Boolean[] getIsBinaryRead() { + return binaryRead; + } + public static class Builder { final List includeColumnNames = new ArrayList<>(); + final List binaryColumns = new ArrayList<>(); /** * Include one or more specific columns. Any column not included will not be read. @@ -49,16 +57,31 @@ public static class Builder { public T includeColumn(String... names) { for (String name : names) { includeColumnNames.add(name); + binaryColumns.add(false); } return (T) this; } + /** + * Include this column. + * @param name the name of the column + * @param isBinary whether this column is to be read in as binary + */ + public T includeColumn(String name, boolean isBinary) { + includeColumnNames.add(name); + binaryColumns.add(isBinary); + return (T) this; + } + /** * Include one or more specific columns. Any column not included will not be read. * @param names the name of the column, or more than one if you want. */ public T includeColumn(Collection names) { - includeColumnNames.addAll(names); + for (String name: names) { + includeColumnNames.add(name); + binaryColumns.add(false); + } return (T) this; } } diff --git a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java index dd771cab7ea..68b95336d29 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java @@ -27,8 +27,6 @@ public class ParquetOptions extends ColumnFilterOptions { private final DType unit; - - private ParquetOptions(Builder builder) { super(builder); unit = builder.unit; diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index c8f842fcc63..bc8b01f5637 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -243,12 +243,13 @@ private static native long[] readJSON(String[] columnNames, * Read in Parquet formatted data. * @param filterColumnNames name of the columns to read, or an empty array if we want to read * all of them + * @param binary whether to read this column as binary * @param filePath the path of the file to read, or null if no path should be read. * @param address the address of the buffer to read from or 0 if we should not. * @param length the length of the buffer to read from. * @param timeUnit return type of TimeStamp in units */ - private static native long[] readParquet(String[] filterColumnNames, String filePath, + private static native long[] readParquet(String[] filterColumnNames, Boolean[] binary, String filePath, long address, long length, int timeUnit) throws CudfException; /** @@ -956,7 +957,7 @@ public static Table readParquet(File path) { * @return the file parsed as a table on the GPU. */ public static Table readParquet(ParquetOptions opts, File path) { - return new Table(readParquet(opts.getIncludeColumnNames(), + return new Table(readParquet(opts.getIncludeColumnNames(), opts.getIsBinaryRead(), path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId())); } @@ -1016,7 +1017,7 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer, assert len > 0; assert len <= buffer.getLength() - offset; assert offset >= 0 && offset < buffer.length; - return new Table(readParquet(opts.getIncludeColumnNames(), + return new Table(readParquet(opts.getIncludeColumnNames(), opts.getIsBinaryRead(), null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId())); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index d511512431b..82e55a68acc 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1428,11 +1428,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON( CATCH_STD(env, NULL); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env, jclass, - jobjectArray filter_col_names, - jstring inputfilepath, - jlong buffer, - jlong buffer_length, jint unit) { +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet( + JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray col_binary_read, + jstring inputfilepath, jlong buffer, jlong buffer_length, jint unit) { bool read_buffer = true; if (buffer == 0) { JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL); @@ -1454,6 +1452,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env, } cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); + cudf::jni::native_jbooleanArray n_col_binary_read(env, col_binary_read); auto source = read_buffer ? cudf::io::source_info(reinterpret_cast(buffer), static_cast(buffer_length)) : @@ -1461,7 +1460,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env, auto builder = cudf::io::parquet_reader_options::builder(source); if (n_filter_col_names.size() > 0) { - builder = builder.columns(n_filter_col_names.as_cpp_vector()); + builder = builder.columns(n_filter_col_names.as_cpp_vector()) + .convert_binary_to_strings(n_col_binary_read.to_vector()); } cudf::io::parquet_reader_options opts = From a009c02455da236ffc5920dceeeb5bde42f7623a Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Mon, 1 Aug 2022 23:16:09 -0700 Subject: [PATCH 2/6] return primitive boolean --- .../ai/rapids/cudf/ColumnFilterOptions.java | 20 ++++++++++-------- java/src/main/java/ai/rapids/cudf/Table.java | 8 +++---- java/src/main/native/src/TableJni.cpp | 6 ++++-- .../test/java/ai/rapids/cudf/TableTest.java | 13 ++++++++++++ java/src/test/resources/binary.parquet | Bin 0 -> 467 bytes 5 files changed, 32 insertions(+), 15 deletions(-) create mode 100644 java/src/test/resources/binary.parquet diff --git a/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java index e9d261a31b9..2f4c058e6d3 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java @@ -29,26 +29,28 @@ public abstract class ColumnFilterOptions { // Names of the columns to be returned (other columns are skipped) // If empty all columns are returned. private final String[] includeColumnNames; - private final Boolean[] binaryRead; + private final boolean[] readBinaryAsString; protected ColumnFilterOptions(Builder builder) { includeColumnNames = builder.includeColumnNames.toArray( new String[builder.includeColumnNames.size()]); - binaryRead = builder.binaryColumns.toArray( - new Boolean[builder.binaryColumns.size()]); + readBinaryAsString = new boolean[builder.binaryAsStringColumns.size()]; + for (int i = 0 ; i < builder.binaryAsStringColumns.size() ; i++) { + readBinaryAsString[i] = builder.binaryAsStringColumns.get(i); + } } String[] getIncludeColumnNames() { return includeColumnNames; } - Boolean[] getIsBinaryRead() { - return binaryRead; + boolean[] getConvertToBinaryRead() { + return readBinaryAsString; } public static class Builder { final List includeColumnNames = new ArrayList<>(); - final List binaryColumns = new ArrayList<>(); + final List binaryAsStringColumns = new ArrayList<>(); /** * Include one or more specific columns. Any column not included will not be read. @@ -57,7 +59,7 @@ public static class Builder { public T includeColumn(String... names) { for (String name : names) { includeColumnNames.add(name); - binaryColumns.add(false); + binaryAsStringColumns.add(true); } return (T) this; } @@ -69,7 +71,7 @@ public T includeColumn(String... names) { */ public T includeColumn(String name, boolean isBinary) { includeColumnNames.add(name); - binaryColumns.add(isBinary); + binaryAsStringColumns.add(!isBinary); return (T) this; } @@ -80,7 +82,7 @@ public T includeColumn(String name, boolean isBinary) { public T includeColumn(Collection names) { for (String name: names) { includeColumnNames.add(name); - binaryColumns.add(false); + binaryAsStringColumns.add(true); } return (T) this; } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index bc8b01f5637..f2c9bffba5c 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -243,13 +243,13 @@ private static native long[] readJSON(String[] columnNames, * Read in Parquet formatted data. * @param filterColumnNames name of the columns to read, or an empty array if we want to read * all of them - * @param binary whether to read this column as binary + * @param binaryToString whether to convert this column to String if binary * @param filePath the path of the file to read, or null if no path should be read. * @param address the address of the buffer to read from or 0 if we should not. * @param length the length of the buffer to read from. * @param timeUnit return type of TimeStamp in units */ - private static native long[] readParquet(String[] filterColumnNames, Boolean[] binary, String filePath, + private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath, long address, long length, int timeUnit) throws CudfException; /** @@ -957,7 +957,7 @@ public static Table readParquet(File path) { * @return the file parsed as a table on the GPU. */ public static Table readParquet(ParquetOptions opts, File path) { - return new Table(readParquet(opts.getIncludeColumnNames(), opts.getIsBinaryRead(), + return new Table(readParquet(opts.getIncludeColumnNames(), opts.getConvertToBinaryRead(), path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId())); } @@ -1017,7 +1017,7 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer, assert len > 0; assert len <= buffer.getLength() - offset; assert offset >= 0 && offset < buffer.length; - return new Table(readParquet(opts.getIncludeColumnNames(), opts.getIsBinaryRead(), + return new Table(readParquet(opts.getIncludeColumnNames(), opts.getConvertToBinaryRead(), null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId())); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 82e55a68acc..4623f1e0ce2 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1429,8 +1429,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON( } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet( - JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray col_binary_read, + JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jstring inputfilepath, jlong buffer, jlong buffer_length, jint unit) { + + JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0); bool read_buffer = true; if (buffer == 0) { JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL); @@ -1452,7 +1454,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet( } cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); - cudf::jni::native_jbooleanArray n_col_binary_read(env, col_binary_read); + cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read); auto source = read_buffer ? cudf::io::source_info(reinterpret_cast(buffer), static_cast(buffer_length)) : diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 7ef47d6a7cc..1bd67562863 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -78,6 +78,7 @@ public class TableTest extends CudfTestBase { private static final File TEST_PARQUET_FILE = TestUtils.getResourceAsFile("acq.parquet"); + private static final File TEST_PARQUET_FILE_BINARY = TestUtils.getResourceAsFile("binary.parquet"); private static final File TEST_ORC_FILE = TestUtils.getResourceAsFile("TestOrcFile.orc"); private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc"); private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet"); @@ -566,6 +567,18 @@ void testReadParquet() { } } + @Test + void testReadParquetBinary() { + ParquetOptions opts = ParquetOptions.builder() + .includeColumn("value", true) + .build(); + try (Table table = Table.readParquet(opts, TEST_PARQUET_FILE_BINARY)) { + assertTableTypes(new DType[]{DType.LIST}, table); + ColumnView columnView = table.getColumn(0); + assertEquals(DType.INT8, columnView.getChildColumnView(0).getType()); + } + } + @Test void testReadParquetBuffer() throws IOException { ParquetOptions opts = ParquetOptions.builder() diff --git a/java/src/test/resources/binary.parquet b/java/src/test/resources/binary.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5cf3f1117978ff653664e915e230763b0402490d GIT binary patch literal 467 zcmZ`$T}#6-6iutvGK%0sN(&WAz_CFMt!3RheDQ7gG<;M<(k0buKencwjIsaZPw=OC zH@YYBAvZVooO{nnGQWQeFv1`B$-^2X8rGc%4SG!SbiX9CbjBs@JVgkNW-7#!a~SuX8tJVx6;?|{&izJ;1|J2g!W_&*`8%p^&4eN25R&mOxL28SyftRAq{ANtl=Mq zT;&uzu3FL1y85DpXc%V=v!1u4*(?Q_64})4q9GnhGm&#irT~?!*h7b#0Map&tYq~D z&SjhDEXjcay}~2Msbsp{?Ot}sdgwH7irVc=y5P{KfjdarEaxtp#5|p(aU9+zkw~K` g47nfqQ7lIOB2LEf*iS^lgQ4v>@CcQH&>X(=7Zv?}=Kufz literal 0 HcmV?d00001 From 8295600eeedbc6715e384b46830a96a1da428019 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Tue, 2 Aug 2022 11:59:13 -0700 Subject: [PATCH 3/6] addressed review comments --- .../ai/rapids/cudf/ColumnFilterOptions.java | 2 +- java/src/main/java/ai/rapids/cudf/Table.java | 4 ++-- .../src/test/java/ai/rapids/cudf/TableTest.java | 5 +++-- java/src/test/resources/binary.parquet | Bin 467 -> 653 bytes 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java index 2f4c058e6d3..b42cf2ad3ef 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java @@ -44,7 +44,7 @@ String[] getIncludeColumnNames() { return includeColumnNames; } - boolean[] getConvertToBinaryRead() { + boolean[] getReadBinaryAsString() { return readBinaryAsString; } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index f2c9bffba5c..5fe9b064323 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -957,7 +957,7 @@ public static Table readParquet(File path) { * @return the file parsed as a table on the GPU. */ public static Table readParquet(ParquetOptions opts, File path) { - return new Table(readParquet(opts.getIncludeColumnNames(), opts.getConvertToBinaryRead(), + return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId())); } @@ -1017,7 +1017,7 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer, assert len > 0; assert len <= buffer.getLength() - offset; assert offset >= 0 && offset < buffer.length; - return new Table(readParquet(opts.getIncludeColumnNames(), opts.getConvertToBinaryRead(), + return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId())); } diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 1bd67562863..c7e6fecea26 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -570,10 +570,11 @@ void testReadParquet() { @Test void testReadParquetBinary() { ParquetOptions opts = ParquetOptions.builder() - .includeColumn("value", true) + .includeColumn("value1", true) + .includeColumn("value2", false) .build(); try (Table table = Table.readParquet(opts, TEST_PARQUET_FILE_BINARY)) { - assertTableTypes(new DType[]{DType.LIST}, table); + assertTableTypes(new DType[]{DType.LIST, DType.STRING}, table); ColumnView columnView = table.getColumn(0); assertEquals(DType.INT8, columnView.getChildColumnView(0).getType()); } diff --git a/java/src/test/resources/binary.parquet b/java/src/test/resources/binary.parquet index 5cf3f1117978ff653664e915e230763b0402490d..b72be9f36cc6e286aaee5927b6730b64e89240aa 100644 GIT binary patch literal 653 zcmb_aO>4qH6x`TqQxJOEkU$P$p`ws3iDES3rMKEs@LEdQm`x4cM9qgq#J^B_^x&WD zKj_mcXiE=;!t&U+Z)TWTo_9O6F~S}mnplQQ1g9aQQ?5z_giyU(f_ipUhC1d# zOr%S1H10SBrM*JzWxD5d#E2JEL}jK6hbgx00zk1BnMs6Tdr2-DZlww{^JU%_j}hj zacb;i8j#VVwo&(sNCXD!x$r5XmBMKr=)>!aiRoAVoEWPPDx{6U@-My z0+P(^96%B#%*DvV0aVGz!cm+kD9fh+)0!yCB&p-UQ(TZ(lpSB3oROLf6k!nMQDu@~ zElbQPO=S>c^O2O1lu_f55#^CIWnandKnF7b&17Hz gDQ27Oz!=CFJ$W*t=H$PO4ZJckK#6=t1_qEH0pLqC&j0`b From 36886e2d66f98b4622b9f8ea1b9b796db4a239e5 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Tue, 2 Aug 2022 15:01:06 -0700 Subject: [PATCH 4/6] addressed review comments --- .../ai/rapids/cudf/ColumnFilterOptions.java | 14 +----- .../java/ai/rapids/cudf/ParquetOptions.java | 49 +++++++++++++++++++ 2 files changed, 50 insertions(+), 13 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java index b42cf2ad3ef..6f9d4d43e4a 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java @@ -29,28 +29,19 @@ public abstract class ColumnFilterOptions { // Names of the columns to be returned (other columns are skipped) // If empty all columns are returned. private final String[] includeColumnNames; - private final boolean[] readBinaryAsString; protected ColumnFilterOptions(Builder builder) { includeColumnNames = builder.includeColumnNames.toArray( new String[builder.includeColumnNames.size()]); - readBinaryAsString = new boolean[builder.binaryAsStringColumns.size()]; - for (int i = 0 ; i < builder.binaryAsStringColumns.size() ; i++) { - readBinaryAsString[i] = builder.binaryAsStringColumns.get(i); - } + } String[] getIncludeColumnNames() { return includeColumnNames; } - boolean[] getReadBinaryAsString() { - return readBinaryAsString; - } - public static class Builder { final List includeColumnNames = new ArrayList<>(); - final List binaryAsStringColumns = new ArrayList<>(); /** * Include one or more specific columns. Any column not included will not be read. @@ -59,7 +50,6 @@ public static class Builder { public T includeColumn(String... names) { for (String name : names) { includeColumnNames.add(name); - binaryAsStringColumns.add(true); } return (T) this; } @@ -71,7 +61,6 @@ public T includeColumn(String... names) { */ public T includeColumn(String name, boolean isBinary) { includeColumnNames.add(name); - binaryAsStringColumns.add(!isBinary); return (T) this; } @@ -82,7 +71,6 @@ public T includeColumn(String name, boolean isBinary) { public T includeColumn(Collection names) { for (String name: names) { includeColumnNames.add(name); - binaryAsStringColumns.add(true); } return (T) this; } diff --git a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java index 68b95336d29..8c12d3fba71 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java @@ -18,6 +18,10 @@ package ai.rapids.cudf; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + /** * Options for reading a parquet file */ @@ -26,22 +30,32 @@ public class ParquetOptions extends ColumnFilterOptions { public static ParquetOptions DEFAULT = new ParquetOptions(new Builder()); private final DType unit; + private final boolean[] readBinaryAsString; private ParquetOptions(Builder builder) { super(builder); unit = builder.unit; + readBinaryAsString = new boolean[builder.binaryAsStringColumns.size()]; + for (int i = 0 ; i < builder.binaryAsStringColumns.size() ; i++) { + readBinaryAsString[i] = builder.binaryAsStringColumns.get(i); + } } DType timeUnit() { return unit; } + boolean[] getReadBinaryAsString() { + return readBinaryAsString; + } + public static Builder builder() { return new Builder(); } public static class Builder extends ColumnFilterOptions.Builder { private DType unit = DType.EMPTY; + final List binaryAsStringColumns = new ArrayList<>(); /** * Specify the time unit to use when returning timestamps. @@ -54,6 +68,41 @@ public Builder withTimeUnit(DType unit) { return this; } + /** + * Include one or more specific columns. Any column not included will not be read. + * @param names the name of the column, or more than one if you want. + */ + public Builder includeColumn(String... names) { + for (String name : names) { + includeColumnNames.add(name); + binaryAsStringColumns.add(true); + } + return this; + } + + /** + * Include this column. + * @param name the name of the column + * @param isBinary whether this column is to be read in as binary + */ + public Builder includeColumn(String name, boolean isBinary) { + includeColumnNames.add(name); + binaryAsStringColumns.add(!isBinary); + return this; + } + + /** + * Include one or more specific columns. Any column not included will not be read. + * @param names the name of the column, or more than one if you want. + */ + public Builder includeColumn(Collection names) { + for (String name: names) { + includeColumnNames.add(name); + binaryAsStringColumns.add(true); + } + return this; + } + public ParquetOptions build() { return new ParquetOptions(this); } From c8a2ee697b38167901ee808655e2394a6a0dfce1 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Tue, 2 Aug 2022 15:12:00 -0700 Subject: [PATCH 5/6] removed unnecessary changes --- .../java/ai/rapids/cudf/ColumnFilterOptions.java | 15 +-------------- .../main/java/ai/rapids/cudf/ParquetOptions.java | 2 +- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java index 6f9d4d43e4a..371fe9defa5 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnFilterOptions.java @@ -33,7 +33,6 @@ public abstract class ColumnFilterOptions { protected ColumnFilterOptions(Builder builder) { includeColumnNames = builder.includeColumnNames.toArray( new String[builder.includeColumnNames.size()]); - } String[] getIncludeColumnNames() { @@ -54,24 +53,12 @@ public T includeColumn(String... names) { return (T) this; } - /** - * Include this column. - * @param name the name of the column - * @param isBinary whether this column is to be read in as binary - */ - public T includeColumn(String name, boolean isBinary) { - includeColumnNames.add(name); - return (T) this; - } - /** * Include one or more specific columns. Any column not included will not be read. * @param names the name of the column, or more than one if you want. */ public T includeColumn(Collection names) { - for (String name: names) { - includeColumnNames.add(name); - } + includeColumnNames.addAll(names); return (T) this; } } diff --git a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java index 8c12d3fba71..cfd4a34d756 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java @@ -49,7 +49,7 @@ boolean[] getReadBinaryAsString() { return readBinaryAsString; } - public static Builder builder() { + public static ParquetOptions.Builder builder() { return new Builder(); } From 8c0ea07d8297f9ff368271d21098e642177071ce Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Tue, 2 Aug 2022 15:48:43 -0700 Subject: [PATCH 6/6] addressed more review comments --- java/src/main/java/ai/rapids/cudf/ParquetOptions.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java index cfd4a34d756..1ae1b91b962 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java @@ -72,9 +72,10 @@ public Builder withTimeUnit(DType unit) { * Include one or more specific columns. Any column not included will not be read. * @param names the name of the column, or more than one if you want. */ + @Override public Builder includeColumn(String... names) { - for (String name : names) { - includeColumnNames.add(name); + super.includeColumn(names); + for (int i = 0 ; i < names.length ; i++) { binaryAsStringColumns.add(true); } return this; @@ -95,9 +96,10 @@ public Builder includeColumn(String name, boolean isBinary) { * Include one or more specific columns. Any column not included will not be read. * @param names the name of the column, or more than one if you want. */ + @Override public Builder includeColumn(Collection names) { - for (String name: names) { - includeColumnNames.add(name); + super.includeColumn(names); + for (int i = 0 ; i < names.size() ; i++) { binaryAsStringColumns.add(true); } return this;