Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve columnarCopy for HostColumnarToGpu #4770

Merged
merged 3 commits into from
Feb 16, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.nvidia.spark.rapids;

import ai.rapids.cudf.HostColumnVector.ColumnBuilder;

import org.apache.spark.sql.vectorized.ColumnVector;

/**
* A helper class which efficiently transfers different types of host columnar data into cuDF.
tgravescs marked this conversation as resolved.
Show resolved Hide resolved
*/
public class ColumnarCopyHelper {

public static void nullCopy(ColumnBuilder b, int rows) {
for (int i = 0; i < rows; i++) {
b.appendNull();
}
}

public static void booleanCopy(ColumnVector cv, ColumnBuilder b, int rows) {
if (!cv.hasNull()) {
for (int i = 0; i < rows; i++) b.append(cv.getBoolean(i));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I would prefer to split this line up for formatting purposes.

for (int i = 0; i < rows; i++) {
  b.append(cv.getBoolean(i));
}

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like we could benefit from an efficient bulk API in HostMemoryBuffer

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This ColumnVector is the Spark class, not the CUDF class. In some cases we can know what the layout is, but not all and in many of those cases the underlying data is not exposed, so there is no good way to do a bulk data copy. We could make changes to Spark to try and improve that, but this only really shows up in cases when we are reading an input format like Parquet or ORC.

Copy link
Collaborator Author

@sperlingxx sperlingxx Feb 15, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

return;
revans2 marked this conversation as resolved.
Show resolved Hide resolved
}
for (int i = 0; i < rows; i++) {
if (cv.isNullAt(i)) {
b.appendNull();
continue;
revans2 marked this conversation as resolved.
Show resolved Hide resolved
}
b.append(cv.getBoolean(i));
}
}

public static void byteCopy(ColumnVector cv, ColumnBuilder b, int rows) {
if (!cv.hasNull()) {
for (int i = 0; i < rows; i++) b.append(cv.getByte(i));
return;
}
for (int i = 0; i < rows; i++) {
if (cv.isNullAt(i)) {
b.appendNull();
continue;
}
b.append(cv.getByte(i));
}
}

public static void shortCopy(ColumnVector cv, ColumnBuilder b, int rows) {
if (!cv.hasNull()) {
for (int i = 0; i < rows; i++) b.append(cv.getShort(i));
return;
}
for (int i = 0; i < rows; i++) {
if (cv.isNullAt(i)) {
b.appendNull();
continue;
}
b.append(cv.getShort(i));
}
}

public static void intCopy(ColumnVector cv, ColumnBuilder b, int rows) {
if (!cv.hasNull()) {
for (int i = 0; i < rows; i++) b.append(cv.getInt(i));
return;
}
for (int i = 0; i < rows; i++) {
if (cv.isNullAt(i)) {
b.appendNull();
continue;
}
b.append(cv.getInt(i));
}
}

public static void longCopy(ColumnVector cv, ColumnBuilder b, int rows) {
if (!cv.hasNull()) {
for (int i = 0; i < rows; i++) b.append(cv.getLong(i));
return;
}
for (int i = 0; i < rows; i++) {
if (cv.isNullAt(i)) {
b.appendNull();
continue;
}
b.append(cv.getLong(i));
}
}

public static void floatCopy(ColumnVector cv, ColumnBuilder b, int rows) {
if (!cv.hasNull()) {
for (int i = 0; i < rows; i++) b.append(cv.getFloat(i));
return;
}
for (int i = 0; i < rows; i++) {
if (cv.isNullAt(i)) {
b.appendNull();
continue;
}
b.append(cv.getFloat(i));
}
}

public static void doubleCopy(ColumnVector cv, ColumnBuilder b, int rows) {
if (!cv.hasNull()) {
for (int i = 0; i < rows; i++) b.append(cv.getDouble(i));
return;
}
for (int i = 0; i < rows; i++) {
if (cv.isNullAt(i)) {
b.appendNull();
continue;
}
b.append(cv.getDouble(i));
}
}

public static void stringCopy(ColumnVector cv, ColumnBuilder b, int rows) {
if (!cv.hasNull()) {
for (int i = 0; i < rows; i++) b.appendUTF8String(cv.getUTF8String(i).getBytes());
return;
}
for (int i = 0; i < rows; i++) {
if (cv.isNullAt(i)) {
b.appendNull();
continue;
}
b.appendUTF8String(cv.getUTF8String(i).getBytes());
}
}

public static void decimal32Copy(ColumnVector cv, ColumnBuilder b, int rows,
int precision, int scale) {
if (!cv.hasNull()) {
for (int i = 0; i < rows; i++) {
b.append((int) cv.getDecimal(i, precision, scale).toUnscaledLong());
}
return;
}
for (int i = 0; i < rows; i++) {
if (cv.isNullAt(i)) {
b.appendNull();
continue;
}
b.append((int) cv.getDecimal(i, precision, scale).toUnscaledLong());
revans2 marked this conversation as resolved.
Show resolved Hide resolved
}
}

public static void decimal64Copy(ColumnVector cv, ColumnBuilder b, int rows,
int precision, int scale) {
if (!cv.hasNull()) {
for (int i = 0; i < rows; i++) {
b.append(cv.getDecimal(i, precision, scale).toUnscaledLong());
}
return;
}
for (int i = 0; i < rows; i++) {
if (cv.isNullAt(i)) {
b.appendNull();
continue;
}
b.append(cv.getDecimal(i, precision, scale).toUnscaledLong());
}
}

public static void decimal128Copy(ColumnVector cv, ColumnBuilder b, int rows,
int precision, int scale) {
if (!cv.hasNull()) {
for (int i = 0; i < rows; i++) {
b.append(cv.getDecimal(i, precision, scale).toJavaBigDecimal());
}
return;
}
for (int i = 0; i < rows; i++) {
if (cv.isNullAt(i)) {
b.appendNull();
continue;
}
b.append(cv.getDecimal(i, precision, scale).toJavaBigDecimal());
}
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -387,7 +387,7 @@ public GpuColumnarBatchBuilder(StructType schema, int rows) {
}

public void copyColumnar(ColumnVector cv, int colNum, boolean nullable, int rows) {
HostColumnarToGpu.columnarCopy(cv, builder(colNum), nullable, rows);
HostColumnarToGpu.columnarCopy(cv, builder(colNum), rows);
}

public ai.rapids.cudf.HostColumnVector.ColumnBuilder builder(int i) {
Expand Down
Loading