Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARRAY_STRING #76

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion ci/docker/centos-7-cpp.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,25 @@

FROM centos:centos7

# Update mirrors to use vault.centos.org as CentOS 7
# is EOL since 2024-06-30
RUN sed -i \
-e 's/^mirrorlist/#mirrorlist/' \
-e 's/^#baseurl/baseurl/' \
-e 's/mirror\.centos\.org/vault.centos.org/' \
/etc/yum.repos.d/*.repo

# devtoolset is required for C++17
RUN \
yum install -y \
centos-release-scl \
epel-release && \
sed -i \
-e 's/^mirrorlist/#mirrorlist/' \
-e 's/^#baseurl/baseurl/' \
-e 's/^# baseurl/baseurl/' \
-e 's/mirror\.centos\.org/vault.centos.org/' \
/etc/yum.repos.d/CentOS-SCLo-scl*.repo && \
yum install -y \
cmake3 \
curl \
Expand All @@ -40,4 +54,4 @@ RUN bash /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin

ENV \
ARROW_R_DEV=TRUE \
CMAKE=/usr/bin/cmake3
CMAKE=/usr/bin/cmake3
27 changes: 20 additions & 7 deletions ci/docker/python-wheel-manylinux.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,22 @@ ARG manylinux
ENV MANYLINUX_VERSION=${manylinux}

# Ensure dnf is installed, especially for the manylinux2014 base
RUN if [ "${MANYLINUX_VERSION}" = "2014" ]; then \
sed -i \
-e 's/^mirrorlist/#mirrorlist/' \
-e 's/^#baseurl/baseurl/' \
-e 's/mirror\.centos\.org/vault.centos.org/' \
/etc/yum.repos.d/*.repo; \
if [ "${arch}" != "amd64" ]; then \
sed -i \
-e 's,vault\.centos\.org/centos,vault.centos.org/altarch,' \
/etc/yum.repos.d/CentOS-SCLo-scl-rh.repo; \
fi; \
fi
RUN yum install -y dnf

# Install basic dependencies
RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget kernel-headers
RUN dnf install -y git flex curl autoconf zip perl-IPC-Cmd wget

# A system Python is required for ninja and vcpkg in this Dockerfile.
# On manylinux2014 base images, system Python is 2.7.5, while
Expand All @@ -39,8 +51,7 @@ ENV CPYTHON_VERSION=cp38
ENV PATH=/opt/python/${CPYTHON_VERSION}-${CPYTHON_VERSION}/bin:${PATH}

# Install CMake
# AWS SDK doesn't work with CMake=3.22 due to https://gitlab.kitware.com/cmake/cmake/-/issues/22524
ARG cmake=3.21.4
ARG cmake=3.29.2
COPY ci/scripts/install_cmake.sh arrow/ci/scripts/
RUN /arrow/ci/scripts/install_cmake.sh ${arch} linux ${cmake} /usr/local

Expand All @@ -62,15 +73,16 @@ COPY ci/vcpkg/*.patch \
COPY ci/scripts/install_vcpkg.sh \
arrow/ci/scripts/
ENV VCPKG_ROOT=/opt/vcpkg
RUN arrow/ci/scripts/install_vcpkg.sh ${VCPKG_ROOT} ${vcpkg}
ENV PATH="${PATH}:${VCPKG_ROOT}"

ARG build_type=release
ENV CMAKE_BUILD_TYPE=${build_type} \
VCPKG_FORCE_SYSTEM_BINARIES=1 \
VCPKG_OVERLAY_TRIPLETS=/arrow/ci/vcpkg \
VCPKG_DEFAULT_TRIPLET=${arch_short}-linux-static-${build_type} \
VCPKG_FEATURE_FLAGS="manifests"

RUN arrow/ci/scripts/install_vcpkg.sh ${VCPKG_ROOT} ${vcpkg}
ENV PATH="${PATH}:${VCPKG_ROOT}"

COPY ci/vcpkg/vcpkg.json arrow/ci/vcpkg/
# cannot use the S3 feature here because while aws-sdk-cpp=1.9.160 contains
# ssl related fixes as well as we can patch the vcpkg portfile to support
Expand All @@ -81,6 +93,7 @@ RUN vcpkg install \
--clean-after-build \
--x-install-root=${VCPKG_ROOT}/installed \
--x-manifest-root=/arrow/ci/vcpkg \
--x-feature=azure \
--x-feature=flight \
--x-feature=gcs \
--x-feature=json \
Expand All @@ -97,4 +110,4 @@ SHELL ["/bin/bash", "-i", "-c"]
ENTRYPOINT ["/bin/bash", "-i", "-c"]

COPY python/requirements-wheel-build.txt /arrow/python/
RUN pip install -r /arrow/python/requirements-wheel-build.txt
RUN pip install -r /arrow/python/requirements-wheel-build.txt
108 changes: 108 additions & 0 deletions cpp/src/gandiva/array_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,36 @@ bool array_contains_template(const Type* entry_buf,
return false;
}

template <typename Type>
char* array_to_string_template(const Type* entry_buf,
int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity,
const char* delimiter) {
bool first_element = true;

const int32_t* entry_validityAdjusted = entry_validity - entry_len;
int64_t validityBitIndex = -entry_len;
std::string result; // Initialize the result variable as an empty string

for (int i = 0; i < entry_len; i++) {
if (!arrow::bit_util::GetBit(reinterpret_cast<const uint8_t*>(entry_validityAdjusted), validityBitIndex + i)) {
continue;
}

if (!first_element) {
result += delimiter;
}

Type entry_item = *(entry_buf + i);
result += std::to_string(entry_item);

first_element = false;
}

char* result_char = new char[result.length() + 1];
std::strcpy(result_char, result.c_str());
return result_char;
}

extern "C" {

bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf,
Expand Down Expand Up @@ -208,6 +238,35 @@ double* array_float64_remove(int64_t context_ptr, const double* entry_buf,
loop_var, validity_index_var,
valid_row, out_len, valid_ptr);
}

char* array_int32_to_string(int64_t context_ptr, const int32_t* entry_buf,
int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity,
int32_t contains_data, bool contains_data_valid,
const char* delimiter) {
return array_to_string_template<int32_t>(entry_buf, entry_len, entry_validity,
combined_row_validity, delimiter);
}

char* array_int64_to_string(int64_t context_ptr, const int64_t* entry_buf,
int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity,
const char* delimiter) {
return array_to_string_template<int64_t>(entry_buf, entry_len, entry_validity,
combined_row_validity, delimiter);
}

char* array_float32_to_string(int64_t context_ptr, const float* entry_buf,
int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity,
const char* delimiter) {
return array_to_string_template<float>(entry_buf, entry_len, entry_validity,
combined_row_validity, delimiter);
}

char* array_float64_to_string(int64_t context_ptr, const double* entry_buf,
int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity,
const char* delimiter) {
return array_to_string_template<double>(entry_buf, entry_len, entry_validity,
combined_row_validity, delimiter);
}
}

namespace gandiva {
Expand Down Expand Up @@ -353,6 +412,55 @@ arrow::Status ExportedArrayFunctions::AddMappings(Engine* engine) const {
engine->AddGlobalMappingForFunc("array_float64_remove",
types->double_ptr_type(), args,
reinterpret_cast<void*>(array_float64_remove));


//Array to string.
args = {types->i64_type(), // int64_t execution_context
types->i32_ptr_type(), // int8_t* input data ptr
types->i32_type(), // int32_t input length
types->i32_ptr_type(), // input validity buffer
types->i1_type(), // bool input row validity
types->i8_ptr_type() //value to for delimiter
};
engine->AddGlobalMappingForFunc("array_int32_to_string",
types->i32_ptr_type(), args,
reinterpret_cast<void*>(array_int32_to_string));

args = {types->i64_type(), // int64_t execution_context
types->i64_ptr_type(), // int8_t* input data ptr
types->i32_type(), // int32_t input length
types->i32_ptr_type(), // input validity buffer
types->i1_type(), // bool input row validity
types->i8_ptr_type() //value to for delimiter
};

engine->AddGlobalMappingForFunc("array_int64_to_string",
types->i64_ptr_type(), args,
reinterpret_cast<void*>(array_int64_to_string));

args = {types->i64_type(), // int64_t execution_context
types->float_ptr_type(), // float* input data ptr
types->i32_type(), // int32_t input length
types->i32_ptr_type(), // input validity buffer
types->i1_type(), // bool input row validity
types->i8_ptr_type() //value to for delimiter
};

engine->AddGlobalMappingForFunc("array_float32_to_string",
types->float_ptr_type(), args,
reinterpret_cast<void*>(array_float32_to_string));

args = {types->i64_type(), // int64_t execution_context
types->double_ptr_type(), // int8_t* input data ptr
types->i32_type(), // int32_t input length
types->i32_ptr_type(), // input validity buffer
types->i1_type(), // bool input row validity
types->i8_ptr_type() //value to for delimiter
};

engine->AddGlobalMappingForFunc("array_float64_to_string",
types->double_ptr_type(), args,
reinterpret_cast<void*>(array_float64_to_string));
return arrow::Status::OK();
}
} // namespace gandiva
18 changes: 17 additions & 1 deletion cpp/src/gandiva/array_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,5 +82,21 @@ double* array_float64_remove(int64_t context_ptr, const double* entry_buf,
double remove_data, bool entry_validWhat,
int64_t loop_var, int64_t validity_index_var,
bool* valid_row, int32_t* out_len, int32_t** valid_ptr);

GANDIVA_EXPORT
char* array_int32_to_string(int64_t context_ptr, const int32_t* entry_buf,
int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity,
int32_t contains_data, bool contains_data_valid,
const char* delimiter);
GANDIVA_EXPORT
char* array_int64_to_string(int64_t context_ptr, const int64_t* entry_buf,
int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity,
const char* delimiter);
GANDIVA_EXPORT
char* array_float32_to_string(int64_t context_ptr, const float* entry_buf,
int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity,
const char* delimiter);
GANDIVA_EXPORT
char* array_float64_to_string(int64_t context_ptr, const double* entry_buf,
int32_t entry_len, const int32_t* entry_validity, bool combined_row_validity,
const char* delimiter);
}
13 changes: 13 additions & 0 deletions cpp/src/gandiva/function_registry_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,19 @@ std::vector<NativeFunction> GetArrayFunctionRegistry() {
NativeFunction("array_remove", {}, DataTypeVector{list(float64()), float64()},
list(float64()), kResultNullInternal, "array_float64_remove",
NativeFunction::kNeedsContext),

NativeFunction("array_to_string", {}, DataTypeVector{list(int32()), utf8()},
utf8(), kResultNullInternal, "array_int32_to_string",
NativeFunction::kNeedsContext),
NativeFunction("array_to_string", {}, DataTypeVector{list(int64()), utf8()},
utf8(), kResultNullInternal, "array_int64_to_string",
NativeFunction::kNeedsContext),
NativeFunction("array_to_string", {}, DataTypeVector{list(float32()), utf8()},
utf8(), kResultNullInternal, "array_float32_to_string",
NativeFunction::kNeedsContext),
NativeFunction("array_to_string", {}, DataTypeVector{list(float64()), utf8()},
utf8(), kResultNullInternal, "array_float64_to_string",
NativeFunction::kNeedsContext),
};
return array_fn_registry_;
}
Expand Down
Loading