Skip to content

Add support for collect_list Spark aggregate function #1265

Add support for collect_list Spark aggregate function

Add support for collect_list Spark aggregate function #1265

Workflow file for this run

# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "Fuzzer Jobs"
on:
pull_request:
paths:
- "velox/**"
- "!velox/docs/**"
- "CMakeLists.txt"
- "CMake/**"
- "third_party/**"
- "scripts/setup-ubuntu.sh"
- "scripts/setup-helper-functions.sh"
- ".github/workflows/linux-build.yml"
- ".github/workflows/scheduled.yml"
schedule:
- cron: '0 3 * * *'
workflow_dispatch:
inputs:
ref:
description: 'Ref to checkout out'
default: 'main'
numThreads:
description: 'Number of threads'
default: 16
maxHighMemJobs:
description: 'Number of high memory jobs'
default: 8
maxLinkJobs:
description: 'Maximum number of link jobs'
default: 4
extraCMakeFlags:
description: 'Additional CMake flags'
default: ''
duration:
description: 'Duration of fuzzer run in seconds'
default: 1800
defaults:
run:
shell: bash
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.repository }}-${{ github.head_ref || github.sha }}
cancel-in-progress: true
env:
# Run for 15 minute on PRs
DURATION: "${{ inputs.duration || ( github.event_name == 'pull_request' && 900 || 1800 )}}"
# minimize artifact duration for PRs, keep them a bit longer for nightly runs
RETENTION: "${{ github.event_name == 'pull_request' && 1 || 3 }}"
jobs:
compile:
name: Build
runs-on: 8-core
timeout-minutes: 120
env:
CCACHE_DIR: "${{ github.workspace }}/.ccache/"
CCACHE_BASEDIR: "${{ github.workspace }}"
LINUX_DISTRO: "ubuntu"
steps:
- name: "Restore ccache"
uses: assignUser/stash/restore@v1
with:
path: "${{ env.CCACHE_DIR }}"
key: ccache-fuzzer
- name: "Checkout Repo"
uses: actions/checkout@v4
with:
path: velox
submodules: 'recursive'
ref: "${{ inputs.ref }}"
- name: "Install dependencies"
run: |
cd velox
source ./scripts/setup-ubuntu.sh
ccache -vsz
- name: Build
run: |
cd velox
make debug NUM_THREADS="${{ inputs.numThreads || 8 }}" MAX_HIGH_MEM_JOBS="${{ inputs.maxHighMemJobs || 8 }}" MAX_LINK_JOBS="${{ inputs.maxLinkJobs || 4 }}" EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON ${{ inputs.extraCMakeFlags }}"
- name: Ccache after
run: ccache -vs
- name: "Save ccache"
uses: assignUser/stash/save@v1
with:
path: "${{ env.CCACHE_DIR }}"
key: ccache-fuzzer
retention-days: "${{ env.RETENTION }}"
- name: Upload presto fuzzer
uses: actions/upload-artifact@v4
with:
name: presto
path: velox/_build/debug/velox/expression/tests/velox_expression_fuzzer_test
retention-days: "${{ env.RETENTION }}"
- name: Upload spark expression fuzzer
uses: actions/upload-artifact@v4
with:
name: spark_expression_fuzzer
path: velox/_build/debug/velox/expression/tests/spark_expression_fuzzer_test
retention-days: "${{ env.RETENTION }}"
- name: Upload spark aggregation fuzzer
uses: actions/upload-artifact@v4
with:
name: spark_aggregation_fuzzer
path: velox/_build/debug/velox/functions/sparksql/fuzzer/spark_aggregation_fuzzer_test
retention-days: "${{ env.RETENTION }}"
- name: Upload aggregation fuzzer
uses: actions/upload-artifact@v4
with:
name: aggregation
path: velox/_build/debug/velox/functions/prestosql/fuzzer/velox_aggregation_fuzzer_test
retention-days: "${{ env.RETENTION }}"
- name: Upload join fuzzer
uses: actions/upload-artifact@v4
with:
name: join
path: velox/_build/debug/velox/exec/tests/velox_join_fuzzer_test
retention-days: "${{ env.RETENTION }}"
- name: Upload exchange fuzzer
uses: actions/upload-artifact@v4
with:
name: exchange
path: velox/_build/debug//velox/exec/tests/velox_exchange_fuzzer_test
retention-days: "${{ env.RETENTION }}"
linux-presto-fuzzer-run:
name: "Presto Fuzzer"
runs-on: ubuntu-latest
needs: compile
timeout-minutes: 120
steps:
- name: "Checkout Repo"
uses: actions/checkout@v4
with:
ref: "${{ inputs.ref }}"
- name: "Install dependencies"
run: source ./scripts/setup-ubuntu.sh
- uses: dorny/paths-filter@v3
if: github.event_name == 'pull_request'
id: changes
with:
filters: |
presto:
- 'velox/expression/!(test)**'
- 'velox/exec/!(test)**'
- 'velox/common/!(test)**'
- 'velox/core/!(test)**'
- 'velox/vector/!(test)**'
- name: Set presto specific fuzzer duration
env:
# Run for 30 minutes instead of 15, when files relevant to presto are touched
pr_duration: "${{ steps.changes.outputs.presto == 'true' && 1800 || 900 }}"
# Run for 60 minutes if its a scheduled run
other_duration: "${{ inputs.duration || 3600 }}"
is_pr: "${{ github.event_name == 'pull_request' }}"
run: |
if [ "$is_pr" == "true" ]; then
duration=$pr_duration
else
duration=$other_duration
fi
echo "DURATION=$duration" >> $GITHUB_ENV
- name: Download presto fuzzer
uses: actions/download-artifact@v4
with:
name: presto
- name: "Run Presto Fuzzer"
run: |
mkdir -p /tmp/fuzzer_repro/
chmod -R 777 /tmp/fuzzer_repro
chmod +x velox_expression_fuzzer_test
./velox_expression_fuzzer_test \
--seed ${RANDOM} \
--enable_variadic_signatures \
--velox_fuzzer_enable_complex_types \
--lazy_vector_generation_ratio 0.2 \
--velox_fuzzer_enable_column_reuse \
--velox_fuzzer_enable_expression_reuse \
--max_expression_trees_per_step 2 \
--retry_with_try \
--enable_dereference \
--duration_sec $DURATION \
--logtostderr=1 \
--minloglevel=0 \
--repro_persist_path=/tmp/fuzzer_repro \
&& echo -e "\n\nFuzzer run finished successfully."
- name: Archive production artifacts
if: ${{ !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: presto-fuzzer-failure-artifacts
path: |
/tmp/fuzzer_repro
linux-spark-aggregate-fuzzer-run:
name: "Spark Aggregate Fuzzer"
runs-on: ubuntu-latest
needs: compile
timeout-minutes: 60
steps:
- name: "Checkout Repo"
uses: actions/checkout@v4
with:
ref: "${{ inputs.ref }}"
- name: "Install dependencies"
run: source ./scripts/setup-ubuntu.sh
- name: Download spark aggregation fuzzer
uses: actions/download-artifact@v4
with:
name: spark_aggregation_fuzzer
- name: "Run Spark Aggregate Fuzzer"
run: |
mkdir -p /tmp/spark_aggregate_fuzzer_repro/
chmod -R 777 /tmp/spark_aggregate_fuzzer_repro
chmod +x spark_aggregation_fuzzer_test
./spark_aggregation_fuzzer_test \
--seed ${RANDOM} \
--duration_sec $DURATION \
--logtostderr=1 \
--minloglevel=0 \
--repro_persist_path=/tmp/spark_aggregate_fuzzer_repro \
&& echo -e "\n\nSpark Aggregation Fuzzer run finished successfully."
- name: Archive Spark aggregate production artifacts
if: ${{ !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: spark-agg-fuzzer-failure-artifacts
path: |
/tmp/spark_aggregate_fuzzer_repro
linux-spark-fuzzer-run:
name: "Spark Fuzzer"
runs-on: ubuntu-latest
needs: compile
timeout-minutes: 120
steps:
- name: "Checkout Repo"
uses: actions/checkout@v4
with:
ref: "${{ inputs.ref }}"
- name: "Install dependencies"
run: source ./scripts/setup-ubuntu.sh
- name: Download spark expression fuzzer
uses: actions/download-artifact@v4
with:
name: spark_expression_fuzzer
- name: "Run Spark Expression Fuzzer"
run: |
mkdir -p /tmp/spark_fuzzer_repro/
chmod -R 777 /tmp/spark_fuzzer_repro
chmod +x spark_expression_fuzzer_test
./spark_expression_fuzzer_test \
--seed ${RANDOM} \
--enable_variadic_signatures \
--lazy_vector_generation_ratio 0.2 \
--velox_fuzzer_enable_column_reuse \
--velox_fuzzer_enable_expression_reuse \
--max_expression_trees_per_step 2 \
--retry_with_try \
--enable_dereference \
--duration_sec $DURATION \
--logtostderr=1 \
--minloglevel=0 \
--repro_persist_path=/tmp/spark_fuzzer_repro \
&& echo -e "\n\nSpark Fuzzer run finished successfully."
- name: Archive Spark expression production artifacts
if: ${{ !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: spark-fuzzer-failure-artifacts
path: |
/tmp/spark_fuzzer_repro
linux-aggregate-fuzzer-run:
name: "Aggregate Fuzzer"
runs-on: ubuntu-latest
needs: compile
timeout-minutes: 120
steps:
- name: "Checkout Repo"
uses: actions/checkout@v4
with:
ref: "${{ inputs.ref }}"
- name: "Install dependencies"
run: source ./scripts/setup-ubuntu.sh
- name: Download aggregation fuzzer
uses: actions/download-artifact@v4
with:
name: aggregation
- name: "Run Aggregate Fuzzer"
run: |
mkdir -p /tmp/aggregate_fuzzer_repro/
rm -rfv /tmp/aggregate_fuzzer_repro/*
chmod -R 777 /tmp/aggregate_fuzzer_repro
chmod +x velox_aggregation_fuzzer_test
./velox_aggregation_fuzzer_test \
--seed ${RANDOM} \
--duration_sec $DURATION \
--logtostderr=1 \
--minloglevel=0 \
--repro_persist_path=/tmp/aggregate_fuzzer_repro \
&& echo -e "\n\nAggregation fuzzer run finished successfully."
- name: Archive aggregate production artifacts
if: ${{ !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: aggregate-fuzzer-failure-artifacts
path: |
/tmp/aggregate_fuzzer_repro
linux-join-fuzzer-run:
name: "Join Fuzzer"
runs-on: ubuntu-latest
needs: compile
timeout-minutes: 120
steps:
- name: "Checkout Repo"
uses: actions/checkout@v4
with:
ref: "${{ inputs.ref }}"
- name: "Install dependencies"
run: source ./scripts/setup-ubuntu.sh
- name: Download join fuzzer
uses: actions/download-artifact@v4
with:
name: join
- name: "Run Join Fuzzer"
run: |
mkdir -p /tmp/join_fuzzer_repro/
rm -rfv /tmp/join_fuzzer_repro/*
chmod -R 777 /tmp/join_fuzzer_repro
chmod +x velox_join_fuzzer_test
./velox_join_fuzzer_test \
--seed ${RANDOM} \
--duration_sec $DURATION \
--logtostderr=1 \
--minloglevel=0 \
&& echo -e "\n\nAggregation fuzzer run finished successfully."
- name: Archive aggregate production artifacts
if: ${{ !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: join-fuzzer-failure-artifacts
path: |
/tmp/join_fuzzer_repro
linux-exchange-fuzzer-run:
runs-on: ubuntu-latest
needs: compile
timeout-minutes: 120
steps:
- name: Checkout Repo
uses: actions/checkout@v4
with:
ref: "${{ inputs.ref }}"
- name: Install dependencies
run: source ./scripts/setup-ubuntu.sh
- name: Download exchange fuzzer
uses: actions/download-artifact@v4
with:
name: exchange
- name: Run exchange Fuzzer
run: |
sudo sysctl -w vm.max_map_count=67108864
cat /proc/sys/vm/max_map_count
mkdir -p /tmp/exchange_fuzzer_repro/
rm -rfv /tmp/exchange_fuzzer_repro/*
chmod -R 777 /tmp/exchange_fuzzer_repro
chmod +x velox_exchange_fuzzer_test
./velox_exchange_fuzzer_test \
--seed ${RANDOM} \
--duration_sec $DURATION \
--logtostderr=1 \
--minloglevel=0 \
--repro_path=/tmp/exchange_fuzzer_repro \
&& echo -e "\n\Exchange fuzzer run finished successfully."
- name: Archive Exchange production artifacts
if: ${{ !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: exchange-fuzzer-failure-artifacts
path: |
/tmp/exchange_fuzzer_repro
presto-java-aggregation-fuzzer-run:
name: Aggregation Fuzzer with Presto as source of truth
runs-on: 8-core
container: ghcr.io/facebookincubator/velox-dev:presto-java
timeout-minutes: 120
if: ${{ github.event_name != 'pull_request' }}
env:
CCACHE_DIR: "${{ github.workspace }}/.ccache/"
CCACHE_BASEDIR: "${{ github.workspace }}"
LINUX_DISTRO: "centos"
steps:
- name: "Restore ccache"
uses: assignUser/stash/restore@v1
with:
path: "${{ env.CCACHE_DIR }}"
key: ccache-presto-java-fuzzer
- name: "Checkout Repo"
uses: actions/checkout@v4
with:
path: velox
submodules: 'recursive'
ref: "${{ inputs.ref }}"
- name: Fix git permissions
# Usually actions/checkout does this but as we run in a container
# it doesn't work
run: git config --global --add safe.directory /__w/velox/velox/velox
- name: Zero Ccache Statistics
run: ccache -sz
- name: "Build"
env:
EXTRA_CMAKE_FLAGS: "-DVELOX_ENABLE_ARROW=ON ${{ inputs.extraCMakeFlags }}"
run: |
cd velox
make debug NUM_THREADS="${{ inputs.numThreads || 8 }}" MAX_HIGH_MEM_JOBS="${{ inputs.maxHighMemJobs || 8 }}" MAX_LINK_JOBS="${{ inputs.maxLinkJobs || 4 }}" EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON ${{ inputs.extraCMakeFlags }}"
ccache -s
- name: "Save ccache"
uses: assignUser/stash/save@v1
with:
path: "${{ env.CCACHE_DIR }}"
key: ccache-presto-java-fuzzer
- name: "Run Aggregate Fuzzer"
run: |
cd velox
cp ./scripts/etc/hive.properties $PRESTO_HOME/etc/catalog
ls -lR $PRESTO_HOME/etc
$PRESTO_HOME/bin/launcher run -v > /tmp/server.log 2>&1 &
# Sleep for 60 seconds to allow Presto server to start.
sleep 60
/opt/presto-cli --server 127.0.0.1:8080 --execute 'CREATE SCHEMA hive.tpch;'
mkdir -p /tmp/aggregate_fuzzer_repro/
rm -rfv /tmp/aggregate_fuzzer_repro/*
chmod -R 777 /tmp/aggregate_fuzzer_repro
_build/debug/velox/functions/prestosql/fuzzer/velox_aggregation_fuzzer_test \
--seed ${RANDOM} \
--duration_sec $DURATION \
--logtostderr=1 \
--minloglevel=0 \
--repro_persist_path=/tmp/aggregate_fuzzer_repro \
--enable_sorted_aggregations=true \
--presto_url=http://127.0.0.1:8080 \
&& echo -e "\n\nAggregation fuzzer run finished successfully."
- name: Archive aggregate production artifacts
if: ${{ !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: presto-sot-aggregate-fuzzer-failure-artifacts
path: |
/tmp/aggregate_fuzzer_repro
/tmp/server.log