Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add "Extended" clickbench queries #8861

Merged
merged 1 commit into from
Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ parquet: Benchmark of parquet reader's filtering speed
sort: Benchmark of sorting speed
clickbench_1: ClickBench queries against a single parquet file
clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
clickbench_extended: ClickBench "inspired" queries against a single parquet (DataFusion specific)

**********
* Supported Configuration (Environment Variables)
Expand Down Expand Up @@ -155,6 +156,9 @@ main() {
clickbench_partitioned)
data_clickbench_partitioned
;;
clickbench_extended)
data_clickbench_1
;;
*)
echo "Error: unknown benchmark '$BENCHMARK' for data generation"
usage
Expand Down Expand Up @@ -193,6 +197,7 @@ main() {
run_sort
run_clickbench_1
run_clickbench_partitioned
run_clickbench_extended
;;
tpch)
run_tpch "1"
Expand All @@ -218,6 +223,9 @@ main() {
clickbench_partitioned)
run_clickbench_partitioned
;;
clickbench_extended)
run_clickbench_extended
;;
*)
echo "Error: unknown benchmark '$BENCHMARK' for run"
usage
Expand Down Expand Up @@ -401,6 +409,15 @@ run_clickbench_partitioned() {
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o ${RESULTS_FILE}
}

# Runs the clickbench "extended" benchmark with a single large parquet file
run_clickbench_extended() {
RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json"
echo "RESULTS_FILE: ${RESULTS_FILE}"
echo "Running clickbench (1 file) extended benchmark..."
$CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o ${RESULTS_FILE}
}


compare_benchmarks() {
BASE_RESULTS_DIR="${SCRIPT_DIR}/results"
BRANCH1="${ARG2}"
Expand Down
33 changes: 33 additions & 0 deletions benchmarks/queries/clickbench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# ClickBench queries

This directory contains queries for the ClickBench benchmark https://benchmark.clickhouse.com/

ClickBench is focused on aggregation and filtering performance (though it has no Joins)

## Files:
* `queries.sql` - Actual ClickBench queries, downloaded from the [ClickBench repository]
* `extended.sql` - "Extended" DataFusion specific queries.

[ClickBench repository]: https://github.com/ClickHouse/ClickBench/blob/main/datafusion/queries.sql

## "Extended" Queries
The "extended" queries are not part of the official ClickBench benchmark.
Instead they are used to test other DataFusion features that are not
covered by the standard benchmark

Each description below is for the corresponding line in `extended.sql` (line 1
is `Q0`, line 2 is `Q1`, etc.)

### Q0
Models initial Data exploration, to understand some statistics of data.
Import Query Properties: multiple `COUNT DISTINCT` on strings

```sql
SELECT
COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel")
FROM hits;
```




1 change: 0 additions & 1 deletion benchmarks/queries/clickbench/README.txt

This file was deleted.

1 change: 1 addition & 0 deletions benchmarks/queries/clickbench/extended.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
66 changes: 41 additions & 25 deletions benchmarks/src/clickbench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,14 @@
// specific language governing permissions and limitations
// under the License.

use std::path::Path;
use std::{path::PathBuf, time::Instant};

use datafusion::{
common::exec_err,
error::{DataFusionError, Result},
prelude::SessionContext,
};
use datafusion_common::exec_datafusion_err;
use structopt::StructOpt;

use crate::{BenchmarkRun, CommonOpt};
Expand Down Expand Up @@ -69,15 +70,49 @@ pub struct RunOpt {
output_path: Option<PathBuf>,
}

const CLICKBENCH_QUERY_START_ID: usize = 0;
const CLICKBENCH_QUERY_END_ID: usize = 42;
struct AllQueries {
queries: Vec<String>,
}

impl AllQueries {
fn try_new(path: &Path) -> Result<Self> {
// ClickBench has all queries in a single file identified by line number
let all_queries = std::fs::read_to_string(path)
.map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?;
Ok(Self {
queries: all_queries.lines().map(|s| s.to_string()).collect(),
})
}

/// Returns the text of query `query_id`
fn get_query(&self, query_id: usize) -> Result<&str> {
self.queries
.get(query_id)
.ok_or_else(|| {
let min_id = self.min_query_id();
let max_id = self.max_query_id();
exec_datafusion_err!(
"Invalid query id {query_id}. Must be between {min_id} and {max_id}"
)
})
.map(|s| s.as_str())
}

fn min_query_id(&self) -> usize {
0
}

fn max_query_id(&self) -> usize {
self.queries.len() - 1
}
}
impl RunOpt {
pub async fn run(self) -> Result<()> {
println!("Running benchmarks with the following options: {self:?}");
let queries = AllQueries::try_new(self.queries_path.as_path())?;
let query_range = match self.query {
Some(query_id) => query_id..=query_id,
None => CLICKBENCH_QUERY_START_ID..=CLICKBENCH_QUERY_END_ID,
None => queries.min_query_id()..=queries.max_query_id(),
};

let config = self.common.config();
Expand All @@ -88,12 +123,12 @@ impl RunOpt {
let mut benchmark_run = BenchmarkRun::new();
for query_id in query_range {
benchmark_run.start_new_case(&format!("Query {query_id}"));
let sql = self.get_query(query_id)?;
let sql = queries.get_query(query_id)?;
println!("Q{query_id}: {sql}");

for i in 0..iterations {
let start = Instant::now();
let results = ctx.sql(&sql).await?.collect().await?;
let results = ctx.sql(sql).await?.collect().await?;
let elapsed = start.elapsed();
let ms = elapsed.as_secs_f64() * 1000.0;
let row_count: usize = results.iter().map(|b| b.num_rows()).sum();
Expand All @@ -120,23 +155,4 @@ impl RunOpt {
)
})
}

/// Returns the text of query `query_id`
fn get_query(&self, query_id: usize) -> Result<String> {
if query_id > CLICKBENCH_QUERY_END_ID {
return exec_err!(
"Invalid query id {query_id}. Must be between {CLICKBENCH_QUERY_START_ID} and {CLICKBENCH_QUERY_END_ID}"
);
}

let path = self.queries_path.as_path();

// ClickBench has all queries in a single file identified by line number
let all_queries = std::fs::read_to_string(path).map_err(|e| {
DataFusionError::Execution(format!("Could not open {path:?}: {e}"))
})?;
let all_queries: Vec<_> = all_queries.lines().collect();

Ok(all_queries.get(query_id).map(|s| s.to_string()).unwrap())
}
}