Skip to content

Commit 26adc8f

Browse files
acking-youKontinuationtimsauceralamb
authored
Add short circuit evaluation for AND and OR (apache#15462)
* [draft] add shot circuit in BinaryExpr * refactor: add check_short_circuit function * refactor: change if condition to match * feat: Add support for --mem-pool-type and --memory-limit options to multiple benchmarks (apache#14642) * Add support --mem-pool-type and --memory-limit options for all benchmarks * Add --sort-spill-reservation-bytes option * Chore/Add additional FFI unit tests (apache#14802) * Add unit tests to FFI_ExecutionPlan * Add unit tests for FFI table source * Add round trip tests for volatility * Add unit tests for FFI insert op * Simplify string generation in unit test Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * Fix drop of borrowed value --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * Improve feature flag CI coverage `datafusion` and `datafusion-functions` (apache#15203) * add extend sql & docs * feat: Add support for --mem-pool-type and --memory-limit options to multiple benchmarks (apache#14642) * Add support --mem-pool-type and --memory-limit options for all benchmarks * Add --sort-spill-reservation-bytes option * Chore/Add additional FFI unit tests (apache#14802) * Add unit tests to FFI_ExecutionPlan * Add unit tests for FFI table source * Add round trip tests for volatility * Add unit tests for FFI insert op * Simplify string generation in unit test Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * Fix drop of borrowed value --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * Improve feature flag CI coverage `datafusion` and `datafusion-functions` (apache#15203) * fix: incorrect false judgment * add test * separate q6 to new PR * feat: Add support for --mem-pool-type and --memory-limit options to multiple benchmarks (apache#14642) * Add support --mem-pool-type and --memory-limit options for all benchmarks * Add --sort-spill-reservation-bytes option * Chore/Add additional FFI unit tests (apache#14802) * Add unit tests to FFI_ExecutionPlan * Add unit tests for FFI table source * Add round trip tests for volatility * Add unit tests for FFI insert op * Simplify string generation in unit test Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * Fix drop of borrowed value --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * Improve feature flag CI coverage `datafusion` and `datafusion-functions` (apache#15203) * feat: Add support for --mem-pool-type and --memory-limit options to multiple benchmarks (apache#14642) * Add support --mem-pool-type and --memory-limit options for all benchmarks * Add --sort-spill-reservation-bytes option * Chore/Add additional FFI unit tests (apache#14802) * Add unit tests to FFI_ExecutionPlan * Add unit tests for FFI table source * Add round trip tests for volatility * Add unit tests for FFI insert op * Simplify string generation in unit test Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * Fix drop of borrowed value --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * Improve feature flag CI coverage `datafusion` and `datafusion-functions` (apache#15203) * add benchmark for boolean_op * fix cargo doc * add binary_op bench * Better comments --------- Co-authored-by: Kristin Cowalcijk <bo@wherobots.com> Co-authored-by: Tim Saucer <timsaucer@gmail.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent c188f46 commit 26adc8f

File tree

3 files changed

+472
-0
lines changed

3 files changed

+472
-0
lines changed

datafusion/physical-expr/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,7 @@ name = "case_when"
7171
[[bench]]
7272
harness = false
7373
name = "is_null"
74+
75+
[[bench]]
76+
harness = false
77+
name = "binary_op"
Lines changed: 373 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,373 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow::{
19+
array::BooleanArray,
20+
compute::{bool_and, bool_or},
21+
datatypes::{DataType, Field, Schema},
22+
};
23+
use arrow::{array::StringArray, record_batch::RecordBatch};
24+
use criterion::{black_box, criterion_group, criterion_main, Criterion};
25+
use datafusion_expr::{and, binary_expr, col, lit, or, Operator};
26+
use datafusion_physical_expr::{
27+
expressions::{BinaryExpr, Column},
28+
planner::logical2physical,
29+
PhysicalExpr,
30+
};
31+
use std::sync::{Arc, LazyLock};
32+
33+
/// Generates BooleanArrays with different true/false distributions for benchmarking.
34+
///
35+
/// Returns a vector of tuples containing scenario name and corresponding BooleanArray.
36+
///
37+
/// # Arguments
38+
/// - `TEST_ALL_FALSE` - Used to generate what kind of test data
39+
/// - `len` - Length of the BooleanArray to generate
40+
fn generate_boolean_cases<const TEST_ALL_FALSE: bool>(
41+
len: usize,
42+
) -> Vec<(String, BooleanArray)> {
43+
let mut cases = Vec::with_capacity(6);
44+
45+
// Scenario 1: All elements false or all elements true
46+
if TEST_ALL_FALSE {
47+
let all_false = BooleanArray::from(vec![false; len]);
48+
cases.push(("all_false".to_string(), all_false));
49+
} else {
50+
let all_true = BooleanArray::from(vec![true; len]);
51+
cases.push(("all_true".to_string(), all_true));
52+
}
53+
54+
// Scenario 2: Single true at first position or single false at first position
55+
if TEST_ALL_FALSE {
56+
let mut first_true = vec![false; len];
57+
first_true[0] = true;
58+
cases.push(("one_true_first".to_string(), BooleanArray::from(first_true)));
59+
} else {
60+
let mut first_false = vec![true; len];
61+
first_false[0] = false;
62+
cases.push((
63+
"one_false_first".to_string(),
64+
BooleanArray::from(first_false),
65+
));
66+
}
67+
68+
// Scenario 3: Single true at last position or single false at last position
69+
if TEST_ALL_FALSE {
70+
let mut last_true = vec![false; len];
71+
last_true[len - 1] = true;
72+
cases.push(("one_true_last".to_string(), BooleanArray::from(last_true)));
73+
} else {
74+
let mut last_false = vec![true; len];
75+
last_false[len - 1] = false;
76+
cases.push(("one_false_last".to_string(), BooleanArray::from(last_false)));
77+
}
78+
79+
// Scenario 4: Single true at exact middle or single false at exact middle
80+
let mid = len / 2;
81+
if TEST_ALL_FALSE {
82+
let mut mid_true = vec![false; len];
83+
mid_true[mid] = true;
84+
cases.push(("one_true_middle".to_string(), BooleanArray::from(mid_true)));
85+
} else {
86+
let mut mid_false = vec![true; len];
87+
mid_false[mid] = false;
88+
cases.push((
89+
"one_false_middle".to_string(),
90+
BooleanArray::from(mid_false),
91+
));
92+
}
93+
94+
// Scenario 5: Single true at 25% position or single false at 25% position
95+
let mid_left = len / 4;
96+
if TEST_ALL_FALSE {
97+
let mut mid_left_true = vec![false; len];
98+
mid_left_true[mid_left] = true;
99+
cases.push((
100+
"one_true_middle_left".to_string(),
101+
BooleanArray::from(mid_left_true),
102+
));
103+
} else {
104+
let mut mid_left_false = vec![true; len];
105+
mid_left_false[mid_left] = false;
106+
cases.push((
107+
"one_false_middle_left".to_string(),
108+
BooleanArray::from(mid_left_false),
109+
));
110+
}
111+
112+
// Scenario 6: Single true at 75% position or single false at 75% position
113+
let mid_right = (3 * len) / 4;
114+
if TEST_ALL_FALSE {
115+
let mut mid_right_true = vec![false; len];
116+
mid_right_true[mid_right] = true;
117+
cases.push((
118+
"one_true_middle_right".to_string(),
119+
BooleanArray::from(mid_right_true),
120+
));
121+
} else {
122+
let mut mid_right_false = vec![true; len];
123+
mid_right_false[mid_right] = false;
124+
cases.push((
125+
"one_false_middle_right".to_string(),
126+
BooleanArray::from(mid_right_false),
127+
));
128+
}
129+
130+
cases
131+
}
132+
133+
/// Benchmarks boolean operations `false_count/bool_or` and `true_count/bool_and` on [`BooleanArray`]
134+
/// You can run this benchmark with:
135+
/// ```sh
136+
/// # test true_count/false_count
137+
/// TEST_BOOL_COUNT=1 cargo bench --bench binary_op -- boolean_ops
138+
/// # test bool_or/bool_and
139+
/// cargo bench --bench binary_op -- boolean_ops
140+
/// ```
141+
fn benchmark_boolean_ops(c: &mut Criterion) {
142+
let len = 1_000_000; // Use one million elements for clear performance differentiation
143+
static TEST_BOOL_COUNT: LazyLock<bool> =
144+
LazyLock::new(|| match std::env::var("TEST_BOOL_COUNT") {
145+
Ok(_) => {
146+
println!("TEST_BOOL_COUNT=ON");
147+
true
148+
}
149+
Err(_) => {
150+
println!("TEST_BOOL_COUNT=OFF");
151+
false
152+
}
153+
});
154+
155+
// Determine the test function to be executed based on the ENV `TEST_BOOL_COUNT`
156+
fn test_func<const TEST_ALL_FALSE: bool>(array: &BooleanArray) -> bool {
157+
// Use false_count for all false and true_count for all true
158+
if *TEST_BOOL_COUNT {
159+
if TEST_ALL_FALSE {
160+
array.false_count() == array.len()
161+
} else {
162+
array.true_count() == array.len()
163+
}
164+
}
165+
// Use bool_or for all false and bool_and for all true
166+
else if TEST_ALL_FALSE {
167+
match bool_or(array) {
168+
Some(v) => !v,
169+
None => false,
170+
}
171+
} else {
172+
bool_and(array).unwrap_or(false)
173+
}
174+
}
175+
176+
// Test cases for false_count and bool_or
177+
{
178+
let test_cases = generate_boolean_cases::<true>(len);
179+
for (scenario, array) in test_cases {
180+
let arr_ref = Arc::new(array);
181+
182+
// Benchmark test_func across different scenarios
183+
c.bench_function(&format!("boolean_ops/or/{}", scenario), |b| {
184+
b.iter(|| test_func::<true>(black_box(&arr_ref)))
185+
});
186+
}
187+
}
188+
// Test cases for true_count and bool_and
189+
{
190+
let test_cases = generate_boolean_cases::<false>(len);
191+
for (scenario, array) in test_cases {
192+
let arr_ref = Arc::new(array);
193+
194+
// Benchmark test_func across different scenarios
195+
c.bench_function(&format!("boolean_ops/and/{}", scenario), |b| {
196+
b.iter(|| test_func::<false>(black_box(&arr_ref)))
197+
});
198+
}
199+
}
200+
}
201+
202+
/// Benchmarks AND/OR operator short-circuiting by evaluating complex regex conditions.
203+
///
204+
/// Creates 6 test scenarios per operator:
205+
/// 1. All values enable short-circuit (all_true/all_false)
206+
/// 2. 2-6 Single true/false value at different positions to measure early exit
207+
///
208+
/// You can run this benchmark with:
209+
/// ```sh
210+
/// cargo bench --bench binary_op -- short_circuit
211+
/// ```
212+
fn benchmark_binary_op_in_short_circuit(c: &mut Criterion) {
213+
// Create schema with three columns
214+
let schema = Arc::new(Schema::new(vec![
215+
Field::new("a", DataType::Boolean, false),
216+
Field::new("b", DataType::Utf8, false),
217+
Field::new("c", DataType::Utf8, false),
218+
]));
219+
220+
// Generate test data with extended content
221+
let (b_values, c_values) = generate_test_strings(8192);
222+
223+
let batches_and =
224+
create_record_batch::<true>(schema.clone(), &b_values, &c_values).unwrap();
225+
let batches_or =
226+
create_record_batch::<false>(schema.clone(), &b_values, &c_values).unwrap();
227+
228+
// Build complex string matching conditions
229+
let right_condition_and = and(
230+
// Check for API endpoint pattern in URLs
231+
binary_expr(
232+
col("b"),
233+
Operator::RegexMatch,
234+
lit(r#"^https://(\w+\.)?example\.(com|org)/"#),
235+
),
236+
// Check for markdown code blocks and summary section
237+
binary_expr(
238+
col("c"),
239+
Operator::RegexMatch,
240+
lit("```(rust|python|go)\nfn? main$$"),
241+
),
242+
);
243+
244+
let right_condition_or = or(
245+
// Check for secure HTTPS protocol
246+
binary_expr(
247+
col("b"),
248+
Operator::RegexMatch,
249+
lit(r#"^https://(\w+\.)?example\.(com|org)/"#),
250+
),
251+
// Check for Rust code examples
252+
binary_expr(
253+
col("c"),
254+
Operator::RegexMatch,
255+
lit("```(rust|python|go)\nfn? main$$"),
256+
),
257+
);
258+
259+
// Create physical binary expressions
260+
let expr_and = BinaryExpr::new(
261+
Arc::new(Column::new("a", 0)),
262+
Operator::And,
263+
logical2physical(&right_condition_and, &schema),
264+
);
265+
266+
let expr_or = BinaryExpr::new(
267+
Arc::new(Column::new("a", 0)),
268+
Operator::Or,
269+
logical2physical(&right_condition_or, &schema),
270+
);
271+
272+
// Each scenario when the test operator is `and`
273+
{
274+
for (name, batch) in batches_and {
275+
c.bench_function(&format!("short_circuit/and/{}", name), |b| {
276+
b.iter(|| expr_and.evaluate(black_box(&batch)).unwrap())
277+
});
278+
}
279+
}
280+
// Each scenario when the test operator is `or`
281+
{
282+
for (name, batch) in batches_or {
283+
c.bench_function(&format!("short_circuit/or/{}", name), |b| {
284+
b.iter(|| expr_or.evaluate(black_box(&batch)).unwrap())
285+
});
286+
}
287+
}
288+
}
289+
290+
/// Generate test data with computationally expensive patterns
291+
fn generate_test_strings(num_rows: usize) -> (Vec<String>, Vec<String>) {
292+
// Extended URL patterns with query parameters and paths
293+
let base_urls = [
294+
"https://api.example.com/v2/users/12345/posts?category=tech&sort=date&lang=en-US",
295+
"https://cdn.example.net/assets/images/2023/08/15/sample-image-highres.jpg?width=1920&quality=85",
296+
"http://service.demo.org:8080/api/data/transactions/20230815123456.csv",
297+
"ftp://legacy.archive.example/backups/2023/Q3/database-dump.sql.gz",
298+
"https://docs.example.co.uk/reference/advanced-topics/concurrency/parallel-processing.md#implementation-details",
299+
];
300+
301+
// Extended markdown content with code blocks and structure
302+
let base_markdowns = [
303+
concat!(
304+
"# Advanced Topics in Computer Science\n\n",
305+
"## Summary\nThis article explores complex system design patterns and...\n\n",
306+
"```rust\nfn process_data(data: &mut [i32]) {\n // Parallel processing example\n data.par_iter_mut().for_each(|x| *x *= 2);\n}\n```\n\n",
307+
"## Performance Considerations\nWhen implementing concurrent systems...\n"
308+
),
309+
concat!(
310+
"## API Documentation\n\n",
311+
"```json\n{\n \"endpoint\": \"/api/v2/users\",\n \"methods\": [\"GET\", \"POST\"],\n \"parameters\": {\n \"page\": \"number\"\n }\n}\n```\n\n",
312+
"# Authentication Guide\nSecure your API access using OAuth 2.0...\n"
313+
),
314+
concat!(
315+
"# Data Processing Pipeline\n\n",
316+
"```python\nfrom multiprocessing import Pool\n\ndef main():\n with Pool(8) as p:\n results = p.map(process_item, data)\n```\n\n",
317+
"## Summary of Optimizations\n1. Batch processing\n2. Memory pooling\n3. Concurrent I/O operations\n"
318+
),
319+
concat!(
320+
"# System Architecture Overview\n\n",
321+
"## Components\n- Load Balancer\n- Database Cluster\n- Cache Service\n\n",
322+
"```go\nfunc main() {\n router := gin.Default()\n router.GET(\"/api/health\", healthCheck)\n router.Run(\":8080\")\n}\n```\n"
323+
),
324+
concat!(
325+
"## Configuration Reference\n\n",
326+
"```yaml\nserver:\n port: 8080\n max_threads: 32\n\ndatabase:\n url: postgres://user@prod-db:5432/main\n```\n\n",
327+
"# Deployment Strategies\nBlue-green deployment patterns with...\n"
328+
),
329+
];
330+
331+
let mut urls = Vec::with_capacity(num_rows);
332+
let mut markdowns = Vec::with_capacity(num_rows);
333+
334+
for i in 0..num_rows {
335+
urls.push(base_urls[i % 5].to_string());
336+
markdowns.push(base_markdowns[i % 5].to_string());
337+
}
338+
339+
(urls, markdowns)
340+
}
341+
342+
/// Creates record batches with boolean arrays that test different short-circuit scenarios.
343+
/// When TEST_ALL_FALSE = true: creates data for AND operator benchmarks (needs early false exit)
344+
/// When TEST_ALL_FALSE = false: creates data for OR operator benchmarks (needs early true exit)
345+
fn create_record_batch<const TEST_ALL_FALSE: bool>(
346+
schema: Arc<Schema>,
347+
b_values: &[String],
348+
c_values: &[String],
349+
) -> arrow::error::Result<Vec<(String, RecordBatch)>> {
350+
// Generate data for six scenarios, but only the data for the "all_false" and "all_true" cases can be optimized through short-circuiting
351+
let boolean_array = generate_boolean_cases::<TEST_ALL_FALSE>(b_values.len());
352+
let mut rbs = Vec::with_capacity(boolean_array.len());
353+
for (name, a_array) in boolean_array {
354+
let b_array = StringArray::from(b_values.to_vec());
355+
let c_array = StringArray::from(c_values.to_vec());
356+
rbs.push((
357+
name,
358+
RecordBatch::try_new(
359+
schema.clone(),
360+
vec![Arc::new(a_array), Arc::new(b_array), Arc::new(c_array)],
361+
)?,
362+
));
363+
}
364+
Ok(rbs)
365+
}
366+
367+
criterion_group!(
368+
benches,
369+
benchmark_boolean_ops,
370+
benchmark_binary_op_in_short_circuit
371+
);
372+
373+
criterion_main!(benches);

0 commit comments

Comments
 (0)