Skip to content

Commit 3292cef

Browse files
lbhmzachschuermann
andauthored
misc: Create expression benchmark for default engine (#1220)
## What changes are proposed in this pull request? #1192 added a makeshift benchmark test to measure the performance of the new `ToJson` expression. This PR migrates this test to a dedicated `expression_bench` for the default engine. This PR only adds `ToJson` to `expression_bench`, but this can be extended as necessary in future PRs. ## How was this change tested? N/A - this PR adds a performance benchmark. --------- Co-authored-by: Zach Schuermann <[email protected]>
1 parent 5bdc186 commit 3292cef

File tree

3 files changed

+183
-76
lines changed

3 files changed

+183
-76
lines changed

kernel/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,3 +152,7 @@ tracing-subscriber = { version = "0.3", default-features = false, features = [
152152
[[bench]]
153153
name = "metadata_bench"
154154
harness = false
155+
156+
[[bench]]
157+
name = "expression_bench"
158+
harness = false

kernel/benches/expression_bench.rs

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
//! Benchmark for expression evaluation performance with the default engine.
2+
//!
3+
//! You can run this benchmark with `cargo bench --bench expression_bench`.
4+
//!
5+
//! To compare your changes vs. latest main, you can:
6+
//! ```bash
7+
//! # checkout baseline branch (upstream/main) and save as baseline
8+
//! git checkout main # or upstream/main, another branch, etc.
9+
//! cargo bench --bench expression_bench -- --save-baseline main
10+
//!
11+
//! # switch back to your changes, and compare against baseline
12+
//! git checkout your-branch
13+
//! cargo bench --bench expression_bench -- --baseline main
14+
//! ```
15+
16+
use std::hint::black_box;
17+
use std::sync::Arc;
18+
19+
use delta_kernel::arrow::array::{
20+
ArrayRef, BooleanBuilder, Float64Builder, Int32Builder, StringBuilder, StructArray,
21+
};
22+
use delta_kernel::arrow::datatypes::{DataType, Field, Fields};
23+
use delta_kernel::engine::arrow_expression::evaluate_expression::to_json;
24+
25+
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
26+
27+
/// Creates a test struct array with realistic data for benchmarking.
28+
fn create_test_struct_array(num_rows: usize) -> StructArray {
29+
let mut id_builder = Int32Builder::with_capacity(num_rows);
30+
let mut name_builder = StringBuilder::with_capacity(num_rows, num_rows * 20);
31+
let mut score_builder = Float64Builder::with_capacity(num_rows);
32+
let mut active_builder = BooleanBuilder::with_capacity(num_rows);
33+
34+
for i in 0..num_rows {
35+
id_builder.append_value(i as i32);
36+
name_builder.append_value(format!("user_{i}"));
37+
score_builder.append_value((i as f64) * 0.1 + 100.0);
38+
active_builder.append_value(i % 3 != 0);
39+
}
40+
41+
let fields = Fields::from(vec![
42+
Arc::new(Field::new("id", DataType::Int32, false)),
43+
Arc::new(Field::new("name", DataType::Utf8, false)),
44+
Arc::new(Field::new("score", DataType::Float64, false)),
45+
Arc::new(Field::new("active", DataType::Boolean, false)),
46+
]);
47+
48+
let arrays: Vec<ArrayRef> = vec![
49+
Arc::new(id_builder.finish()),
50+
Arc::new(name_builder.finish()),
51+
Arc::new(score_builder.finish()),
52+
Arc::new(active_builder.finish()),
53+
];
54+
55+
StructArray::new(fields, arrays, None)
56+
}
57+
58+
/// Creates a simple struct array with fewer fields for lightweight benchmarking.
59+
fn create_simple_struct_array(num_rows: usize) -> StructArray {
60+
let mut id_builder = Int32Builder::with_capacity(num_rows);
61+
let mut name_builder = StringBuilder::with_capacity(num_rows, num_rows * 10);
62+
63+
for i in 0..num_rows {
64+
id_builder.append_value(i as i32);
65+
name_builder.append_value(format!("item_{i}"));
66+
}
67+
68+
let fields = Fields::from(vec![
69+
Arc::new(Field::new("id", DataType::Int32, false)),
70+
Arc::new(Field::new("name", DataType::Utf8, false)),
71+
]);
72+
73+
let arrays: Vec<ArrayRef> = vec![
74+
Arc::new(id_builder.finish()),
75+
Arc::new(name_builder.finish()),
76+
];
77+
78+
StructArray::new(fields, arrays, None)
79+
}
80+
81+
/// Creates a nested struct array for complex JSON benchmarking.
82+
fn create_nested_struct_array(num_rows: usize) -> StructArray {
83+
// Create inner struct
84+
let mut inner_int_builder = Int32Builder::with_capacity(num_rows);
85+
let mut inner_string_builder = StringBuilder::with_capacity(num_rows, num_rows * 15);
86+
87+
for i in 0..num_rows {
88+
inner_int_builder.append_value(i as i32 * 10);
89+
inner_string_builder.append_value(format!("inner_{i}"));
90+
}
91+
92+
let inner_fields = Fields::from(vec![
93+
Arc::new(Field::new("inner_int", DataType::Int32, true)),
94+
Arc::new(Field::new("inner_string", DataType::Utf8, true)),
95+
]);
96+
97+
let inner_arrays: Vec<ArrayRef> = vec![
98+
Arc::new(inner_int_builder.finish()),
99+
Arc::new(inner_string_builder.finish()),
100+
];
101+
102+
let inner_struct = Arc::new(StructArray::new(inner_fields.clone(), inner_arrays, None));
103+
104+
// Create outer struct
105+
let mut outer_id_builder = Int32Builder::with_capacity(num_rows);
106+
for i in 0..num_rows {
107+
outer_id_builder.append_value(i as i32);
108+
}
109+
110+
let fields = Fields::from(vec![
111+
Arc::new(Field::new("outer_id", DataType::Int32, false)),
112+
Arc::new(Field::new(
113+
"nested_struct",
114+
DataType::Struct(inner_fields),
115+
true,
116+
)),
117+
]);
118+
119+
let arrays: Vec<ArrayRef> = vec![Arc::new(outer_id_builder.finish()), inner_struct];
120+
121+
StructArray::new(fields, arrays, None)
122+
}
123+
124+
fn to_json_benchmark(c: &mut Criterion) {
125+
let mut group = c.benchmark_group("to_json");
126+
127+
// Test different sizes for scalability analysis
128+
let test_sizes = [100, 1_000, 10_000, 100_000, 1_000_000];
129+
130+
for &size in &test_sizes {
131+
group.throughput(Throughput::Elements(size as u64));
132+
133+
// Benchmark simple struct array
134+
let simple_struct = create_simple_struct_array(size);
135+
group.bench_with_input(
136+
BenchmarkId::new("simple_struct", size),
137+
&simple_struct,
138+
|b, struct_array| {
139+
b.iter(|| {
140+
let result = to_json(black_box(struct_array));
141+
black_box(result).unwrap()
142+
})
143+
},
144+
);
145+
146+
// Benchmark complex struct array
147+
let complex_struct = create_test_struct_array(size);
148+
group.bench_with_input(
149+
BenchmarkId::new("complex_struct", size),
150+
&complex_struct,
151+
|b, struct_array| {
152+
b.iter(|| {
153+
let result = to_json(black_box(struct_array));
154+
black_box(result).unwrap()
155+
})
156+
},
157+
);
158+
159+
// Benchmark nested struct array
160+
let nested_struct = create_nested_struct_array(size);
161+
group.bench_with_input(
162+
BenchmarkId::new("nested_struct", size),
163+
&nested_struct,
164+
|b, struct_array| {
165+
b.iter(|| {
166+
let result = to_json(black_box(struct_array));
167+
black_box(result).unwrap()
168+
})
169+
},
170+
);
171+
}
172+
173+
group.finish();
174+
}
175+
176+
criterion_group!(benches, to_json_benchmark);
177+
criterion_main!(benches);

kernel/src/engine/arrow_expression/tests.rs

Lines changed: 2 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
use std::ops::{Add, Div, Mul, Sub};
22

33
use crate::arrow::array::{
4-
create_array, Array, ArrayRef, AsArray, BooleanArray, GenericStringArray, Int32Array,
5-
Int32Builder, ListArray, MapArray, MapBuilder, MapFieldNames, StringArray, StringBuilder,
6-
StructArray,
4+
create_array, Array, ArrayRef, BooleanArray, GenericStringArray, Int32Array, Int32Builder,
5+
ListArray, MapArray, MapBuilder, MapFieldNames, StringArray, StringBuilder, StructArray,
76
};
87
use crate::arrow::buffer::{BooleanBuffer, NullBuffer, OffsetBuffer, ScalarBuffer};
98
use crate::arrow::compute::kernels::cmp::{gt_eq, lt};
@@ -1109,76 +1108,3 @@ fn test_to_json_with_nested_struct() {
11091108
r#"{"outer_int":200,"nested_struct":{"inner_string":"value"}}"#
11101109
);
11111110
}
1112-
1113-
#[test]
1114-
#[ignore]
1115-
fn benchmark_to_json_performance() {
1116-
use std::time::Instant;
1117-
1118-
// Create a large test struct array for performance testing
1119-
fn create_large_test_struct_array(num_rows: usize) -> StructArray {
1120-
let mut id_builder = Int32Builder::with_capacity(num_rows);
1121-
let mut name_builder = StringBuilder::with_capacity(num_rows, num_rows * 20);
1122-
let mut score_builder = crate::arrow::array::Float64Builder::with_capacity(num_rows);
1123-
let mut active_builder = crate::arrow::array::BooleanBuilder::with_capacity(num_rows);
1124-
1125-
for i in 0..num_rows {
1126-
id_builder.append_value(i as i32);
1127-
name_builder.append_value(format!("user_{i}"));
1128-
score_builder.append_value((i as f64) * 0.1 + 100.0);
1129-
active_builder.append_value(i % 3 != 0);
1130-
}
1131-
1132-
let fields = Fields::from(vec![
1133-
Arc::new(Field::new("id", DataType::Int32, false)),
1134-
Arc::new(Field::new("name", DataType::Utf8, false)),
1135-
Arc::new(Field::new("score", DataType::Float64, false)),
1136-
Arc::new(Field::new("active", DataType::Boolean, false)),
1137-
]);
1138-
1139-
let arrays: Vec<ArrayRef> = vec![
1140-
Arc::new(id_builder.finish()),
1141-
Arc::new(name_builder.finish()),
1142-
Arc::new(score_builder.finish()),
1143-
Arc::new(active_builder.finish()),
1144-
];
1145-
1146-
StructArray::new(fields, arrays, None)
1147-
}
1148-
1149-
// Test with different sizes to measure performance characteristics
1150-
let test_sizes = [100, 1000, 5000, 100000];
1151-
for &size in &test_sizes {
1152-
let large_struct = create_large_test_struct_array(size);
1153-
1154-
let start = Instant::now();
1155-
let result = to_json(&large_struct).unwrap();
1156-
let duration = start.elapsed();
1157-
1158-
println!(
1159-
"to_json processed {} rows in {:?} ({:.2} μs/row)",
1160-
size,
1161-
duration,
1162-
duration.as_micros() as f64 / size as f64
1163-
);
1164-
1165-
// Verify correctness
1166-
assert_eq!(result.len(), size);
1167-
let string_array = result.as_string::<i32>();
1168-
1169-
// Check a few sample results
1170-
if size > 0 {
1171-
let first_json = string_array.value(0);
1172-
assert!(first_json.contains("\"id\":0"));
1173-
assert!(first_json.contains("\"name\":\"user_0\""));
1174-
assert!(first_json.contains("\"score\":100"));
1175-
assert!(first_json.contains("\"active\":false"));
1176-
}
1177-
1178-
if size > 1 {
1179-
let second_json = string_array.value(1);
1180-
assert!(second_json.contains("\"id\":1"));
1181-
assert!(second_json.contains("\"name\":\"user_1\""));
1182-
}
1183-
}
1184-
}

0 commit comments

Comments
 (0)