Skip to content

Commit b7f70a0

Browse files
committed
feat: Encode stats for Binary and FixedLengthBinary
This now writes binary stats as escaped string sequences according the delta lake protocol instead of logging a warning. The encoding logic was all there, just needed to call it add add tests. Signed-off-by: Michael R. Maletich <[email protected]>
1 parent 2846081 commit b7f70a0

File tree

1 file changed

+52
-21
lines changed

1 file changed

+52
-21
lines changed

crates/core/src/writer/stats.rs

Lines changed: 52 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
use std::cmp::min;
2-
use std::ops::Not;
2+
33
use std::sync::Arc;
44
use std::time::{SystemTime, UNIX_EPOCH};
55
use std::{collections::HashMap, ops::AddAssign};
66

77
use delta_kernel::expressions::Scalar;
88
use indexmap::IndexMap;
99
use itertools::Itertools;
10-
use parquet::basic::Type;
10+
1111
use parquet::file::metadata::ParquetMetaData;
1212
use parquet::format::FileMetaData;
1313
use parquet::schema::types::{ColumnDescriptor, SchemaDescriptor};
@@ -16,7 +16,6 @@ use parquet::{
1616
file::{metadata::RowGroupMetaData, statistics::Statistics},
1717
format::TimeUnit,
1818
};
19-
use tracing::warn;
2019

2120
use super::*;
2221
use crate::kernel::{scalars::ScalarExt, Add};
@@ -190,19 +189,10 @@ fn stats_from_metadata(
190189
let maybe_stats: Option<AggregatedStats> = row_group_metadata
191190
.iter()
192191
.flat_map(|g| {
193-
g.column(idx).statistics().into_iter().filter_map(|s| {
194-
let is_binary = matches!(&column_descr.physical_type(), Type::BYTE_ARRAY)
195-
&& matches!(column_descr.logical_type(), Some(LogicalType::String)).not();
196-
if is_binary {
197-
warn!(
198-
"Skipping column {} because it's a binary field.",
199-
&column_descr.name().to_string()
200-
);
201-
None
202-
} else {
203-
Some(AggregatedStats::from((s, &column_descr.logical_type())))
204-
}
205-
})
192+
g.column(idx)
193+
.statistics()
194+
.into_iter()
195+
.map(|s| AggregatedStats::from((s, &column_descr.logical_type())))
206196
})
207197
.reduce(|mut left, right| {
208198
left += right;
@@ -316,7 +306,6 @@ impl StatsScalar {
316306
}
317307
.unwrap_or_default();
318308
match logical_type {
319-
None => Ok(Self::Bytes(bytes.to_vec())),
320309
Some(LogicalType::String) => {
321310
Ok(Self::String(String::from_utf8(bytes.to_vec()).map_err(
322311
|_| DeltaWriterError::StatsParsingFailed {
@@ -325,10 +314,7 @@ impl StatsScalar {
325314
},
326315
)?))
327316
}
328-
_ => Err(DeltaWriterError::StatsParsingFailed {
329-
debug_value: format!("{bytes:?}"),
330-
logical_type: logical_type.clone(),
331-
}),
317+
_ => Ok(Self::Bytes(bytes.to_vec())),
332318
}
333319
}
334320
(Statistics::FixedLenByteArray(v), Some(LogicalType::Decimal { scale, precision })) => {
@@ -384,6 +370,17 @@ impl StatsScalar {
384370
let val = uuid::Uuid::from_bytes(bytes);
385371
Ok(Self::Uuid(val))
386372
}
373+
(Statistics::FixedLenByteArray(v), None) => {
374+
let bytes = if use_min {
375+
v.min_bytes_opt()
376+
} else {
377+
v.max_bytes_opt()
378+
}
379+
.unwrap_or_default();
380+
381+
Ok(Self::Bytes(bytes.to_vec()))
382+
}
383+
// TODO other fixed binary column types
387384
(stats, _) => Err(DeltaWriterError::StatsParsingFailed {
388385
debug_value: format!("{stats:?}"),
389386
logical_type: logical_type.clone(),
@@ -798,6 +795,22 @@ mod tests {
798795
Some(LogicalType::Uuid),
799796
Value::from("c2e8c7f7-d1f9-4b49-a5d9-4bfe75c317e2"),
800797
),
798+
(
799+
simple_parquet_stat!(
800+
Statistics::ByteArray,
801+
ByteArray::from(b"\x00\x00\x01\x02".to_vec())
802+
),
803+
None,
804+
Value::from("\\x00\\x00\\x01\\x02"),
805+
),
806+
(
807+
simple_parquet_stat!(
808+
Statistics::FixedLenByteArray,
809+
FixedLenByteArray::from(b"\x00\x00\x01\x02".to_vec())
810+
),
811+
None,
812+
Value::from("\\x00\\x00\\x01\\x02"),
813+
),
801814
];
802815

803816
for (stats, logical_type, expected) in cases {
@@ -880,6 +893,12 @@ mod tests {
880893
("uuid", ColumnValueStat::Value(v)) => {
881894
assert_eq!("176c770d-92af-4a21-bf76-5d8c5261d659", v.as_str().unwrap())
882895
}
896+
("binary", ColumnValueStat::Value(v)) => {
897+
assert_eq!("\\x00\\x00\\x01\\x02\\x03\\x04", v.as_str().unwrap())
898+
}
899+
("fixed_binary", ColumnValueStat::Value(v)) => {
900+
assert_eq!("\\x00\\x00\\x01\\x02\\x03", v.as_str().unwrap())
901+
}
883902
k => panic!("Key {k:?} should not be present in min_values"),
884903
}
885904
}
@@ -911,6 +930,12 @@ mod tests {
911930
("uuid", ColumnValueStat::Value(v)) => {
912931
assert_eq!("a98bea04-d119-4f21-8edc-eb218b5849af", v.as_str().unwrap())
913932
}
933+
("binary", ColumnValueStat::Value(v)) => {
934+
assert_eq!("\\x00\\x00\\x01\\x02\\x03\\x05", v.as_str().unwrap())
935+
}
936+
("fixed_binary", ColumnValueStat::Value(v)) => {
937+
assert_eq!("\\x00\\x00\\x01\\x02\\x04", v.as_str().unwrap())
938+
}
914939
k => panic!("Key {k:?} should not be present in max_values"),
915940
}
916941
}
@@ -938,6 +963,8 @@ mod tests {
938963
("some_nested_list", ColumnCountStat::Value(v)) => assert_eq!(100, *v),
939964
("date", ColumnCountStat::Value(v)) => assert_eq!(0, *v),
940965
("uuid", ColumnCountStat::Value(v)) => assert_eq!(0, *v),
966+
("binary", ColumnCountStat::Value(v)) => assert_eq!(100, *v),
967+
("fixed_binary", ColumnCountStat::Value(v)) => assert_eq!(100, *v),
941968
k => panic!("Key {k:?} should not be present in null_count"),
942969
}
943970
}
@@ -1089,6 +1116,8 @@ mod tests {
10891116
"some_nested_list": [[42], [84]],
10901117
"date": "2021-06-22",
10911118
"uuid": "176c770d-92af-4a21-bf76-5d8c5261d659",
1119+
"binary": "\\x00\\x00\\x01\\x02\\x03\\x04",
1120+
"fixed_binary": "\\x00\\x00\\x01\\x02\\x03",
10921121
}),
10931122
100,
10941123
)
@@ -1111,6 +1140,8 @@ mod tests {
11111140
"some_nested_list": [[42], [84]],
11121141
"date": "2021-06-22",
11131142
"uuid": "54f3e867-3f7b-4122-a452-9d74fb4fe1ba",
1143+
"binary": "\\x00\\x00\\x01\\x02\\x03\\x05",
1144+
"fixed_binary": "\\x00\\x00\\x01\\x02\\x04",
11141145
}),
11151146
100,
11161147
))

0 commit comments

Comments
 (0)