1
1
use std:: cmp:: min;
2
- use std :: ops :: Not ;
2
+
3
3
use std:: sync:: Arc ;
4
4
use std:: time:: { SystemTime , UNIX_EPOCH } ;
5
5
use std:: { collections:: HashMap , ops:: AddAssign } ;
6
6
7
7
use delta_kernel:: expressions:: Scalar ;
8
8
use indexmap:: IndexMap ;
9
9
use itertools:: Itertools ;
10
- use parquet :: basic :: Type ;
10
+
11
11
use parquet:: file:: metadata:: ParquetMetaData ;
12
12
use parquet:: format:: FileMetaData ;
13
13
use parquet:: schema:: types:: { ColumnDescriptor , SchemaDescriptor } ;
@@ -16,7 +16,6 @@ use parquet::{
16
16
file:: { metadata:: RowGroupMetaData , statistics:: Statistics } ,
17
17
format:: TimeUnit ,
18
18
} ;
19
- use tracing:: warn;
20
19
21
20
use super :: * ;
22
21
use crate :: kernel:: { scalars:: ScalarExt , Add } ;
@@ -190,19 +189,10 @@ fn stats_from_metadata(
190
189
let maybe_stats: Option < AggregatedStats > = row_group_metadata
191
190
. iter ( )
192
191
. flat_map ( |g| {
193
- g. column ( idx) . statistics ( ) . into_iter ( ) . filter_map ( |s| {
194
- let is_binary = matches ! ( & column_descr. physical_type( ) , Type :: BYTE_ARRAY )
195
- && matches ! ( column_descr. logical_type( ) , Some ( LogicalType :: String ) ) . not ( ) ;
196
- if is_binary {
197
- warn ! (
198
- "Skipping column {} because it's a binary field." ,
199
- & column_descr. name( ) . to_string( )
200
- ) ;
201
- None
202
- } else {
203
- Some ( AggregatedStats :: from ( ( s, & column_descr. logical_type ( ) ) ) )
204
- }
205
- } )
192
+ g. column ( idx)
193
+ . statistics ( )
194
+ . into_iter ( )
195
+ . map ( |s| AggregatedStats :: from ( ( s, & column_descr. logical_type ( ) ) ) )
206
196
} )
207
197
. reduce ( |mut left, right| {
208
198
left += right;
@@ -316,7 +306,6 @@ impl StatsScalar {
316
306
}
317
307
. unwrap_or_default ( ) ;
318
308
match logical_type {
319
- None => Ok ( Self :: Bytes ( bytes. to_vec ( ) ) ) ,
320
309
Some ( LogicalType :: String ) => {
321
310
Ok ( Self :: String ( String :: from_utf8 ( bytes. to_vec ( ) ) . map_err (
322
311
|_| DeltaWriterError :: StatsParsingFailed {
@@ -325,10 +314,7 @@ impl StatsScalar {
325
314
} ,
326
315
) ?) )
327
316
}
328
- _ => Err ( DeltaWriterError :: StatsParsingFailed {
329
- debug_value : format ! ( "{bytes:?}" ) ,
330
- logical_type : logical_type. clone ( ) ,
331
- } ) ,
317
+ _ => Ok ( Self :: Bytes ( bytes. to_vec ( ) ) ) ,
332
318
}
333
319
}
334
320
( Statistics :: FixedLenByteArray ( v) , Some ( LogicalType :: Decimal { scale, precision } ) ) => {
@@ -384,6 +370,17 @@ impl StatsScalar {
384
370
let val = uuid:: Uuid :: from_bytes ( bytes) ;
385
371
Ok ( Self :: Uuid ( val) )
386
372
}
373
+ ( Statistics :: FixedLenByteArray ( v) , None ) => {
374
+ let bytes = if use_min {
375
+ v. min_bytes_opt ( )
376
+ } else {
377
+ v. max_bytes_opt ( )
378
+ }
379
+ . unwrap_or_default ( ) ;
380
+
381
+ Ok ( Self :: Bytes ( bytes. to_vec ( ) ) )
382
+ }
383
+ // TODO other fixed binary column types
387
384
( stats, _) => Err ( DeltaWriterError :: StatsParsingFailed {
388
385
debug_value : format ! ( "{stats:?}" ) ,
389
386
logical_type : logical_type. clone ( ) ,
@@ -798,6 +795,22 @@ mod tests {
798
795
Some ( LogicalType :: Uuid ) ,
799
796
Value :: from ( "c2e8c7f7-d1f9-4b49-a5d9-4bfe75c317e2" ) ,
800
797
) ,
798
+ (
799
+ simple_parquet_stat ! (
800
+ Statistics :: ByteArray ,
801
+ ByteArray :: from( b"\x00 \x00 \x01 \x02 " . to_vec( ) )
802
+ ) ,
803
+ None ,
804
+ Value :: from ( "\\ x00\\ x00\\ x01\\ x02" ) ,
805
+ ) ,
806
+ (
807
+ simple_parquet_stat ! (
808
+ Statistics :: FixedLenByteArray ,
809
+ FixedLenByteArray :: from( b"\x00 \x00 \x01 \x02 " . to_vec( ) )
810
+ ) ,
811
+ None ,
812
+ Value :: from ( "\\ x00\\ x00\\ x01\\ x02" ) ,
813
+ ) ,
801
814
] ;
802
815
803
816
for ( stats, logical_type, expected) in cases {
@@ -880,6 +893,12 @@ mod tests {
880
893
( "uuid" , ColumnValueStat :: Value ( v) ) => {
881
894
assert_eq ! ( "176c770d-92af-4a21-bf76-5d8c5261d659" , v. as_str( ) . unwrap( ) )
882
895
}
896
+ ( "binary" , ColumnValueStat :: Value ( v) ) => {
897
+ assert_eq ! ( "\\ x00\\ x00\\ x01\\ x02\\ x03\\ x04" , v. as_str( ) . unwrap( ) )
898
+ }
899
+ ( "fixed_binary" , ColumnValueStat :: Value ( v) ) => {
900
+ assert_eq ! ( "\\ x00\\ x00\\ x01\\ x02\\ x03" , v. as_str( ) . unwrap( ) )
901
+ }
883
902
k => panic ! ( "Key {k:?} should not be present in min_values" ) ,
884
903
}
885
904
}
@@ -911,6 +930,12 @@ mod tests {
911
930
( "uuid" , ColumnValueStat :: Value ( v) ) => {
912
931
assert_eq ! ( "a98bea04-d119-4f21-8edc-eb218b5849af" , v. as_str( ) . unwrap( ) )
913
932
}
933
+ ( "binary" , ColumnValueStat :: Value ( v) ) => {
934
+ assert_eq ! ( "\\ x00\\ x00\\ x01\\ x02\\ x03\\ x05" , v. as_str( ) . unwrap( ) )
935
+ }
936
+ ( "fixed_binary" , ColumnValueStat :: Value ( v) ) => {
937
+ assert_eq ! ( "\\ x00\\ x00\\ x01\\ x02\\ x04" , v. as_str( ) . unwrap( ) )
938
+ }
914
939
k => panic ! ( "Key {k:?} should not be present in max_values" ) ,
915
940
}
916
941
}
@@ -938,6 +963,8 @@ mod tests {
938
963
( "some_nested_list" , ColumnCountStat :: Value ( v) ) => assert_eq ! ( 100 , * v) ,
939
964
( "date" , ColumnCountStat :: Value ( v) ) => assert_eq ! ( 0 , * v) ,
940
965
( "uuid" , ColumnCountStat :: Value ( v) ) => assert_eq ! ( 0 , * v) ,
966
+ ( "binary" , ColumnCountStat :: Value ( v) ) => assert_eq ! ( 100 , * v) ,
967
+ ( "fixed_binary" , ColumnCountStat :: Value ( v) ) => assert_eq ! ( 100 , * v) ,
941
968
k => panic ! ( "Key {k:?} should not be present in null_count" ) ,
942
969
}
943
970
}
@@ -1089,6 +1116,8 @@ mod tests {
1089
1116
"some_nested_list" : [ [ 42 ] , [ 84 ] ] ,
1090
1117
"date" : "2021-06-22" ,
1091
1118
"uuid" : "176c770d-92af-4a21-bf76-5d8c5261d659" ,
1119
+ "binary" : "\\ x00\\ x00\\ x01\\ x02\\ x03\\ x04" ,
1120
+ "fixed_binary" : "\\ x00\\ x00\\ x01\\ x02\\ x03" ,
1092
1121
} ) ,
1093
1122
100 ,
1094
1123
)
@@ -1111,6 +1140,8 @@ mod tests {
1111
1140
"some_nested_list" : [ [ 42 ] , [ 84 ] ] ,
1112
1141
"date" : "2021-06-22" ,
1113
1142
"uuid" : "54f3e867-3f7b-4122-a452-9d74fb4fe1ba" ,
1143
+ "binary" : "\\ x00\\ x00\\ x01\\ x02\\ x03\\ x05" ,
1144
+ "fixed_binary" : "\\ x00\\ x00\\ x01\\ x02\\ x04" ,
1114
1145
} ) ,
1115
1146
100 ,
1116
1147
) )
0 commit comments