@@ -81,8 +81,8 @@ pub(crate) struct RowTrackingVisitor {
81
81
/// High water mark for row IDs
82
82
pub ( crate ) row_id_high_water_mark : i64 ,
83
83
84
- /// Computed base row IDs of the visited actions
85
- pub ( crate ) base_row_ids : Vec < i64 > ,
84
+ /// Computed base row IDs of the visited actions, organized by batch
85
+ pub ( crate ) base_row_id_batches : Vec < Vec < i64 > > ,
86
86
}
87
87
88
88
impl RowTrackingVisitor {
@@ -94,11 +94,11 @@ impl RowTrackingVisitor {
94
94
/// We verify this hard-coded index in a test.
95
95
const NUM_RECORDS_FIELD_INDEX : usize = 5 ;
96
96
97
- pub ( crate ) fn new ( row_id_high_water_mark : Option < i64 > ) -> Self {
97
+ pub ( crate ) fn new ( row_id_high_water_mark : Option < i64 > , num_batches : Option < usize > ) -> Self {
98
98
// A table might not have a row ID high water mark yet, so we model the input as an Option<i64>
99
99
Self {
100
100
row_id_high_water_mark : row_id_high_water_mark. unwrap_or ( Self :: DEFAULT_HIGH_WATER_MARK ) ,
101
- base_row_ids : vec ! [ ] ,
101
+ base_row_id_batches : Vec :: with_capacity ( num_batches . unwrap_or ( 0 ) ) ,
102
102
}
103
103
}
104
104
}
@@ -119,9 +119,8 @@ impl RowVisitor for RowTrackingVisitor {
119
119
) )
120
120
) ;
121
121
122
- // Reset base row ID vector and allocate the necessary capacity
123
- self . base_row_ids . clear ( ) ;
124
- self . base_row_ids . reserve ( row_count) ;
122
+ // Create a new batch for this visit
123
+ let mut batch_base_row_ids = Vec :: with_capacity ( row_count) ;
125
124
126
125
let mut current_hwm = self . row_id_high_water_mark ;
127
126
for i in 0 ..row_count {
@@ -133,10 +132,11 @@ impl RowVisitor for RowTrackingVisitor {
133
132
. to_string ( ) ,
134
133
)
135
134
} ) ?;
136
- self . base_row_ids . push ( current_hwm + 1 ) ;
135
+ batch_base_row_ids . push ( current_hwm + 1 ) ;
137
136
current_hwm += num_records;
138
137
}
139
138
139
+ self . base_row_id_batches . push ( batch_base_row_ids) ;
140
140
self . row_id_high_water_mark = current_hwm;
141
141
Ok ( ( ) )
142
142
}
@@ -205,15 +205,16 @@ mod tests {
205
205
206
206
#[ test]
207
207
fn test_visit_basic_functionality ( ) -> DeltaResult < ( ) > {
208
- let mut visitor = RowTrackingVisitor :: new ( None ) ;
208
+ let mut visitor = RowTrackingVisitor :: new ( None , Some ( 1 ) ) ;
209
209
let num_records_mock = MockGetData :: new ( vec ! [ Some ( 10 ) , Some ( 5 ) , Some ( 20 ) ] ) ;
210
210
let unit_mock = ( ) ;
211
211
let getters = create_getters ( & num_records_mock, & unit_mock) ;
212
212
213
213
visitor. visit ( 3 , & getters) ?;
214
214
215
215
// Check that base row IDs are calculated correctly
216
- assert_eq ! ( visitor. base_row_ids, vec![ 0 , 10 , 15 ] ) ;
216
+ assert_eq ! ( visitor. base_row_id_batches. len( ) , 1 ) ;
217
+ assert_eq ! ( visitor. base_row_id_batches[ 0 ] , vec![ 0 , 10 , 15 ] ) ;
217
218
218
219
// Check that high water mark is updated correctly
219
220
assert_eq ! ( visitor. row_id_high_water_mark, 34 ) ; // -1 + 10 + 5 + 20
@@ -223,15 +224,16 @@ mod tests {
223
224
224
225
#[ test]
225
226
fn test_visit_with_negative_high_water_mark ( ) -> DeltaResult < ( ) > {
226
- let mut visitor = RowTrackingVisitor :: new ( Some ( -5 ) ) ;
227
+ let mut visitor = RowTrackingVisitor :: new ( Some ( -5 ) , Some ( 1 ) ) ;
227
228
let num_records_mock = MockGetData :: new ( vec ! [ Some ( 3 ) , Some ( 2 ) ] ) ;
228
229
let unit_mock = ( ) ;
229
230
let getters = create_getters ( & num_records_mock, & unit_mock) ;
230
231
231
232
visitor. visit ( 2 , & getters) ?;
232
233
233
234
// Base row IDs should start from high_water_mark + 1
234
- assert_eq ! ( visitor. base_row_ids, vec![ -4 , -1 ] ) ; // -5+1=-4, then -4+3=-1
235
+ assert_eq ! ( visitor. base_row_id_batches. len( ) , 1 ) ;
236
+ assert_eq ! ( visitor. base_row_id_batches[ 0 ] , vec![ -4 , -1 ] ) ; // -5+1=-4, then -4+3=-1
235
237
236
238
// High water mark should be updated
237
239
assert_eq ! ( visitor. row_id_high_water_mark, 0 ) ; // -5 + 3 + 2 = 0
@@ -241,15 +243,16 @@ mod tests {
241
243
242
244
#[ test]
243
245
fn test_visit_with_zero_records ( ) -> DeltaResult < ( ) > {
244
- let mut visitor = RowTrackingVisitor :: new ( Some ( 10 ) ) ;
246
+ let mut visitor = RowTrackingVisitor :: new ( Some ( 10 ) , Some ( 1 ) ) ;
245
247
let num_records_mock = MockGetData :: new ( vec ! [ Some ( 0 ) , Some ( 0 ) , Some ( 5 ) ] ) ;
246
248
let unit_mock = ( ) ;
247
249
let getters = create_getters ( & num_records_mock, & unit_mock) ;
248
250
249
251
visitor. visit ( 3 , & getters) ?;
250
252
251
253
// Base row IDs should still be assigned even for zero-record files
252
- assert_eq ! ( visitor. base_row_ids, vec![ 11 , 11 , 11 ] ) ;
254
+ assert_eq ! ( visitor. base_row_id_batches. len( ) , 1 ) ;
255
+ assert_eq ! ( visitor. base_row_id_batches[ 0 ] , vec![ 11 , 11 , 11 ] ) ;
253
256
254
257
// High water mark should only increase by non-zero records
255
258
assert_eq ! ( visitor. row_id_high_water_mark, 15 ) ; // 10 + 0 + 0 + 5
@@ -259,23 +262,54 @@ mod tests {
259
262
260
263
#[ test]
261
264
fn test_visit_empty_batch ( ) -> DeltaResult < ( ) > {
262
- let mut visitor = RowTrackingVisitor :: new ( Some ( 42 ) ) ;
265
+ let mut visitor = RowTrackingVisitor :: new ( Some ( 42 ) , None ) ;
263
266
let num_records_mock = MockGetData :: new ( vec ! [ ] ) ;
264
267
let unit_mock = ( ) ;
265
268
let getters = create_getters ( & num_records_mock, & unit_mock) ;
266
269
267
270
visitor. visit ( 0 , & getters) ?;
268
271
269
272
// Should handle empty batch gracefully
270
- assert ! ( visitor. base_row_ids. is_empty( ) ) ;
273
+ assert_eq ! ( visitor. base_row_id_batches. len( ) , 1 ) ;
274
+ assert ! ( visitor. base_row_id_batches[ 0 ] . is_empty( ) ) ;
271
275
assert_eq ! ( visitor. row_id_high_water_mark, 42 ) ; // Should remain unchanged
272
276
273
277
Ok ( ( ) )
274
278
}
275
279
280
+ #[ test]
281
+ fn test_visit_multiple_batches ( ) -> DeltaResult < ( ) > {
282
+ let mut visitor = RowTrackingVisitor :: new ( Some ( 0 ) , Some ( 2 ) ) ;
283
+ let unit_mock = ( ) ;
284
+
285
+ // First batch
286
+ let num_records_mock1 = MockGetData :: new ( vec ! [ Some ( 10 ) , Some ( 5 ) ] ) ;
287
+ let getters1 = create_getters ( & num_records_mock1, & unit_mock) ;
288
+ visitor. visit ( 2 , & getters1) ?;
289
+
290
+ // Second batch
291
+ let num_records_mock2 = MockGetData :: new ( vec ! [ Some ( 3 ) , Some ( 7 ) , Some ( 2 ) ] ) ;
292
+ let getters2 = create_getters ( & num_records_mock2, & unit_mock) ;
293
+ visitor. visit ( 3 , & getters2) ?;
294
+
295
+ // Check that we have two batches
296
+ assert_eq ! ( visitor. base_row_id_batches. len( ) , 2 ) ;
297
+
298
+ // Check first batch: starts at 1, then 11
299
+ assert_eq ! ( visitor. base_row_id_batches[ 0 ] , vec![ 1 , 11 ] ) ;
300
+
301
+ // Check second batch: starts at 16, then 19, then 26
302
+ assert_eq ! ( visitor. base_row_id_batches[ 1 ] , vec![ 16 , 19 , 26 ] ) ;
303
+
304
+ // Check final high water mark: 0 + 10 + 5 + 3 + 7 + 2 = 27
305
+ assert_eq ! ( visitor. row_id_high_water_mark, 27 ) ;
306
+
307
+ Ok ( ( ) )
308
+ }
309
+
276
310
#[ test]
277
311
fn test_visit_wrong_getter_count ( ) -> DeltaResult < ( ) > {
278
- let mut visitor = RowTrackingVisitor :: new ( Some ( 0 ) ) ;
312
+ let mut visitor = RowTrackingVisitor :: new ( Some ( 0 ) , None ) ;
279
313
let unit_mock = ( ) ;
280
314
let wrong_getters: Vec < & dyn GetData < ' _ > > = vec ! [ & unit_mock] ; // Only one getter instead of expected count
281
315
@@ -287,7 +321,7 @@ mod tests {
287
321
288
322
#[ test]
289
323
fn test_visit_missing_num_records ( ) -> DeltaResult < ( ) > {
290
- let mut visitor = RowTrackingVisitor :: new ( Some ( 0 ) ) ;
324
+ let mut visitor = RowTrackingVisitor :: new ( Some ( 0 ) , None ) ;
291
325
let num_records_mock = MockGetData :: new ( vec ! [ None ] ) ; // Missing numRecords
292
326
let unit_mock = ( ) ;
293
327
let getters = create_getters ( & num_records_mock, & unit_mock) ;
@@ -303,7 +337,7 @@ mod tests {
303
337
304
338
#[ test]
305
339
fn test_selected_column_names_and_types ( ) {
306
- let visitor = RowTrackingVisitor :: new ( Some ( 0 ) ) ;
340
+ let visitor = RowTrackingVisitor :: new ( Some ( 0 ) , None ) ;
307
341
let ( names, types) = visitor. selected_column_names_and_types ( ) ;
308
342
309
343
// Should return the same as add_files_schema().leaves(None)
0 commit comments