@@ -82,20 +82,23 @@ impl BitSequence {
82
82
pub fn new ( bits : u32 , bit_len : BitLen ) -> Self {
83
83
Self { bits, bit_len }
84
84
}
85
+
85
86
pub fn bits ( & self ) -> u32 {
86
87
self . bits
87
88
}
89
+
88
90
/// The number of bits of `bits` to use.
89
91
pub fn bit_len ( & self ) -> BitLen {
90
92
self . bit_len
91
93
}
94
+
92
95
/// Split the bits into a prefix of `bit_len` bits and a suffix of `self.bit_len - bit_len`
93
96
/// bits.
94
97
///
95
98
/// # Failure
96
99
///
97
100
/// This function panics if `bit_len > self.bit_len`.
98
- pub fn split ( & self , bit_len : BitLen ) -> ( u32 , u32 ) {
101
+ pub fn split_bits ( & self , bit_len : BitLen ) -> ( u32 , u32 ) {
99
102
let shift = self . bit_len - bit_len;
100
103
match shift. into ( ) {
101
104
0u8 => ( self . bits , 0 ) , // Special case: cannot >> 32
@@ -106,6 +109,25 @@ impl BitSequence {
106
109
) ,
107
110
}
108
111
}
112
+
113
+ /// Split the bits into a prefix of `bit_len` bits and a suffix of `self.bit_len - bit_len`
114
+ /// bits.
115
+ ///
116
+ /// # Failure
117
+ ///
118
+ /// This function panics if `bit_len > self.bit_len`.
119
+ pub fn split ( & self , bit_len : BitLen ) -> ( BitSequence , BitSequence ) {
120
+ let ( prefix, suffix) = self . split_bits ( bit_len) ;
121
+ (
122
+ BitSequence :: new ( prefix, bit_len) ,
123
+ BitSequence :: new ( suffix, self . bit_len - bit_len) ,
124
+ )
125
+ }
126
+
127
+ /// Add lowest-weight to this bit sequence bits until it reaches
128
+ /// a sufficient bit length.
129
+ ///
130
+ /// Does nothing if the bit sequence already has a sufficient bitlength.
109
131
pub fn pad_lowest_to ( & self , total_bit_len : BitLen ) -> Cow < BitSequence > {
110
132
assert ! ( total_bit_len. 0 <= 32u8 ) ;
111
133
if total_bit_len <= self . bit_len {
@@ -117,21 +139,29 @@ impl BitSequence {
117
139
}
118
140
Cow :: Owned ( BitSequence :: new ( self . bits << shift, total_bit_len) )
119
141
}
142
+
143
+ /// Prepend a sequence of bits to a sequencce.s
144
+ pub fn with_prefix ( & self , prefix : & BitSequence ) -> Self {
145
+ assert ! ( ( prefix. bit_len( ) + self . bit_len( ) ) . as_u8( ) <= 32 ) ;
146
+ let bits = self . bits | ( prefix. bits ( ) << self . bit_len ) ;
147
+ let bit_len = self . bit_len + prefix. bit_len ;
148
+ BitSequence :: new ( bits, bit_len)
149
+ }
120
150
}
121
151
122
152
#[ test]
123
153
fn test_bit_sequence_split ( ) {
124
154
let bits = 0b11111111_11111111_00000000_00000000 ;
125
155
let key = BitSequence :: new ( bits, BitLen ( 32 ) ) ;
126
- assert_eq ! ( key. split ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
127
- assert_eq ! ( key. split ( BitLen ( 32 ) ) , ( bits, 0 ) ) ;
128
- assert_eq ! ( key. split ( BitLen ( 16 ) ) , ( 0b11111111_11111111 , 0 ) ) ;
156
+ assert_eq ! ( key. split_bits ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
157
+ assert_eq ! ( key. split_bits ( BitLen ( 32 ) ) , ( bits, 0 ) ) ;
158
+ assert_eq ! ( key. split_bits ( BitLen ( 16 ) ) , ( 0b11111111_11111111 , 0 ) ) ;
129
159
130
160
let bits = 0b00000000_00000000_00000000_11111111 ;
131
161
let key = BitSequence :: new ( bits, BitLen ( 16 ) ) ;
132
- assert_eq ! ( key. split ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
133
- assert_eq ! ( key. split ( BitLen ( 16 ) ) , ( bits, 0 ) ) ;
134
- assert_eq ! ( key. split ( BitLen ( 8 ) ) , ( 0 , 0b11111111 ) ) ;
162
+ assert_eq ! ( key. split_bits ( BitLen ( 0 ) ) , ( 0 , bits) ) ;
163
+ assert_eq ! ( key. split_bits ( BitLen ( 16 ) ) , ( bits, 0 ) ) ;
164
+ assert_eq ! ( key. split_bits ( BitLen ( 8 ) ) , ( 0 , 0b11111111 ) ) ;
135
165
}
136
166
137
167
/// A Huffman key
@@ -159,6 +189,10 @@ impl Key {
159
189
Key ( BitSequence { bits, bit_len } )
160
190
}
161
191
192
+ pub fn from_bit_sequence ( sequence : BitSequence ) -> Self {
193
+ Self :: new ( sequence. bits , sequence. bit_len )
194
+ }
195
+
162
196
/// The bits in this Key.
163
197
///
164
198
/// # Invariant
@@ -176,6 +210,11 @@ impl Key {
176
210
pub fn as_bit_sequence ( & self ) -> & BitSequence {
177
211
& self . 0
178
212
}
213
+
214
+ pub fn with_prefix ( & self , prefix : & BitSequence ) -> Self {
215
+ let sequence = self . 0 . with_prefix ( prefix) ;
216
+ Key :: from_bit_sequence ( sequence)
217
+ }
179
218
}
180
219
181
220
/// A node in the Huffman tree.
@@ -219,43 +258,46 @@ impl<T> PartialEq for Node<T> {
219
258
}
220
259
impl < T > Eq for Node < T > { }
221
260
222
- /// Keys associated to a sequence of values.
261
+ /// Codebook associated to a sequence of values.
223
262
#[ derive( Clone , Debug ) ]
224
- pub struct Keys < T > {
225
- /// The longest bit length that actually appears in `keys `.
263
+ pub struct Codebook < T > {
264
+ /// The longest bit length that actually appears in `mappings `.
226
265
highest_bit_len : BitLen ,
227
266
228
267
/// The sequence of keys.
229
268
///
230
269
/// Order is meaningful.
231
- keys : Vec < ( T , Key ) > ,
270
+ mappings : Vec < ( T , Key ) > ,
232
271
}
233
272
234
- impl < T > Keys < T > {
273
+ impl < T > Codebook < T > {
274
+ /// The number of elements in this Codebook.
235
275
pub fn len ( & self ) -> usize {
236
- self . keys . len ( )
276
+ self . mappings . len ( )
237
277
}
278
+
279
+ /// The longest bit length that acctually appears in this Codebook.
238
280
pub fn highest_bit_len ( & self ) -> BitLen {
239
281
self . highest_bit_len
240
282
}
241
283
}
242
284
243
- impl < T > IntoIterator for Keys < T > {
285
+ impl < T > IntoIterator for Codebook < T > {
244
286
type Item = ( T , Key ) ;
245
287
type IntoIter = std:: vec:: IntoIter < ( T , Key ) > ;
246
288
fn into_iter ( self ) -> Self :: IntoIter {
247
- self . keys . into_iter ( )
289
+ self . mappings . into_iter ( )
248
290
}
249
291
}
250
292
251
- impl < T > Keys < T >
293
+ impl < T > Codebook < T >
252
294
where
253
295
T : Ord + Clone ,
254
296
{
255
- /// Compute a `Keys ` from a sequence of values.
297
+ /// Compute a `Codebook ` from a sequence of values.
256
298
///
257
299
/// Optionally, `max_bit_len` may specify a largest acceptable bit length.
258
- /// If `Keys ` may not be computed without exceeding this bit length,
300
+ /// If the `Codebook ` may not be computed without exceeding this bit length,
259
301
/// fail with `Err(problemantic_bit_len)`.
260
302
///
261
303
/// The current implementation only attempts to produce the best compression
@@ -278,11 +320,11 @@ where
278
320
let counter = map. entry ( item) . or_insert ( 0 . into ( ) ) ;
279
321
* counter += 1 . into ( ) ;
280
322
}
281
- // Then compute the `Keys `.
323
+ // Then compute the `Codebook `.
282
324
Self :: from_instances ( map, max_bit_len)
283
325
}
284
326
285
- /// Compute a `Keys ` from a sequence of values
327
+ /// Compute a `Codebook ` from a sequence of values
286
328
/// with a number of instances already attached.
287
329
///
288
330
/// The current implementation only attempts to produce the best compression
@@ -305,27 +347,27 @@ where
305
347
306
348
// The bits associated to the next value.
307
349
let mut bits = 0 ;
308
- let mut keys = Vec :: with_capacity ( bit_lengths. len ( ) ) ;
350
+ let mut mappings = Vec :: with_capacity ( bit_lengths. len ( ) ) ;
309
351
310
352
for i in 0 ..bit_lengths. len ( ) - 1 {
311
353
let ( bit_len, symbol, next_bit_len) = (
312
354
bit_lengths[ i] . 1 ,
313
355
bit_lengths[ i] . 0 . clone ( ) ,
314
356
bit_lengths[ i + 1 ] . 1 ,
315
357
) ;
316
- keys . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
358
+ mappings . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
317
359
bits = ( bits + 1 ) << ( next_bit_len - bit_len) ;
318
360
if bit_len > highest_bit_len {
319
361
highest_bit_len = bit_len;
320
362
}
321
363
}
322
364
// Handle the last element.
323
365
let ( ref symbol, bit_len) = bit_lengths[ bit_lengths. len ( ) - 1 ] ;
324
- keys . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
366
+ mappings . push ( ( symbol. clone ( ) , Key :: new ( bits, bit_len) ) ) ;
325
367
326
368
return Ok ( Self {
327
369
highest_bit_len,
328
- keys ,
370
+ mappings ,
329
371
} ) ;
330
372
}
331
373
@@ -412,26 +454,106 @@ where
412
454
#[ test]
413
455
fn test_coded_from_sequence ( ) {
414
456
let sample = "appl" ;
415
- let coded = Keys :: from_sequence ( sample. chars ( ) , std:: u8:: MAX ) . unwrap ( ) ;
457
+ let coded = Codebook :: from_sequence ( sample. chars ( ) , std:: u8:: MAX ) . unwrap ( ) ;
416
458
417
459
// Symbol 'p' appears twice, we should see 3 codes.
418
- assert_eq ! ( coded. keys . len( ) , 3 ) ;
460
+ assert_eq ! ( coded. mappings . len( ) , 3 ) ;
419
461
420
462
// Check order of symbols.
421
- assert_eq ! ( coded. keys [ 0 ] . 0 , 'p' ) ;
422
- assert_eq ! ( coded. keys [ 1 ] . 0 , 'a' ) ;
423
- assert_eq ! ( coded. keys [ 2 ] . 0 , 'l' ) ;
463
+ assert_eq ! ( coded. mappings [ 0 ] . 0 , 'p' ) ;
464
+ assert_eq ! ( coded. mappings [ 1 ] . 0 , 'a' ) ;
465
+ assert_eq ! ( coded. mappings [ 2 ] . 0 , 'l' ) ;
424
466
425
467
// Check bit length of symbols.
426
- assert_eq ! ( coded. keys [ 0 ] . 1 . bit_len( ) , 1 . into( ) ) ;
427
- assert_eq ! ( coded. keys [ 1 ] . 1 . bit_len( ) , 2 . into( ) ) ;
428
- assert_eq ! ( coded. keys [ 2 ] . 1 . bit_len( ) , 2 . into( ) ) ;
468
+ assert_eq ! ( coded. mappings [ 0 ] . 1 . bit_len( ) , 1 . into( ) ) ;
469
+ assert_eq ! ( coded. mappings [ 1 ] . 1 . bit_len( ) , 2 . into( ) ) ;
470
+ assert_eq ! ( coded. mappings [ 2 ] . 1 . bit_len( ) , 2 . into( ) ) ;
429
471
430
472
// Check code of symbols.
431
- assert_eq ! ( coded. keys [ 0 ] . 1 . bits( ) , 0b00 ) ;
432
- assert_eq ! ( coded. keys [ 1 ] . 1 . bits( ) , 0b10 ) ;
433
- assert_eq ! ( coded. keys [ 2 ] . 1 . bits( ) , 0b11 ) ;
473
+ assert_eq ! ( coded. mappings [ 0 ] . 1 . bits( ) , 0b00 ) ;
474
+ assert_eq ! ( coded. mappings [ 1 ] . 1 . bits( ) , 0b10 ) ;
475
+ assert_eq ! ( coded. mappings [ 2 ] . 1 . bits( ) , 0b11 ) ;
434
476
435
477
// Let's try again with a limit to 1 bit paths.
436
- assert_eq ! ( Keys :: from_sequence( sample. chars( ) , 1 ) . unwrap_err( ) , 2 ) ;
478
+ assert_eq ! ( Codebook :: from_sequence( sample. chars( ) , 1 ) . unwrap_err( ) , 2 ) ;
479
+ }
480
+
481
+ impl < T > Codebook < T > {
482
+ /// Return the mappings of a Codebook.
483
+ pub fn mappings ( self ) -> Vec < ( T , Key ) > {
484
+ self . mappings
485
+ }
486
+
487
+ /// Split a Codebook into several Codebooks grouped by a common prefix.
488
+ ///
489
+ /// For instance, if `prefix_len` is 2, the result will be a vector of size 2^2
490
+ /// containing:
491
+ ///
492
+ /// - at index 0 (= 0b00), all the keys starting with 0b00, minus the prefix 0b00;
493
+ /// - at index 1 (= 0b01), all the keys starting with 0b01, minus the prefix 0b01;
494
+ /// - at index 2 (= 0b10), all the keys starting with 0b10, minus the prefix 0b10;
495
+ /// - at index 3 (= 0b11), all the keys starting with 0b11, minus the prefix 0b11.
496
+ ///
497
+ /// ```
498
+ /// let sample = "appl";
499
+ /// let coded = Codebook::from_sequence(sample.chars(), std::u8::MAX).unwrap();
500
+ /// // 0b0 => p
501
+ /// // 0b10 => a
502
+ /// // 0b11 => l
503
+ ///
504
+ /// let buckets = coded.bucket_by_prefix(1);
505
+ /// assert_eq!(buckets.len(), 2);
506
+ ///
507
+ /// // `buckets[0]` contains keys that start with `0`.
508
+ /// assert_eq!(buckets[0].len(), 1);
509
+ /// let bucket: Vec<_> = buckets[0].iter().collect();
510
+ /// assert_eq!(bucket[0].0, 'p');
511
+ /// assert_eq!(bucket[0].1, Key::new(0, BitLen(0))); // Key was 0b0, now empty.
512
+ ///
513
+ /// // `buckets[1]` contains keys that start with `1`.
514
+ /// assert_eq!(buckets[0].len(), 2);
515
+ /// let bucket: Vec<_> = buckets[0].iter().sorted_by_key(|(c, )| c);
516
+ /// assert_eq!(bucket[0].0, 'a');
517
+ /// assert_eq!(bucket[0].1, Key::new(1, BitLen(1))); // Key was 0b11, now 0b1
518
+ /// assert_eq!(bucket[1].0, 'l');
519
+ /// assert_eq!(bucket[1].1, Key::new(0, BitLen(1))); // Key was 0b10, now 0b0
520
+ /// ```
521
+ pub fn bucket_by_prefix ( self , prefix_len : BitLen ) -> Vec < Codebook < T > > {
522
+ assert ! ( prefix_len < self . highest_bit_len) ;
523
+
524
+ // Prepare empty buckets.
525
+ let mut result = Vec :: with_capacity ( 1usize << prefix_len) ;
526
+ result. resize_with ( 1usize << prefix_len, || Codebook {
527
+ highest_bit_len : 0 . into ( ) ,
528
+ mappings : vec ! [ ] ,
529
+ } ) ;
530
+
531
+ // Dispatch each (value, key) to its bucket.
532
+ for ( value, key) in self {
533
+ let ( prefix, suffix) = key. as_bit_sequence ( ) . split ( prefix_len) ;
534
+ let ref mut bucket = result[ prefix. bits ( ) as usize ] ;
535
+ if suffix. bit_len ( ) > bucket. highest_bit_len {
536
+ bucket. highest_bit_len = suffix. bit_len ( ) ;
537
+ }
538
+ bucket
539
+ . mappings
540
+ . push ( ( value, Key :: from_bit_sequence ( suffix) ) ) ;
541
+ }
542
+
543
+ result
544
+ }
545
+
546
+ pub fn map < F , U > ( self , mut f : F ) -> Codebook < U >
547
+ where
548
+ F : FnMut ( T ) -> U ,
549
+ {
550
+ Codebook {
551
+ highest_bit_len : self . highest_bit_len ,
552
+ mappings : self
553
+ . mappings
554
+ . into_iter ( )
555
+ . map ( |( value, key) | ( f ( value) , key) )
556
+ . collect ( ) ,
557
+ }
558
+ }
437
559
}
0 commit comments