Skip to content

Commit 9d85128

Browse files
committed
Context 0.1: MultiLookupHuffmanTable
1 parent 7e8d52f commit 9d85128

File tree

2 files changed

+441
-68
lines changed

2 files changed

+441
-68
lines changed

crates/binjs_io/src/context/huffman/mod.rs

Lines changed: 158 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -82,20 +82,23 @@ impl BitSequence {
8282
pub fn new(bits: u32, bit_len: BitLen) -> Self {
8383
Self { bits, bit_len }
8484
}
85+
8586
pub fn bits(&self) -> u32 {
8687
self.bits
8788
}
89+
8890
/// The number of bits of `bits` to use.
8991
pub fn bit_len(&self) -> BitLen {
9092
self.bit_len
9193
}
94+
9295
/// Split the bits into a prefix of `bit_len` bits and a suffix of `self.bit_len - bit_len`
9396
/// bits.
9497
///
9598
/// # Failure
9699
///
97100
/// This function panics if `bit_len > self.bit_len`.
98-
pub fn split(&self, bit_len: BitLen) -> (u32, u32) {
101+
pub fn split_bits(&self, bit_len: BitLen) -> (u32, u32) {
99102
let shift = self.bit_len - bit_len;
100103
match shift.into() {
101104
0u8 => (self.bits, 0), // Special case: cannot >> 32
@@ -106,6 +109,25 @@ impl BitSequence {
106109
),
107110
}
108111
}
112+
113+
/// Split the bits into a prefix of `bit_len` bits and a suffix of `self.bit_len - bit_len`
114+
/// bits.
115+
///
116+
/// # Failure
117+
///
118+
/// This function panics if `bit_len > self.bit_len`.
119+
pub fn split(&self, bit_len: BitLen) -> (BitSequence, BitSequence) {
120+
let (prefix, suffix) = self.split_bits(bit_len);
121+
(
122+
BitSequence::new(prefix, bit_len),
123+
BitSequence::new(suffix, self.bit_len - bit_len),
124+
)
125+
}
126+
127+
/// Add lowest-weight to this bit sequence bits until it reaches
128+
/// a sufficient bit length.
129+
///
130+
/// Does nothing if the bit sequence already has a sufficient bitlength.
109131
pub fn pad_lowest_to(&self, total_bit_len: BitLen) -> Cow<BitSequence> {
110132
assert!(total_bit_len.0 <= 32u8);
111133
if total_bit_len <= self.bit_len {
@@ -117,21 +139,29 @@ impl BitSequence {
117139
}
118140
Cow::Owned(BitSequence::new(self.bits << shift, total_bit_len))
119141
}
142+
143+
/// Prepend a sequence of bits to a sequencce.s
144+
pub fn with_prefix(&self, prefix: &BitSequence) -> Self {
145+
assert!((prefix.bit_len() + self.bit_len()).as_u8() <= 32);
146+
let bits = self.bits | (prefix.bits() << self.bit_len);
147+
let bit_len = self.bit_len + prefix.bit_len;
148+
BitSequence::new(bits, bit_len)
149+
}
120150
}
121151

122152
#[test]
123153
fn test_bit_sequence_split() {
124154
let bits = 0b11111111_11111111_00000000_00000000;
125155
let key = BitSequence::new(bits, BitLen(32));
126-
assert_eq!(key.split(BitLen(0)), (0, bits));
127-
assert_eq!(key.split(BitLen(32)), (bits, 0));
128-
assert_eq!(key.split(BitLen(16)), (0b11111111_11111111, 0));
156+
assert_eq!(key.split_bits(BitLen(0)), (0, bits));
157+
assert_eq!(key.split_bits(BitLen(32)), (bits, 0));
158+
assert_eq!(key.split_bits(BitLen(16)), (0b11111111_11111111, 0));
129159

130160
let bits = 0b00000000_00000000_00000000_11111111;
131161
let key = BitSequence::new(bits, BitLen(16));
132-
assert_eq!(key.split(BitLen(0)), (0, bits));
133-
assert_eq!(key.split(BitLen(16)), (bits, 0));
134-
assert_eq!(key.split(BitLen(8)), (0, 0b11111111));
162+
assert_eq!(key.split_bits(BitLen(0)), (0, bits));
163+
assert_eq!(key.split_bits(BitLen(16)), (bits, 0));
164+
assert_eq!(key.split_bits(BitLen(8)), (0, 0b11111111));
135165
}
136166

137167
/// A Huffman key
@@ -159,6 +189,10 @@ impl Key {
159189
Key(BitSequence { bits, bit_len })
160190
}
161191

192+
pub fn from_bit_sequence(sequence: BitSequence) -> Self {
193+
Self::new(sequence.bits, sequence.bit_len)
194+
}
195+
162196
/// The bits in this Key.
163197
///
164198
/// # Invariant
@@ -176,6 +210,11 @@ impl Key {
176210
pub fn as_bit_sequence(&self) -> &BitSequence {
177211
&self.0
178212
}
213+
214+
pub fn with_prefix(&self, prefix: &BitSequence) -> Self {
215+
let sequence = self.0.with_prefix(prefix);
216+
Key::from_bit_sequence(sequence)
217+
}
179218
}
180219

181220
/// A node in the Huffman tree.
@@ -219,43 +258,46 @@ impl<T> PartialEq for Node<T> {
219258
}
220259
impl<T> Eq for Node<T> {}
221260

222-
/// Keys associated to a sequence of values.
261+
/// Codebook associated to a sequence of values.
223262
#[derive(Clone, Debug)]
224-
pub struct Keys<T> {
225-
/// The longest bit length that actually appears in `keys`.
263+
pub struct Codebook<T> {
264+
/// The longest bit length that actually appears in `mappings`.
226265
highest_bit_len: BitLen,
227266

228267
/// The sequence of keys.
229268
///
230269
/// Order is meaningful.
231-
keys: Vec<(T, Key)>,
270+
mappings: Vec<(T, Key)>,
232271
}
233272

234-
impl<T> Keys<T> {
273+
impl<T> Codebook<T> {
274+
/// The number of elements in this Codebook.
235275
pub fn len(&self) -> usize {
236-
self.keys.len()
276+
self.mappings.len()
237277
}
278+
279+
/// The longest bit length that acctually appears in this Codebook.
238280
pub fn highest_bit_len(&self) -> BitLen {
239281
self.highest_bit_len
240282
}
241283
}
242284

243-
impl<T> IntoIterator for Keys<T> {
285+
impl<T> IntoIterator for Codebook<T> {
244286
type Item = (T, Key);
245287
type IntoIter = std::vec::IntoIter<(T, Key)>;
246288
fn into_iter(self) -> Self::IntoIter {
247-
self.keys.into_iter()
289+
self.mappings.into_iter()
248290
}
249291
}
250292

251-
impl<T> Keys<T>
293+
impl<T> Codebook<T>
252294
where
253295
T: Ord + Clone,
254296
{
255-
/// Compute a `Keys` from a sequence of values.
297+
/// Compute a `Codebook` from a sequence of values.
256298
///
257299
/// Optionally, `max_bit_len` may specify a largest acceptable bit length.
258-
/// If `Keys` may not be computed without exceeding this bit length,
300+
/// If the `Codebook` may not be computed without exceeding this bit length,
259301
/// fail with `Err(problemantic_bit_len)`.
260302
///
261303
/// The current implementation only attempts to produce the best compression
@@ -278,11 +320,11 @@ where
278320
let counter = map.entry(item).or_insert(0.into());
279321
*counter += 1.into();
280322
}
281-
// Then compute the `Keys`.
323+
// Then compute the `Codebook`.
282324
Self::from_instances(map, max_bit_len)
283325
}
284326

285-
/// Compute a `Keys` from a sequence of values
327+
/// Compute a `Codebook` from a sequence of values
286328
/// with a number of instances already attached.
287329
///
288330
/// The current implementation only attempts to produce the best compression
@@ -305,27 +347,27 @@ where
305347

306348
// The bits associated to the next value.
307349
let mut bits = 0;
308-
let mut keys = Vec::with_capacity(bit_lengths.len());
350+
let mut mappings = Vec::with_capacity(bit_lengths.len());
309351

310352
for i in 0..bit_lengths.len() - 1 {
311353
let (bit_len, symbol, next_bit_len) = (
312354
bit_lengths[i].1,
313355
bit_lengths[i].0.clone(),
314356
bit_lengths[i + 1].1,
315357
);
316-
keys.push((symbol.clone(), Key::new(bits, bit_len)));
358+
mappings.push((symbol.clone(), Key::new(bits, bit_len)));
317359
bits = (bits + 1) << (next_bit_len - bit_len);
318360
if bit_len > highest_bit_len {
319361
highest_bit_len = bit_len;
320362
}
321363
}
322364
// Handle the last element.
323365
let (ref symbol, bit_len) = bit_lengths[bit_lengths.len() - 1];
324-
keys.push((symbol.clone(), Key::new(bits, bit_len)));
366+
mappings.push((symbol.clone(), Key::new(bits, bit_len)));
325367

326368
return Ok(Self {
327369
highest_bit_len,
328-
keys,
370+
mappings,
329371
});
330372
}
331373

@@ -412,26 +454,106 @@ where
412454
#[test]
413455
fn test_coded_from_sequence() {
414456
let sample = "appl";
415-
let coded = Keys::from_sequence(sample.chars(), std::u8::MAX).unwrap();
457+
let coded = Codebook::from_sequence(sample.chars(), std::u8::MAX).unwrap();
416458

417459
// Symbol 'p' appears twice, we should see 3 codes.
418-
assert_eq!(coded.keys.len(), 3);
460+
assert_eq!(coded.mappings.len(), 3);
419461

420462
// Check order of symbols.
421-
assert_eq!(coded.keys[0].0, 'p');
422-
assert_eq!(coded.keys[1].0, 'a');
423-
assert_eq!(coded.keys[2].0, 'l');
463+
assert_eq!(coded.mappings[0].0, 'p');
464+
assert_eq!(coded.mappings[1].0, 'a');
465+
assert_eq!(coded.mappings[2].0, 'l');
424466

425467
// Check bit length of symbols.
426-
assert_eq!(coded.keys[0].1.bit_len(), 1.into());
427-
assert_eq!(coded.keys[1].1.bit_len(), 2.into());
428-
assert_eq!(coded.keys[2].1.bit_len(), 2.into());
468+
assert_eq!(coded.mappings[0].1.bit_len(), 1.into());
469+
assert_eq!(coded.mappings[1].1.bit_len(), 2.into());
470+
assert_eq!(coded.mappings[2].1.bit_len(), 2.into());
429471

430472
// Check code of symbols.
431-
assert_eq!(coded.keys[0].1.bits(), 0b00);
432-
assert_eq!(coded.keys[1].1.bits(), 0b10);
433-
assert_eq!(coded.keys[2].1.bits(), 0b11);
473+
assert_eq!(coded.mappings[0].1.bits(), 0b00);
474+
assert_eq!(coded.mappings[1].1.bits(), 0b10);
475+
assert_eq!(coded.mappings[2].1.bits(), 0b11);
434476

435477
// Let's try again with a limit to 1 bit paths.
436-
assert_eq!(Keys::from_sequence(sample.chars(), 1).unwrap_err(), 2);
478+
assert_eq!(Codebook::from_sequence(sample.chars(), 1).unwrap_err(), 2);
479+
}
480+
481+
impl<T> Codebook<T> {
482+
/// Return the mappings of a Codebook.
483+
pub fn mappings(self) -> Vec<(T, Key)> {
484+
self.mappings
485+
}
486+
487+
/// Split a Codebook into several Codebooks grouped by a common prefix.
488+
///
489+
/// For instance, if `prefix_len` is 2, the result will be a vector of size 2^2
490+
/// containing:
491+
///
492+
/// - at index 0 (= 0b00), all the keys starting with 0b00, minus the prefix 0b00;
493+
/// - at index 1 (= 0b01), all the keys starting with 0b01, minus the prefix 0b01;
494+
/// - at index 2 (= 0b10), all the keys starting with 0b10, minus the prefix 0b10;
495+
/// - at index 3 (= 0b11), all the keys starting with 0b11, minus the prefix 0b11.
496+
///
497+
/// ```
498+
/// let sample = "appl";
499+
/// let coded = Codebook::from_sequence(sample.chars(), std::u8::MAX).unwrap();
500+
/// // 0b0 => p
501+
/// // 0b10 => a
502+
/// // 0b11 => l
503+
///
504+
/// let buckets = coded.bucket_by_prefix(1);
505+
/// assert_eq!(buckets.len(), 2);
506+
///
507+
/// // `buckets[0]` contains keys that start with `0`.
508+
/// assert_eq!(buckets[0].len(), 1);
509+
/// let bucket: Vec<_> = buckets[0].iter().collect();
510+
/// assert_eq!(bucket[0].0, 'p');
511+
/// assert_eq!(bucket[0].1, Key::new(0, BitLen(0))); // Key was 0b0, now empty.
512+
///
513+
/// // `buckets[1]` contains keys that start with `1`.
514+
/// assert_eq!(buckets[0].len(), 2);
515+
/// let bucket: Vec<_> = buckets[0].iter().sorted_by_key(|(c, )| c);
516+
/// assert_eq!(bucket[0].0, 'a');
517+
/// assert_eq!(bucket[0].1, Key::new(1, BitLen(1))); // Key was 0b11, now 0b1
518+
/// assert_eq!(bucket[1].0, 'l');
519+
/// assert_eq!(bucket[1].1, Key::new(0, BitLen(1))); // Key was 0b10, now 0b0
520+
/// ```
521+
pub fn bucket_by_prefix(self, prefix_len: BitLen) -> Vec<Codebook<T>> {
522+
assert!(prefix_len < self.highest_bit_len);
523+
524+
// Prepare empty buckets.
525+
let mut result = Vec::with_capacity(1usize << prefix_len);
526+
result.resize_with(1usize << prefix_len, || Codebook {
527+
highest_bit_len: 0.into(),
528+
mappings: vec![],
529+
});
530+
531+
// Dispatch each (value, key) to its bucket.
532+
for (value, key) in self {
533+
let (prefix, suffix) = key.as_bit_sequence().split(prefix_len);
534+
let ref mut bucket = result[prefix.bits() as usize];
535+
if suffix.bit_len() > bucket.highest_bit_len {
536+
bucket.highest_bit_len = suffix.bit_len();
537+
}
538+
bucket
539+
.mappings
540+
.push((value, Key::from_bit_sequence(suffix)));
541+
}
542+
543+
result
544+
}
545+
546+
pub fn map<F, U>(self, mut f: F) -> Codebook<U>
547+
where
548+
F: FnMut(T) -> U,
549+
{
550+
Codebook {
551+
highest_bit_len: self.highest_bit_len,
552+
mappings: self
553+
.mappings
554+
.into_iter()
555+
.map(|(value, key)| (f(value), key))
556+
.collect(),
557+
}
558+
}
437559
}

0 commit comments

Comments
 (0)