Skip to content

Commit 971be68

Browse files
committed
feat: whitespace optimizer
1 parent fc7be52 commit 971be68

File tree

3 files changed

+437
-0
lines changed

3 files changed

+437
-0
lines changed
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
use std::time::Instant;
2+
use tokenizers::pre_tokenizers::whitespace::{Whitespace, WhitespaceOptimized};
3+
use tokenizers::{OffsetReferential, OffsetType, PreTokenizer, PreTokenizedString};
4+
5+
fn main() {
6+
println!("Whitespace Pre-Tokenizer Optimization Demo");
7+
println!("==========================================\n");
8+
9+
// Test cases with different characteristics
10+
let test_cases = vec![
11+
("Simple text", "Hello world! How are you doing?"),
12+
("Mixed content", "This is a test with numbers 123 and symbols @#$% and unicode: café résumé"),
13+
("Whitespace heavy", "Multiple spaces\tand\nnewlines\r\nhere"),
14+
("Symbol heavy", "Hello!@#$%^&*()world?><>{}[]|\\"),
15+
("Word heavy", "This is a very long sentence with many words that should be tokenized properly"),
16+
("Unicode heavy", "αβγ δέζ ηθι κλμ νξο πρσ τυφ χψω"),
17+
("Mixed unicode", "Hello 123 αβγ !@# world δέζ ηθι"),
18+
];
19+
20+
for (name, text) in test_cases {
21+
println!("Test case: {}", name);
22+
println!("Input: '{}'", text);
23+
24+
// Test original implementation
25+
let start = Instant::now();
26+
let mut original = PreTokenizedString::from(text);
27+
let original_pretok = Whitespace {};
28+
original_pretok.pre_tokenize(&mut original).unwrap();
29+
let original_duration = start.elapsed();
30+
31+
let original_splits = original
32+
.get_splits(OffsetReferential::Original, OffsetType::Byte)
33+
.into_iter()
34+
.map(|(s, o, _)| (s, o))
35+
.collect::<Vec<_>>();
36+
37+
// Test optimized implementation
38+
let start = Instant::now();
39+
let mut optimized = PreTokenizedString::from(text);
40+
let optimized_pretok = WhitespaceOptimized {};
41+
optimized_pretok.pre_tokenize(&mut optimized).unwrap();
42+
let optimized_duration = start.elapsed();
43+
44+
let optimized_splits = optimized
45+
.get_splits(OffsetReferential::Original, OffsetType::Byte)
46+
.into_iter()
47+
.map(|(s, o, _)| (s, o))
48+
.collect::<Vec<_>>();
49+
50+
// Verify compatibility
51+
let compatible = original_splits == optimized_splits;
52+
53+
println!(" Original tokens: {:?}", original_splits);
54+
println!(" Optimized tokens: {:?}", optimized_splits);
55+
println!(" Compatible: {}", compatible);
56+
println!(" Original time: {:?}", original_duration);
57+
println!(" Optimized time: {:?}", optimized_duration);
58+
59+
if original_duration > optimized_duration {
60+
let speedup = original_duration.as_nanos() as f64 / optimized_duration.as_nanos() as f64;
61+
println!(" Speedup: {:.2}x", speedup);
62+
} else {
63+
let slowdown = optimized_duration.as_nanos() as f64 / original_duration.as_nanos() as f64;
64+
println!(" Slowdown: {:.2}x", slowdown);
65+
}
66+
println!();
67+
}
68+
69+
// Performance test with large text
70+
println!("Large text performance test:");
71+
let base_text = "Hello world! This is a test with numbers 123 and symbols @#$% and unicode: café résumé. ";
72+
let large_text: String = base_text.repeat(1000); // ~50KB of text
73+
println!("Text size: {} bytes", large_text.len());
74+
75+
// Warm up
76+
for _ in 0..10 {
77+
let mut _warmup = PreTokenizedString::from(&large_text);
78+
let _pretok = Whitespace {};
79+
// Don't actually call pre_tokenize to avoid affecting results
80+
}
81+
82+
// Benchmark original
83+
let iterations = 100;
84+
let start = Instant::now();
85+
for _ in 0..iterations {
86+
let mut pretokenized = PreTokenizedString::from(&large_text);
87+
let pretok = Whitespace {};
88+
pretok.pre_tokenize(&mut pretokenized).unwrap();
89+
}
90+
let original_total = start.elapsed();
91+
let original_avg = original_total / iterations;
92+
93+
// Benchmark optimized
94+
let start = Instant::now();
95+
for _ in 0..iterations {
96+
let mut pretokenized = PreTokenizedString::from(&large_text);
97+
let pretok = WhitespaceOptimized {};
98+
pretok.pre_tokenize(&mut pretokenized).unwrap();
99+
}
100+
let optimized_total = start.elapsed();
101+
let optimized_avg = optimized_total / iterations;
102+
103+
println!(" Original average: {:?}", original_avg);
104+
println!(" Optimized average: {:?}", optimized_avg);
105+
106+
if original_avg > optimized_avg {
107+
let speedup = original_avg.as_nanos() as f64 / optimized_avg.as_nanos() as f64;
108+
println!(" Overall speedup: {:.2}x", speedup);
109+
} else {
110+
let slowdown = optimized_avg.as_nanos() as f64 / original_avg.as_nanos() as f64;
111+
println!(" Overall slowdown: {:.2}x", slowdown);
112+
}
113+
114+
println!("\nNote: Performance results may vary depending on hardware and system load.");
115+
println!("For accurate benchmarks, run: cargo bench --bench whitespace_benchmark");
116+
}
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#[macro_use]
2+
extern crate criterion;
3+
4+
use criterion::{Criterion, Throughput};
5+
use tokenizers::pre_tokenizers::whitespace::{Whitespace, WhitespaceOptimized};
6+
use tokenizers::{OffsetReferential, OffsetType, PreTokenizer, PreTokenizedString};
7+
8+
fn bench_whitespace_comparison(c: &mut Criterion) {
9+
let mut group = c.benchmark_group("whitespace-pre-tokenizers");
10+
11+
// Test data with various characteristics
12+
let test_cases = vec![
13+
("simple", "Hello world! How are you doing?"),
14+
("mixed", "This is a test with numbers 123 and symbols @#$% and unicode: café résumé"),
15+
("whitespace_heavy", "Multiple spaces\tand\nnewlines\r\nhere"),
16+
("symbol_heavy", "Hello!@#$%^&*()world?><>{}[]|\\"),
17+
("word_heavy", "This is a very long sentence with many words that should be tokenized properly"),
18+
("unicode_heavy", "αβγ δέζ ηθι κλμ νξο πρσ τυφ χψω"),
19+
("mixed_unicode", "Hello 123 αβγ !@# world δέζ ηθι"),
20+
];
21+
22+
for (name, text) in test_cases {
23+
let data_len = text.len() as u64;
24+
group.throughput(Throughput::Bytes(data_len));
25+
26+
// Benchmark original regex-based implementation
27+
group.bench_function(&format!("{}-original", name), |b| {
28+
b.iter(|| {
29+
let mut pretokenized = PreTokenizedString::from(text);
30+
let pretok = Whitespace {};
31+
pretok.pre_tokenize(&mut pretokenized).unwrap();
32+
let _result = pretokenized
33+
.get_splits(OffsetReferential::Original, OffsetType::Byte)
34+
.into_iter()
35+
.map(|(s, o, _)| (s, o))
36+
.collect::<Vec<_>>();
37+
})
38+
});
39+
40+
// Benchmark optimized byte-level implementation
41+
group.bench_function(&format!("{}-optimized", name), |b| {
42+
b.iter(|| {
43+
let mut pretokenized = PreTokenizedString::from(text);
44+
let pretok = WhitespaceOptimized {};
45+
pretok.pre_tokenize(&mut pretokenized).unwrap();
46+
let _result = pretokenized
47+
.get_splits(OffsetReferential::Original, OffsetType::Byte)
48+
.into_iter()
49+
.map(|(s, o, _)| (s, o))
50+
.collect::<Vec<_>>();
51+
})
52+
});
53+
}
54+
55+
group.finish();
56+
}
57+
58+
fn bench_large_text(c: &mut Criterion) {
59+
let mut group = c.benchmark_group("whitespace-large-text");
60+
61+
// Create a large text by repeating patterns
62+
let base_text = "Hello world! This is a test with numbers 123 and symbols @#$% and unicode: café résumé. ";
63+
let large_text: String = base_text.repeat(1000); // ~50KB of text
64+
let data_len = large_text.len() as u64;
65+
66+
group.throughput(Throughput::Bytes(data_len));
67+
68+
group.bench_function("large-original", |b| {
69+
b.iter(|| {
70+
let mut pretokenized = PreTokenizedString::from(large_text.as_str());
71+
let pretok = Whitespace {};
72+
pretok.pre_tokenize(&mut pretokenized).unwrap();
73+
let _result = pretokenized
74+
.get_splits(OffsetReferential::Original, OffsetType::Byte)
75+
.into_iter()
76+
.map(|(s, o, _)| (s, o))
77+
.collect::<Vec<_>>();
78+
})
79+
});
80+
81+
group.bench_function("large-optimized", |b| {
82+
b.iter(|| {
83+
let mut pretokenized = PreTokenizedString::from(large_text.as_str());
84+
let pretok = WhitespaceOptimized {};
85+
pretok.pre_tokenize(&mut pretokenized).unwrap();
86+
let _result = pretokenized
87+
.get_splits(OffsetReferential::Original, OffsetType::Byte)
88+
.into_iter()
89+
.map(|(s, o, _)| (s, o))
90+
.collect::<Vec<_>>();
91+
})
92+
});
93+
94+
group.finish();
95+
}
96+
97+
criterion_group! {
98+
name = whitespace_benches;
99+
config = Criterion::default().sample_size(20);
100+
targets = bench_whitespace_comparison, bench_large_text
101+
}
102+
103+
criterion_main!(whitespace_benches);

0 commit comments

Comments
 (0)