1
+ use std:: time:: Instant ;
2
+ use tokenizers:: pre_tokenizers:: whitespace:: { Whitespace , WhitespaceOptimized } ;
3
+ use tokenizers:: { OffsetReferential , OffsetType , PreTokenizer , PreTokenizedString } ;
4
+
5
+ fn main ( ) {
6
+ println ! ( "Whitespace Pre-Tokenizer Optimization Demo" ) ;
7
+ println ! ( "==========================================\n " ) ;
8
+
9
+ // Test cases with different characteristics
10
+ let test_cases = vec ! [
11
+ ( "Simple text" , "Hello world! How are you doing?" ) ,
12
+ ( "Mixed content" , "This is a test with numbers 123 and symbols @#$% and unicode: café résumé" ) ,
13
+ ( "Whitespace heavy" , "Multiple spaces\t and\n newlines\r \n here" ) ,
14
+ ( "Symbol heavy" , "Hello!@#$%^&*()world?><>{}[]|\\ " ) ,
15
+ ( "Word heavy" , "This is a very long sentence with many words that should be tokenized properly" ) ,
16
+ ( "Unicode heavy" , "αβγ δέζ ηθι κλμ νξο πρσ τυφ χψω" ) ,
17
+ ( "Mixed unicode" , "Hello 123 αβγ !@# world δέζ ηθι" ) ,
18
+ ] ;
19
+
20
+ for ( name, text) in test_cases {
21
+ println ! ( "Test case: {}" , name) ;
22
+ println ! ( "Input: '{}'" , text) ;
23
+
24
+ // Test original implementation
25
+ let start = Instant :: now ( ) ;
26
+ let mut original = PreTokenizedString :: from ( text) ;
27
+ let original_pretok = Whitespace { } ;
28
+ original_pretok. pre_tokenize ( & mut original) . unwrap ( ) ;
29
+ let original_duration = start. elapsed ( ) ;
30
+
31
+ let original_splits = original
32
+ . get_splits ( OffsetReferential :: Original , OffsetType :: Byte )
33
+ . into_iter ( )
34
+ . map ( |( s, o, _) | ( s, o) )
35
+ . collect :: < Vec < _ > > ( ) ;
36
+
37
+ // Test optimized implementation
38
+ let start = Instant :: now ( ) ;
39
+ let mut optimized = PreTokenizedString :: from ( text) ;
40
+ let optimized_pretok = WhitespaceOptimized { } ;
41
+ optimized_pretok. pre_tokenize ( & mut optimized) . unwrap ( ) ;
42
+ let optimized_duration = start. elapsed ( ) ;
43
+
44
+ let optimized_splits = optimized
45
+ . get_splits ( OffsetReferential :: Original , OffsetType :: Byte )
46
+ . into_iter ( )
47
+ . map ( |( s, o, _) | ( s, o) )
48
+ . collect :: < Vec < _ > > ( ) ;
49
+
50
+ // Verify compatibility
51
+ let compatible = original_splits == optimized_splits;
52
+
53
+ println ! ( " Original tokens: {:?}" , original_splits) ;
54
+ println ! ( " Optimized tokens: {:?}" , optimized_splits) ;
55
+ println ! ( " Compatible: {}" , compatible) ;
56
+ println ! ( " Original time: {:?}" , original_duration) ;
57
+ println ! ( " Optimized time: {:?}" , optimized_duration) ;
58
+
59
+ if original_duration > optimized_duration {
60
+ let speedup = original_duration. as_nanos ( ) as f64 / optimized_duration. as_nanos ( ) as f64 ;
61
+ println ! ( " Speedup: {:.2}x" , speedup) ;
62
+ } else {
63
+ let slowdown = optimized_duration. as_nanos ( ) as f64 / original_duration. as_nanos ( ) as f64 ;
64
+ println ! ( " Slowdown: {:.2}x" , slowdown) ;
65
+ }
66
+ println ! ( ) ;
67
+ }
68
+
69
+ // Performance test with large text
70
+ println ! ( "Large text performance test:" ) ;
71
+ let base_text = "Hello world! This is a test with numbers 123 and symbols @#$% and unicode: café résumé. " ;
72
+ let large_text: String = base_text. repeat ( 1000 ) ; // ~50KB of text
73
+ println ! ( "Text size: {} bytes" , large_text. len( ) ) ;
74
+
75
+ // Warm up
76
+ for _ in 0 ..10 {
77
+ let mut _warmup = PreTokenizedString :: from ( & large_text) ;
78
+ let _pretok = Whitespace { } ;
79
+ // Don't actually call pre_tokenize to avoid affecting results
80
+ }
81
+
82
+ // Benchmark original
83
+ let iterations = 100 ;
84
+ let start = Instant :: now ( ) ;
85
+ for _ in 0 ..iterations {
86
+ let mut pretokenized = PreTokenizedString :: from ( & large_text) ;
87
+ let pretok = Whitespace { } ;
88
+ pretok. pre_tokenize ( & mut pretokenized) . unwrap ( ) ;
89
+ }
90
+ let original_total = start. elapsed ( ) ;
91
+ let original_avg = original_total / iterations;
92
+
93
+ // Benchmark optimized
94
+ let start = Instant :: now ( ) ;
95
+ for _ in 0 ..iterations {
96
+ let mut pretokenized = PreTokenizedString :: from ( & large_text) ;
97
+ let pretok = WhitespaceOptimized { } ;
98
+ pretok. pre_tokenize ( & mut pretokenized) . unwrap ( ) ;
99
+ }
100
+ let optimized_total = start. elapsed ( ) ;
101
+ let optimized_avg = optimized_total / iterations;
102
+
103
+ println ! ( " Original average: {:?}" , original_avg) ;
104
+ println ! ( " Optimized average: {:?}" , optimized_avg) ;
105
+
106
+ if original_avg > optimized_avg {
107
+ let speedup = original_avg. as_nanos ( ) as f64 / optimized_avg. as_nanos ( ) as f64 ;
108
+ println ! ( " Overall speedup: {:.2}x" , speedup) ;
109
+ } else {
110
+ let slowdown = optimized_avg. as_nanos ( ) as f64 / original_avg. as_nanos ( ) as f64 ;
111
+ println ! ( " Overall slowdown: {:.2}x" , slowdown) ;
112
+ }
113
+
114
+ println ! ( "\n Note: Performance results may vary depending on hardware and system load." ) ;
115
+ println ! ( "For accurate benchmarks, run: cargo bench --bench whitespace_benchmark" ) ;
116
+ }
0 commit comments