From 9ddc80533c43dcc0a2d3a8701cadc6777ad267c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Michaud?= Date: Fri, 26 Sep 2025 15:36:17 +0200 Subject: [PATCH] expose NewRawInputSequence to prevent costly calls to the reflect library In most cases, users of the api will simply pass a string to the tokenizer to get a list of tokens to pass to the model. In that case, there is no need to call the reflect library at all, which might save important cpu cycles and consequently improve the throughput of the services using the tokenizer library. --- tokenizer.go | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tokenizer.go b/tokenizer.go index d4d4990..16f4cca 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -199,10 +199,7 @@ type InputSequence struct { func NewInputSequence(input interface{}) (retVal InputSequence) { switch reflect.TypeOf(input).Kind().String() { case "string": - return InputSequence{ - input: []string{input.(string)}, - inputType: RawInput, - } + return NewRawInputSequence(input.(string)) case "slice": if reflect.TypeOf(input).Elem().Name() != "string" { log.Fatalf("Invalid input type: Expected type of 'string' or '[]string', got %v\n", reflect.TypeOf(input).Kind().String()) @@ -218,6 +215,17 @@ func NewInputSequence(input interface{}) (retVal InputSequence) { return } +// NewRawInputSequence creates a new InputSequence from a raw string input. +// Using this method instead of NewInputSequence avoids calling the reflect library +// to inspect the type of the input when the caller knows exactly what the input is. +// It is a performance optimization because inspecting the type is not needed in most cases. +func NewRawInputSequence(input string) InputSequence { + return InputSequence{ + input: []string{input}, + inputType: RawInput, + } +} + type Single struct { Sentence InputSequence }