Skip to content

Commit f339867

Browse files
committed
Fixing the public API of the Rust trainer.
1 parent ee2c570 commit f339867

File tree

5 files changed

+15
-8
lines changed

5 files changed

+15
-8
lines changed

bindings/node/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
authors = ["Nicolas Patry <[email protected]>"]
33
edition = "2021"
44
name = "node"
5-
version = "0.21.3-dev.0"
5+
version = "0.21.4-dev.0"
66

77
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
88

bindings/python/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "tokenizers-python"
3-
version = "0.21.3-dev.0"
3+
version = "0.21.4-dev.0"
44
authors = ["Anthony MOI <[email protected]>"]
55
edition = "2021"
66

tokenizers/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
authors = ["Anthony MOI <[email protected]>", "Nicolas Patry <[email protected]>"]
33
edition = "2018"
44
name = "tokenizers"
5-
version = "0.21.3-dev.0"
5+
version = "0.21.4-dev.0"
66
homepage = "https://github.com/huggingface/tokenizers"
77
repository = "https://github.com/huggingface/tokenizers"
88
documentation = "https://docs.rs/tokenizers/"

tokenizers/src/models/bpe/trainer.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use compact_str::CompactString;
99
use dary_heap::OctonaryHeap;
1010
use serde::{Deserialize, Serialize};
1111
use std::cmp::Ordering;
12+
use std::collections::HashSet;
1213

1314
#[derive(Debug, Eq)]
1415
struct Merge {
@@ -116,8 +117,10 @@ impl BpeTrainerBuilder {
116117

117118
/// Set the initial alphabet
118119
#[must_use]
119-
pub fn initial_alphabet(mut self, alphabet: AHashSet<char>) -> Self {
120-
self.config.initial_alphabet = alphabet;
120+
pub fn initial_alphabet(mut self, alphabet: HashSet<char>) -> Self {
121+
let mut initial_alphabet = AHashSet::with_capacity(alphabet.len());
122+
initial_alphabet.extend(alphabet);
123+
self.config.initial_alphabet = initial_alphabet;
121124
self
122125
}
123126

tokenizers/src/models/wordpiece/trainer.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
use std::collections::HashSet;
2+
13
use super::WordPiece;
24
use crate::models::bpe::{BpeTrainer, BpeTrainerBuilder, BPE};
35
use crate::tokenizer::{AddedToken, Result, Trainer};
@@ -61,7 +63,7 @@ impl WordPieceTrainerBuilder {
6163

6264
/// Set the initial alphabet
6365
#[must_use]
64-
pub fn initial_alphabet(mut self, alphabet: AHashSet<char>) -> Self {
66+
pub fn initial_alphabet(mut self, alphabet: HashSet<char>) -> Self {
6567
self.bpe_trainer_builder = self.bpe_trainer_builder.initial_alphabet(alphabet);
6668
self
6769
}
@@ -138,8 +140,10 @@ impl WordPieceTrainer {
138140
&self.bpe_trainer.initial_alphabet
139141
}
140142

141-
pub fn set_initial_alphabet(&mut self, alphabet: AHashSet<char>) {
142-
self.bpe_trainer.initial_alphabet = alphabet;
143+
pub fn set_initial_alphabet(&mut self, alphabet: HashSet<char>) {
144+
let mut initial_alphabet = AHashSet::with_capacity(alphabet.len());
145+
initial_alphabet.extend(alphabet);
146+
self.bpe_trainer.initial_alphabet = initial_alphabet;
143147
}
144148

145149
pub fn continuing_subword_prefix(&self) -> &Option<String> {

0 commit comments

Comments
 (0)