diff --git a/onnxruntime_extensions/_hf_cvt.py b/onnxruntime_extensions/_hf_cvt.py index 4ba6bdc5f..5ac80c96c 100644 --- a/onnxruntime_extensions/_hf_cvt.py +++ b/onnxruntime_extensions/_hf_cvt.py @@ -54,8 +54,15 @@ def convert_json_vocab(hf_tokenizer): # get vocab object from json file vocab = tokenizer_json.get("model", {}).get("vocab", {}) sorted_merges = tokenizer_json.get("model", {}).get("merges", []) - sorted_merges = [v_.replace("\n", "<0x0A>") for v_ in sorted_merges] + attrs = {"vocab": json.dumps(vocab, separators=(",", ":"))} + + # merges data can be a list of string or list of list of string + if (all(isinstance(v_,(list,tuple))) for v_ in sorted_merges) : + sorted_merges = [ " ".join(v if v != "\n" else "<0x0A>" for v in v_ ) for v_ in sorted_merges] + else : + sorted_merges = [v_.replace("\n", "<0x0A>") for v_ in sorted_merges] + attrs["merges"] = "\n".join(sorted_merges) if hf_tokenizer.added_tokens_encoder: token_map = [f"{_k}={_v}" for _k, diff --git a/test/test_autotokenizer.py b/test/test_autotokenizer.py index fc48edd23..ee5a77ed0 100644 --- a/test/test_autotokenizer.py +++ b/test/test_autotokenizer.py @@ -2,6 +2,7 @@ # Licensed under the MIT License. import unittest +import os import numpy as np from transformers import AutoTokenizer, GPT2Tokenizer from onnxruntime_extensions import OrtPyFunction, gen_processing_models, ort_inference, util @@ -128,6 +129,19 @@ def print_prime(n): self.assertEqual(len(ids["input_ids"].shape), len(actual_ids.shape)) np.testing.assert_array_equal(ids["input_ids"], actual_ids) + def test_microsoft_phi4(self): + script_dir = os.path.dirname(os.path.abspath(__file__)) + tokenizer_dir = os.path.join(script_dir, 'data',"phi-4-mini-reasoning") + + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_dir, torch_dtype="auto") + code = 'This is a sample Code' + + ids = tokenizer(code, return_tensors="np") + ort_tok, _ = gen_processing_models(tokenizer, pre_kwargs={}) + actual_ids, *_ = ort_inference(ort_tok, [code]) + self.assertEqual(len(ids["input_ids"].shape), len(actual_ids.shape)) + np.testing.assert_array_equal(ids["input_ids"], actual_ids) if __name__ == '__main__': unittest.main()