Skip to content
9 changes: 8 additions & 1 deletion onnxruntime_extensions/_hf_cvt.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,15 @@ def convert_json_vocab(hf_tokenizer):
# get vocab object from json file
vocab = tokenizer_json.get("model", {}).get("vocab", {})
sorted_merges = tokenizer_json.get("model", {}).get("merges", [])
sorted_merges = [v_.replace("\n", "<0x0A>") for v_ in sorted_merges]

attrs = {"vocab": json.dumps(vocab, separators=(",", ":"))}

# merges data can be a list of string or list of list of string
if (all(isinstance(v_,(list,tuple))) for v_ in sorted_merges) :
sorted_merges = [ " ".join(v if v != "\n" else "<0x0A>" for v in v_ ) for v_ in sorted_merges]
else :
sorted_merges = [v_.replace("\n", "<0x0A>") for v_ in sorted_merges]

attrs["merges"] = "\n".join(sorted_merges)
if hf_tokenizer.added_tokens_encoder:
token_map = [f"{_k}={_v}" for _k,
Expand Down
Loading