-
Notifications
You must be signed in to change notification settings - Fork 2.1k
[feature] lazyimport-and-tokenizer #2462
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
b59a7ac
d39b505
55f5f0f
2d036cf
717f219
eb0f4e6
2fb77e4
5d58f21
ea633e9
80d8c46
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,12 +37,6 @@ | |
"AddedToken", | ||
"normalize_chars", | ||
"tokenize_special_chars,convert_to_unicode,", | ||
"PreTrainedTokenizer", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 为什么删掉这些 |
||
], | ||
"tokenizer_utils_base": [ | ||
"PaddingStrategy", | ||
"TextInput", | ||
"TensorType", | ||
], | ||
"attention_utils": ["create_bigbird_rand_mask_idx_list"], | ||
"tensor_parallel_utils": [], | ||
|
@@ -88,6 +82,11 @@ | |
"AutoDiscriminator", | ||
"AutoModelForConditionalGeneration", | ||
], | ||
"tokenizer_utils_base": [ | ||
"PaddingStrategy", | ||
"TextInput", | ||
"TensorType", | ||
], | ||
"auto.processing": ["AutoProcessor"], | ||
"auto.tokenizer": ["AutoTokenizer"], | ||
"deepseek_v2.configuration": ["DeepseekV2Config"], | ||
|
@@ -320,6 +319,8 @@ | |
"Qwen3MoePretrainingCriterion", | ||
], | ||
"qwen3_moe.modeling_pp": ["Qwen3MoeForCausalLMPipe"], | ||
"ernie4_5vl.tokenizer": ["Ernie4_5_VLTokenizer"], | ||
"ernie4_5vl": [], | ||
"bert": [], | ||
"llama": [], | ||
"qwen2": [], | ||
|
@@ -346,6 +347,7 @@ | |
tokenize_special_chars, | ||
convert_to_unicode, | ||
) | ||
from .tokenizer_utils_fast import PretrainedTokenizerFast | ||
from .processing_utils import ProcessorMixin | ||
from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin | ||
from .image_processing_utils import ImageProcessingMixin | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,8 @@ | |
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
import json | ||
|
||
# import logging | ||
import os | ||
import warnings | ||
from typing import Dict, Optional, Union | ||
|
@@ -140,13 +142,36 @@ def get_paddleformers_tokenizer_config( | |
return result | ||
|
||
|
||
class AutoTokenizer(hf.AutoTokenizer): | ||
def _bind_paddle_mixin_if_available(tokenizer_class): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 没太理解这个的作用? |
||
""" | ||
Adapted from transformers.AutoTokenizer.from_pretrained with modifications: | ||
1. Added get_paddleformers_tokenizer_config() to extend tokenizer_config.json download source | ||
2. Explicitly binds PaddleTokenizerMixin to the tokenizer class before final instantiation | ||
Bind the PaddleTokenizerMixin if Paddle is available; otherwise, return the original class. | ||
|
||
Note: This extends HuggingFace's standard tokenizer loading logic with PaddlePaddle integration. | ||
Args: | ||
tokenizer_class: The original tokenizer class. | ||
|
||
Returns: | ||
The tokenizer class bound with PaddleTokenizerMixin, or the original class. | ||
""" | ||
try: | ||
return type(tokenizer_class.__name__, (PaddleTokenizerMixin, tokenizer_class), {}) | ||
except: | ||
return tokenizer_class | ||
|
||
|
||
class AutoTokenizer(hf.AutoTokenizer): | ||
""" | ||
Smart AutoTokenizer that automatically adapts based on available dependencies: | ||
|
||
1. **Multi-source support**: Supports HuggingFace, PaddleFormers, and other download sources | ||
2. **Conditional Paddle integration**: Automatically detects PaddlePaddle availability | ||
3. **Fallback compatibility**: Works seamlessly with or without Paddle dependencies | ||
4. **Enhanced functionality**: Extends HuggingFace's standard tokenizer loading logic | ||
|
||
Features: | ||
- Automatically binds PaddleTokenizerMixin when PaddlePaddle is available | ||
- Falls back to pure Transformers mode when PaddlePaddle is not available | ||
- Maintains full compatibility with all HuggingFace tokenizers | ||
- Supports custom download sources through environment variables | ||
""" | ||
|
||
@classmethod | ||
|
@@ -201,7 +226,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): | |
|
||
if tokenizer_class is None: | ||
raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.") | ||
tokenizer_class = type(tokenizer_class.__name__, (PaddleTokenizerMixin, tokenizer_class), {}) | ||
|
||
# Bind PaddleTokenizerMixin | ||
tokenizer_class = _bind_paddle_mixin_if_available(tokenizer_class) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ? |
||
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) | ||
|
||
# Next, let's try to use the tokenizer_config file to get the tokenizer class. | ||
|
@@ -268,6 +295,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): | |
or tokenizer_class_from_name(config_tokenizer_class + "Fast") is not None | ||
) | ||
) | ||
|
||
if has_remote_code: | ||
if use_fast and tokenizer_auto_map[1] is not None: | ||
class_ref = tokenizer_auto_map[1] | ||
|
@@ -285,11 +313,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): | |
tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs) | ||
_ = kwargs.pop("code_revision", None) | ||
tokenizer_class.register_for_auto_class() | ||
tokenizer_class = type(tokenizer_class.__name__, (PaddleTokenizerMixin, tokenizer_class), {}) | ||
|
||
# Bind PaddleTokenizerMixin | ||
tokenizer_class = _bind_paddle_mixin_if_available(tokenizer_class) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 同上? |
||
return tokenizer_class.from_pretrained( | ||
pretrained_model_name_or_path, *inputs, trust_remote_code=trust_remote_code, **kwargs | ||
) | ||
elif config_tokenizer_class is not None: | ||
|
||
tokenizer_class = None | ||
if use_fast and not config_tokenizer_class.endswith("Fast"): | ||
tokenizer_class_candidate = f"{config_tokenizer_class}Fast" | ||
|
@@ -301,7 +332,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): | |
raise ValueError( | ||
f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported." | ||
) | ||
tokenizer_class = type(tokenizer_class.__name__, (PaddleTokenizerMixin, tokenizer_class), {}) | ||
|
||
# Bind PaddleTokenizerMixin | ||
tokenizer_class = _bind_paddle_mixin_if_available(tokenizer_class) | ||
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) | ||
|
||
# Otherwise we have to be creative. | ||
|
@@ -321,15 +354,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): | |
tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)] | ||
|
||
if tokenizer_class_fast and (use_fast or tokenizer_class_py is None): | ||
tokenizer_class_fast = type( | ||
tokenizer_class_fast.__name__, (PaddleTokenizerMixin, tokenizer_class_fast), {} | ||
) | ||
# Bind PaddleTokenizerMixin | ||
tokenizer_class_fast = _bind_paddle_mixin_if_available(tokenizer_class_fast) | ||
return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) | ||
else: | ||
if tokenizer_class_py is not None: | ||
tokenizer_class_py = type( | ||
tokenizer_class_py.__name__, (PaddleTokenizerMixin, tokenizer_class_py), {} | ||
) | ||
# Bind PaddleTokenizerMixin | ||
tokenizer_class_py = _bind_paddle_mixin_if_available(tokenizer_class_py) | ||
return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) | ||
else: | ||
raise ValueError( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import sys | ||
from typing import TYPE_CHECKING | ||
|
||
from ...utils.lazy_import import _LazyModule | ||
|
||
import_structure = { | ||
"tokenizer": ["Ernie4_5_VLTokenizer"], | ||
"configuration": [ | ||
"Ernie4_5_VLMoEConfig", | ||
], | ||
} | ||
|
||
if TYPE_CHECKING: | ||
from .configuration import * | ||
from .tokenizer import Ernie4_5_VLTokenizer | ||
else: | ||
sys.modules[__name__] = _LazyModule( | ||
__name__, | ||
globals()["__file__"], | ||
import_structure, | ||
module_spec=__spec__, | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
要处理一下没有import成功 _get_arch_info的case,比如_get_arch_info=None,然后要先if _get_arch_info is not None and(xxx)