immediately return initialed tiktokenizer instance and remove dead code in usage of tiktokenizer (#17957)

2025-04-30 16:07:20 +08:00
parent bcc95e520b
commit 12c96b93d9
5 changed files with 18 additions and 73 deletions
--- a/api/core/model_runtime/model_providers/__base/ai_model.py
+++ b/api/core/model_runtime/model_providers/__base/ai_model.py
@@ -24,7 +24,6 @@ from core.model_runtime.errors.invoke import (
    InvokeRateLimitError,
    InvokeServerUnavailableError,
 )
-from core.model_runtime.model_providers.__base.tokenizers.gpt2_tokenzier import GPT2Tokenizer
 from core.plugin.entities.plugin_daemon import PluginDaemonInnerError, PluginModelProviderEntity
 from core.plugin.impl.model import PluginModelClient

@@ -253,15 +252,3 @@ class AIModel(BaseModel):
            raise Exception(f"Invalid model parameter rule name {name}")

        return default_parameter_rule
-
-    def _get_num_tokens_by_gpt2(self, text: str) -> int:
-        """
-        Get number of tokens for given prompt messages by gpt2
-        Some provider models do not provide an interface for obtaining the number of tokens.
-        Here, the gpt2 tokenizer is used to calculate the number of tokens.
-        This method can be executed offline, and the gpt2 tokenizer has been cached in the project.
-
-        :param text: plain text of prompt. You need to convert the original message to plain text
-        :return: number of tokens
-        """
-        return GPT2Tokenizer.get_num_tokens(text)
--- a/api/core/model_runtime/model_providers/__base/tokenizers/gpt2_tokenzier.py
+++ b/api/core/model_runtime/model_providers/__base/tokenizers/gpt2_tokenzier.py
@@ -30,6 +30,8 @@ class GPT2Tokenizer:
    @staticmethod
    def get_encoder() -> Any:
        global _tokenizer, _lock
+        if _tokenizer is not None:
+            return _tokenizer
        with _lock:
            if _tokenizer is None:
                # Try to use tiktoken to get the tokenizer because it is faster
--- a/api/core/rag/splitter/text_splitter.py
+++ b/api/core/rag/splitter/text_splitter.py
@@ -159,50 +159,6 @@ class TextSplitter(BaseDocumentTransformer, ABC):
            )
        return cls(length_function=lambda x: [_huggingface_tokenizer_length(text) for text in x], **kwargs)

-    @classmethod
-    def from_tiktoken_encoder(
-        cls: type[TS],
-        encoding_name: str = "gpt2",
-        model_name: Optional[str] = None,
-        allowed_special: Union[Literal["all"], Set[str]] = set(),
-        disallowed_special: Union[Literal["all"], Collection[str]] = "all",
-        **kwargs: Any,
-    ) -> TS:
-        """Text splitter that uses tiktoken encoder to count length."""
-        try:
-            import tiktoken
-        except ImportError:
-            raise ImportError(
-                "Could not import tiktoken python package. "
-                "This is needed in order to calculate max_tokens_for_prompt. "
-                "Please install it with `pip install tiktoken`."
-            )
-
-        if model_name is not None:
-            enc = tiktoken.encoding_for_model(model_name)
-        else:
-            enc = tiktoken.get_encoding(encoding_name)
-
-        def _tiktoken_encoder(text: str) -> int:
-            return len(
-                enc.encode(
-                    text,
-                    allowed_special=allowed_special,
-                    disallowed_special=disallowed_special,
-                )
-            )
-
-        if issubclass(cls, TokenTextSplitter):
-            extra_kwargs = {
-                "encoding_name": encoding_name,
-                "model_name": model_name,
-                "allowed_special": allowed_special,
-                "disallowed_special": disallowed_special,
-            }
-            kwargs = {**kwargs, **extra_kwargs}
-
-        return cls(length_function=lambda x: [_tiktoken_encoder(text) for text in x], **kwargs)
-
    def transform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]:
        """Transform sequence of documents by splitting them."""
        return self.split_documents(list(documents))