text spliter length method use default embedding model tokenizer (#2011)

Co-authored-by: jyong <jyong@dify.ai>
2024-01-12 18:45:34 +08:00
parent 1779cea6e3
commit a63a9c7d45
2 changed files with 69 additions and 24 deletions
--- a/api/core/spiltter/fixed_text_splitter.py
+++ b/api/core/spiltter/fixed_text_splitter.py
@@ -1,8 +1,10 @@
 """Functionality for splitting text."""
 from __future__ import annotations

-from typing import Any, List, Optional
+from typing import Any, List, Optional, cast

+from core.model_manager import ModelInstance
+from core.model_runtime.model_providers.__base.text_embedding_model import TextEmbeddingModel
 from core.model_runtime.model_providers.__base.tokenizers.gpt2_tokenzier import GPT2Tokenizer
 from langchain.text_splitter import (TS, AbstractSet, Collection, Literal, RecursiveCharacterTextSplitter,
                                     TokenTextSplitter, Type, Union)
@@ -12,22 +14,30 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter):
    """
        This class is used to implement from_gpt2_encoder, to prevent using of tiktoken
    """
+
    @classmethod
-    def from_gpt2_encoder(
-        cls: Type[TS],
-        encoding_name: str = "gpt2",
-        model_name: Optional[str] = None,
-        allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
-        disallowed_special: Union[Literal["all"], Collection[str]] = "all",
-        **kwargs: Any,
+    def from_encoder(
+            cls: Type[TS],
+            embedding_model_instance: Optional[ModelInstance],
+            allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
+            disallowed_special: Union[Literal["all"], Collection[str]] = "all",
+            **kwargs: Any,
    ):
        def _token_encoder(text: str) -> int:
-            return GPT2Tokenizer.get_num_tokens(text)
+            if embedding_model_instance:
+                embedding_model_type_instance = embedding_model_instance.model_type_instance
+                embedding_model_type_instance = cast(TextEmbeddingModel, embedding_model_type_instance)
+                return embedding_model_type_instance.get_num_tokens(
+                    model=embedding_model_instance.model,
+                    credentials=embedding_model_instance.credentials,
+                    texts=[text]
+                )
+            else:
+                return GPT2Tokenizer.get_num_tokens(text)

        if issubclass(cls, TokenTextSplitter):
            extra_kwargs = {
-                "encoding_name": encoding_name,
-                "model_name": model_name,
+                "model_name": embedding_model_instance.model if embedding_model_instance else 'gpt2',
                "allowed_special": allowed_special,
                "disallowed_special": disallowed_special,
            }
@@ -35,6 +45,7 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter):

        return cls(length_function=_token_encoder, **kwargs)

+
 class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter):
    def __init__(self, fixed_separator: str = "\n\n", separators: Optional[List[str]] = None, **kwargs: Any):
        """Create a new TextSplitter."""
@@ -90,4 +101,4 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
        if _good_splits:
            merged_text = self._merge_splits(_good_splits, separator)
            final_chunks.extend(merged_text)
-        return final_chunks
+        return final_chunks