From d3157b46eea7de9a4beef00bcd7dbbbb9dd8f544 Mon Sep 17 00:00:00 2001 From: -LAN- Date: Wed, 9 Apr 2025 21:52:58 +0900 Subject: [PATCH] feat(large_language_model): Adds plugin-based token counting configuration option (#17706) Signed-off-by: -LAN- Co-authored-by: Yeuoly --- api/.env.example | 1 + api/configs/feature/__init__.py | 7 ++++- api/core/app/apps/agent_chat/app_runner.py | 14 ---------- api/core/app/apps/chat/app_runner.py | 14 ---------- api/core/app/apps/completion/app_runner.py | 14 ---------- .../en_US/customizable_model_scale_out.md | 2 +- .../zh_Hans/customizable_model_scale_out.md | 2 +- .../__base/large_language_model.py | 26 ++++++++++--------- docker/.env.example | 11 +++++--- docker/docker-compose.yaml | 1 + 10 files changed, 32 insertions(+), 60 deletions(-) diff --git a/api/.env.example b/api/.env.example index ba76274c3..3bbea44f2 100644 --- a/api/.env.example +++ b/api/.env.example @@ -326,6 +326,7 @@ UPLOAD_AUDIO_FILE_SIZE_LIMIT=50 MULTIMODAL_SEND_FORMAT=base64 PROMPT_GENERATION_MAX_TOKENS=512 CODE_GENERATION_MAX_TOKENS=1024 +PLUGIN_BASED_TOKEN_COUNTING_ENABLED=false # Mail configuration, support: resend, smtp MAIL_TYPE= diff --git a/api/configs/feature/__init__.py b/api/configs/feature/__init__.py index fa8e8c2bf..d35a74e3e 100644 --- a/api/configs/feature/__init__.py +++ b/api/configs/feature/__init__.py @@ -442,7 +442,7 @@ class LoggingConfig(BaseSettings): class ModelLoadBalanceConfig(BaseSettings): """ - Configuration for model load balancing + Configuration for model load balancing and token counting """ MODEL_LB_ENABLED: bool = Field( @@ -450,6 +450,11 @@ class ModelLoadBalanceConfig(BaseSettings): default=False, ) + PLUGIN_BASED_TOKEN_COUNTING_ENABLED: bool = Field( + description="Enable or disable plugin based token counting. If disabled, token counting will return 0.", + default=False, + ) + class BillingConfig(BaseSettings): """ diff --git a/api/core/app/apps/agent_chat/app_runner.py b/api/core/app/apps/agent_chat/app_runner.py index 72a171711..71328f6d1 100644 --- a/api/core/app/apps/agent_chat/app_runner.py +++ b/api/core/app/apps/agent_chat/app_runner.py @@ -53,20 +53,6 @@ class AgentChatAppRunner(AppRunner): query = application_generate_entity.query files = application_generate_entity.files - # Pre-calculate the number of tokens of the prompt messages, - # and return the rest number of tokens by model context token size limit and max token size limit. - # If the rest number of tokens is not enough, raise exception. - # Include: prompt template, inputs, query(optional), files(optional) - # Not Include: memory, external data, dataset context - self.get_pre_calculate_rest_tokens( - app_record=app_record, - model_config=application_generate_entity.model_conf, - prompt_template_entity=app_config.prompt_template, - inputs=dict(inputs), - files=list(files), - query=query, - ) - memory = None if application_generate_entity.conversation_id: # get memory of conversation (read-only) diff --git a/api/core/app/apps/chat/app_runner.py b/api/core/app/apps/chat/app_runner.py index 8641f188f..39597fc03 100644 --- a/api/core/app/apps/chat/app_runner.py +++ b/api/core/app/apps/chat/app_runner.py @@ -61,20 +61,6 @@ class ChatAppRunner(AppRunner): ) image_detail_config = image_detail_config or ImagePromptMessageContent.DETAIL.LOW - # Pre-calculate the number of tokens of the prompt messages, - # and return the rest number of tokens by model context token size limit and max token size limit. - # If the rest number of tokens is not enough, raise exception. - # Include: prompt template, inputs, query(optional), files(optional) - # Not Include: memory, external data, dataset context - self.get_pre_calculate_rest_tokens( - app_record=app_record, - model_config=application_generate_entity.model_conf, - prompt_template_entity=app_config.prompt_template, - inputs=inputs, - files=files, - query=query, - ) - memory = None if application_generate_entity.conversation_id: # get memory of conversation (read-only) diff --git a/api/core/app/apps/completion/app_runner.py b/api/core/app/apps/completion/app_runner.py index 4f1624731..80fdd0b80 100644 --- a/api/core/app/apps/completion/app_runner.py +++ b/api/core/app/apps/completion/app_runner.py @@ -54,20 +54,6 @@ class CompletionAppRunner(AppRunner): ) image_detail_config = image_detail_config or ImagePromptMessageContent.DETAIL.LOW - # Pre-calculate the number of tokens of the prompt messages, - # and return the rest number of tokens by model context token size limit and max token size limit. - # If the rest number of tokens is not enough, raise exception. - # Include: prompt template, inputs, query(optional), files(optional) - # Not Include: memory, external data, dataset context - self.get_pre_calculate_rest_tokens( - app_record=app_record, - model_config=application_generate_entity.model_conf, - prompt_template_entity=app_config.prompt_template, - inputs=inputs, - files=files, - query=query, - ) - # organize all inputs and template to prompt messages # Include: prompt template, inputs, query(optional), files(optional) prompt_messages, stop = self.organize_prompt_messages( diff --git a/api/core/model_runtime/docs/en_US/customizable_model_scale_out.md b/api/core/model_runtime/docs/en_US/customizable_model_scale_out.md index f050919d8..b5a714a17 100644 --- a/api/core/model_runtime/docs/en_US/customizable_model_scale_out.md +++ b/api/core/model_runtime/docs/en_US/customizable_model_scale_out.md @@ -192,7 +192,7 @@ def get_num_tokens(self, model: str, credentials: dict, prompt_messages: list[Pr ``` -Sometimes, you might not want to return 0 directly. In such cases, you can use `self._get_num_tokens_by_gpt2(text: str)` to get pre-computed tokens. This method is provided by the `AIModel` base class, and it uses GPT2's Tokenizer for calculation. However, it should be noted that this is only a substitute and may not be fully accurate. +Sometimes, you might not want to return 0 directly. In such cases, you can use `self._get_num_tokens_by_gpt2(text: str)` to get pre-computed tokens and ensure environment variable `PLUGIN_BASED_TOKEN_COUNTING_ENABLED` is set to `true`, This method is provided by the `AIModel` base class, and it uses GPT2's Tokenizer for calculation. However, it should be noted that this is only a substitute and may not be fully accurate. - Model Credentials Validation diff --git a/api/core/model_runtime/docs/zh_Hans/customizable_model_scale_out.md b/api/core/model_runtime/docs/zh_Hans/customizable_model_scale_out.md index 240f65802..c36575b9a 100644 --- a/api/core/model_runtime/docs/zh_Hans/customizable_model_scale_out.md +++ b/api/core/model_runtime/docs/zh_Hans/customizable_model_scale_out.md @@ -179,7 +179,7 @@ provider_credential_schema: """ ``` - 有时候,也许你不需要直接返回0,所以你可以使用`self._get_num_tokens_by_gpt2(text: str)`来获取预计算的tokens,这个方法位于`AIModel`基类中,它会使用GPT2的Tokenizer进行计算,但是只能作为替代方法,并不完全准确。 + 有时候,也许你不需要直接返回0,所以你可以使用`self._get_num_tokens_by_gpt2(text: str)`来获取预计算的tokens,并确保环境变量`PLUGIN_BASED_TOKEN_COUNTING_ENABLED`设置为`true`,这个方法位于`AIModel`基类中,它会使用GPT2的Tokenizer进行计算,但是只能作为替代方法,并不完全准确。 - 模型凭据校验 diff --git a/api/core/model_runtime/model_providers/__base/large_language_model.py b/api/core/model_runtime/model_providers/__base/large_language_model.py index ed67fef76..b81ccafc1 100644 --- a/api/core/model_runtime/model_providers/__base/large_language_model.py +++ b/api/core/model_runtime/model_providers/__base/large_language_model.py @@ -295,18 +295,20 @@ class LargeLanguageModel(AIModel): :param tools: tools for tool calling :return: """ - plugin_model_manager = PluginModelManager() - return plugin_model_manager.get_llm_num_tokens( - tenant_id=self.tenant_id, - user_id="unknown", - plugin_id=self.plugin_id, - provider=self.provider_name, - model_type=self.model_type.value, - model=model, - credentials=credentials, - prompt_messages=prompt_messages, - tools=tools, - ) + if dify_config.PLUGIN_BASED_TOKEN_COUNTING_ENABLED: + plugin_model_manager = PluginModelManager() + return plugin_model_manager.get_llm_num_tokens( + tenant_id=self.tenant_id, + user_id="unknown", + plugin_id=self.plugin_id, + provider=self.provider_name, + model_type=self.model_type.value, + model=model, + credentials=credentials, + prompt_messages=prompt_messages, + tools=tools, + ) + return 0 def _calc_response_usage( self, model: str, credentials: dict, prompt_tokens: int, completion_tokens: int diff --git a/docker/.env.example b/docker/.env.example index 0dc25fb5f..6e2c4d3d9 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -75,7 +75,7 @@ SECRET_KEY=sk-9f73s3ljTXVcMT3Blb3ljTqtsKiGHXVcMT3BlbkFJLK7U # Password for admin user initialization. # If left unset, admin user will not be prompted for a password -# when creating the initial admin account. +# when creating the initial admin account. # The length of the password cannot exceed 30 characters. INIT_PASSWORD= @@ -605,17 +605,22 @@ SCARF_NO_ANALYTICS=true # ------------------------------ # The maximum number of tokens allowed for prompt generation. -# This setting controls the upper limit of tokens that can be used by the LLM +# This setting controls the upper limit of tokens that can be used by the LLM # when generating a prompt in the prompt generation tool. # Default: 512 tokens. PROMPT_GENERATION_MAX_TOKENS=512 # The maximum number of tokens allowed for code generation. -# This setting controls the upper limit of tokens that can be used by the LLM +# This setting controls the upper limit of tokens that can be used by the LLM # when generating code in the code generation tool. # Default: 1024 tokens. CODE_GENERATION_MAX_TOKENS=1024 +# Enable or disable plugin based token counting. If disabled, token counting will return 0. +# This can improve performance by skipping token counting operations. +# Default: false (disabled). +PLUGIN_BASED_TOKEN_COUNTING_ENABLED=false + # ------------------------------ # Multi-modal Configuration # ------------------------------ diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index e6fafd2dd..b317fffc0 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -276,6 +276,7 @@ x-shared-env: &shared-api-worker-env SCARF_NO_ANALYTICS: ${SCARF_NO_ANALYTICS:-true} PROMPT_GENERATION_MAX_TOKENS: ${PROMPT_GENERATION_MAX_TOKENS:-512} CODE_GENERATION_MAX_TOKENS: ${CODE_GENERATION_MAX_TOKENS:-1024} + PLUGIN_BASED_TOKEN_COUNTING_ENABLED: ${PLUGIN_BASED_TOKEN_COUNTING_ENABLED:-false} MULTIMODAL_SEND_FORMAT: ${MULTIMODAL_SEND_FORMAT:-base64} UPLOAD_IMAGE_FILE_SIZE_LIMIT: ${UPLOAD_IMAGE_FILE_SIZE_LIMIT:-10} UPLOAD_VIDEO_FILE_SIZE_LIMIT: ${UPLOAD_VIDEO_FILE_SIZE_LIMIT:-100}