feat/enhance the multi-modal support (#8818)

2024-10-21 10:43:49 +08:00
parent 7a1d6fe509
commit e61752bd3a
267 changed files with 6263 additions and 3523 deletions
--- a/api/core/model_runtime/model_providers/__base/large_language_model.py
+++ b/api/core/model_runtime/model_providers/__base/large_language_model.py
@@ -1,5 +1,4 @@
 import logging
-import os
 import re
 import time
 from abc import abstractmethod
@@ -8,6 +7,7 @@ from typing import Optional, Union

 from pydantic import ConfigDict

+from configs import dify_config
 from core.model_runtime.callbacks.base_callback import Callback
 from core.model_runtime.callbacks.logging_callback import LoggingCallback
 from core.model_runtime.entities.llm_entities import LLMMode, LLMResult, LLMResultChunk, LLMResultChunkDelta, LLMUsage
@@ -77,7 +77,7 @@ class LargeLanguageModel(AIModel):

        callbacks = callbacks or []

-        if bool(os.environ.get("DEBUG", "False").lower() == "true"):
+        if dify_config.DEBUG:
            callbacks.append(LoggingCallback())

        # trigger before invoke callbacks
@@ -107,7 +107,16 @@ class LargeLanguageModel(AIModel):
                    callbacks=callbacks,
                )
            else:
-                result = self._invoke(model, credentials, prompt_messages, model_parameters, tools, stop, stream, user)
+                result = self._invoke(
+                    model=model,
+                    credentials=credentials,
+                    prompt_messages=prompt_messages,
+                    model_parameters=model_parameters,
+                    tools=tools,
+                    stop=stop,
+                    stream=stream,
+                    user=user,
+                )
        except Exception as e:
            self._trigger_invoke_error_callbacks(
                model=model,
--- a/api/core/model_runtime/model_providers/__base/tts_model.py
+++ b/api/core/model_runtime/model_providers/__base/tts_model.py
@@ -1,6 +1,7 @@
 import logging
 import re
 from abc import abstractmethod
+from collections.abc import Iterable
 from typing import Any, Optional

 from pydantic import ConfigDict
@@ -22,8 +23,14 @@ class TTSModel(AIModel):
    model_config = ConfigDict(protected_namespaces=())

    def invoke(
-        self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, user: Optional[str] = None
-    ):
+        self,
+        model: str,
+        tenant_id: str,
+        credentials: dict,
+        content_text: str,
+        voice: str,
+        user: Optional[str] = None,
+    ) -> Iterable[bytes]:
        """
        Invoke large language model

@@ -50,8 +57,14 @@ class TTSModel(AIModel):

    @abstractmethod
    def _invoke(
-        self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, user: Optional[str] = None
-    ):
+        self,
+        model: str,
+        tenant_id: str,
+        credentials: dict,
+        content_text: str,
+        voice: str,
+        user: Optional[str] = None,
+    ) -> Iterable[bytes]:
        """
        Invoke large language model

@@ -68,25 +81,25 @@ class TTSModel(AIModel):

    def get_tts_model_voices(self, model: str, credentials: dict, language: Optional[str] = None) -> list:
        """
-        Get voice for given tts model voices
+        Retrieves the list of voices supported by a given text-to-speech (TTS) model.

-        :param language: tts language
-        :param model: model name
-        :param credentials: model credentials
-        :return: voices lists
+        :param language: The language for which the voices are requested.
+        :param model: The name of the TTS model.
+        :param credentials: The credentials required to access the TTS model.
+        :return: A list of voices supported by the TTS model.
        """
        model_schema = self.get_model_schema(model, credentials)

-        if model_schema and ModelPropertyKey.VOICES in model_schema.model_properties:
-            voices = model_schema.model_properties[ModelPropertyKey.VOICES]
-            if language:
-                return [
-                    {"name": d["name"], "value": d["mode"]}
-                    for d in voices
-                    if language and language in d.get("language")
-                ]
-            else:
-                return [{"name": d["name"], "value": d["mode"]} for d in voices]
+        if not model_schema or ModelPropertyKey.VOICES not in model_schema.model_properties:
+            raise ValueError("this model does not support voice")
+
+        voices = model_schema.model_properties[ModelPropertyKey.VOICES]
+        if language:
+            return [
+                {"name": d["name"], "value": d["mode"]} for d in voices if language and language in d.get("language")
+            ]
+        else:
+            return [{"name": d["name"], "value": d["mode"]} for d in voices]

    def _get_model_default_voice(self, model: str, credentials: dict) -> Any:
        """
@@ -111,8 +124,10 @@ class TTSModel(AIModel):
        """
        model_schema = self.get_model_schema(model, credentials)

-        if model_schema and ModelPropertyKey.AUDIO_TYPE in model_schema.model_properties:
-            return model_schema.model_properties[ModelPropertyKey.AUDIO_TYPE]
+        if not model_schema or ModelPropertyKey.AUDIO_TYPE not in model_schema.model_properties:
+            raise ValueError("this model does not support audio type")
+
+        return model_schema.model_properties[ModelPropertyKey.AUDIO_TYPE]

    def _get_model_word_limit(self, model: str, credentials: dict) -> int:
        """
@@ -121,8 +136,10 @@ class TTSModel(AIModel):
        """
        model_schema = self.get_model_schema(model, credentials)

-        if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties:
-            return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT]
+        if not model_schema or ModelPropertyKey.WORD_LIMIT not in model_schema.model_properties:
+            raise ValueError("this model does not support word limit")
+
+        return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT]

    def _get_model_workers_limit(self, model: str, credentials: dict) -> int:
        """
@@ -131,8 +148,10 @@ class TTSModel(AIModel):
        """
        model_schema = self.get_model_schema(model, credentials)

-        if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties:
-            return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
+        if not model_schema or ModelPropertyKey.MAX_WORKERS not in model_schema.model_properties:
+            raise ValueError("this model does not support max workers")
+
+        return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]

    @staticmethod
    def _split_text_into_sentences(org_text, max_length=2000, pattern=r"[。.!?]"):
--- a/api/core/model_runtime/model_providers/openai/llm/_position.yaml
+++ b/api/core/model_runtime/model_providers/openai/llm/_position.yaml
@@ -1,3 +1,4 @@
+- gpt-4o-audio-preview
 - gpt-4
 - gpt-4o
 - gpt-4o-2024-05-13
--- a/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml
+++ b/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml
@@ -0,0 +1,44 @@
+model: gpt-4o-audio-preview
+label:
+  zh_Hans: gpt-4o-audio-preview
+  en_US: gpt-4o-audio-preview
+model_type: llm
+features:
+  - multi-tool-call
+  - agent-thought
+  - stream-tool-call
+  - vision
+model_properties:
+  mode: chat
+  context_size: 128000
+parameter_rules:
+  - name: temperature
+    use_template: temperature
+  - name: top_p
+    use_template: top_p
+  - name: presence_penalty
+    use_template: presence_penalty
+  - name: frequency_penalty
+    use_template: frequency_penalty
+  - name: max_tokens
+    use_template: max_tokens
+    default: 512
+    min: 1
+    max: 4096
+  - name: response_format
+    label:
+      zh_Hans: 回复格式
+      en_US: Response Format
+    type: string
+    help:
+      zh_Hans: 指定模型必须输出的格式
+      en_US: specifying the format that the model must output
+    required: false
+    options:
+      - text
+      - json_object
+pricing:
+  input: '5.00'
+  output: '15.00'
+  unit: '0.000001'
+  currency: USD
--- a/api/core/model_runtime/model_providers/openai/llm/llm.py
+++ b/api/core/model_runtime/model_providers/openai/llm/llm.py
@@ -1,7 +1,7 @@
 import json
 import logging
 from collections.abc import Generator
-from typing import Optional, Union, cast
+from typing import Any, Optional, Union, cast

 import tiktoken
 from openai import OpenAI, Stream
@@ -11,9 +11,9 @@ from openai.types.chat.chat_completion_chunk import ChoiceDeltaFunctionCall, Cho
 from openai.types.chat.chat_completion_message import FunctionCall

 from core.model_runtime.callbacks.base_callback import Callback
-from core.model_runtime.entities.llm_entities import LLMMode, LLMResult, LLMResultChunk, LLMResultChunkDelta
-from core.model_runtime.entities.message_entities import (
+from core.model_runtime.entities import (
    AssistantPromptMessage,
+    AudioPromptMessageContent,
    ImagePromptMessageContent,
    PromptMessage,
    PromptMessageContentType,
@@ -23,6 +23,7 @@ from core.model_runtime.entities.message_entities import (
    ToolPromptMessage,
    UserPromptMessage,
 )
+from core.model_runtime.entities.llm_entities import LLMMode, LLMResult, LLMResultChunk, LLMResultChunkDelta
 from core.model_runtime.entities.model_entities import AIModelEntity, FetchFrom, I18nObject, ModelType, PriceConfig
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.model_providers.__base.large_language_model import LargeLanguageModel
@@ -613,6 +614,7 @@ class OpenAILargeLanguageModel(_CommonOpenAI, LargeLanguageModel):
        # clear illegal prompt messages
        prompt_messages = self._clear_illegal_prompt_messages(model, prompt_messages)

+        # o1 compatibility
        block_as_stream = False
        if model.startswith("o1"):
            if stream:
@@ -626,8 +628,9 @@ class OpenAILargeLanguageModel(_CommonOpenAI, LargeLanguageModel):
                del extra_model_kwargs["stop"]

        # chat model
+        messages: Any = [self._convert_prompt_message_to_dict(m) for m in prompt_messages]
        response = client.chat.completions.create(
-            messages=[self._convert_prompt_message_to_dict(m) for m in prompt_messages],
+            messages=messages,
            model=model,
            stream=stream,
            **model_parameters,
@@ -946,23 +949,29 @@ class OpenAILargeLanguageModel(_CommonOpenAI, LargeLanguageModel):
        Convert PromptMessage to dict for OpenAI API
        """
        if isinstance(message, UserPromptMessage):
-            message = cast(UserPromptMessage, message)
            if isinstance(message.content, str):
                message_dict = {"role": "user", "content": message.content}
-            else:
+            elif isinstance(message.content, list):
                sub_messages = []
                for message_content in message.content:
-                    if message_content.type == PromptMessageContentType.TEXT:
-                        message_content = cast(TextPromptMessageContent, message_content)
+                    if isinstance(message_content, TextPromptMessageContent):
                        sub_message_dict = {"type": "text", "text": message_content.data}
                        sub_messages.append(sub_message_dict)
-                    elif message_content.type == PromptMessageContentType.IMAGE:
-                        message_content = cast(ImagePromptMessageContent, message_content)
+                    elif isinstance(message_content, ImagePromptMessageContent):
                        sub_message_dict = {
                            "type": "image_url",
                            "image_url": {"url": message_content.data, "detail": message_content.detail.value},
                        }
                        sub_messages.append(sub_message_dict)
+                    elif isinstance(message_content, AudioPromptMessageContent):
+                        sub_message_dict = {
+                            "type": "input_audio",
+                            "input_audio": {
+                                "data": message_content.data,
+                                "format": message_content.format,
+                            },
+                        }
+                        sub_messages.append(sub_message_dict)

                message_dict = {"role": "user", "content": sub_messages}
        elif isinstance(message, AssistantPromptMessage):