Files
dify/api/core/plugin/utils/chunk_merger.py
-LAN- a2e0f80c01 [Chore/Refactor] Improve type checking configuration (#25185)
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
2025-09-05 08:34:18 +08:00

93 lines
3.5 KiB
Python

from collections.abc import Generator
from dataclasses import dataclass, field
from typing import TypeVar, Union, cast
from core.agent.entities import AgentInvokeMessage
from core.tools.entities.tool_entities import ToolInvokeMessage
MessageType = TypeVar("MessageType", bound=Union[ToolInvokeMessage, AgentInvokeMessage])
@dataclass
class FileChunk:
"""
Buffer for accumulating file chunks during streaming.
"""
total_length: int
bytes_written: int = field(default=0, init=False)
data: bytearray = field(init=False)
def __post_init__(self) -> None:
self.data = bytearray(self.total_length)
def merge_blob_chunks(
response: Generator[MessageType, None, None],
max_file_size: int = 30 * 1024 * 1024,
max_chunk_size: int = 8192,
) -> Generator[MessageType, None, None]:
"""
Merge streaming blob chunks into complete blob messages.
This function processes a stream of plugin invoke messages, accumulating
BLOB_CHUNK messages by their ID until the final chunk is received,
then yielding a single complete BLOB message.
Args:
response: Generator yielding messages that may include blob chunks
max_file_size: Maximum allowed file size in bytes (default: 30MB)
max_chunk_size: Maximum allowed chunk size in bytes (default: 8KB)
Yields:
Messages from the response stream, with blob chunks merged into complete blobs
Raises:
ValueError: If file size exceeds max_file_size or chunk size exceeds max_chunk_size
"""
files: dict[str, FileChunk] = {}
for resp in response:
if resp.type == ToolInvokeMessage.MessageType.BLOB_CHUNK:
assert isinstance(resp.message, ToolInvokeMessage.BlobChunkMessage)
# Get blob chunk information
chunk_id = resp.message.id
total_length = resp.message.total_length
blob_data = resp.message.blob
is_end = resp.message.end
# Initialize buffer for this file if it doesn't exist
if chunk_id not in files:
files[chunk_id] = FileChunk(total_length)
# Check if file is too large (before appending)
if files[chunk_id].bytes_written + len(blob_data) > max_file_size:
# Delete the file if it's too large
del files[chunk_id]
raise ValueError(f"File is too large which reached the limit of {max_file_size / 1024 / 1024}MB")
# Check if single chunk is too large
if len(blob_data) > max_chunk_size:
raise ValueError(f"File chunk is too large which reached the limit of {max_chunk_size / 1024}KB")
# Append the blob data to the buffer
files[chunk_id].data[files[chunk_id].bytes_written : files[chunk_id].bytes_written + len(blob_data)] = (
blob_data
)
files[chunk_id].bytes_written += len(blob_data)
# If this is the final chunk, yield a complete blob message
if is_end:
# Create the appropriate message type based on the response type
message_class = type(resp)
merged_message = message_class(
type=ToolInvokeMessage.MessageType.BLOB,
message=ToolInvokeMessage.BlobMessage(blob=files[chunk_id].data[: files[chunk_id].bytes_written]),
meta=resp.meta,
)
yield cast(MessageType, merged_message)
# Clean up the buffer
del files[chunk_id]
else:
yield resp