From 5b3cc560d5b059f6a037d0da5365d6ac53230594 Mon Sep 17 00:00:00 2001 From: Frederick2313072 <163711933+Frederick2313072@users.noreply.github.com> Date: Mon, 1 Sep 2025 15:46:37 +0800 Subject: [PATCH] fix:hard-coded top-k fallback issue. (#24879) --- api/core/rag/datasource/retrieval_service.py | 2 +- api/core/rag/datasource/vdb/couchbase/couchbase_vector.py | 2 +- api/core/rag/retrieval/dataset_retrieval.py | 6 +++--- .../utils/dataset_retriever/dataset_multi_retriever_tool.py | 4 ++-- .../utils/dataset_retriever/dataset_retriever_base_tool.py | 2 +- .../nodes/knowledge_retrieval/knowledge_retrieval_node.py | 2 +- api/services/dataset_service.py | 4 ++-- api/services/hit_testing_service.py | 4 ++-- .../datasets/external-knowledge-base/create/index.tsx | 2 +- web/app/components/datasets/hit-testing/textarea.tsx | 2 +- web/context/debug-configuration.ts | 2 +- 11 files changed, 16 insertions(+), 16 deletions(-) diff --git a/api/core/rag/datasource/retrieval_service.py b/api/core/rag/datasource/retrieval_service.py index e872a4e37..2912a48d9 100644 --- a/api/core/rag/datasource/retrieval_service.py +++ b/api/core/rag/datasource/retrieval_service.py @@ -24,7 +24,7 @@ default_retrieval_model = { "search_method": RetrievalMethod.SEMANTIC_SEARCH.value, "reranking_enable": False, "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""}, - "top_k": 2, + "top_k": 4, "score_threshold_enabled": False, } diff --git a/api/core/rag/datasource/vdb/couchbase/couchbase_vector.py b/api/core/rag/datasource/vdb/couchbase/couchbase_vector.py index bd986393d..d22a7e4fd 100644 --- a/api/core/rag/datasource/vdb/couchbase/couchbase_vector.py +++ b/api/core/rag/datasource/vdb/couchbase/couchbase_vector.py @@ -304,7 +304,7 @@ class CouchbaseVector(BaseVector): return docs def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]: - top_k = kwargs.get("top_k", 2) + top_k = kwargs.get("top_k", 4) try: CBrequest = search.SearchRequest.create(search.QueryStringQuery("text:" + query)) search_iter = self._scope.search( diff --git a/api/core/rag/retrieval/dataset_retrieval.py b/api/core/rag/retrieval/dataset_retrieval.py index cd4af7283..49c72b4ba 100644 --- a/api/core/rag/retrieval/dataset_retrieval.py +++ b/api/core/rag/retrieval/dataset_retrieval.py @@ -65,7 +65,7 @@ default_retrieval_model: dict[str, Any] = { "search_method": RetrievalMethod.SEMANTIC_SEARCH.value, "reranking_enable": False, "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""}, - "top_k": 2, + "top_k": 4, "score_threshold_enabled": False, } @@ -647,7 +647,7 @@ class DatasetRetrieval: retrieval_method=retrieval_model["search_method"], dataset_id=dataset.id, query=query, - top_k=retrieval_model.get("top_k") or 2, + top_k=retrieval_model.get("top_k") or 4, score_threshold=retrieval_model.get("score_threshold", 0.0) if retrieval_model["score_threshold_enabled"] else 0.0, @@ -743,7 +743,7 @@ class DatasetRetrieval: tool = DatasetMultiRetrieverTool.from_dataset( dataset_ids=[dataset.id for dataset in available_datasets], tenant_id=tenant_id, - top_k=retrieve_config.top_k or 2, + top_k=retrieve_config.top_k or 4, score_threshold=retrieve_config.score_threshold, hit_callbacks=[hit_callback], return_resource=return_resource, diff --git a/api/core/tools/utils/dataset_retriever/dataset_multi_retriever_tool.py b/api/core/tools/utils/dataset_retriever/dataset_multi_retriever_tool.py index 7eb4bc017..56c6a9fbe 100644 --- a/api/core/tools/utils/dataset_retriever/dataset_multi_retriever_tool.py +++ b/api/core/tools/utils/dataset_retriever/dataset_multi_retriever_tool.py @@ -181,7 +181,7 @@ class DatasetMultiRetrieverTool(DatasetRetrieverBaseTool): retrieval_method="keyword_search", dataset_id=dataset.id, query=query, - top_k=retrieval_model.get("top_k") or 2, + top_k=retrieval_model.get("top_k") or 4, ) if documents: all_documents.extend(documents) @@ -192,7 +192,7 @@ class DatasetMultiRetrieverTool(DatasetRetrieverBaseTool): retrieval_method=retrieval_model["search_method"], dataset_id=dataset.id, query=query, - top_k=retrieval_model.get("top_k") or 2, + top_k=retrieval_model.get("top_k") or 4, score_threshold=retrieval_model.get("score_threshold", 0.0) if retrieval_model["score_threshold_enabled"] else 0.0, diff --git a/api/core/tools/utils/dataset_retriever/dataset_retriever_base_tool.py b/api/core/tools/utils/dataset_retriever/dataset_retriever_base_tool.py index 567275531..4f489e00f 100644 --- a/api/core/tools/utils/dataset_retriever/dataset_retriever_base_tool.py +++ b/api/core/tools/utils/dataset_retriever/dataset_retriever_base_tool.py @@ -13,7 +13,7 @@ class DatasetRetrieverBaseTool(BaseModel, ABC): name: str = "dataset" description: str = "use this to retrieve a dataset. " tenant_id: str - top_k: int = 2 + top_k: int = 4 score_threshold: Optional[float] = None hit_callbacks: list[DatasetIndexToolCallbackHandler] = [] return_resource: bool diff --git a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py index 5e5c9f520..a44f15f87 100644 --- a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py +++ b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py @@ -78,7 +78,7 @@ default_retrieval_model = { "search_method": RetrievalMethod.SEMANTIC_SEARCH.value, "reranking_enable": False, "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""}, - "top_k": 2, + "top_k": 4, "score_threshold_enabled": False, } diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index bbebb7a92..d3a98bf0a 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -1149,7 +1149,7 @@ class DocumentService: "search_method": RetrievalMethod.SEMANTIC_SEARCH.value, "reranking_enable": False, "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""}, - "top_k": 2, + "top_k": 4, "score_threshold_enabled": False, } @@ -1612,7 +1612,7 @@ class DocumentService: search_method=RetrievalMethod.SEMANTIC_SEARCH.value, reranking_enable=False, reranking_model=RerankingModel(reranking_provider_name="", reranking_model_name=""), - top_k=2, + top_k=4, score_threshold_enabled=False, ) # save dataset diff --git a/api/services/hit_testing_service.py b/api/services/hit_testing_service.py index 1517ca659..bce28da03 100644 --- a/api/services/hit_testing_service.py +++ b/api/services/hit_testing_service.py @@ -18,7 +18,7 @@ default_retrieval_model = { "search_method": RetrievalMethod.SEMANTIC_SEARCH.value, "reranking_enable": False, "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""}, - "top_k": 2, + "top_k": 4, "score_threshold_enabled": False, } @@ -66,7 +66,7 @@ class HitTestingService: retrieval_method=retrieval_model.get("search_method", "semantic_search"), dataset_id=dataset.id, query=query, - top_k=retrieval_model.get("top_k", 2), + top_k=retrieval_model.get("top_k", 4), score_threshold=retrieval_model.get("score_threshold", 0.0) if retrieval_model["score_threshold_enabled"] else 0.0, diff --git a/web/app/components/datasets/external-knowledge-base/create/index.tsx b/web/app/components/datasets/external-knowledge-base/create/index.tsx index b8f754e9c..e7e8b99c2 100644 --- a/web/app/components/datasets/external-knowledge-base/create/index.tsx +++ b/web/app/components/datasets/external-knowledge-base/create/index.tsx @@ -28,7 +28,7 @@ const ExternalKnowledgeBaseCreate: React.FC = external_knowledge_api_id: '', external_knowledge_id: '', external_retrieval_model: { - top_k: 2, + top_k: 4, score_threshold: 0.5, score_threshold_enabled: false, }, diff --git a/web/app/components/datasets/hit-testing/textarea.tsx b/web/app/components/datasets/hit-testing/textarea.tsx index c92e10754..a1f736e8f 100644 --- a/web/app/components/datasets/hit-testing/textarea.tsx +++ b/web/app/components/datasets/hit-testing/textarea.tsx @@ -49,7 +49,7 @@ const TextAreaWithButton = ({ const { t } = useTranslation() const [isSettingsOpen, setIsSettingsOpen] = useState(false) const [externalRetrievalSettings, setExternalRetrievalSettings] = useState({ - top_k: 2, + top_k: 4, score_threshold: 0.5, score_threshold_enabled: false, }) diff --git a/web/context/debug-configuration.ts b/web/context/debug-configuration.ts index cb737c528..bbf7be809 100644 --- a/web/context/debug-configuration.ts +++ b/web/context/debug-configuration.ts @@ -233,7 +233,7 @@ const DebugConfigurationContext = createContext({ reranking_provider_name: '', reranking_model_name: '', }, - top_k: 2, + top_k: 4, score_threshold_enabled: false, score_threshold: 0.7, datasets: {