diff --git a/api/controllers/console/datasets/datasets.py b/api/controllers/console/datasets/datasets.py index de99b89ef..4d6aadec8 100644 --- a/api/controllers/console/datasets/datasets.py +++ b/api/controllers/console/datasets/datasets.py @@ -17,6 +17,7 @@ from core.model_runtime.entities.model_entities import ModelType from core.provider_manager import ProviderManager from core.rag.datasource.vdb.vector_type import VectorType from core.rag.extractor.entity.extract_setting import ExtractSetting +from core.rag.retrieval.retrival_methods import RetrievalMethod from extensions.ext_database import db from fields.app_fields import related_app_list from fields.dataset_fields import dataset_detail_fields, dataset_query_detail_fields @@ -500,13 +501,15 @@ class DatasetRetrievalSettingApi(Resource): case VectorType.MILVUS | VectorType.RELYT | VectorType.PGVECTOR | VectorType.TIDB_VECTOR | VectorType.CHROMA | VectorType.TENCENT: return { 'retrieval_method': [ - 'semantic_search' + RetrievalMethod.SEMANTIC_SEARCH ] } case VectorType.QDRANT | VectorType.WEAVIATE | VectorType.OPENSEARCH: return { 'retrieval_method': [ - 'semantic_search', 'full_text_search', 'hybrid_search' + RetrievalMethod.SEMANTIC_SEARCH, + RetrievalMethod.FULL_TEXT_SEARCH, + RetrievalMethod.HYBRID_SEARCH, ] } case _: @@ -522,13 +525,15 @@ class DatasetRetrievalSettingMockApi(Resource): case VectorType.MILVUS | VectorType.RELYT | VectorType.PGVECTOR | VectorType.TIDB_VECTOR | VectorType.CHROMA | VectorType.TENCEN: return { 'retrieval_method': [ - 'semantic_search' + RetrievalMethod.SEMANTIC_SEARCH ] } case VectorType.QDRANT | VectorType.WEAVIATE | VectorType.OPENSEARCH: return { 'retrieval_method': [ - 'semantic_search', 'full_text_search', 'hybrid_search' + RetrievalMethod.SEMANTIC_SEARCH, + RetrievalMethod.FULL_TEXT_SEARCH, + RetrievalMethod.HYBRID_SEARCH, ] } case _: diff --git a/api/core/rag/datasource/retrieval_service.py b/api/core/rag/datasource/retrieval_service.py index dd74406f3..623b7a312 100644 --- a/api/core/rag/datasource/retrieval_service.py +++ b/api/core/rag/datasource/retrieval_service.py @@ -6,11 +6,12 @@ from flask import Flask, current_app from core.rag.data_post_processor.data_post_processor import DataPostProcessor from core.rag.datasource.keyword.keyword_factory import Keyword from core.rag.datasource.vdb.vector_factory import Vector +from core.rag.retrieval.retrival_methods import RetrievalMethod from extensions.ext_database import db from models.dataset import Dataset default_retrieval_model = { - 'search_method': 'semantic_search', + 'search_method': RetrievalMethod.SEMANTIC_SEARCH, 'reranking_enable': False, 'reranking_model': { 'reranking_provider_name': '', @@ -47,7 +48,7 @@ class RetrievalService: threads.append(keyword_thread) keyword_thread.start() # retrieval_model source with semantic - if retrival_method == 'semantic_search' or retrival_method == 'hybrid_search': + if RetrievalMethod.is_support_semantic_search(retrival_method): embedding_thread = threading.Thread(target=RetrievalService.embedding_search, kwargs={ 'flask_app': current_app._get_current_object(), 'dataset_id': dataset_id, @@ -63,7 +64,7 @@ class RetrievalService: embedding_thread.start() # retrieval source with full text - if retrival_method == 'full_text_search' or retrival_method == 'hybrid_search': + if RetrievalMethod.is_support_fulltext_search(retrival_method): full_text_index_thread = threading.Thread(target=RetrievalService.full_text_index_search, kwargs={ 'flask_app': current_app._get_current_object(), 'dataset_id': dataset_id, @@ -85,7 +86,7 @@ class RetrievalService: exception_message = ';\n'.join(exceptions) raise Exception(exception_message) - if retrival_method == 'hybrid_search': + if retrival_method == RetrievalMethod.HYBRID_SEARCH: data_post_processor = DataPostProcessor(str(dataset.tenant_id), reranking_model, False) all_documents = data_post_processor.invoke( query=query, @@ -141,7 +142,7 @@ class RetrievalService: ) if documents: - if reranking_model and retrival_method == 'semantic_search': + if reranking_model and retrival_method == RetrievalMethod.SEMANTIC_SEARCH: data_post_processor = DataPostProcessor(str(dataset.tenant_id), reranking_model, False) all_documents.extend(data_post_processor.invoke( query=query, @@ -173,7 +174,7 @@ class RetrievalService: top_k=top_k ) if documents: - if reranking_model and retrival_method == 'full_text_search': + if reranking_model and retrival_method == RetrievalMethod.FULL_TEXT_SEARCH: data_post_processor = DataPostProcessor(str(dataset.tenant_id), reranking_model, False) all_documents.extend(data_post_processor.invoke( query=query, diff --git a/api/core/rag/retrieval/dataset_retrieval.py b/api/core/rag/retrieval/dataset_retrieval.py index b42a441a3..3f5042714 100644 --- a/api/core/rag/retrieval/dataset_retrieval.py +++ b/api/core/rag/retrieval/dataset_retrieval.py @@ -15,6 +15,7 @@ from core.model_runtime.model_providers.__base.large_language_model import Large from core.rag.datasource.retrieval_service import RetrievalService from core.rag.models.document import Document from core.rag.rerank.rerank import RerankRunner +from core.rag.retrieval.retrival_methods import RetrievalMethod from core.rag.retrieval.router.multi_dataset_function_call_router import FunctionCallMultiDatasetRouter from core.rag.retrieval.router.multi_dataset_react_route import ReactMultiDatasetRouter from core.tools.tool.dataset_retriever.dataset_multi_retriever_tool import DatasetMultiRetrieverTool @@ -25,7 +26,7 @@ from models.dataset import Dataset, DatasetQuery, DocumentSegment from models.dataset import Document as DatasetDocument default_retrieval_model = { - 'search_method': 'semantic_search', + 'search_method': RetrievalMethod.SEMANTIC_SEARCH, 'reranking_enable': False, 'reranking_model': { 'reranking_provider_name': '', @@ -419,7 +420,7 @@ class DatasetRetrieval: if retrieve_config.retrieve_strategy == DatasetRetrieveConfigEntity.RetrieveStrategy.SINGLE: # get retrieval model config default_retrieval_model = { - 'search_method': 'semantic_search', + 'search_method': RetrievalMethod.SEMANTIC_SEARCH, 'reranking_enable': False, 'reranking_model': { 'reranking_provider_name': '', diff --git a/api/core/rag/retrieval/retrival_methods.py b/api/core/rag/retrieval/retrival_methods.py new file mode 100644 index 000000000..9b7907013 --- /dev/null +++ b/api/core/rag/retrieval/retrival_methods.py @@ -0,0 +1,15 @@ +from enum import Enum + + +class RetrievalMethod(str, Enum): + SEMANTIC_SEARCH = 'semantic_search' + FULL_TEXT_SEARCH = 'full_text_search' + HYBRID_SEARCH = 'hybrid_search' + + @staticmethod + def is_support_semantic_search(retrieval_method: str) -> bool: + return retrieval_method in {RetrievalMethod.SEMANTIC_SEARCH, RetrievalMethod.HYBRID_SEARCH} + + @staticmethod + def is_support_fulltext_search(retrieval_method: str) -> bool: + return retrieval_method in {RetrievalMethod.FULL_TEXT_SEARCH, RetrievalMethod.HYBRID_SEARCH} diff --git a/api/core/tools/tool/dataset_retriever/dataset_multi_retriever_tool.py b/api/core/tools/tool/dataset_retriever/dataset_multi_retriever_tool.py index 18cf78066..5b053678f 100644 --- a/api/core/tools/tool/dataset_retriever/dataset_multi_retriever_tool.py +++ b/api/core/tools/tool/dataset_retriever/dataset_multi_retriever_tool.py @@ -8,12 +8,13 @@ from core.model_manager import ModelManager from core.model_runtime.entities.model_entities import ModelType from core.rag.datasource.retrieval_service import RetrievalService from core.rag.rerank.rerank import RerankRunner +from core.rag.retrieval.retrival_methods import RetrievalMethod from core.tools.tool.dataset_retriever.dataset_retriever_base_tool import DatasetRetrieverBaseTool from extensions.ext_database import db from models.dataset import Dataset, Document, DocumentSegment default_retrieval_model = { - 'search_method': 'semantic_search', + 'search_method': RetrievalMethod.SEMANTIC_SEARCH, 'reranking_enable': False, 'reranking_model': { 'reranking_provider_name': '', diff --git a/api/core/tools/tool/dataset_retriever/dataset_retriever_tool.py b/api/core/tools/tool/dataset_retriever/dataset_retriever_tool.py index af45fc66f..de2ce5858 100644 --- a/api/core/tools/tool/dataset_retriever/dataset_retriever_tool.py +++ b/api/core/tools/tool/dataset_retriever/dataset_retriever_tool.py @@ -2,12 +2,13 @@ from pydantic import BaseModel, Field from core.rag.datasource.retrieval_service import RetrievalService +from core.rag.retrieval.retrival_methods import RetrievalMethod from core.tools.tool.dataset_retriever.dataset_retriever_base_tool import DatasetRetrieverBaseTool from extensions.ext_database import db from models.dataset import Dataset, Document, DocumentSegment default_retrieval_model = { - 'search_method': 'semantic_search', + 'search_method': RetrievalMethod.SEMANTIC_SEARCH, 'reranking_enable': False, 'reranking_model': { 'reranking_provider_name': '', diff --git a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py index 1a0f3b049..9e29bd9ea 100644 --- a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py +++ b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py @@ -11,6 +11,7 @@ from core.model_manager import ModelInstance, ModelManager from core.model_runtime.entities.model_entities import ModelFeature, ModelType from core.model_runtime.model_providers.__base.large_language_model import LargeLanguageModel from core.rag.retrieval.dataset_retrieval import DatasetRetrieval +from core.rag.retrieval.retrival_methods import RetrievalMethod from core.workflow.entities.base_node_data_entities import BaseNodeData from core.workflow.entities.node_entities import NodeRunResult, NodeType from core.workflow.entities.variable_pool import VariablePool @@ -21,7 +22,7 @@ from models.dataset import Dataset, Document, DocumentSegment from models.workflow import WorkflowNodeExecutionStatus default_retrieval_model = { - 'search_method': 'semantic_search', + 'search_method': RetrievalMethod.SEMANTIC_SEARCH, 'reranking_enable': False, 'reranking_model': { 'reranking_provider_name': '', diff --git a/api/models/dataset.py b/api/models/dataset.py index 09e18ab53..757a5bf8d 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -13,6 +13,7 @@ from flask import current_app from sqlalchemy import func from sqlalchemy.dialects.postgresql import JSONB +from core.rag.retrieval.retrival_methods import RetrievalMethod from extensions.ext_database import db from extensions.ext_storage import storage from models import StringUUID @@ -116,7 +117,7 @@ class Dataset(db.Model): @property def retrieval_model_dict(self): default_retrieval_model = { - 'search_method': 'semantic_search', + 'search_method': RetrievalMethod.SEMANTIC_SEARCH, 'reranking_enable': False, 'reranking_model': { 'reranking_provider_name': '', diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index b3cf15811..e8446da44 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -15,6 +15,7 @@ from core.model_manager import ModelManager from core.model_runtime.entities.model_entities import ModelType from core.rag.datasource.keyword.keyword_factory import Keyword from core.rag.models.document import Document as RAGDocument +from core.rag.retrieval.retrival_methods import RetrievalMethod from events.dataset_event import dataset_was_deleted from events.document_event import document_was_deleted from extensions.ext_database import db @@ -602,7 +603,7 @@ class DocumentService: dataset.collection_binding_id = dataset_collection_binding.id if not dataset.retrieval_model: default_retrieval_model = { - 'search_method': 'semantic_search', + 'search_method': RetrievalMethod.SEMANTIC_SEARCH, 'reranking_enable': False, 'reranking_model': { 'reranking_provider_name': '', @@ -959,7 +960,7 @@ class DocumentService: retrieval_model = document_data['retrieval_model'] else: default_retrieval_model = { - 'search_method': 'semantic_search', + 'search_method': RetrievalMethod.SEMANTIC_SEARCH, 'reranking_enable': False, 'reranking_model': { 'reranking_provider_name': '', diff --git a/api/services/hit_testing_service.py b/api/services/hit_testing_service.py index 6d5a0537d..8ff96c733 100644 --- a/api/services/hit_testing_service.py +++ b/api/services/hit_testing_service.py @@ -10,12 +10,13 @@ from core.model_runtime.entities.model_entities import ModelType from core.rag.datasource.entity.embedding import Embeddings from core.rag.datasource.retrieval_service import RetrievalService from core.rag.models.document import Document +from core.rag.retrieval.retrival_methods import RetrievalMethod from extensions.ext_database import db from models.account import Account from models.dataset import Dataset, DatasetQuery, DocumentSegment default_retrieval_model = { - 'search_method': 'semantic_search', + 'search_method': RetrievalMethod.SEMANTIC_SEARCH, 'reranking_enable': False, 'reranking_model': { 'reranking_provider_name': '',