From 870e73c03b2285f9a5a6c145655250d176f070ee Mon Sep 17 00:00:00 2001 From: GuanMu Date: Sat, 21 Jun 2025 11:18:48 +0800 Subject: [PATCH] Knowledge base API supports status updates #18147 (#18235) --- .../console/datasets/datasets_document.py | 80 ++------------- .../service_api/dataset/dataset.py | 55 ++++++++++- api/services/dataset_service.py | 97 ++++++++++++++++++- .../datasets/template/template.en.mdx | 57 +++++++++++ .../datasets/template/template.ja.mdx | 58 +++++++++++ .../datasets/template/template.zh.mdx | 57 +++++++++++ 6 files changed, 329 insertions(+), 75 deletions(-) diff --git a/api/controllers/console/datasets/datasets_document.py b/api/controllers/console/datasets/datasets_document.py index f7c04102a..7ac60a0dc 100644 --- a/api/controllers/console/datasets/datasets_document.py +++ b/api/controllers/console/datasets/datasets_document.py @@ -43,7 +43,6 @@ from core.model_runtime.errors.invoke import InvokeAuthorizationError from core.plugin.impl.exc import PluginDaemonClientSideError from core.rag.extractor.entity.extract_setting import ExtractSetting from extensions.ext_database import db -from extensions.ext_redis import redis_client from fields.document_fields import ( dataset_and_document_fields, document_fields, @@ -54,8 +53,6 @@ from libs.login import login_required from models import Dataset, DatasetProcessRule, Document, DocumentSegment, UploadFile from services.dataset_service import DatasetService, DocumentService from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig -from tasks.add_document_to_index_task import add_document_to_index_task -from tasks.remove_document_from_index_task import remove_document_from_index_task class DocumentResource(Resource): @@ -862,77 +859,16 @@ class DocumentStatusApi(DocumentResource): DatasetService.check_dataset_permission(dataset, current_user) document_ids = request.args.getlist("document_id") - for document_id in document_ids: - document = self.get_document(dataset_id, document_id) - indexing_cache_key = "document_{}_indexing".format(document.id) - cache_result = redis_client.get(indexing_cache_key) - if cache_result is not None: - raise InvalidActionError(f"Document:{document.name} is being indexed, please try again later") + try: + DocumentService.batch_update_document_status(dataset, document_ids, action, current_user) + except services.errors.document.DocumentIndexingError as e: + raise InvalidActionError(str(e)) + except ValueError as e: + raise InvalidActionError(str(e)) + except NotFound as e: + raise NotFound(str(e)) - if action == "enable": - if document.enabled: - continue - document.enabled = True - document.disabled_at = None - document.disabled_by = None - document.updated_at = datetime.now(UTC).replace(tzinfo=None) - db.session.commit() - - # Set cache to prevent indexing the same document multiple times - redis_client.setex(indexing_cache_key, 600, 1) - - add_document_to_index_task.delay(document_id) - - elif action == "disable": - if not document.completed_at or document.indexing_status != "completed": - raise InvalidActionError(f"Document: {document.name} is not completed.") - if not document.enabled: - continue - - document.enabled = False - document.disabled_at = datetime.now(UTC).replace(tzinfo=None) - document.disabled_by = current_user.id - document.updated_at = datetime.now(UTC).replace(tzinfo=None) - db.session.commit() - - # Set cache to prevent indexing the same document multiple times - redis_client.setex(indexing_cache_key, 600, 1) - - remove_document_from_index_task.delay(document_id) - - elif action == "archive": - if document.archived: - continue - - document.archived = True - document.archived_at = datetime.now(UTC).replace(tzinfo=None) - document.archived_by = current_user.id - document.updated_at = datetime.now(UTC).replace(tzinfo=None) - db.session.commit() - - if document.enabled: - # Set cache to prevent indexing the same document multiple times - redis_client.setex(indexing_cache_key, 600, 1) - - remove_document_from_index_task.delay(document_id) - - elif action == "un_archive": - if not document.archived: - continue - document.archived = False - document.archived_at = None - document.archived_by = None - document.updated_at = datetime.now(UTC).replace(tzinfo=None) - db.session.commit() - - # Set cache to prevent indexing the same document multiple times - redis_client.setex(indexing_cache_key, 600, 1) - - add_document_to_index_task.delay(document_id) - - else: - raise InvalidActionError() return {"result": "success"}, 200 diff --git a/api/controllers/service_api/dataset/dataset.py b/api/controllers/service_api/dataset/dataset.py index 1467dfb6b..839afdb9f 100644 --- a/api/controllers/service_api/dataset/dataset.py +++ b/api/controllers/service_api/dataset/dataset.py @@ -4,7 +4,7 @@ from werkzeug.exceptions import Forbidden, NotFound import services.dataset_service from controllers.service_api import api -from controllers.service_api.dataset.error import DatasetInUseError, DatasetNameDuplicateError +from controllers.service_api.dataset.error import DatasetInUseError, DatasetNameDuplicateError, InvalidActionError from controllers.service_api.wraps import ( DatasetApiResource, cloud_edition_billing_rate_limit_check, @@ -17,7 +17,7 @@ from fields.dataset_fields import dataset_detail_fields from fields.tag_fields import tag_fields from libs.login import current_user from models.dataset import Dataset, DatasetPermissionEnum -from services.dataset_service import DatasetPermissionService, DatasetService +from services.dataset_service import DatasetPermissionService, DatasetService, DocumentService from services.entities.knowledge_entities.knowledge_entities import RetrievalModel from services.tag_service import TagService @@ -329,6 +329,56 @@ class DatasetApi(DatasetApiResource): raise DatasetInUseError() +class DocumentStatusApi(DatasetApiResource): + """Resource for batch document status operations.""" + + def patch(self, tenant_id, dataset_id, action): + """ + Batch update document status. + + Args: + tenant_id: tenant id + dataset_id: dataset id + action: action to perform (enable, disable, archive, un_archive) + + Returns: + dict: A dictionary with a key 'result' and a value 'success' + int: HTTP status code 200 indicating that the operation was successful. + + Raises: + NotFound: If the dataset with the given ID does not exist. + Forbidden: If the user does not have permission. + InvalidActionError: If the action is invalid or cannot be performed. + """ + dataset_id_str = str(dataset_id) + dataset = DatasetService.get_dataset(dataset_id_str) + + if dataset is None: + raise NotFound("Dataset not found.") + + # Check user's permission + try: + DatasetService.check_dataset_permission(dataset, current_user) + except services.errors.account.NoPermissionError as e: + raise Forbidden(str(e)) + + # Check dataset model setting + DatasetService.check_dataset_model_setting(dataset) + + # Get document IDs from request body + data = request.get_json() + document_ids = data.get("document_ids", []) + + try: + DocumentService.batch_update_document_status(dataset, document_ids, action, current_user) + except services.errors.document.DocumentIndexingError as e: + raise InvalidActionError(str(e)) + except ValueError as e: + raise InvalidActionError(str(e)) + + return {"result": "success"}, 200 + + class DatasetTagsApi(DatasetApiResource): @validate_dataset_token @marshal_with(tag_fields) @@ -457,6 +507,7 @@ class DatasetTagsBindingStatusApi(DatasetApiResource): api.add_resource(DatasetListApi, "/datasets") api.add_resource(DatasetApi, "/datasets/") +api.add_resource(DocumentStatusApi, "/datasets//documents/status/") api.add_resource(DatasetTagsApi, "/datasets/tags") api.add_resource(DatasetTagBindingApi, "/datasets/tags/binding") api.add_resource(DatasetTagUnbindingApi, "/datasets/tags/unbinding") diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index a29bf9259..91b1efb3d 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -59,6 +59,7 @@ from services.external_knowledge_service import ExternalDatasetService from services.feature_service import FeatureModel, FeatureService from services.tag_service import TagService from services.vector_service import VectorService +from tasks.add_document_to_index_task import add_document_to_index_task from tasks.batch_clean_document_task import batch_clean_document_task from tasks.clean_notion_document_task import clean_notion_document_task from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task @@ -70,6 +71,7 @@ from tasks.document_indexing_update_task import document_indexing_update_task from tasks.duplicate_document_indexing_task import duplicate_document_indexing_task from tasks.enable_segments_to_index_task import enable_segments_to_index_task from tasks.recover_document_indexing_task import recover_document_indexing_task +from tasks.remove_document_from_index_task import remove_document_from_index_task from tasks.retry_document_indexing_task import retry_document_indexing_task from tasks.sync_website_document_indexing_task import sync_website_document_indexing_task @@ -434,7 +436,7 @@ class DatasetService: raise ValueError(ex.description) filtered_data["updated_by"] = user.id - filtered_data["updated_at"] = datetime.datetime.now() + filtered_data["updated_at"] = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) # update Retrieval model filtered_data["retrieval_model"] = data["retrieval_model"] @@ -1608,6 +1610,99 @@ class DocumentService: if not isinstance(args["process_rule"]["rules"]["segmentation"]["max_tokens"], int): raise ValueError("Process rule segmentation max_tokens is invalid") + @staticmethod + def batch_update_document_status(dataset: Dataset, document_ids: list[str], action: str, user): + """ + Batch update document status. + + Args: + dataset (Dataset): The dataset object + document_ids (list[str]): List of document IDs to update + action (str): Action to perform (enable, disable, archive, un_archive) + user: Current user performing the action + + Raises: + DocumentIndexingError: If document is being indexed or not in correct state + """ + if not document_ids: + return + + for document_id in document_ids: + document = DocumentService.get_document(dataset.id, document_id) + + if not document: + continue + + indexing_cache_key = f"document_{document.id}_indexing" + cache_result = redis_client.get(indexing_cache_key) + if cache_result is not None: + raise DocumentIndexingError(f"Document:{document.name} is being indexed, please try again later") + + if action == "enable": + if document.enabled: + continue + document.enabled = True + document.disabled_at = None + document.disabled_by = None + document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) + db.session.commit() + + # Set cache to prevent indexing the same document multiple times + redis_client.setex(indexing_cache_key, 600, 1) + + add_document_to_index_task.delay(document_id) + + elif action == "disable": + if not document.completed_at or document.indexing_status != "completed": + raise DocumentIndexingError(f"Document: {document.name} is not completed.") + if not document.enabled: + continue + + document.enabled = False + document.disabled_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) + document.disabled_by = user.id + document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) + db.session.commit() + + # Set cache to prevent indexing the same document multiple times + redis_client.setex(indexing_cache_key, 600, 1) + + remove_document_from_index_task.delay(document_id) + + elif action == "archive": + if document.archived: + continue + + document.archived = True + document.archived_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) + document.archived_by = user.id + document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) + db.session.commit() + + if document.enabled: + # Set cache to prevent indexing the same document multiple times + redis_client.setex(indexing_cache_key, 600, 1) + + remove_document_from_index_task.delay(document_id) + + elif action == "un_archive": + if not document.archived: + continue + document.archived = False + document.archived_at = None + document.archived_by = None + document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) + db.session.commit() + + # Only re-index if the document is currently enabled + if document.enabled: + # Set cache to prevent indexing the same document multiple times + redis_client.setex(indexing_cache_key, 600, 1) + add_document_to_index_task.delay(document_id) + + else: + raise ValueError(f"Invalid action: {action}") + class SegmentService: @classmethod diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx index e1ff827c9..91293768b 100644 --- a/web/app/(commonLayout)/datasets/template/template.en.mdx +++ b/web/app/(commonLayout)/datasets/template/template.en.mdx @@ -1124,6 +1124,63 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
+ + + + ### Path + + + Knowledge ID + + + - `enable` - Enable document + - `disable` - Disable document + - `archive` - Archive document + - `un_archive` - Unarchive document + + + + ### Request Body + + + List of document IDs + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "document_ids": ["doc-id-1", "doc-id-2"] + }' + ``` + + + + ```json {{ title: 'Response' }} + { + "result": "success" + } + ``` + + + + +
+ + + + + ### パス + + + ナレッジ ID + + + - `enable` - ドキュメントを有効化 + - `disable` - ドキュメントを無効化 + - `archive` - ドキュメントをアーカイブ + - `un_archive` - ドキュメントのアーカイブを解除 + + + + ### リクエストボディ + + + ドキュメントIDのリスト + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "document_ids": ["doc-id-1", "doc-id-2"] + }' + ``` + + + + ```json {{ title: 'Response' }} + { + "result": "success" + } + ``` + + + + +
+
+ diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx index 3994356b5..d407fad3c 100644 --- a/web/app/(commonLayout)/datasets/template/template.zh.mdx +++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx @@ -1131,6 +1131,63 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
+ + + + ### Path + + + 知识库 ID + + + - `enable` - 启用文档 + - `disable` - 禁用文档 + - `archive` - 归档文档 + - `un_archive` - 取消归档文档 + + + + ### Request Body + + + 文档ID列表 + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "document_ids": ["doc-id-1", "doc-id-2"] + }' + ``` + + + + ```json {{ title: 'Response' }} + { + "result": "success" + } + ``` + + + + +
+