From 870e73c03b2285f9a5a6c145655250d176f070ee Mon Sep 17 00:00:00 2001
From: GuanMu <ballmanjq@gmail.com>
Date: Sat, 21 Jun 2025 11:18:48 +0800
Subject: [PATCH] Knowledge base API supports status updates #18147 (#18235)

---
 .../console/datasets/datasets_document.py     | 80 ++-------------
 .../service_api/dataset/dataset.py            | 55 ++++++++++-
 api/services/dataset_service.py               | 97 ++++++++++++++++++-
 .../datasets/template/template.en.mdx         | 57 +++++++++++
 .../datasets/template/template.ja.mdx         | 58 +++++++++++
 .../datasets/template/template.zh.mdx         | 57 +++++++++++
 6 files changed, 329 insertions(+), 75 deletions(-)

diff --git a/api/controllers/console/datasets/datasets_document.py b/api/controllers/console/datasets/datasets_document.py
index f7c04102a..7ac60a0dc 100644
--- a/api/controllers/console/datasets/datasets_document.py
+++ b/api/controllers/console/datasets/datasets_document.py
@@ -43,7 +43,6 @@ from core.model_runtime.errors.invoke import InvokeAuthorizationError
 from core.plugin.impl.exc import PluginDaemonClientSideError
 from core.rag.extractor.entity.extract_setting import ExtractSetting
 from extensions.ext_database import db
-from extensions.ext_redis import redis_client
 from fields.document_fields import (
     dataset_and_document_fields,
     document_fields,
@@ -54,8 +53,6 @@ from libs.login import login_required
 from models import Dataset, DatasetProcessRule, Document, DocumentSegment, UploadFile
 from services.dataset_service import DatasetService, DocumentService
 from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig
-from tasks.add_document_to_index_task import add_document_to_index_task
-from tasks.remove_document_from_index_task import remove_document_from_index_task
 
 
 class DocumentResource(Resource):
@@ -862,77 +859,16 @@ class DocumentStatusApi(DocumentResource):
         DatasetService.check_dataset_permission(dataset, current_user)
 
         document_ids = request.args.getlist("document_id")
-        for document_id in document_ids:
-            document = self.get_document(dataset_id, document_id)
 
-            indexing_cache_key = "document_{}_indexing".format(document.id)
-            cache_result = redis_client.get(indexing_cache_key)
-            if cache_result is not None:
-                raise InvalidActionError(f"Document:{document.name} is being indexed, please try again later")
+        try:
+            DocumentService.batch_update_document_status(dataset, document_ids, action, current_user)
+        except services.errors.document.DocumentIndexingError as e:
+            raise InvalidActionError(str(e))
+        except ValueError as e:
+            raise InvalidActionError(str(e))
+        except NotFound as e:
+            raise NotFound(str(e))
 
-            if action == "enable":
-                if document.enabled:
-                    continue
-                document.enabled = True
-                document.disabled_at = None
-                document.disabled_by = None
-                document.updated_at = datetime.now(UTC).replace(tzinfo=None)
-                db.session.commit()
-
-                # Set cache to prevent indexing the same document multiple times
-                redis_client.setex(indexing_cache_key, 600, 1)
-
-                add_document_to_index_task.delay(document_id)
-
-            elif action == "disable":
-                if not document.completed_at or document.indexing_status != "completed":
-                    raise InvalidActionError(f"Document: {document.name} is not completed.")
-                if not document.enabled:
-                    continue
-
-                document.enabled = False
-                document.disabled_at = datetime.now(UTC).replace(tzinfo=None)
-                document.disabled_by = current_user.id
-                document.updated_at = datetime.now(UTC).replace(tzinfo=None)
-                db.session.commit()
-
-                # Set cache to prevent indexing the same document multiple times
-                redis_client.setex(indexing_cache_key, 600, 1)
-
-                remove_document_from_index_task.delay(document_id)
-
-            elif action == "archive":
-                if document.archived:
-                    continue
-
-                document.archived = True
-                document.archived_at = datetime.now(UTC).replace(tzinfo=None)
-                document.archived_by = current_user.id
-                document.updated_at = datetime.now(UTC).replace(tzinfo=None)
-                db.session.commit()
-
-                if document.enabled:
-                    # Set cache to prevent indexing the same document multiple times
-                    redis_client.setex(indexing_cache_key, 600, 1)
-
-                    remove_document_from_index_task.delay(document_id)
-
-            elif action == "un_archive":
-                if not document.archived:
-                    continue
-                document.archived = False
-                document.archived_at = None
-                document.archived_by = None
-                document.updated_at = datetime.now(UTC).replace(tzinfo=None)
-                db.session.commit()
-
-                # Set cache to prevent indexing the same document multiple times
-                redis_client.setex(indexing_cache_key, 600, 1)
-
-                add_document_to_index_task.delay(document_id)
-
-            else:
-                raise InvalidActionError()
         return {"result": "success"}, 200
 
 
diff --git a/api/controllers/service_api/dataset/dataset.py b/api/controllers/service_api/dataset/dataset.py
index 1467dfb6b..839afdb9f 100644
--- a/api/controllers/service_api/dataset/dataset.py
+++ b/api/controllers/service_api/dataset/dataset.py
@@ -4,7 +4,7 @@ from werkzeug.exceptions import Forbidden, NotFound
 
 import services.dataset_service
 from controllers.service_api import api
-from controllers.service_api.dataset.error import DatasetInUseError, DatasetNameDuplicateError
+from controllers.service_api.dataset.error import DatasetInUseError, DatasetNameDuplicateError, InvalidActionError
 from controllers.service_api.wraps import (
     DatasetApiResource,
     cloud_edition_billing_rate_limit_check,
@@ -17,7 +17,7 @@ from fields.dataset_fields import dataset_detail_fields
 from fields.tag_fields import tag_fields
 from libs.login import current_user
 from models.dataset import Dataset, DatasetPermissionEnum
-from services.dataset_service import DatasetPermissionService, DatasetService
+from services.dataset_service import DatasetPermissionService, DatasetService, DocumentService
 from services.entities.knowledge_entities.knowledge_entities import RetrievalModel
 from services.tag_service import TagService
 
@@ -329,6 +329,56 @@ class DatasetApi(DatasetApiResource):
             raise DatasetInUseError()
 
 
+class DocumentStatusApi(DatasetApiResource):
+    """Resource for batch document status operations."""
+
+    def patch(self, tenant_id, dataset_id, action):
+        """
+        Batch update document status.
+
+        Args:
+            tenant_id: tenant id
+            dataset_id: dataset id
+            action: action to perform (enable, disable, archive, un_archive)
+
+        Returns:
+            dict: A dictionary with a key 'result' and a value 'success'
+            int: HTTP status code 200 indicating that the operation was successful.
+
+        Raises:
+            NotFound: If the dataset with the given ID does not exist.
+            Forbidden: If the user does not have permission.
+            InvalidActionError: If the action is invalid or cannot be performed.
+        """
+        dataset_id_str = str(dataset_id)
+        dataset = DatasetService.get_dataset(dataset_id_str)
+
+        if dataset is None:
+            raise NotFound("Dataset not found.")
+
+        # Check user's permission
+        try:
+            DatasetService.check_dataset_permission(dataset, current_user)
+        except services.errors.account.NoPermissionError as e:
+            raise Forbidden(str(e))
+
+        # Check dataset model setting
+        DatasetService.check_dataset_model_setting(dataset)
+
+        # Get document IDs from request body
+        data = request.get_json()
+        document_ids = data.get("document_ids", [])
+
+        try:
+            DocumentService.batch_update_document_status(dataset, document_ids, action, current_user)
+        except services.errors.document.DocumentIndexingError as e:
+            raise InvalidActionError(str(e))
+        except ValueError as e:
+            raise InvalidActionError(str(e))
+
+        return {"result": "success"}, 200
+
+
 class DatasetTagsApi(DatasetApiResource):
     @validate_dataset_token
     @marshal_with(tag_fields)
@@ -457,6 +507,7 @@ class DatasetTagsBindingStatusApi(DatasetApiResource):
 
 api.add_resource(DatasetListApi, "/datasets")
 api.add_resource(DatasetApi, "/datasets/<uuid:dataset_id>")
+api.add_resource(DocumentStatusApi, "/datasets/<uuid:dataset_id>/documents/status/<string:action>")
 api.add_resource(DatasetTagsApi, "/datasets/tags")
 api.add_resource(DatasetTagBindingApi, "/datasets/tags/binding")
 api.add_resource(DatasetTagUnbindingApi, "/datasets/tags/unbinding")
diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py
index a29bf9259..91b1efb3d 100644
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -59,6 +59,7 @@ from services.external_knowledge_service import ExternalDatasetService
 from services.feature_service import FeatureModel, FeatureService
 from services.tag_service import TagService
 from services.vector_service import VectorService
+from tasks.add_document_to_index_task import add_document_to_index_task
 from tasks.batch_clean_document_task import batch_clean_document_task
 from tasks.clean_notion_document_task import clean_notion_document_task
 from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task
@@ -70,6 +71,7 @@ from tasks.document_indexing_update_task import document_indexing_update_task
 from tasks.duplicate_document_indexing_task import duplicate_document_indexing_task
 from tasks.enable_segments_to_index_task import enable_segments_to_index_task
 from tasks.recover_document_indexing_task import recover_document_indexing_task
+from tasks.remove_document_from_index_task import remove_document_from_index_task
 from tasks.retry_document_indexing_task import retry_document_indexing_task
 from tasks.sync_website_document_indexing_task import sync_website_document_indexing_task
 
@@ -434,7 +436,7 @@ class DatasetService:
                         raise ValueError(ex.description)
 
             filtered_data["updated_by"] = user.id
-            filtered_data["updated_at"] = datetime.datetime.now()
+            filtered_data["updated_at"] = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
 
             # update Retrieval model
             filtered_data["retrieval_model"] = data["retrieval_model"]
@@ -1608,6 +1610,99 @@ class DocumentService:
             if not isinstance(args["process_rule"]["rules"]["segmentation"]["max_tokens"], int):
                 raise ValueError("Process rule segmentation max_tokens is invalid")
 
+    @staticmethod
+    def batch_update_document_status(dataset: Dataset, document_ids: list[str], action: str, user):
+        """
+        Batch update document status.
+
+        Args:
+            dataset (Dataset): The dataset object
+            document_ids (list[str]): List of document IDs to update
+            action (str): Action to perform (enable, disable, archive, un_archive)
+            user: Current user performing the action
+
+        Raises:
+            DocumentIndexingError: If document is being indexed or not in correct state
+        """
+        if not document_ids:
+            return
+
+        for document_id in document_ids:
+            document = DocumentService.get_document(dataset.id, document_id)
+
+            if not document:
+                continue
+
+            indexing_cache_key = f"document_{document.id}_indexing"
+            cache_result = redis_client.get(indexing_cache_key)
+            if cache_result is not None:
+                raise DocumentIndexingError(f"Document:{document.name} is being indexed, please try again later")
+
+            if action == "enable":
+                if document.enabled:
+                    continue
+                document.enabled = True
+                document.disabled_at = None
+                document.disabled_by = None
+                document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
+                db.session.commit()
+
+                # Set cache to prevent indexing the same document multiple times
+                redis_client.setex(indexing_cache_key, 600, 1)
+
+                add_document_to_index_task.delay(document_id)
+
+            elif action == "disable":
+                if not document.completed_at or document.indexing_status != "completed":
+                    raise DocumentIndexingError(f"Document: {document.name} is not completed.")
+                if not document.enabled:
+                    continue
+
+                document.enabled = False
+                document.disabled_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
+                document.disabled_by = user.id
+                document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
+                db.session.commit()
+
+                # Set cache to prevent indexing the same document multiple times
+                redis_client.setex(indexing_cache_key, 600, 1)
+
+                remove_document_from_index_task.delay(document_id)
+
+            elif action == "archive":
+                if document.archived:
+                    continue
+
+                document.archived = True
+                document.archived_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
+                document.archived_by = user.id
+                document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
+                db.session.commit()
+
+                if document.enabled:
+                    # Set cache to prevent indexing the same document multiple times
+                    redis_client.setex(indexing_cache_key, 600, 1)
+
+                    remove_document_from_index_task.delay(document_id)
+
+            elif action == "un_archive":
+                if not document.archived:
+                    continue
+                document.archived = False
+                document.archived_at = None
+                document.archived_by = None
+                document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
+                db.session.commit()
+
+                # Only re-index if the document is currently enabled
+                if document.enabled:
+                    # Set cache to prevent indexing the same document multiple times
+                    redis_client.setex(indexing_cache_key, 600, 1)
+                    add_document_to_index_task.delay(document_id)
+
+            else:
+                raise ValueError(f"Invalid action: {action}")
+
 
 class SegmentService:
     @classmethod
diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx
index e1ff827c9..91293768b 100644
--- a/web/app/(commonLayout)/datasets/template/template.en.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.en.mdx
@@ -1124,6 +1124,63 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
 
 <hr className='ml-0 mr-0' />
 
+<Heading
+  url='/datasets/{dataset_id}/documents/status/{action}'
+  method='PATCH'
+  title='Update Document Status'
+  name='#batch_document_status'
+/>
+<Row>
+  <Col>
+    ### Path
+    <Properties>
+      <Property name='dataset_id' type='string' key='dataset_id'>
+        Knowledge ID
+      </Property>
+      <Property name='action' type='string' key='action'>
+        - `enable` - Enable document
+        - `disable` - Disable document
+        - `archive` - Archive document
+        - `un_archive` - Unarchive document
+      </Property>
+    </Properties>
+
+    ### Request Body
+    <Properties>
+      <Property name='document_ids' type='array[string]' key='document_ids'>
+        List of document IDs
+      </Property>
+    </Properties>
+  </Col>
+  <Col sticky>
+    <CodeGroup
+      title="Request"
+      tag="PATCH"
+      label="/datasets/{dataset_id}/documents/status/{action}"
+      targetCode={`curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \\\n--header 'Authorization: Bearer {api_key}' \\\n--header 'Content-Type: application/json' \\\n--data-raw '{\n    "document_ids": ["doc-id-1", "doc-id-2"]\n}'`}
+    >
+    ```bash {{ title: 'cURL' }}
+    curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \
+    --header 'Authorization: Bearer {api_key}' \
+    --header 'Content-Type: application/json' \
+    --data-raw '{
+        "document_ids": ["doc-id-1", "doc-id-2"]
+    }'
+    ```
+    </CodeGroup>
+
+    <CodeGroup title="Response">
+    ```json {{ title: 'Response' }}
+    {
+      "result": "success"
+    }
+    ```
+    </CodeGroup>
+  </Col>
+</Row>
+
+<hr className='ml-0 mr-0' />
+
 <Heading
   url='/datasets/{dataset_id}/documents/{document_id}/segments'
   method='POST'
diff --git a/web/app/(commonLayout)/datasets/template/template.ja.mdx b/web/app/(commonLayout)/datasets/template/template.ja.mdx
index a796b65ba..9296d34d2 100644
--- a/web/app/(commonLayout)/datasets/template/template.ja.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.ja.mdx
@@ -881,6 +881,63 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
 
 <hr className='ml-0 mr-0' />
 
+<Heading
+  url='/datasets/{dataset_id}/documents/status/{action}'
+  method='PATCH'
+  title='ドキュメントステータスの更新'
+  name='#batch_document_status'
+/>
+<Row>
+  <Col>
+    ### パス
+    <Properties>
+      <Property name='dataset_id' type='string' key='dataset_id'>
+        ナレッジ ID
+      </Property>
+      <Property name='action' type='string' key='action'>
+        - `enable` - ドキュメントを有効化
+        - `disable` - ドキュメントを無効化
+        - `archive` - ドキュメントをアーカイブ
+        - `un_archive` - ドキュメントのアーカイブを解除
+      </Property>
+    </Properties>
+
+    ### リクエストボディ
+    <Properties>
+      <Property name='document_ids' type='array[string]' key='document_ids'>
+        ドキュメントIDのリスト
+      </Property>
+    </Properties>
+  </Col>
+  <Col sticky>
+    <CodeGroup
+      title="リクエスト"
+      tag="PATCH"
+      label="/datasets/{dataset_id}/documents/status/{action}"
+      targetCode={`curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \\\n--header 'Authorization: Bearer {api_key}' \\\n--header 'Content-Type: application/json' \\\n--data-raw '{\n    "document_ids": ["doc-id-1", "doc-id-2"]\n}'`}
+    >
+    ```bash {{ title: 'cURL' }}
+    curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \
+    --header 'Authorization: Bearer {api_key}' \
+    --header 'Content-Type: application/json' \
+    --data-raw '{
+        "document_ids": ["doc-id-1", "doc-id-2"]
+    }'
+    ```
+    </CodeGroup>
+
+    <CodeGroup title="レスポンス">
+    ```json {{ title: 'Response' }}
+    {
+      "result": "success"
+    }
+    ```
+    </CodeGroup>
+  </Col>
+</Row>
+
+<hr className='ml-0 mr-0' />
+
 <Heading
   url='/datasets/{dataset_id}/documents/{document_id}/segments'
   method='POST'
@@ -2413,3 +2470,4 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
   </tbody>
 </table>
 <div className="pb-4" />
+
diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx
index 3994356b5..d407fad3c 100644
--- a/web/app/(commonLayout)/datasets/template/template.zh.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx
@@ -1131,6 +1131,63 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
 
 <hr className='ml-0 mr-0' />
 
+<Heading
+  url='/datasets/{dataset_id}/documents/status/{action}'
+  method='PATCH'
+  title='更新文档状态'
+  name='#batch_document_status'
+/>
+<Row>
+  <Col>
+    ### Path
+    <Properties>
+      <Property name='dataset_id' type='string' key='dataset_id'>
+        知识库 ID
+      </Property>
+      <Property name='action' type='string' key='action'>
+        - `enable` - 启用文档
+        - `disable` - 禁用文档
+        - `archive` - 归档文档
+        - `un_archive` - 取消归档文档
+      </Property>
+    </Properties>
+
+    ### Request Body
+    <Properties>
+      <Property name='document_ids' type='array[string]' key='document_ids'>
+        文档ID列表
+      </Property>
+    </Properties>
+  </Col>
+  <Col sticky>
+    <CodeGroup
+      title="Request"
+      tag="PATCH"
+      label="/datasets/{dataset_id}/documents/status/{action}"
+      targetCode={`curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \\\n--header 'Authorization: Bearer {api_key}' \\\n--header 'Content-Type: application/json' \\\n--data-raw '{\n    "document_ids": ["doc-id-1", "doc-id-2"]\n}'`}
+    >
+    ```bash {{ title: 'cURL' }}
+    curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \
+    --header 'Authorization: Bearer {api_key}' \
+    --header 'Content-Type: application/json' \
+    --data-raw '{
+        "document_ids": ["doc-id-1", "doc-id-2"]
+    }'
+    ```
+    </CodeGroup>
+
+    <CodeGroup title="Response">
+    ```json {{ title: 'Response' }}
+    {
+      "result": "success"
+    }
+    ```
+    </CodeGroup>
+  </Col>
+</Row>
+
+<hr className='ml-0 mr-0' />
+
 <Heading
   url='/datasets/{dataset_id}/documents/{document_id}/segments'
   method='POST'