Add get document detail service api (#21700)

Co-authored-by: lizb <lizb@sugon.com>
2025-06-30 22:13:56 +08:00
parent 96d27d7087
commit 55a6b330ec
5 changed files with 479 additions and 3 deletions
--- a/api/controllers/service_api/dataset/document.py
+++ b/api/controllers/service_api/dataset/document.py
@@ -3,7 +3,7 @@ import json
 from flask import request
 from flask_restful import marshal, reqparse
 from sqlalchemy import desc, select
-from werkzeug.exceptions import NotFound
+from werkzeug.exceptions import Forbidden, NotFound

 import services
 from controllers.common.errors import FilenameNotExistsError
@@ -18,6 +18,7 @@ from controllers.service_api.app.error import (
 from controllers.service_api.dataset.error import (
    ArchivedDocumentImmutableError,
    DocumentIndexingError,
+    InvalidMetadataError,
 )
 from controllers.service_api.wraps import (
    DatasetApiResource,
@@ -466,6 +467,101 @@ class DocumentIndexingStatusApi(DatasetApiResource):
        return data


+class DocumentDetailApi(DatasetApiResource):
+    METADATA_CHOICES = {"all", "only", "without"}
+
+    def get(self, tenant_id, dataset_id, document_id):
+        dataset_id = str(dataset_id)
+        document_id = str(document_id)
+
+        dataset = self.get_dataset(dataset_id, tenant_id)
+
+        document = DocumentService.get_document(dataset.id, document_id)
+
+        if not document:
+            raise NotFound("Document not found.")
+
+        if document.tenant_id != str(tenant_id):
+            raise Forbidden("No permission.")
+
+        metadata = request.args.get("metadata", "all")
+        if metadata not in self.METADATA_CHOICES:
+            raise InvalidMetadataError(f"Invalid metadata value: {metadata}")
+
+        if metadata == "only":
+            response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata_details}
+        elif metadata == "without":
+            dataset_process_rules = DatasetService.get_process_rules(dataset_id)
+            document_process_rules = document.dataset_process_rule.to_dict()
+            data_source_info = document.data_source_detail_dict
+            response = {
+                "id": document.id,
+                "position": document.position,
+                "data_source_type": document.data_source_type,
+                "data_source_info": data_source_info,
+                "dataset_process_rule_id": document.dataset_process_rule_id,
+                "dataset_process_rule": dataset_process_rules,
+                "document_process_rule": document_process_rules,
+                "name": document.name,
+                "created_from": document.created_from,
+                "created_by": document.created_by,
+                "created_at": document.created_at.timestamp(),
+                "tokens": document.tokens,
+                "indexing_status": document.indexing_status,
+                "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
+                "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
+                "indexing_latency": document.indexing_latency,
+                "error": document.error,
+                "enabled": document.enabled,
+                "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
+                "disabled_by": document.disabled_by,
+                "archived": document.archived,
+                "segment_count": document.segment_count,
+                "average_segment_length": document.average_segment_length,
+                "hit_count": document.hit_count,
+                "display_status": document.display_status,
+                "doc_form": document.doc_form,
+                "doc_language": document.doc_language,
+            }
+        else:
+            dataset_process_rules = DatasetService.get_process_rules(dataset_id)
+            document_process_rules = document.dataset_process_rule.to_dict()
+            data_source_info = document.data_source_detail_dict
+            response = {
+                "id": document.id,
+                "position": document.position,
+                "data_source_type": document.data_source_type,
+                "data_source_info": data_source_info,
+                "dataset_process_rule_id": document.dataset_process_rule_id,
+                "dataset_process_rule": dataset_process_rules,
+                "document_process_rule": document_process_rules,
+                "name": document.name,
+                "created_from": document.created_from,
+                "created_by": document.created_by,
+                "created_at": document.created_at.timestamp(),
+                "tokens": document.tokens,
+                "indexing_status": document.indexing_status,
+                "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
+                "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
+                "indexing_latency": document.indexing_latency,
+                "error": document.error,
+                "enabled": document.enabled,
+                "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
+                "disabled_by": document.disabled_by,
+                "archived": document.archived,
+                "doc_type": document.doc_type,
+                "doc_metadata": document.doc_metadata_details,
+                "segment_count": document.segment_count,
+                "average_segment_length": document.average_segment_length,
+                "hit_count": document.hit_count,
+                "display_status": document.display_status,
+                "doc_form": document.doc_form,
+                "doc_language": document.doc_language,
+            }
+
+        return response
+
+
 api.add_resource(
    DocumentAddByTextApi,
    "/datasets/<uuid:dataset_id>/document/create_by_text",
@@ -489,3 +585,4 @@ api.add_resource(
 api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
 api.add_resource(DocumentListApi, "/datasets/<uuid:dataset_id>/documents")
 api.add_resource(DocumentIndexingStatusApi, "/datasets/<uuid:dataset_id>/documents/<string:batch>/indexing-status")
+api.add_resource(DocumentDetailApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
--- a/api/controllers/service_api/wraps.py
+++ b/api/controllers/service_api/wraps.py
@@ -11,13 +11,13 @@ from flask_restful import Resource
 from pydantic import BaseModel
 from sqlalchemy import select, update
 from sqlalchemy.orm import Session
-from werkzeug.exceptions import Forbidden, Unauthorized
+from werkzeug.exceptions import Forbidden, NotFound, Unauthorized

 from extensions.ext_database import db
 from extensions.ext_redis import redis_client
 from libs.login import _get_user
 from models.account import Account, Tenant, TenantAccountJoin, TenantStatus
-from models.dataset import RateLimitLog
+from models.dataset import Dataset, RateLimitLog
 from models.model import ApiToken, App, EndUser
 from services.feature_service import FeatureService

@@ -317,3 +317,11 @@ def create_or_update_end_user_for_user_id(app_model: App, user_id: Optional[str]

 class DatasetApiResource(Resource):
    method_decorators = [validate_dataset_token]
+
+    def get_dataset(self, dataset_id: str, tenant_id: str) -> Dataset:
+        dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id, Dataset.tenant_id == tenant_id).first()
+
+        if not dataset:
+            raise NotFound("Dataset not found.")
+
+        return dataset
--- a/web/app/(commonLayout)/datasets/template/template.en.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.en.mdx
@@ -1124,6 +1124,129 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi

 <hr className='ml-0 mr-0' />

+<Heading
+  url='/datasets/{dataset_id}/documents/{document_id}'
+  method='GET'
+  title='Get Document Detail'
+  name='#get-document-detail'
+/>
+<Row>
+  <Col>
+  Get a document's detail.
+  ### Path
+  - `dataset_id` (string) Dataset ID
+  - `document_id` (string) Document ID
+
+  ### Query
+  - `metadata` (string) Metadata filter, can be `all`, `only`, or `without`. Default is `all`.
+
+  ### Response
+  Returns the document's detail.
+  </Col>
+  <Col sticky>
+  ### Request Example
+  <CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}>
+    ```bash {{ title: 'cURL' }}
+    curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \
+    -H 'Authorization: Bearer {api_key}'
+    ```
+    </CodeGroup>
+
+    ### Response Example
+    <CodeGroup title="Response">
+    ```json {{ title: 'Response' }}
+    {
+    "id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2", 
+    "position": 1, 
+    "data_source_type": "upload_file", 
+    "data_source_info": {
+        "upload_file": {
+            ...
+        }
+    }, 
+    "dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c", 
+    "dataset_process_rule": {
+        "mode": "hierarchical", 
+        "rules": {
+            "pre_processing_rules": [
+                {
+                    "id": "remove_extra_spaces", 
+                    "enabled": true
+                }, 
+                {
+                    "id": "remove_urls_emails", 
+                    "enabled": false
+                }
+            ], 
+            "segmentation": {
+                "separator": "**********page_ending**********", 
+                "max_tokens": 1024, 
+                "chunk_overlap": 0
+            }, 
+            "parent_mode": "paragraph", 
+            "subchunk_segmentation": {
+                "separator": "\n", 
+                "max_tokens": 512, 
+                "chunk_overlap": 0
+            }
+        }
+    }, 
+    "document_process_rule": {
+        "id": "24b99906-845e-499f-9e3c-d5565dd6962c", 
+        "dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9", 
+        "mode": "hierarchical", 
+        "rules": {
+            "pre_processing_rules": [
+                {
+                    "id": "remove_extra_spaces", 
+                    "enabled": true
+                }, 
+                {
+                    "id": "remove_urls_emails", 
+                    "enabled": false
+                }
+            ], 
+            "segmentation": {
+                "separator": "**********page_ending**********", 
+                "max_tokens": 1024, 
+                "chunk_overlap": 0
+            }, 
+            "parent_mode": "paragraph", 
+            "subchunk_segmentation": {
+                "separator": "\n", 
+                "max_tokens": 512, 
+                "chunk_overlap": 0
+            }
+        }
+    }, 
+    "name": "xxxx", 
+    "created_from": "web", 
+    "created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0", 
+    "created_at": 1750464191, 
+    "tokens": null, 
+    "indexing_status": "waiting", 
+    "completed_at": null, 
+    "updated_at": 1750464191, 
+    "indexing_latency": null, 
+    "error": null, 
+    "enabled": true, 
+    "disabled_at": null, 
+    "disabled_by": null, 
+    "archived": false, 
+    "segment_count": 0, 
+    "average_segment_length": 0, 
+    "hit_count": null, 
+    "display_status": "queuing", 
+    "doc_form": "hierarchical_model", 
+    "doc_language": "Chinese Simplified"
+    }
+    ```
+    </CodeGroup>
+  </Col>
+</Row>
+___
+<hr className='ml-0 mr-0' />
+
 <Heading
  url='/datasets/{dataset_id}/documents/status/{action}'
  method='PATCH'
--- a/web/app/(commonLayout)/datasets/template/template.ja.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.ja.mdx
@@ -881,6 +881,130 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi

 <hr className='ml-0 mr-0' />

+<Heading
+  url='/datasets/{dataset_id}/documents/{document_id}'
+  method='GET'
+  title='ドキュメントの詳細を取得'
+  name='#get-document-detail'
+/>
+<Row>
+  <Col>
+  ドキュメントの詳細を取得.
+  ### Path
+  - `dataset_id` (string) ナレッジベースID
+  - `document_id` (string) ドキュメントID
+
+  ### Query
+  - `metadata` (string) metadataのフィルター条件 `all`、`only`、または`without`。デフォルトは `all`。
+
+  ### Response
+  ナレッジベースドキュメントの詳細を返す.
+  </Col>
+ <Col sticky>
+  ### Request Example
+  <CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}>
+    ```bash {{ title: 'cURL' }}
+    curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \
+    -H 'Authorization: Bearer {api_key}'
+    ```
+    </CodeGroup>
+
+    ### Response Example
+    <CodeGroup title="Response">
+    ```json {{ title: 'Response' }}
+    {
+    "id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2", 
+    "position": 1, 
+    "data_source_type": "upload_file", 
+    "data_source_info": {
+        "upload_file": {
+            ...
+        }
+    }, 
+    "dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c", 
+    "dataset_process_rule": {
+        "mode": "hierarchical", 
+        "rules": {
+            "pre_processing_rules": [
+                {
+                    "id": "remove_extra_spaces", 
+                    "enabled": true
+                }, 
+                {
+                    "id": "remove_urls_emails", 
+                    "enabled": false
+                }
+            ], 
+            "segmentation": {
+                "separator": "**********page_ending**********", 
+                "max_tokens": 1024, 
+                "chunk_overlap": 0
+            }, 
+            "parent_mode": "paragraph", 
+            "subchunk_segmentation": {
+                "separator": "\n", 
+                "max_tokens": 512, 
+                "chunk_overlap": 0
+            }
+        }
+    }, 
+    "document_process_rule": {
+        "id": "24b99906-845e-499f-9e3c-d5565dd6962c", 
+        "dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9", 
+        "mode": "hierarchical", 
+        "rules": {
+            "pre_processing_rules": [
+                {
+                    "id": "remove_extra_spaces", 
+                    "enabled": true
+                }, 
+                {
+                    "id": "remove_urls_emails", 
+                    "enabled": false
+                }
+            ], 
+            "segmentation": {
+                "separator": "**********page_ending**********", 
+                "max_tokens": 1024, 
+                "chunk_overlap": 0
+            }, 
+            "parent_mode": "paragraph", 
+            "subchunk_segmentation": {
+                "separator": "\n", 
+                "max_tokens": 512, 
+                "chunk_overlap": 0
+            }
+        }
+    }, 
+    "name": "xxxx", 
+    "created_from": "web", 
+    "created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0", 
+    "created_at": 1750464191, 
+    "tokens": null, 
+    "indexing_status": "waiting", 
+    "completed_at": null, 
+    "updated_at": 1750464191, 
+    "indexing_latency": null, 
+    "error": null, 
+    "enabled": true, 
+    "disabled_at": null, 
+    "disabled_by": null, 
+    "archived": false, 
+    "segment_count": 0, 
+    "average_segment_length": 0, 
+    "hit_count": null, 
+    "display_status": "queuing", 
+    "doc_form": "hierarchical_model", 
+    "doc_language": "Chinese Simplified"
+    }
+    ```
+    </CodeGroup>
+  </Col>
+</Row>
+___
+<hr className='ml-0 mr-0' />
+
+
 <Heading
  url='/datasets/{dataset_id}/documents/status/{action}'
  method='PATCH'
--- a/web/app/(commonLayout)/datasets/template/template.zh.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx
@@ -1131,6 +1131,130 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi

 <hr className='ml-0 mr-0' />

+<Heading
+  url='/datasets/{dataset_id}/documents/{document_id}'
+  method='GET'
+  title='获取文档详情'
+  name='#get-document-detail'
+/>
+<Row>
+  <Col>
+  获取文档详情.
+  ### Path
+  - `dataset_id` (string) 知识库 ID
+  - `document_id` (string) 文档 ID
+
+  ### Query
+  - `metadata` (string) metadata 过滤条件 `all`, `only`, 或者 `without`. 默认是 `all`.
+
+  ### Response
+  返回知识库文档的详情.
+  </Col>
+  <Col sticky>
+  ### Request Example
+  <CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}>
+    ```bash {{ title: 'cURL' }}
+    curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \
+    -H 'Authorization: Bearer {api_key}'
+    ```
+    </CodeGroup>
+
+    ### Response Example
+    <CodeGroup title="Response">
+    ```json {{ title: 'Response' }}
+    {
+    "id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2", 
+    "position": 1, 
+    "data_source_type": "upload_file", 
+    "data_source_info": {
+        "upload_file": {
+            ...
+        }
+    }, 
+    "dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c", 
+    "dataset_process_rule": {
+        "mode": "hierarchical", 
+        "rules": {
+            "pre_processing_rules": [
+                {
+                    "id": "remove_extra_spaces", 
+                    "enabled": true
+                }, 
+                {
+                    "id": "remove_urls_emails", 
+                    "enabled": false
+                }
+            ], 
+            "segmentation": {
+                "separator": "**********page_ending**********", 
+                "max_tokens": 1024, 
+                "chunk_overlap": 0
+            }, 
+            "parent_mode": "paragraph", 
+            "subchunk_segmentation": {
+                "separator": "\n", 
+                "max_tokens": 512, 
+                "chunk_overlap": 0
+            }
+        }
+    }, 
+    "document_process_rule": {
+        "id": "24b99906-845e-499f-9e3c-d5565dd6962c", 
+        "dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9", 
+        "mode": "hierarchical", 
+        "rules": {
+            "pre_processing_rules": [
+                {
+                    "id": "remove_extra_spaces", 
+                    "enabled": true
+                }, 
+                {
+                    "id": "remove_urls_emails", 
+                    "enabled": false
+                }
+            ], 
+            "segmentation": {
+                "separator": "**********page_ending**********", 
+                "max_tokens": 1024, 
+                "chunk_overlap": 0
+            }, 
+            "parent_mode": "paragraph", 
+            "subchunk_segmentation": {
+                "separator": "\n", 
+                "max_tokens": 512, 
+                "chunk_overlap": 0
+            }
+        }
+    }, 
+    "name": "xxxx", 
+    "created_from": "web", 
+    "created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0", 
+    "created_at": 1750464191, 
+    "tokens": null, 
+    "indexing_status": "waiting", 
+    "completed_at": null, 
+    "updated_at": 1750464191, 
+    "indexing_latency": null, 
+    "error": null, 
+    "enabled": true, 
+    "disabled_at": null, 
+    "disabled_by": null, 
+    "archived": false, 
+    "segment_count": 0, 
+    "average_segment_length": 0, 
+    "hit_count": null, 
+    "display_status": "queuing", 
+    "doc_form": "hierarchical_model", 
+    "doc_language": "Chinese Simplified"
+    }
+    ```
+    </CodeGroup>
+  </Col>
+</Row>
+___
+<hr className='ml-0 mr-0' />
+
+
 <Heading
  url='/datasets/{dataset_id}/documents/status/{action}'
  method='PATCH'