Add get document detail service api (#21700)

Co-authored-by: lizb <lizb@sugon.com>
This commit is contained in:
Ganondorf
2025-06-30 22:13:56 +08:00
committed by GitHub
parent 96d27d7087
commit 55a6b330ec
5 changed files with 479 additions and 3 deletions

View File

@@ -3,7 +3,7 @@ import json
from flask import request
from flask_restful import marshal, reqparse
from sqlalchemy import desc, select
from werkzeug.exceptions import NotFound
from werkzeug.exceptions import Forbidden, NotFound
import services
from controllers.common.errors import FilenameNotExistsError
@@ -18,6 +18,7 @@ from controllers.service_api.app.error import (
from controllers.service_api.dataset.error import (
ArchivedDocumentImmutableError,
DocumentIndexingError,
InvalidMetadataError,
)
from controllers.service_api.wraps import (
DatasetApiResource,
@@ -466,6 +467,101 @@ class DocumentIndexingStatusApi(DatasetApiResource):
return data
class DocumentDetailApi(DatasetApiResource):
METADATA_CHOICES = {"all", "only", "without"}
def get(self, tenant_id, dataset_id, document_id):
dataset_id = str(dataset_id)
document_id = str(document_id)
dataset = self.get_dataset(dataset_id, tenant_id)
document = DocumentService.get_document(dataset.id, document_id)
if not document:
raise NotFound("Document not found.")
if document.tenant_id != str(tenant_id):
raise Forbidden("No permission.")
metadata = request.args.get("metadata", "all")
if metadata not in self.METADATA_CHOICES:
raise InvalidMetadataError(f"Invalid metadata value: {metadata}")
if metadata == "only":
response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata_details}
elif metadata == "without":
dataset_process_rules = DatasetService.get_process_rules(dataset_id)
document_process_rules = document.dataset_process_rule.to_dict()
data_source_info = document.data_source_detail_dict
response = {
"id": document.id,
"position": document.position,
"data_source_type": document.data_source_type,
"data_source_info": data_source_info,
"dataset_process_rule_id": document.dataset_process_rule_id,
"dataset_process_rule": dataset_process_rules,
"document_process_rule": document_process_rules,
"name": document.name,
"created_from": document.created_from,
"created_by": document.created_by,
"created_at": document.created_at.timestamp(),
"tokens": document.tokens,
"indexing_status": document.indexing_status,
"completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
"updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
"indexing_latency": document.indexing_latency,
"error": document.error,
"enabled": document.enabled,
"disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
"disabled_by": document.disabled_by,
"archived": document.archived,
"segment_count": document.segment_count,
"average_segment_length": document.average_segment_length,
"hit_count": document.hit_count,
"display_status": document.display_status,
"doc_form": document.doc_form,
"doc_language": document.doc_language,
}
else:
dataset_process_rules = DatasetService.get_process_rules(dataset_id)
document_process_rules = document.dataset_process_rule.to_dict()
data_source_info = document.data_source_detail_dict
response = {
"id": document.id,
"position": document.position,
"data_source_type": document.data_source_type,
"data_source_info": data_source_info,
"dataset_process_rule_id": document.dataset_process_rule_id,
"dataset_process_rule": dataset_process_rules,
"document_process_rule": document_process_rules,
"name": document.name,
"created_from": document.created_from,
"created_by": document.created_by,
"created_at": document.created_at.timestamp(),
"tokens": document.tokens,
"indexing_status": document.indexing_status,
"completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
"updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
"indexing_latency": document.indexing_latency,
"error": document.error,
"enabled": document.enabled,
"disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
"disabled_by": document.disabled_by,
"archived": document.archived,
"doc_type": document.doc_type,
"doc_metadata": document.doc_metadata_details,
"segment_count": document.segment_count,
"average_segment_length": document.average_segment_length,
"hit_count": document.hit_count,
"display_status": document.display_status,
"doc_form": document.doc_form,
"doc_language": document.doc_language,
}
return response
api.add_resource(
DocumentAddByTextApi,
"/datasets/<uuid:dataset_id>/document/create_by_text",
@@ -489,3 +585,4 @@ api.add_resource(
api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
api.add_resource(DocumentListApi, "/datasets/<uuid:dataset_id>/documents")
api.add_resource(DocumentIndexingStatusApi, "/datasets/<uuid:dataset_id>/documents/<string:batch>/indexing-status")
api.add_resource(DocumentDetailApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")

View File

@@ -11,13 +11,13 @@ from flask_restful import Resource
from pydantic import BaseModel
from sqlalchemy import select, update
from sqlalchemy.orm import Session
from werkzeug.exceptions import Forbidden, Unauthorized
from werkzeug.exceptions import Forbidden, NotFound, Unauthorized
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from libs.login import _get_user
from models.account import Account, Tenant, TenantAccountJoin, TenantStatus
from models.dataset import RateLimitLog
from models.dataset import Dataset, RateLimitLog
from models.model import ApiToken, App, EndUser
from services.feature_service import FeatureService
@@ -317,3 +317,11 @@ def create_or_update_end_user_for_user_id(app_model: App, user_id: Optional[str]
class DatasetApiResource(Resource):
method_decorators = [validate_dataset_token]
def get_dataset(self, dataset_id: str, tenant_id: str) -> Dataset:
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id, Dataset.tenant_id == tenant_id).first()
if not dataset:
raise NotFound("Dataset not found.")
return dataset

View File

@@ -1124,6 +1124,129 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
<hr className='ml-0 mr-0' />
<Heading
url='/datasets/{dataset_id}/documents/{document_id}'
method='GET'
title='Get Document Detail'
name='#get-document-detail'
/>
<Row>
<Col>
Get a document's detail.
### Path
- `dataset_id` (string) Dataset ID
- `document_id` (string) Document ID
### Query
- `metadata` (string) Metadata filter, can be `all`, `only`, or `without`. Default is `all`.
### Response
Returns the document's detail.
</Col>
<Col sticky>
### Request Example
<CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}>
```bash {{ title: 'cURL' }}
curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \
-H 'Authorization: Bearer {api_key}'
```
</CodeGroup>
### Response Example
<CodeGroup title="Response">
```json {{ title: 'Response' }}
{
"id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2",
"position": 1,
"data_source_type": "upload_file",
"data_source_info": {
"upload_file": {
...
}
},
"dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_process_rule": {
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"document_process_rule": {
"id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9",
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"name": "xxxx",
"created_from": "web",
"created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0",
"created_at": 1750464191,
"tokens": null,
"indexing_status": "waiting",
"completed_at": null,
"updated_at": 1750464191,
"indexing_latency": null,
"error": null,
"enabled": true,
"disabled_at": null,
"disabled_by": null,
"archived": false,
"segment_count": 0,
"average_segment_length": 0,
"hit_count": null,
"display_status": "queuing",
"doc_form": "hierarchical_model",
"doc_language": "Chinese Simplified"
}
```
</CodeGroup>
</Col>
</Row>
___
<hr className='ml-0 mr-0' />
<Heading
url='/datasets/{dataset_id}/documents/status/{action}'
method='PATCH'

View File

@@ -881,6 +881,130 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
<hr className='ml-0 mr-0' />
<Heading
url='/datasets/{dataset_id}/documents/{document_id}'
method='GET'
title='ドキュメントの詳細を取得'
name='#get-document-detail'
/>
<Row>
<Col>
ドキュメントの詳細を取得.
### Path
- `dataset_id` (string) ナレッジベースID
- `document_id` (string) ドキュメントID
### Query
- `metadata` (string) metadataのフィルター条件 `all`、`only`、または`without`。デフォルトは `all`。
### Response
ナレッジベースドキュメントの詳細を返す.
</Col>
<Col sticky>
### Request Example
<CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}>
```bash {{ title: 'cURL' }}
curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \
-H 'Authorization: Bearer {api_key}'
```
</CodeGroup>
### Response Example
<CodeGroup title="Response">
```json {{ title: 'Response' }}
{
"id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2",
"position": 1,
"data_source_type": "upload_file",
"data_source_info": {
"upload_file": {
...
}
},
"dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_process_rule": {
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"document_process_rule": {
"id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9",
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"name": "xxxx",
"created_from": "web",
"created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0",
"created_at": 1750464191,
"tokens": null,
"indexing_status": "waiting",
"completed_at": null,
"updated_at": 1750464191,
"indexing_latency": null,
"error": null,
"enabled": true,
"disabled_at": null,
"disabled_by": null,
"archived": false,
"segment_count": 0,
"average_segment_length": 0,
"hit_count": null,
"display_status": "queuing",
"doc_form": "hierarchical_model",
"doc_language": "Chinese Simplified"
}
```
</CodeGroup>
</Col>
</Row>
___
<hr className='ml-0 mr-0' />
<Heading
url='/datasets/{dataset_id}/documents/status/{action}'
method='PATCH'

View File

@@ -1131,6 +1131,130 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
<hr className='ml-0 mr-0' />
<Heading
url='/datasets/{dataset_id}/documents/{document_id}'
method='GET'
title='获取文档详情'
name='#get-document-detail'
/>
<Row>
<Col>
获取文档详情.
### Path
- `dataset_id` (string) 知识库 ID
- `document_id` (string) 文档 ID
### Query
- `metadata` (string) metadata 过滤条件 `all`, `only`, 或者 `without`. 默认是 `all`.
### Response
返回知识库文档的详情.
</Col>
<Col sticky>
### Request Example
<CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}>
```bash {{ title: 'cURL' }}
curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \
-H 'Authorization: Bearer {api_key}'
```
</CodeGroup>
### Response Example
<CodeGroup title="Response">
```json {{ title: 'Response' }}
{
"id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2",
"position": 1,
"data_source_type": "upload_file",
"data_source_info": {
"upload_file": {
...
}
},
"dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_process_rule": {
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"document_process_rule": {
"id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9",
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"name": "xxxx",
"created_from": "web",
"created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0",
"created_at": 1750464191,
"tokens": null,
"indexing_status": "waiting",
"completed_at": null,
"updated_at": 1750464191,
"indexing_latency": null,
"error": null,
"enabled": true,
"disabled_at": null,
"disabled_by": null,
"archived": false,
"segment_count": 0,
"average_segment_length": 0,
"hit_count": null,
"display_status": "queuing",
"doc_form": "hierarchical_model",
"doc_language": "Chinese Simplified"
}
```
</CodeGroup>
</Col>
</Row>
___
<hr className='ml-0 mr-0' />
<Heading
url='/datasets/{dataset_id}/documents/status/{action}'
method='PATCH'