Add get document detail service api (#21700)

Co-authored-by: lizb <lizb@sugon.com>
This commit is contained in:
Ganondorf
2025-06-30 22:13:56 +08:00
committed by GitHub
parent 96d27d7087
commit 55a6b330ec
5 changed files with 479 additions and 3 deletions

View File

@@ -3,7 +3,7 @@ import json
from flask import request from flask import request
from flask_restful import marshal, reqparse from flask_restful import marshal, reqparse
from sqlalchemy import desc, select from sqlalchemy import desc, select
from werkzeug.exceptions import NotFound from werkzeug.exceptions import Forbidden, NotFound
import services import services
from controllers.common.errors import FilenameNotExistsError from controllers.common.errors import FilenameNotExistsError
@@ -18,6 +18,7 @@ from controllers.service_api.app.error import (
from controllers.service_api.dataset.error import ( from controllers.service_api.dataset.error import (
ArchivedDocumentImmutableError, ArchivedDocumentImmutableError,
DocumentIndexingError, DocumentIndexingError,
InvalidMetadataError,
) )
from controllers.service_api.wraps import ( from controllers.service_api.wraps import (
DatasetApiResource, DatasetApiResource,
@@ -466,6 +467,101 @@ class DocumentIndexingStatusApi(DatasetApiResource):
return data return data
class DocumentDetailApi(DatasetApiResource):
METADATA_CHOICES = {"all", "only", "without"}
def get(self, tenant_id, dataset_id, document_id):
dataset_id = str(dataset_id)
document_id = str(document_id)
dataset = self.get_dataset(dataset_id, tenant_id)
document = DocumentService.get_document(dataset.id, document_id)
if not document:
raise NotFound("Document not found.")
if document.tenant_id != str(tenant_id):
raise Forbidden("No permission.")
metadata = request.args.get("metadata", "all")
if metadata not in self.METADATA_CHOICES:
raise InvalidMetadataError(f"Invalid metadata value: {metadata}")
if metadata == "only":
response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata_details}
elif metadata == "without":
dataset_process_rules = DatasetService.get_process_rules(dataset_id)
document_process_rules = document.dataset_process_rule.to_dict()
data_source_info = document.data_source_detail_dict
response = {
"id": document.id,
"position": document.position,
"data_source_type": document.data_source_type,
"data_source_info": data_source_info,
"dataset_process_rule_id": document.dataset_process_rule_id,
"dataset_process_rule": dataset_process_rules,
"document_process_rule": document_process_rules,
"name": document.name,
"created_from": document.created_from,
"created_by": document.created_by,
"created_at": document.created_at.timestamp(),
"tokens": document.tokens,
"indexing_status": document.indexing_status,
"completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
"updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
"indexing_latency": document.indexing_latency,
"error": document.error,
"enabled": document.enabled,
"disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
"disabled_by": document.disabled_by,
"archived": document.archived,
"segment_count": document.segment_count,
"average_segment_length": document.average_segment_length,
"hit_count": document.hit_count,
"display_status": document.display_status,
"doc_form": document.doc_form,
"doc_language": document.doc_language,
}
else:
dataset_process_rules = DatasetService.get_process_rules(dataset_id)
document_process_rules = document.dataset_process_rule.to_dict()
data_source_info = document.data_source_detail_dict
response = {
"id": document.id,
"position": document.position,
"data_source_type": document.data_source_type,
"data_source_info": data_source_info,
"dataset_process_rule_id": document.dataset_process_rule_id,
"dataset_process_rule": dataset_process_rules,
"document_process_rule": document_process_rules,
"name": document.name,
"created_from": document.created_from,
"created_by": document.created_by,
"created_at": document.created_at.timestamp(),
"tokens": document.tokens,
"indexing_status": document.indexing_status,
"completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
"updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
"indexing_latency": document.indexing_latency,
"error": document.error,
"enabled": document.enabled,
"disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
"disabled_by": document.disabled_by,
"archived": document.archived,
"doc_type": document.doc_type,
"doc_metadata": document.doc_metadata_details,
"segment_count": document.segment_count,
"average_segment_length": document.average_segment_length,
"hit_count": document.hit_count,
"display_status": document.display_status,
"doc_form": document.doc_form,
"doc_language": document.doc_language,
}
return response
api.add_resource( api.add_resource(
DocumentAddByTextApi, DocumentAddByTextApi,
"/datasets/<uuid:dataset_id>/document/create_by_text", "/datasets/<uuid:dataset_id>/document/create_by_text",
@@ -489,3 +585,4 @@ api.add_resource(
api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>") api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
api.add_resource(DocumentListApi, "/datasets/<uuid:dataset_id>/documents") api.add_resource(DocumentListApi, "/datasets/<uuid:dataset_id>/documents")
api.add_resource(DocumentIndexingStatusApi, "/datasets/<uuid:dataset_id>/documents/<string:batch>/indexing-status") api.add_resource(DocumentIndexingStatusApi, "/datasets/<uuid:dataset_id>/documents/<string:batch>/indexing-status")
api.add_resource(DocumentDetailApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")

View File

@@ -11,13 +11,13 @@ from flask_restful import Resource
from pydantic import BaseModel from pydantic import BaseModel
from sqlalchemy import select, update from sqlalchemy import select, update
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from werkzeug.exceptions import Forbidden, Unauthorized from werkzeug.exceptions import Forbidden, NotFound, Unauthorized
from extensions.ext_database import db from extensions.ext_database import db
from extensions.ext_redis import redis_client from extensions.ext_redis import redis_client
from libs.login import _get_user from libs.login import _get_user
from models.account import Account, Tenant, TenantAccountJoin, TenantStatus from models.account import Account, Tenant, TenantAccountJoin, TenantStatus
from models.dataset import RateLimitLog from models.dataset import Dataset, RateLimitLog
from models.model import ApiToken, App, EndUser from models.model import ApiToken, App, EndUser
from services.feature_service import FeatureService from services.feature_service import FeatureService
@@ -317,3 +317,11 @@ def create_or_update_end_user_for_user_id(app_model: App, user_id: Optional[str]
class DatasetApiResource(Resource): class DatasetApiResource(Resource):
method_decorators = [validate_dataset_token] method_decorators = [validate_dataset_token]
def get_dataset(self, dataset_id: str, tenant_id: str) -> Dataset:
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id, Dataset.tenant_id == tenant_id).first()
if not dataset:
raise NotFound("Dataset not found.")
return dataset

View File

@@ -1124,6 +1124,129 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
<hr className='ml-0 mr-0' /> <hr className='ml-0 mr-0' />
<Heading
url='/datasets/{dataset_id}/documents/{document_id}'
method='GET'
title='Get Document Detail'
name='#get-document-detail'
/>
<Row>
<Col>
Get a document's detail.
### Path
- `dataset_id` (string) Dataset ID
- `document_id` (string) Document ID
### Query
- `metadata` (string) Metadata filter, can be `all`, `only`, or `without`. Default is `all`.
### Response
Returns the document's detail.
</Col>
<Col sticky>
### Request Example
<CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}>
```bash {{ title: 'cURL' }}
curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \
-H 'Authorization: Bearer {api_key}'
```
</CodeGroup>
### Response Example
<CodeGroup title="Response">
```json {{ title: 'Response' }}
{
"id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2",
"position": 1,
"data_source_type": "upload_file",
"data_source_info": {
"upload_file": {
...
}
},
"dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_process_rule": {
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"document_process_rule": {
"id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9",
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"name": "xxxx",
"created_from": "web",
"created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0",
"created_at": 1750464191,
"tokens": null,
"indexing_status": "waiting",
"completed_at": null,
"updated_at": 1750464191,
"indexing_latency": null,
"error": null,
"enabled": true,
"disabled_at": null,
"disabled_by": null,
"archived": false,
"segment_count": 0,
"average_segment_length": 0,
"hit_count": null,
"display_status": "queuing",
"doc_form": "hierarchical_model",
"doc_language": "Chinese Simplified"
}
```
</CodeGroup>
</Col>
</Row>
___
<hr className='ml-0 mr-0' />
<Heading <Heading
url='/datasets/{dataset_id}/documents/status/{action}' url='/datasets/{dataset_id}/documents/status/{action}'
method='PATCH' method='PATCH'

View File

@@ -881,6 +881,130 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
<hr className='ml-0 mr-0' /> <hr className='ml-0 mr-0' />
<Heading
url='/datasets/{dataset_id}/documents/{document_id}'
method='GET'
title='ドキュメントの詳細を取得'
name='#get-document-detail'
/>
<Row>
<Col>
ドキュメントの詳細を取得.
### Path
- `dataset_id` (string) ナレッジベースID
- `document_id` (string) ドキュメントID
### Query
- `metadata` (string) metadataのフィルター条件 `all`、`only`、または`without`。デフォルトは `all`。
### Response
ナレッジベースドキュメントの詳細を返す.
</Col>
<Col sticky>
### Request Example
<CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}>
```bash {{ title: 'cURL' }}
curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \
-H 'Authorization: Bearer {api_key}'
```
</CodeGroup>
### Response Example
<CodeGroup title="Response">
```json {{ title: 'Response' }}
{
"id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2",
"position": 1,
"data_source_type": "upload_file",
"data_source_info": {
"upload_file": {
...
}
},
"dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_process_rule": {
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"document_process_rule": {
"id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9",
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"name": "xxxx",
"created_from": "web",
"created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0",
"created_at": 1750464191,
"tokens": null,
"indexing_status": "waiting",
"completed_at": null,
"updated_at": 1750464191,
"indexing_latency": null,
"error": null,
"enabled": true,
"disabled_at": null,
"disabled_by": null,
"archived": false,
"segment_count": 0,
"average_segment_length": 0,
"hit_count": null,
"display_status": "queuing",
"doc_form": "hierarchical_model",
"doc_language": "Chinese Simplified"
}
```
</CodeGroup>
</Col>
</Row>
___
<hr className='ml-0 mr-0' />
<Heading <Heading
url='/datasets/{dataset_id}/documents/status/{action}' url='/datasets/{dataset_id}/documents/status/{action}'
method='PATCH' method='PATCH'

View File

@@ -1131,6 +1131,130 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
<hr className='ml-0 mr-0' /> <hr className='ml-0 mr-0' />
<Heading
url='/datasets/{dataset_id}/documents/{document_id}'
method='GET'
title='获取文档详情'
name='#get-document-detail'
/>
<Row>
<Col>
获取文档详情.
### Path
- `dataset_id` (string) 知识库 ID
- `document_id` (string) 文档 ID
### Query
- `metadata` (string) metadata 过滤条件 `all`, `only`, 或者 `without`. 默认是 `all`.
### Response
返回知识库文档的详情.
</Col>
<Col sticky>
### Request Example
<CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}>
```bash {{ title: 'cURL' }}
curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \
-H 'Authorization: Bearer {api_key}'
```
</CodeGroup>
### Response Example
<CodeGroup title="Response">
```json {{ title: 'Response' }}
{
"id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2",
"position": 1,
"data_source_type": "upload_file",
"data_source_info": {
"upload_file": {
...
}
},
"dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_process_rule": {
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"document_process_rule": {
"id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9",
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"name": "xxxx",
"created_from": "web",
"created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0",
"created_at": 1750464191,
"tokens": null,
"indexing_status": "waiting",
"completed_at": null,
"updated_at": 1750464191,
"indexing_latency": null,
"error": null,
"enabled": true,
"disabled_at": null,
"disabled_by": null,
"archived": false,
"segment_count": 0,
"average_segment_length": 0,
"hit_count": null,
"display_status": "queuing",
"doc_form": "hierarchical_model",
"doc_language": "Chinese Simplified"
}
```
</CodeGroup>
</Col>
</Row>
___
<hr className='ml-0 mr-0' />
<Heading <Heading
url='/datasets/{dataset_id}/documents/status/{action}' url='/datasets/{dataset_id}/documents/status/{action}'
method='PATCH' method='PATCH'