From 55a6b330ecd882ea89c99c3fbd3f35e4169ab561 Mon Sep 17 00:00:00 2001 From: Ganondorf <364776488@qq.com> Date: Mon, 30 Jun 2025 22:13:56 +0800 Subject: [PATCH] Add get document detail service api (#21700) Co-authored-by: lizb --- .../service_api/dataset/document.py | 99 +++++++++++++- api/controllers/service_api/wraps.py | 12 +- .../datasets/template/template.en.mdx | 123 +++++++++++++++++ .../datasets/template/template.ja.mdx | 124 ++++++++++++++++++ .../datasets/template/template.zh.mdx | 124 ++++++++++++++++++ 5 files changed, 479 insertions(+), 3 deletions(-) diff --git a/api/controllers/service_api/dataset/document.py b/api/controllers/service_api/dataset/document.py index 6213fad17..d64d9df05 100644 --- a/api/controllers/service_api/dataset/document.py +++ b/api/controllers/service_api/dataset/document.py @@ -3,7 +3,7 @@ import json from flask import request from flask_restful import marshal, reqparse from sqlalchemy import desc, select -from werkzeug.exceptions import NotFound +from werkzeug.exceptions import Forbidden, NotFound import services from controllers.common.errors import FilenameNotExistsError @@ -18,6 +18,7 @@ from controllers.service_api.app.error import ( from controllers.service_api.dataset.error import ( ArchivedDocumentImmutableError, DocumentIndexingError, + InvalidMetadataError, ) from controllers.service_api.wraps import ( DatasetApiResource, @@ -466,6 +467,101 @@ class DocumentIndexingStatusApi(DatasetApiResource): return data +class DocumentDetailApi(DatasetApiResource): + METADATA_CHOICES = {"all", "only", "without"} + + def get(self, tenant_id, dataset_id, document_id): + dataset_id = str(dataset_id) + document_id = str(document_id) + + dataset = self.get_dataset(dataset_id, tenant_id) + + document = DocumentService.get_document(dataset.id, document_id) + + if not document: + raise NotFound("Document not found.") + + if document.tenant_id != str(tenant_id): + raise Forbidden("No permission.") + + metadata = request.args.get("metadata", "all") + if metadata not in self.METADATA_CHOICES: + raise InvalidMetadataError(f"Invalid metadata value: {metadata}") + + if metadata == "only": + response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata_details} + elif metadata == "without": + dataset_process_rules = DatasetService.get_process_rules(dataset_id) + document_process_rules = document.dataset_process_rule.to_dict() + data_source_info = document.data_source_detail_dict + response = { + "id": document.id, + "position": document.position, + "data_source_type": document.data_source_type, + "data_source_info": data_source_info, + "dataset_process_rule_id": document.dataset_process_rule_id, + "dataset_process_rule": dataset_process_rules, + "document_process_rule": document_process_rules, + "name": document.name, + "created_from": document.created_from, + "created_by": document.created_by, + "created_at": document.created_at.timestamp(), + "tokens": document.tokens, + "indexing_status": document.indexing_status, + "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None, + "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None, + "indexing_latency": document.indexing_latency, + "error": document.error, + "enabled": document.enabled, + "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None, + "disabled_by": document.disabled_by, + "archived": document.archived, + "segment_count": document.segment_count, + "average_segment_length": document.average_segment_length, + "hit_count": document.hit_count, + "display_status": document.display_status, + "doc_form": document.doc_form, + "doc_language": document.doc_language, + } + else: + dataset_process_rules = DatasetService.get_process_rules(dataset_id) + document_process_rules = document.dataset_process_rule.to_dict() + data_source_info = document.data_source_detail_dict + response = { + "id": document.id, + "position": document.position, + "data_source_type": document.data_source_type, + "data_source_info": data_source_info, + "dataset_process_rule_id": document.dataset_process_rule_id, + "dataset_process_rule": dataset_process_rules, + "document_process_rule": document_process_rules, + "name": document.name, + "created_from": document.created_from, + "created_by": document.created_by, + "created_at": document.created_at.timestamp(), + "tokens": document.tokens, + "indexing_status": document.indexing_status, + "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None, + "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None, + "indexing_latency": document.indexing_latency, + "error": document.error, + "enabled": document.enabled, + "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None, + "disabled_by": document.disabled_by, + "archived": document.archived, + "doc_type": document.doc_type, + "doc_metadata": document.doc_metadata_details, + "segment_count": document.segment_count, + "average_segment_length": document.average_segment_length, + "hit_count": document.hit_count, + "display_status": document.display_status, + "doc_form": document.doc_form, + "doc_language": document.doc_language, + } + + return response + + api.add_resource( DocumentAddByTextApi, "/datasets//document/create_by_text", @@ -489,3 +585,4 @@ api.add_resource( api.add_resource(DocumentDeleteApi, "/datasets//documents/") api.add_resource(DocumentListApi, "/datasets//documents") api.add_resource(DocumentIndexingStatusApi, "/datasets//documents//indexing-status") +api.add_resource(DocumentDetailApi, "/datasets//documents/") diff --git a/api/controllers/service_api/wraps.py b/api/controllers/service_api/wraps.py index d3316a515..5b919a68d 100644 --- a/api/controllers/service_api/wraps.py +++ b/api/controllers/service_api/wraps.py @@ -11,13 +11,13 @@ from flask_restful import Resource from pydantic import BaseModel from sqlalchemy import select, update from sqlalchemy.orm import Session -from werkzeug.exceptions import Forbidden, Unauthorized +from werkzeug.exceptions import Forbidden, NotFound, Unauthorized from extensions.ext_database import db from extensions.ext_redis import redis_client from libs.login import _get_user from models.account import Account, Tenant, TenantAccountJoin, TenantStatus -from models.dataset import RateLimitLog +from models.dataset import Dataset, RateLimitLog from models.model import ApiToken, App, EndUser from services.feature_service import FeatureService @@ -317,3 +317,11 @@ def create_or_update_end_user_for_user_id(app_model: App, user_id: Optional[str] class DatasetApiResource(Resource): method_decorators = [validate_dataset_token] + + def get_dataset(self, dataset_id: str, tenant_id: str) -> Dataset: + dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id, Dataset.tenant_id == tenant_id).first() + + if not dataset: + raise NotFound("Dataset not found.") + + return dataset diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx index 91293768b..ebb2e6a80 100644 --- a/web/app/(commonLayout)/datasets/template/template.en.mdx +++ b/web/app/(commonLayout)/datasets/template/template.en.mdx @@ -1124,6 +1124,129 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
+ + + + Get a document's detail. + ### Path + - `dataset_id` (string) Dataset ID + - `document_id` (string) Document ID + + ### Query + - `metadata` (string) Metadata filter, can be `all`, `only`, or `without`. Default is `all`. + + ### Response + Returns the document's detail. + + + ### Request Example + + ```bash {{ title: 'cURL' }} + curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \ + -H 'Authorization: Bearer {api_key}' + ``` + + + ### Response Example + + ```json {{ title: 'Response' }} + { + "id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2", + "position": 1, + "data_source_type": "upload_file", + "data_source_info": { + "upload_file": { + ... + } + }, + "dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c", + "dataset_process_rule": { + "mode": "hierarchical", + "rules": { + "pre_processing_rules": [ + { + "id": "remove_extra_spaces", + "enabled": true + }, + { + "id": "remove_urls_emails", + "enabled": false + } + ], + "segmentation": { + "separator": "**********page_ending**********", + "max_tokens": 1024, + "chunk_overlap": 0 + }, + "parent_mode": "paragraph", + "subchunk_segmentation": { + "separator": "\n", + "max_tokens": 512, + "chunk_overlap": 0 + } + } + }, + "document_process_rule": { + "id": "24b99906-845e-499f-9e3c-d5565dd6962c", + "dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9", + "mode": "hierarchical", + "rules": { + "pre_processing_rules": [ + { + "id": "remove_extra_spaces", + "enabled": true + }, + { + "id": "remove_urls_emails", + "enabled": false + } + ], + "segmentation": { + "separator": "**********page_ending**********", + "max_tokens": 1024, + "chunk_overlap": 0 + }, + "parent_mode": "paragraph", + "subchunk_segmentation": { + "separator": "\n", + "max_tokens": 512, + "chunk_overlap": 0 + } + } + }, + "name": "xxxx", + "created_from": "web", + "created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0", + "created_at": 1750464191, + "tokens": null, + "indexing_status": "waiting", + "completed_at": null, + "updated_at": 1750464191, + "indexing_latency": null, + "error": null, + "enabled": true, + "disabled_at": null, + "disabled_by": null, + "archived": false, + "segment_count": 0, + "average_segment_length": 0, + "hit_count": null, + "display_status": "queuing", + "doc_form": "hierarchical_model", + "doc_language": "Chinese Simplified" + } + ``` + + + +___ +
+ + + + + ドキュメントの詳細を取得. + ### Path + - `dataset_id` (string) ナレッジベースID + - `document_id` (string) ドキュメントID + + ### Query + - `metadata` (string) metadataのフィルター条件 `all`、`only`、または`without`。デフォルトは `all`。 + + ### Response + ナレッジベースドキュメントの詳細を返す. + + + ### Request Example + + ```bash {{ title: 'cURL' }} + curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \ + -H 'Authorization: Bearer {api_key}' + ``` + + + ### Response Example + + ```json {{ title: 'Response' }} + { + "id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2", + "position": 1, + "data_source_type": "upload_file", + "data_source_info": { + "upload_file": { + ... + } + }, + "dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c", + "dataset_process_rule": { + "mode": "hierarchical", + "rules": { + "pre_processing_rules": [ + { + "id": "remove_extra_spaces", + "enabled": true + }, + { + "id": "remove_urls_emails", + "enabled": false + } + ], + "segmentation": { + "separator": "**********page_ending**********", + "max_tokens": 1024, + "chunk_overlap": 0 + }, + "parent_mode": "paragraph", + "subchunk_segmentation": { + "separator": "\n", + "max_tokens": 512, + "chunk_overlap": 0 + } + } + }, + "document_process_rule": { + "id": "24b99906-845e-499f-9e3c-d5565dd6962c", + "dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9", + "mode": "hierarchical", + "rules": { + "pre_processing_rules": [ + { + "id": "remove_extra_spaces", + "enabled": true + }, + { + "id": "remove_urls_emails", + "enabled": false + } + ], + "segmentation": { + "separator": "**********page_ending**********", + "max_tokens": 1024, + "chunk_overlap": 0 + }, + "parent_mode": "paragraph", + "subchunk_segmentation": { + "separator": "\n", + "max_tokens": 512, + "chunk_overlap": 0 + } + } + }, + "name": "xxxx", + "created_from": "web", + "created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0", + "created_at": 1750464191, + "tokens": null, + "indexing_status": "waiting", + "completed_at": null, + "updated_at": 1750464191, + "indexing_latency": null, + "error": null, + "enabled": true, + "disabled_at": null, + "disabled_by": null, + "archived": false, + "segment_count": 0, + "average_segment_length": 0, + "hit_count": null, + "display_status": "queuing", + "doc_form": "hierarchical_model", + "doc_language": "Chinese Simplified" + } + ``` + + + +___ +
+ + + + + + 获取文档详情. + ### Path + - `dataset_id` (string) 知识库 ID + - `document_id` (string) 文档 ID + + ### Query + - `metadata` (string) metadata 过滤条件 `all`, `only`, 或者 `without`. 默认是 `all`. + + ### Response + 返回知识库文档的详情. + + + ### Request Example + + ```bash {{ title: 'cURL' }} + curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \ + -H 'Authorization: Bearer {api_key}' + ``` + + + ### Response Example + + ```json {{ title: 'Response' }} + { + "id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2", + "position": 1, + "data_source_type": "upload_file", + "data_source_info": { + "upload_file": { + ... + } + }, + "dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c", + "dataset_process_rule": { + "mode": "hierarchical", + "rules": { + "pre_processing_rules": [ + { + "id": "remove_extra_spaces", + "enabled": true + }, + { + "id": "remove_urls_emails", + "enabled": false + } + ], + "segmentation": { + "separator": "**********page_ending**********", + "max_tokens": 1024, + "chunk_overlap": 0 + }, + "parent_mode": "paragraph", + "subchunk_segmentation": { + "separator": "\n", + "max_tokens": 512, + "chunk_overlap": 0 + } + } + }, + "document_process_rule": { + "id": "24b99906-845e-499f-9e3c-d5565dd6962c", + "dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9", + "mode": "hierarchical", + "rules": { + "pre_processing_rules": [ + { + "id": "remove_extra_spaces", + "enabled": true + }, + { + "id": "remove_urls_emails", + "enabled": false + } + ], + "segmentation": { + "separator": "**********page_ending**********", + "max_tokens": 1024, + "chunk_overlap": 0 + }, + "parent_mode": "paragraph", + "subchunk_segmentation": { + "separator": "\n", + "max_tokens": 512, + "chunk_overlap": 0 + } + } + }, + "name": "xxxx", + "created_from": "web", + "created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0", + "created_at": 1750464191, + "tokens": null, + "indexing_status": "waiting", + "completed_at": null, + "updated_at": 1750464191, + "indexing_latency": null, + "error": null, + "enabled": true, + "disabled_at": null, + "disabled_by": null, + "archived": false, + "segment_count": 0, + "average_segment_length": 0, + "hit_count": null, + "display_status": "queuing", + "doc_form": "hierarchical_model", + "doc_language": "Chinese Simplified" + } + ``` + + + +___ +
+ +