Add get document detail service api (#21700)
Co-authored-by: lizb <lizb@sugon.com>
This commit is contained in:
@@ -3,7 +3,7 @@ import json
|
|||||||
from flask import request
|
from flask import request
|
||||||
from flask_restful import marshal, reqparse
|
from flask_restful import marshal, reqparse
|
||||||
from sqlalchemy import desc, select
|
from sqlalchemy import desc, select
|
||||||
from werkzeug.exceptions import NotFound
|
from werkzeug.exceptions import Forbidden, NotFound
|
||||||
|
|
||||||
import services
|
import services
|
||||||
from controllers.common.errors import FilenameNotExistsError
|
from controllers.common.errors import FilenameNotExistsError
|
||||||
@@ -18,6 +18,7 @@ from controllers.service_api.app.error import (
|
|||||||
from controllers.service_api.dataset.error import (
|
from controllers.service_api.dataset.error import (
|
||||||
ArchivedDocumentImmutableError,
|
ArchivedDocumentImmutableError,
|
||||||
DocumentIndexingError,
|
DocumentIndexingError,
|
||||||
|
InvalidMetadataError,
|
||||||
)
|
)
|
||||||
from controllers.service_api.wraps import (
|
from controllers.service_api.wraps import (
|
||||||
DatasetApiResource,
|
DatasetApiResource,
|
||||||
@@ -466,6 +467,101 @@ class DocumentIndexingStatusApi(DatasetApiResource):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentDetailApi(DatasetApiResource):
|
||||||
|
METADATA_CHOICES = {"all", "only", "without"}
|
||||||
|
|
||||||
|
def get(self, tenant_id, dataset_id, document_id):
|
||||||
|
dataset_id = str(dataset_id)
|
||||||
|
document_id = str(document_id)
|
||||||
|
|
||||||
|
dataset = self.get_dataset(dataset_id, tenant_id)
|
||||||
|
|
||||||
|
document = DocumentService.get_document(dataset.id, document_id)
|
||||||
|
|
||||||
|
if not document:
|
||||||
|
raise NotFound("Document not found.")
|
||||||
|
|
||||||
|
if document.tenant_id != str(tenant_id):
|
||||||
|
raise Forbidden("No permission.")
|
||||||
|
|
||||||
|
metadata = request.args.get("metadata", "all")
|
||||||
|
if metadata not in self.METADATA_CHOICES:
|
||||||
|
raise InvalidMetadataError(f"Invalid metadata value: {metadata}")
|
||||||
|
|
||||||
|
if metadata == "only":
|
||||||
|
response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata_details}
|
||||||
|
elif metadata == "without":
|
||||||
|
dataset_process_rules = DatasetService.get_process_rules(dataset_id)
|
||||||
|
document_process_rules = document.dataset_process_rule.to_dict()
|
||||||
|
data_source_info = document.data_source_detail_dict
|
||||||
|
response = {
|
||||||
|
"id": document.id,
|
||||||
|
"position": document.position,
|
||||||
|
"data_source_type": document.data_source_type,
|
||||||
|
"data_source_info": data_source_info,
|
||||||
|
"dataset_process_rule_id": document.dataset_process_rule_id,
|
||||||
|
"dataset_process_rule": dataset_process_rules,
|
||||||
|
"document_process_rule": document_process_rules,
|
||||||
|
"name": document.name,
|
||||||
|
"created_from": document.created_from,
|
||||||
|
"created_by": document.created_by,
|
||||||
|
"created_at": document.created_at.timestamp(),
|
||||||
|
"tokens": document.tokens,
|
||||||
|
"indexing_status": document.indexing_status,
|
||||||
|
"completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
|
||||||
|
"updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
|
||||||
|
"indexing_latency": document.indexing_latency,
|
||||||
|
"error": document.error,
|
||||||
|
"enabled": document.enabled,
|
||||||
|
"disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
|
||||||
|
"disabled_by": document.disabled_by,
|
||||||
|
"archived": document.archived,
|
||||||
|
"segment_count": document.segment_count,
|
||||||
|
"average_segment_length": document.average_segment_length,
|
||||||
|
"hit_count": document.hit_count,
|
||||||
|
"display_status": document.display_status,
|
||||||
|
"doc_form": document.doc_form,
|
||||||
|
"doc_language": document.doc_language,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
dataset_process_rules = DatasetService.get_process_rules(dataset_id)
|
||||||
|
document_process_rules = document.dataset_process_rule.to_dict()
|
||||||
|
data_source_info = document.data_source_detail_dict
|
||||||
|
response = {
|
||||||
|
"id": document.id,
|
||||||
|
"position": document.position,
|
||||||
|
"data_source_type": document.data_source_type,
|
||||||
|
"data_source_info": data_source_info,
|
||||||
|
"dataset_process_rule_id": document.dataset_process_rule_id,
|
||||||
|
"dataset_process_rule": dataset_process_rules,
|
||||||
|
"document_process_rule": document_process_rules,
|
||||||
|
"name": document.name,
|
||||||
|
"created_from": document.created_from,
|
||||||
|
"created_by": document.created_by,
|
||||||
|
"created_at": document.created_at.timestamp(),
|
||||||
|
"tokens": document.tokens,
|
||||||
|
"indexing_status": document.indexing_status,
|
||||||
|
"completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
|
||||||
|
"updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
|
||||||
|
"indexing_latency": document.indexing_latency,
|
||||||
|
"error": document.error,
|
||||||
|
"enabled": document.enabled,
|
||||||
|
"disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
|
||||||
|
"disabled_by": document.disabled_by,
|
||||||
|
"archived": document.archived,
|
||||||
|
"doc_type": document.doc_type,
|
||||||
|
"doc_metadata": document.doc_metadata_details,
|
||||||
|
"segment_count": document.segment_count,
|
||||||
|
"average_segment_length": document.average_segment_length,
|
||||||
|
"hit_count": document.hit_count,
|
||||||
|
"display_status": document.display_status,
|
||||||
|
"doc_form": document.doc_form,
|
||||||
|
"doc_language": document.doc_language,
|
||||||
|
}
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
api.add_resource(
|
api.add_resource(
|
||||||
DocumentAddByTextApi,
|
DocumentAddByTextApi,
|
||||||
"/datasets/<uuid:dataset_id>/document/create_by_text",
|
"/datasets/<uuid:dataset_id>/document/create_by_text",
|
||||||
@@ -489,3 +585,4 @@ api.add_resource(
|
|||||||
api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
|
api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
|
||||||
api.add_resource(DocumentListApi, "/datasets/<uuid:dataset_id>/documents")
|
api.add_resource(DocumentListApi, "/datasets/<uuid:dataset_id>/documents")
|
||||||
api.add_resource(DocumentIndexingStatusApi, "/datasets/<uuid:dataset_id>/documents/<string:batch>/indexing-status")
|
api.add_resource(DocumentIndexingStatusApi, "/datasets/<uuid:dataset_id>/documents/<string:batch>/indexing-status")
|
||||||
|
api.add_resource(DocumentDetailApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
|
||||||
|
@@ -11,13 +11,13 @@ from flask_restful import Resource
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from sqlalchemy import select, update
|
from sqlalchemy import select, update
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
from werkzeug.exceptions import Forbidden, Unauthorized
|
from werkzeug.exceptions import Forbidden, NotFound, Unauthorized
|
||||||
|
|
||||||
from extensions.ext_database import db
|
from extensions.ext_database import db
|
||||||
from extensions.ext_redis import redis_client
|
from extensions.ext_redis import redis_client
|
||||||
from libs.login import _get_user
|
from libs.login import _get_user
|
||||||
from models.account import Account, Tenant, TenantAccountJoin, TenantStatus
|
from models.account import Account, Tenant, TenantAccountJoin, TenantStatus
|
||||||
from models.dataset import RateLimitLog
|
from models.dataset import Dataset, RateLimitLog
|
||||||
from models.model import ApiToken, App, EndUser
|
from models.model import ApiToken, App, EndUser
|
||||||
from services.feature_service import FeatureService
|
from services.feature_service import FeatureService
|
||||||
|
|
||||||
@@ -317,3 +317,11 @@ def create_or_update_end_user_for_user_id(app_model: App, user_id: Optional[str]
|
|||||||
|
|
||||||
class DatasetApiResource(Resource):
|
class DatasetApiResource(Resource):
|
||||||
method_decorators = [validate_dataset_token]
|
method_decorators = [validate_dataset_token]
|
||||||
|
|
||||||
|
def get_dataset(self, dataset_id: str, tenant_id: str) -> Dataset:
|
||||||
|
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id, Dataset.tenant_id == tenant_id).first()
|
||||||
|
|
||||||
|
if not dataset:
|
||||||
|
raise NotFound("Dataset not found.")
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
@@ -1124,6 +1124,129 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
|||||||
|
|
||||||
<hr className='ml-0 mr-0' />
|
<hr className='ml-0 mr-0' />
|
||||||
|
|
||||||
|
<Heading
|
||||||
|
url='/datasets/{dataset_id}/documents/{document_id}'
|
||||||
|
method='GET'
|
||||||
|
title='Get Document Detail'
|
||||||
|
name='#get-document-detail'
|
||||||
|
/>
|
||||||
|
<Row>
|
||||||
|
<Col>
|
||||||
|
Get a document's detail.
|
||||||
|
### Path
|
||||||
|
- `dataset_id` (string) Dataset ID
|
||||||
|
- `document_id` (string) Document ID
|
||||||
|
|
||||||
|
### Query
|
||||||
|
- `metadata` (string) Metadata filter, can be `all`, `only`, or `without`. Default is `all`.
|
||||||
|
|
||||||
|
### Response
|
||||||
|
Returns the document's detail.
|
||||||
|
</Col>
|
||||||
|
<Col sticky>
|
||||||
|
### Request Example
|
||||||
|
<CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}>
|
||||||
|
```bash {{ title: 'cURL' }}
|
||||||
|
curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \
|
||||||
|
-H 'Authorization: Bearer {api_key}'
|
||||||
|
```
|
||||||
|
</CodeGroup>
|
||||||
|
|
||||||
|
### Response Example
|
||||||
|
<CodeGroup title="Response">
|
||||||
|
```json {{ title: 'Response' }}
|
||||||
|
{
|
||||||
|
"id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2",
|
||||||
|
"position": 1,
|
||||||
|
"data_source_type": "upload_file",
|
||||||
|
"data_source_info": {
|
||||||
|
"upload_file": {
|
||||||
|
...
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c",
|
||||||
|
"dataset_process_rule": {
|
||||||
|
"mode": "hierarchical",
|
||||||
|
"rules": {
|
||||||
|
"pre_processing_rules": [
|
||||||
|
{
|
||||||
|
"id": "remove_extra_spaces",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "remove_urls_emails",
|
||||||
|
"enabled": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"segmentation": {
|
||||||
|
"separator": "**********page_ending**********",
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"chunk_overlap": 0
|
||||||
|
},
|
||||||
|
"parent_mode": "paragraph",
|
||||||
|
"subchunk_segmentation": {
|
||||||
|
"separator": "\n",
|
||||||
|
"max_tokens": 512,
|
||||||
|
"chunk_overlap": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"document_process_rule": {
|
||||||
|
"id": "24b99906-845e-499f-9e3c-d5565dd6962c",
|
||||||
|
"dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9",
|
||||||
|
"mode": "hierarchical",
|
||||||
|
"rules": {
|
||||||
|
"pre_processing_rules": [
|
||||||
|
{
|
||||||
|
"id": "remove_extra_spaces",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "remove_urls_emails",
|
||||||
|
"enabled": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"segmentation": {
|
||||||
|
"separator": "**********page_ending**********",
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"chunk_overlap": 0
|
||||||
|
},
|
||||||
|
"parent_mode": "paragraph",
|
||||||
|
"subchunk_segmentation": {
|
||||||
|
"separator": "\n",
|
||||||
|
"max_tokens": 512,
|
||||||
|
"chunk_overlap": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"name": "xxxx",
|
||||||
|
"created_from": "web",
|
||||||
|
"created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0",
|
||||||
|
"created_at": 1750464191,
|
||||||
|
"tokens": null,
|
||||||
|
"indexing_status": "waiting",
|
||||||
|
"completed_at": null,
|
||||||
|
"updated_at": 1750464191,
|
||||||
|
"indexing_latency": null,
|
||||||
|
"error": null,
|
||||||
|
"enabled": true,
|
||||||
|
"disabled_at": null,
|
||||||
|
"disabled_by": null,
|
||||||
|
"archived": false,
|
||||||
|
"segment_count": 0,
|
||||||
|
"average_segment_length": 0,
|
||||||
|
"hit_count": null,
|
||||||
|
"display_status": "queuing",
|
||||||
|
"doc_form": "hierarchical_model",
|
||||||
|
"doc_language": "Chinese Simplified"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
</CodeGroup>
|
||||||
|
</Col>
|
||||||
|
</Row>
|
||||||
|
___
|
||||||
|
<hr className='ml-0 mr-0' />
|
||||||
|
|
||||||
<Heading
|
<Heading
|
||||||
url='/datasets/{dataset_id}/documents/status/{action}'
|
url='/datasets/{dataset_id}/documents/status/{action}'
|
||||||
method='PATCH'
|
method='PATCH'
|
||||||
|
@@ -881,6 +881,130 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
|||||||
|
|
||||||
<hr className='ml-0 mr-0' />
|
<hr className='ml-0 mr-0' />
|
||||||
|
|
||||||
|
<Heading
|
||||||
|
url='/datasets/{dataset_id}/documents/{document_id}'
|
||||||
|
method='GET'
|
||||||
|
title='ドキュメントの詳細を取得'
|
||||||
|
name='#get-document-detail'
|
||||||
|
/>
|
||||||
|
<Row>
|
||||||
|
<Col>
|
||||||
|
ドキュメントの詳細を取得.
|
||||||
|
### Path
|
||||||
|
- `dataset_id` (string) ナレッジベースID
|
||||||
|
- `document_id` (string) ドキュメントID
|
||||||
|
|
||||||
|
### Query
|
||||||
|
- `metadata` (string) metadataのフィルター条件 `all`、`only`、または`without`。デフォルトは `all`。
|
||||||
|
|
||||||
|
### Response
|
||||||
|
ナレッジベースドキュメントの詳細を返す.
|
||||||
|
</Col>
|
||||||
|
<Col sticky>
|
||||||
|
### Request Example
|
||||||
|
<CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}>
|
||||||
|
```bash {{ title: 'cURL' }}
|
||||||
|
curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \
|
||||||
|
-H 'Authorization: Bearer {api_key}'
|
||||||
|
```
|
||||||
|
</CodeGroup>
|
||||||
|
|
||||||
|
### Response Example
|
||||||
|
<CodeGroup title="Response">
|
||||||
|
```json {{ title: 'Response' }}
|
||||||
|
{
|
||||||
|
"id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2",
|
||||||
|
"position": 1,
|
||||||
|
"data_source_type": "upload_file",
|
||||||
|
"data_source_info": {
|
||||||
|
"upload_file": {
|
||||||
|
...
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c",
|
||||||
|
"dataset_process_rule": {
|
||||||
|
"mode": "hierarchical",
|
||||||
|
"rules": {
|
||||||
|
"pre_processing_rules": [
|
||||||
|
{
|
||||||
|
"id": "remove_extra_spaces",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "remove_urls_emails",
|
||||||
|
"enabled": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"segmentation": {
|
||||||
|
"separator": "**********page_ending**********",
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"chunk_overlap": 0
|
||||||
|
},
|
||||||
|
"parent_mode": "paragraph",
|
||||||
|
"subchunk_segmentation": {
|
||||||
|
"separator": "\n",
|
||||||
|
"max_tokens": 512,
|
||||||
|
"chunk_overlap": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"document_process_rule": {
|
||||||
|
"id": "24b99906-845e-499f-9e3c-d5565dd6962c",
|
||||||
|
"dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9",
|
||||||
|
"mode": "hierarchical",
|
||||||
|
"rules": {
|
||||||
|
"pre_processing_rules": [
|
||||||
|
{
|
||||||
|
"id": "remove_extra_spaces",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "remove_urls_emails",
|
||||||
|
"enabled": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"segmentation": {
|
||||||
|
"separator": "**********page_ending**********",
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"chunk_overlap": 0
|
||||||
|
},
|
||||||
|
"parent_mode": "paragraph",
|
||||||
|
"subchunk_segmentation": {
|
||||||
|
"separator": "\n",
|
||||||
|
"max_tokens": 512,
|
||||||
|
"chunk_overlap": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"name": "xxxx",
|
||||||
|
"created_from": "web",
|
||||||
|
"created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0",
|
||||||
|
"created_at": 1750464191,
|
||||||
|
"tokens": null,
|
||||||
|
"indexing_status": "waiting",
|
||||||
|
"completed_at": null,
|
||||||
|
"updated_at": 1750464191,
|
||||||
|
"indexing_latency": null,
|
||||||
|
"error": null,
|
||||||
|
"enabled": true,
|
||||||
|
"disabled_at": null,
|
||||||
|
"disabled_by": null,
|
||||||
|
"archived": false,
|
||||||
|
"segment_count": 0,
|
||||||
|
"average_segment_length": 0,
|
||||||
|
"hit_count": null,
|
||||||
|
"display_status": "queuing",
|
||||||
|
"doc_form": "hierarchical_model",
|
||||||
|
"doc_language": "Chinese Simplified"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
</CodeGroup>
|
||||||
|
</Col>
|
||||||
|
</Row>
|
||||||
|
___
|
||||||
|
<hr className='ml-0 mr-0' />
|
||||||
|
|
||||||
|
|
||||||
<Heading
|
<Heading
|
||||||
url='/datasets/{dataset_id}/documents/status/{action}'
|
url='/datasets/{dataset_id}/documents/status/{action}'
|
||||||
method='PATCH'
|
method='PATCH'
|
||||||
|
@@ -1131,6 +1131,130 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
|||||||
|
|
||||||
<hr className='ml-0 mr-0' />
|
<hr className='ml-0 mr-0' />
|
||||||
|
|
||||||
|
<Heading
|
||||||
|
url='/datasets/{dataset_id}/documents/{document_id}'
|
||||||
|
method='GET'
|
||||||
|
title='获取文档详情'
|
||||||
|
name='#get-document-detail'
|
||||||
|
/>
|
||||||
|
<Row>
|
||||||
|
<Col>
|
||||||
|
获取文档详情.
|
||||||
|
### Path
|
||||||
|
- `dataset_id` (string) 知识库 ID
|
||||||
|
- `document_id` (string) 文档 ID
|
||||||
|
|
||||||
|
### Query
|
||||||
|
- `metadata` (string) metadata 过滤条件 `all`, `only`, 或者 `without`. 默认是 `all`.
|
||||||
|
|
||||||
|
### Response
|
||||||
|
返回知识库文档的详情.
|
||||||
|
</Col>
|
||||||
|
<Col sticky>
|
||||||
|
### Request Example
|
||||||
|
<CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}>
|
||||||
|
```bash {{ title: 'cURL' }}
|
||||||
|
curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \
|
||||||
|
-H 'Authorization: Bearer {api_key}'
|
||||||
|
```
|
||||||
|
</CodeGroup>
|
||||||
|
|
||||||
|
### Response Example
|
||||||
|
<CodeGroup title="Response">
|
||||||
|
```json {{ title: 'Response' }}
|
||||||
|
{
|
||||||
|
"id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2",
|
||||||
|
"position": 1,
|
||||||
|
"data_source_type": "upload_file",
|
||||||
|
"data_source_info": {
|
||||||
|
"upload_file": {
|
||||||
|
...
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c",
|
||||||
|
"dataset_process_rule": {
|
||||||
|
"mode": "hierarchical",
|
||||||
|
"rules": {
|
||||||
|
"pre_processing_rules": [
|
||||||
|
{
|
||||||
|
"id": "remove_extra_spaces",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "remove_urls_emails",
|
||||||
|
"enabled": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"segmentation": {
|
||||||
|
"separator": "**********page_ending**********",
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"chunk_overlap": 0
|
||||||
|
},
|
||||||
|
"parent_mode": "paragraph",
|
||||||
|
"subchunk_segmentation": {
|
||||||
|
"separator": "\n",
|
||||||
|
"max_tokens": 512,
|
||||||
|
"chunk_overlap": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"document_process_rule": {
|
||||||
|
"id": "24b99906-845e-499f-9e3c-d5565dd6962c",
|
||||||
|
"dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9",
|
||||||
|
"mode": "hierarchical",
|
||||||
|
"rules": {
|
||||||
|
"pre_processing_rules": [
|
||||||
|
{
|
||||||
|
"id": "remove_extra_spaces",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "remove_urls_emails",
|
||||||
|
"enabled": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"segmentation": {
|
||||||
|
"separator": "**********page_ending**********",
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"chunk_overlap": 0
|
||||||
|
},
|
||||||
|
"parent_mode": "paragraph",
|
||||||
|
"subchunk_segmentation": {
|
||||||
|
"separator": "\n",
|
||||||
|
"max_tokens": 512,
|
||||||
|
"chunk_overlap": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"name": "xxxx",
|
||||||
|
"created_from": "web",
|
||||||
|
"created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0",
|
||||||
|
"created_at": 1750464191,
|
||||||
|
"tokens": null,
|
||||||
|
"indexing_status": "waiting",
|
||||||
|
"completed_at": null,
|
||||||
|
"updated_at": 1750464191,
|
||||||
|
"indexing_latency": null,
|
||||||
|
"error": null,
|
||||||
|
"enabled": true,
|
||||||
|
"disabled_at": null,
|
||||||
|
"disabled_by": null,
|
||||||
|
"archived": false,
|
||||||
|
"segment_count": 0,
|
||||||
|
"average_segment_length": 0,
|
||||||
|
"hit_count": null,
|
||||||
|
"display_status": "queuing",
|
||||||
|
"doc_form": "hierarchical_model",
|
||||||
|
"doc_language": "Chinese Simplified"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
</CodeGroup>
|
||||||
|
</Col>
|
||||||
|
</Row>
|
||||||
|
___
|
||||||
|
<hr className='ml-0 mr-0' />
|
||||||
|
|
||||||
|
|
||||||
<Heading
|
<Heading
|
||||||
url='/datasets/{dataset_id}/documents/status/{action}'
|
url='/datasets/{dataset_id}/documents/status/{action}'
|
||||||
method='PATCH'
|
method='PATCH'
|
||||||
|
Reference in New Issue
Block a user