From 1ea4459d9f741c849324924d7148bf412da3b0cb Mon Sep 17 00:00:00 2001 From: Dongyu Li <544104925@qq.com> Date: Fri, 30 May 2025 14:45:30 +0800 Subject: [PATCH] update knowledge base api (#20426) --- .../service_api/dataset/segment.py | 22 +++ api/core/rag/retrieval/dataset_retrieval.py | 3 + api/services/hit_testing_service.py | 28 +++- .../datasets/template/template.en.mdx | 139 +++++++++++++++--- .../datasets/template/template.ja.mdx | 135 +++++++++++++++-- .../datasets/template/template.zh.mdx | 106 ++++++++++++- 6 files changed, 397 insertions(+), 36 deletions(-) diff --git a/api/controllers/service_api/dataset/segment.py b/api/controllers/service_api/dataset/segment.py index ea4be4e51..337752275 100644 --- a/api/controllers/service_api/dataset/segment.py +++ b/api/controllers/service_api/dataset/segment.py @@ -208,6 +208,28 @@ class DatasetSegmentApi(DatasetApiResource): ) return {"data": marshal(updated_segment, segment_fields), "doc_form": document.doc_form}, 200 + def get(self, tenant_id, dataset_id, document_id, segment_id): + # check dataset + dataset_id = str(dataset_id) + tenant_id = str(tenant_id) + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + # check user's model setting + DatasetService.check_dataset_model_setting(dataset) + # check document + document_id = str(document_id) + document = DocumentService.get_document(dataset_id, document_id) + if not document: + raise NotFound("Document not found.") + # check segment + segment_id = str(segment_id) + segment = SegmentService.get_segment_by_id(segment_id=segment_id, tenant_id=current_user.current_tenant_id) + if not segment: + raise NotFound("Segment not found.") + + return {"data": marshal(segment, segment_fields), "doc_form": document.doc_form}, 200 + class ChildChunkApi(DatasetApiResource): """Resource for child chunks.""" diff --git a/api/core/rag/retrieval/dataset_retrieval.py b/api/core/rag/retrieval/dataset_retrieval.py index f66814842..697886052 100644 --- a/api/core/rag/retrieval/dataset_retrieval.py +++ b/api/core/rag/retrieval/dataset_retrieval.py @@ -937,6 +937,9 @@ class DatasetRetrieval: return metadata_filter_document_ids, metadata_condition def _replace_metadata_filter_value(self, text: str, inputs: dict) -> str: + if not inputs: + return text + def replacer(match): key = match.group(1) return str(inputs.get(key, f"{{{{{key}}}}}")) diff --git a/api/services/hit_testing_service.py b/api/services/hit_testing_service.py index 56e06cc33..519d5abca 100644 --- a/api/services/hit_testing_service.py +++ b/api/services/hit_testing_service.py @@ -2,8 +2,11 @@ import logging import time from typing import Any +from core.app.app_config.entities import ModelConfig +from core.model_runtime.entities import LLMMode from core.rag.datasource.retrieval_service import RetrievalService from core.rag.models.document import Document +from core.rag.retrieval.dataset_retrieval import DatasetRetrieval from core.rag.retrieval.retrieval_methods import RetrievalMethod from extensions.ext_database import db from models.account import Account @@ -34,7 +37,29 @@ class HitTestingService: # get retrieval model , if the model is not setting , using default if not retrieval_model: retrieval_model = dataset.retrieval_model or default_retrieval_model + document_ids_filter = None + metadata_filtering_conditions = retrieval_model.get("metadata_filtering_conditions", {}) + if metadata_filtering_conditions: + dataset_retrieval = DatasetRetrieval() + from core.app.app_config.entities import MetadataFilteringCondition + + metadata_filtering_conditions = MetadataFilteringCondition(**metadata_filtering_conditions) + + metadata_filter_document_ids, metadata_condition = dataset_retrieval.get_metadata_filter_condition( + dataset_ids=[dataset.id], + query=query, + metadata_filtering_mode="manual", + metadata_filtering_conditions=metadata_filtering_conditions, + inputs={}, + tenant_id="", + user_id="", + metadata_model_config=ModelConfig(provider="", name="", mode=LLMMode.CHAT, completion_params={}), + ) + if metadata_filter_document_ids: + document_ids_filter = metadata_filter_document_ids.get(dataset.id, []) + if metadata_condition and not document_ids_filter: + return cls.compact_retrieve_response(query, []) all_documents = RetrievalService.retrieve( retrieval_method=retrieval_model.get("search_method", "semantic_search"), dataset_id=dataset.id, @@ -48,6 +73,7 @@ class HitTestingService: else None, reranking_mode=retrieval_model.get("reranking_mode") or "reranking_model", weights=retrieval_model.get("weights", None), + document_ids_filter=document_ids_filter, ) end = time.perf_counter() @@ -99,7 +125,7 @@ class HitTestingService: return dict(cls.compact_external_retrieve_response(dataset, query, all_documents)) @classmethod - def compact_retrieve_response(cls, query: str, documents: list[Document]): + def compact_retrieve_response(cls, query: str, documents: list[Document]) -> dict[Any, Any]: records = RetrievalService.format_retrieval_documents(documents) return { diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx index 3393c636c..1a0979b41 100644 --- a/web/app/(commonLayout)/datasets/template/template.en.mdx +++ b/web/app/(commonLayout)/datasets/template/template.en.mdx @@ -1298,6 +1298,76 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
+ + + + Get details of a specific document segment in the specified knowledge base + + ### Path + + + Knowledge Base ID + + + Document ID + + + Segment ID + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "chunk_id", + "position": 2, + "document_id": "document_id", + "content": "Segment content text", + "sign_content": "Signature content text", + "answer": "Answer content (if in Q&A mode)", + "word_count": 470, + "tokens": 382, + "keywords": ["keyword1", "keyword2"], + "index_node_id": "index_node_id", + "index_node_hash": "index_node_hash", + "hit_count": 0, + "enabled": true, + "status": "completed", + "created_by": "creator_id", + "created_at": creation_timestamp, + "updated_at": update_timestamp, + "indexing_at": indexing_timestamp, + "completed_at": completion_timestamp, + "error": null, + "child_chunks": [] + }, + "doc_form": "text_model" + } + ``` + + + + +
+ - Retrieval model (optional, if not filled, it will be recalled according to the default method) - - search_method (text) Search method: One of the following four keywords is required - - keyword_search Keyword search - - semantic_search Semantic search - - full_text_search Full-text search - - hybrid_search Hybrid search - - reranking_enable (bool) Whether to enable reranking, required if the search mode is semantic_search or hybrid_search (optional) - - reranking_mode (object) Rerank model configuration, required if reranking is enabled - - reranking_provider_name (string) Rerank model provider - - reranking_model_name (string) Rerank model name - - weights (float) Semantic search weight setting in hybrid search mode - - top_k (integer) Number of results to return (optional) - - score_threshold_enabled (bool) Whether to enable score threshold - - score_threshold (float) Score threshold + Retrieval parameters (optional, if not filled, it will be recalled according to the default method) + - search_method (text) Search method: One of the following four keywords is required + - keyword_search Keyword search + - semantic_search Semantic search + - full_text_search Full-text search + - hybrid_search Hybrid search + - reranking_enable (bool) Whether to enable reranking, required if the search mode is semantic_search or hybrid_search (optional) + - reranking_mode (object) Rerank model configuration, required if reranking is enabled + - reranking_provider_name (string) Rerank model provider + - reranking_model_name (string) Rerank model name + - weights (float) Semantic search weight setting in hybrid search mode + - top_k (integer) Number of results to return (optional) + - score_threshold_enabled (bool) Whether to enable score threshold + - score_threshold (float) Score threshold + - metadata_filtering_conditions (object) Metadata filtering conditions + - logical_operator (string) Logical operator: and | or + - conditions (array[object]) Conditions list + - name (string) Metadata field name + - comparison_operator (string) Comparison operator, allowed values: + - String comparison: + - contains: Contains + - not contains: Does not contain + - start with: Starts with + - end with: Ends with + - is: Equals + - is not: Does not equal + - empty: Is empty + - not empty: Is not empty + - Numeric comparison: + - =: Equals + - : Does not equal + - >: Greater than + - < : Less than + - : Greater than or equal + - : Less than or equal + - Time comparison: + - before: Before + - after: After + - value (string|number|null) Comparison value Unused field @@ -1809,7 +1904,17 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi "weights": null, "top_k": 1, "score_threshold_enabled": false, - "score_threshold": null + "score_threshold": null, + "metadata_filtering_conditions": { + "logical_operator": "and", + "conditions": [ + { + "name": "document_name", + "comparison_operator": "contains", + "value": "test" + } + ] + } } }'`} > @@ -2089,9 +2194,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi label="/datasets/{dataset_id}/documents/metadata" targetCode={`curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}/documents/metadata' \\\n--header 'Authorization: Bearer {api_key}' \\\n--header 'Content-Type: application/json'\\\n--data-raw '{"operation_data": [{"document_id": "document_id", "metadata_list": [{"id": "id", "value": "value", "name": "name"}]}]}'`} > - ```bash {{ title: 'cURL' }} - ``` - + ```bash {{ title: 'cURL' }} diff --git a/web/app/(commonLayout)/datasets/template/template.ja.mdx b/web/app/(commonLayout)/datasets/template/template.ja.mdx index defd48816..f194c8012 100644 --- a/web/app/(commonLayout)/datasets/template/template.ja.mdx +++ b/web/app/(commonLayout)/datasets/template/template.ja.mdx @@ -1057,6 +1057,75 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi + + + 指定されたナレッジベース内の特定のドキュメントセグメントの詳細を表示します + + ### パス + + + ナレッジベースID + + + ドキュメントID + + + セグメントID + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "セグメントID", + "position": 2, + "document_id": "ドキュメントID", + "content": "セグメント内容テキスト", + "sign_content": "署名内容テキスト", + "answer": "回答内容(Q&Aモードの場合)", + "word_count": 470, + "tokens": 382, + "keywords": ["キーワード1", "キーワード2"], + "index_node_id": "インデックスノードID", + "index_node_hash": "インデックスノードハッシュ", + "hit_count": 0, + "enabled": true, + "status": "completed", + "created_by": "作成者ID", + "created_at": 作成タイムスタンプ, + "updated_at": 更新タイムスタンプ, + "indexing_at": インデックス作成タイムスタンプ, + "completed_at": 完了タイムスタンプ, + "error": null, + "child_chunks": [] + }, + "doc_form": "text_model" + } + ``` + + + + +
+ + - 検索モデル (オプション、入力されない場合はデフォルトの方法でリコールされます) - - search_method (text) 検索方法: 以下の 4 つのキーワードのいずれかが必要です - - keyword_search キーワード検索 - - semantic_search セマンティック検索 - - full_text_search 全文検索 - - hybrid_search ハイブリッド検索 - - reranking_enable (bool) 再ランキングを有効にするかどうか、検索モードが semantic_search または hybrid_search の場合に必須 (オプション) - - reranking_mode (object) 再ランキングモデル構成、再ランキングが有効な場合に必須 - - reranking_provider_name (string) 再ランキングモデルプロバイダー - - reranking_model_name (string) 再ランキングモデル名 - - weights (float) ハイブリッド検索モードでのセマンティック検索の重み設定 - - top_k (integer) 返される結果の数 (オプション) - - score_threshold_enabled (bool) スコア閾値を有効にするかどうか - - score_threshold (float) スコア閾値 + 検索パラメータ(オプション、入力されない場合はデフォルトの方法でリコールされます) + - search_method (text) 検索方法: 以下の4つのキーワードのいずれかが必要です + - keyword_search キーワード検索 + - semantic_search セマンティック検索 + - full_text_search 全文検索 + - hybrid_search ハイブリッド検索 + - reranking_enable (bool) 再ランキングを有効にするかどうか、検索モードがsemantic_searchまたはhybrid_searchの場合に必須(オプション) + - reranking_mode (object) 再ランキングモデル構成、再ランキングが有効な場合に必須 + - reranking_provider_name (string) 再ランキングモデルプロバイダー + - reranking_model_name (string) 再ランキングモデル名 + - weights (float) ハイブリッド検索モードでのセマンティック検索の重み設定 + - top_k (integer) 返される結果の数(オプション) + - score_threshold_enabled (bool) スコア閾値を有効にするかどうか + - score_threshold (float) スコア閾値 + - metadata_filtering_conditions (object) メタデータフィルタリング条件 + - logical_operator (string) 論理演算子: and | or + - conditions (array[object]) 条件リスト + - name (string) メタデータフィールド名 + - comparison_operator (string) 比較演算子、許可される値: + - 文字列比較: + - contains: 含む + - not contains: 含まない + - start with: で始まる + - end with: で終わる + - is: 等しい + - is not: 等しくない + - empty: 空 + - not empty: 空でない + - 数値比較: + - =: 等しい + - : 等しくない + - >: より大きい + - < : より小さい + - : 以上 + - : 以下 + - 時間比較: + - before: より前 + - after: より後 + - value (string|number|null) 比較値 未使用フィールド @@ -1566,7 +1659,17 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi "weights": null, "top_k": 1, "score_threshold_enabled": false, - "score_threshold": null + "score_threshold": null, + "metadata_filtering_conditions": { + "logical_operator": "and", + "conditions": [ + { + "name": "document_name", + "comparison_operator": "contains", + "value": "test" + } + ] + } } }'`} > diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx index 04b583765..95bc4d6aa 100644 --- a/web/app/(commonLayout)/datasets/template/template.zh.mdx +++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx @@ -1351,6 +1351,75 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi + + + 查看指定知识库中特定文档的分段详情 + + ### Path + + + 知识库 ID + + + 文档 ID + + + 分段 ID + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "分段唯一ID", + "position": 2, + "document_id": "所属文档ID", + "content": "分段内容文本", + "sign_content": "签名内容文本", + "answer": "答案内容(如果有)", + "word_count": 470, + "tokens": 382, + "keywords": ["关键词1", "关键词2"], + "index_node_id": "索引节点ID", + "index_node_hash": "索引节点哈希值", + "hit_count": 0, + "enabled": true, + "status": "completed", + "created_by": "创建者ID", + "created_at": 创建时间戳, + "updated_at": 更新时间戳, + "indexing_at": 索引时间戳, + "completed_at": 完成时间戳, + "error": null, + "child_chunks": [] + }, + "doc_form": "text_model" + } + ``` + + + + +
+ +top_k (integer) 返回结果数量,非必填 - score_threshold_enabled (bool) 是否开启 score 阈值 - score_threshold (float) Score 阈值 + - metadata_filtering_conditions (object) 元数据过滤条件 + - logical_operator (string) 逻辑运算符: and | or + - conditions (array[object]) 条件列表 + - name (string) 元数据字段名 + - comparison_operator (string) 比较运算符,可选值: + - 字符串比较: + - contains: 包含 + - not contains: 不包含 + - start with: 以...开头 + - end with: 以...结尾 + - is: 等于 + - is not: 不等于 + - empty: 为空 + - not empty: 不为空 + - 数值比较: + - =: 等于 + - : 不等于 + - >: 大于 + - < : 小于 + - : 大于等于 + - : 小于等于 + - 时间比较: + - before: 早于 + - after: 晚于 + - value (string|number|null) 比较值
未启用字段 @@ -1851,7 +1945,17 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi "weights": null, "top_k": 1, "score_threshold_enabled": false, - "score_threshold": null + "score_threshold": null, + "metadata_filtering_conditions": { + "logical_operator": "and", + "conditions": [ + { + "name": "document_name", + "comparison_operator": "contains", + "value": "test" + } + ] + } } }'`} >