From 1ea4459d9f741c849324924d7148bf412da3b0cb Mon Sep 17 00:00:00 2001 From: Dongyu Li <544104925@qq.com> Date: Fri, 30 May 2025 14:45:30 +0800 Subject: [PATCH] update knowledge base api (#20426) --- .../service_api/dataset/segment.py | 22 +++ api/core/rag/retrieval/dataset_retrieval.py | 3 + api/services/hit_testing_service.py | 28 +++- .../datasets/template/template.en.mdx | 139 +++++++++++++++--- .../datasets/template/template.ja.mdx | 135 +++++++++++++++-- .../datasets/template/template.zh.mdx | 106 ++++++++++++- 6 files changed, 397 insertions(+), 36 deletions(-) diff --git a/api/controllers/service_api/dataset/segment.py b/api/controllers/service_api/dataset/segment.py index ea4be4e51..337752275 100644 --- a/api/controllers/service_api/dataset/segment.py +++ b/api/controllers/service_api/dataset/segment.py @@ -208,6 +208,28 @@ class DatasetSegmentApi(DatasetApiResource): ) return {"data": marshal(updated_segment, segment_fields), "doc_form": document.doc_form}, 200 + def get(self, tenant_id, dataset_id, document_id, segment_id): + # check dataset + dataset_id = str(dataset_id) + tenant_id = str(tenant_id) + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + # check user's model setting + DatasetService.check_dataset_model_setting(dataset) + # check document + document_id = str(document_id) + document = DocumentService.get_document(dataset_id, document_id) + if not document: + raise NotFound("Document not found.") + # check segment + segment_id = str(segment_id) + segment = SegmentService.get_segment_by_id(segment_id=segment_id, tenant_id=current_user.current_tenant_id) + if not segment: + raise NotFound("Segment not found.") + + return {"data": marshal(segment, segment_fields), "doc_form": document.doc_form}, 200 + class ChildChunkApi(DatasetApiResource): """Resource for child chunks.""" diff --git a/api/core/rag/retrieval/dataset_retrieval.py b/api/core/rag/retrieval/dataset_retrieval.py index f66814842..697886052 100644 --- a/api/core/rag/retrieval/dataset_retrieval.py +++ b/api/core/rag/retrieval/dataset_retrieval.py @@ -937,6 +937,9 @@ class DatasetRetrieval: return metadata_filter_document_ids, metadata_condition def _replace_metadata_filter_value(self, text: str, inputs: dict) -> str: + if not inputs: + return text + def replacer(match): key = match.group(1) return str(inputs.get(key, f"{{{{{key}}}}}")) diff --git a/api/services/hit_testing_service.py b/api/services/hit_testing_service.py index 56e06cc33..519d5abca 100644 --- a/api/services/hit_testing_service.py +++ b/api/services/hit_testing_service.py @@ -2,8 +2,11 @@ import logging import time from typing import Any +from core.app.app_config.entities import ModelConfig +from core.model_runtime.entities import LLMMode from core.rag.datasource.retrieval_service import RetrievalService from core.rag.models.document import Document +from core.rag.retrieval.dataset_retrieval import DatasetRetrieval from core.rag.retrieval.retrieval_methods import RetrievalMethod from extensions.ext_database import db from models.account import Account @@ -34,7 +37,29 @@ class HitTestingService: # get retrieval model , if the model is not setting , using default if not retrieval_model: retrieval_model = dataset.retrieval_model or default_retrieval_model + document_ids_filter = None + metadata_filtering_conditions = retrieval_model.get("metadata_filtering_conditions", {}) + if metadata_filtering_conditions: + dataset_retrieval = DatasetRetrieval() + from core.app.app_config.entities import MetadataFilteringCondition + + metadata_filtering_conditions = MetadataFilteringCondition(**metadata_filtering_conditions) + + metadata_filter_document_ids, metadata_condition = dataset_retrieval.get_metadata_filter_condition( + dataset_ids=[dataset.id], + query=query, + metadata_filtering_mode="manual", + metadata_filtering_conditions=metadata_filtering_conditions, + inputs={}, + tenant_id="", + user_id="", + metadata_model_config=ModelConfig(provider="", name="", mode=LLMMode.CHAT, completion_params={}), + ) + if metadata_filter_document_ids: + document_ids_filter = metadata_filter_document_ids.get(dataset.id, []) + if metadata_condition and not document_ids_filter: + return cls.compact_retrieve_response(query, []) all_documents = RetrievalService.retrieve( retrieval_method=retrieval_model.get("search_method", "semantic_search"), dataset_id=dataset.id, @@ -48,6 +73,7 @@ class HitTestingService: else None, reranking_mode=retrieval_model.get("reranking_mode") or "reranking_model", weights=retrieval_model.get("weights", None), + document_ids_filter=document_ids_filter, ) end = time.perf_counter() @@ -99,7 +125,7 @@ class HitTestingService: return dict(cls.compact_external_retrieve_response(dataset, query, all_documents)) @classmethod - def compact_retrieve_response(cls, query: str, documents: list[Document]): + def compact_retrieve_response(cls, query: str, documents: list[Document]) -> dict[Any, Any]: records = RetrievalService.format_retrieval_documents(documents) return { diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx index 3393c636c..1a0979b41 100644 --- a/web/app/(commonLayout)/datasets/template/template.en.mdx +++ b/web/app/(commonLayout)/datasets/template/template.en.mdx @@ -1298,6 +1298,76 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
search_method
(text) Search method: One of the following four keywords is required
- - keyword_search
Keyword search
- - semantic_search
Semantic search
- - full_text_search
Full-text search
- - hybrid_search
Hybrid search
- - reranking_enable
(bool) Whether to enable reranking, required if the search mode is semantic_search or hybrid_search (optional)
- - reranking_mode
(object) Rerank model configuration, required if reranking is enabled
- - reranking_provider_name
(string) Rerank model provider
- - reranking_model_name
(string) Rerank model name
- - weights
(float) Semantic search weight setting in hybrid search mode
- - top_k
(integer) Number of results to return (optional)
- - score_threshold_enabled
(bool) Whether to enable score threshold
- - score_threshold
(float) Score threshold
+ Retrieval parameters (optional, if not filled, it will be recalled according to the default method)
+ - search_method
(text) Search method: One of the following four keywords is required
+ - keyword_search
Keyword search
+ - semantic_search
Semantic search
+ - full_text_search
Full-text search
+ - hybrid_search
Hybrid search
+ - reranking_enable
(bool) Whether to enable reranking, required if the search mode is semantic_search or hybrid_search (optional)
+ - reranking_mode
(object) Rerank model configuration, required if reranking is enabled
+ - reranking_provider_name
(string) Rerank model provider
+ - reranking_model_name
(string) Rerank model name
+ - weights
(float) Semantic search weight setting in hybrid search mode
+ - top_k
(integer) Number of results to return (optional)
+ - score_threshold_enabled
(bool) Whether to enable score threshold
+ - score_threshold
(float) Score threshold
+ - metadata_filtering_conditions
(object) Metadata filtering conditions
+ - logical_operator
(string) Logical operator: and
| or
+ - conditions
(array[object]) Conditions list
+ - name
(string) Metadata field name
+ - comparison_operator
(string) Comparison operator, allowed values:
+ - String comparison:
+ - contains
: Contains
+ - not contains
: Does not contain
+ - start with
: Starts with
+ - end with
: Ends with
+ - is
: Equals
+ - is not
: Does not equal
+ - empty
: Is empty
+ - not empty
: Is not empty
+ - Numeric comparison:
+ - =
: Equals
+ - ≠
: Does not equal
+ - >
: Greater than
+ - <
: Less than
+ - ≥
: Greater than or equal
+ - ≤
: Less than or equal
+ - Time comparison:
+ - before
: Before
+ - after
: After
+ - value
(string|number|null) Comparison value
search_method
(text) 検索方法: 以下の 4 つのキーワードのいずれかが必要です
- - keyword_search
キーワード検索
- - semantic_search
セマンティック検索
- - full_text_search
全文検索
- - hybrid_search
ハイブリッド検索
- - reranking_enable
(bool) 再ランキングを有効にするかどうか、検索モードが semantic_search または hybrid_search の場合に必須 (オプション)
- - reranking_mode
(object) 再ランキングモデル構成、再ランキングが有効な場合に必須
- - reranking_provider_name
(string) 再ランキングモデルプロバイダー
- - reranking_model_name
(string) 再ランキングモデル名
- - weights
(float) ハイブリッド検索モードでのセマンティック検索の重み設定
- - top_k
(integer) 返される結果の数 (オプション)
- - score_threshold_enabled
(bool) スコア閾値を有効にするかどうか
- - score_threshold
(float) スコア閾値
+ 検索パラメータ(オプション、入力されない場合はデフォルトの方法でリコールされます)
+ - search_method
(text) 検索方法: 以下の4つのキーワードのいずれかが必要です
+ - keyword_search
キーワード検索
+ - semantic_search
セマンティック検索
+ - full_text_search
全文検索
+ - hybrid_search
ハイブリッド検索
+ - reranking_enable
(bool) 再ランキングを有効にするかどうか、検索モードがsemantic_searchまたはhybrid_searchの場合に必須(オプション)
+ - reranking_mode
(object) 再ランキングモデル構成、再ランキングが有効な場合に必須
+ - reranking_provider_name
(string) 再ランキングモデルプロバイダー
+ - reranking_model_name
(string) 再ランキングモデル名
+ - weights
(float) ハイブリッド検索モードでのセマンティック検索の重み設定
+ - top_k
(integer) 返される結果の数(オプション)
+ - score_threshold_enabled
(bool) スコア閾値を有効にするかどうか
+ - score_threshold
(float) スコア閾値
+ - metadata_filtering_conditions
(object) メタデータフィルタリング条件
+ - logical_operator
(string) 論理演算子: and
| or
+ - conditions
(array[object]) 条件リスト
+ - name
(string) メタデータフィールド名
+ - comparison_operator
(string) 比較演算子、許可される値:
+ - 文字列比較:
+ - contains
: 含む
+ - not contains
: 含まない
+ - start with
: で始まる
+ - end with
: で終わる
+ - is
: 等しい
+ - is not
: 等しくない
+ - empty
: 空
+ - not empty
: 空でない
+ - 数値比較:
+ - =
: 等しい
+ - ≠
: 等しくない
+ - >
: より大きい
+ - <
: より小さい
+ - ≥
: 以上
+ - ≤
: 以下
+ - 時間比較:
+ - before
: より前
+ - after
: より後
+ - value
(string|number|null) 比較値
score_threshold_enabled
(bool) 是否开启 score 阈值
- score_threshold
(float) Score 阈值
+ - metadata_filtering_conditions
(object) 元数据过滤条件
+ - logical_operator
(string) 逻辑运算符: and
| or
+ - conditions
(array[object]) 条件列表
+ - name
(string) 元数据字段名
+ - comparison_operator
(string) 比较运算符,可选值:
+ - 字符串比较:
+ - contains
: 包含
+ - not contains
: 不包含
+ - start with
: 以...开头
+ - end with
: 以...结尾
+ - is
: 等于
+ - is not
: 不等于
+ - empty
: 为空
+ - not empty
: 不为空
+ - 数值比较:
+ - =
: 等于
+ - ≠
: 不等于
+ - >
: 大于
+ - <
: 小于
+ - ≥
: 大于等于
+ - ≤
: 小于等于
+ - 时间比较:
+ - before
: 早于
+ - after
: 晚于
+ - value
(string|number|null) 比较值