external knowledge api (#8913)
Co-authored-by: Yi <yxiaoisme@gmail.com>
This commit is contained in:
@@ -32,6 +32,7 @@ from models.dataset import (
|
||||
DatasetQuery,
|
||||
Document,
|
||||
DocumentSegment,
|
||||
ExternalKnowledgeBindings,
|
||||
)
|
||||
from models.model import UploadFile
|
||||
from models.source import DataSourceOauthBinding
|
||||
@@ -39,6 +40,7 @@ from services.errors.account import NoPermissionError
|
||||
from services.errors.dataset import DatasetNameDuplicateError
|
||||
from services.errors.document import DocumentIndexingError
|
||||
from services.errors.file import FileNotExistsError
|
||||
from services.external_knowledge_service import ExternalDatasetService
|
||||
from services.feature_service import FeatureModel, FeatureService
|
||||
from services.tag_service import TagService
|
||||
from services.vector_service import VectorService
|
||||
@@ -56,10 +58,8 @@ from tasks.sync_website_document_indexing_task import sync_website_document_inde
|
||||
|
||||
class DatasetService:
|
||||
@staticmethod
|
||||
def get_datasets(page, per_page, provider="vendor", tenant_id=None, user=None, search=None, tag_ids=None):
|
||||
query = Dataset.query.filter(Dataset.provider == provider, Dataset.tenant_id == tenant_id).order_by(
|
||||
Dataset.created_at.desc()
|
||||
)
|
||||
def get_datasets(page, per_page, tenant_id=None, user=None, search=None, tag_ids=None):
|
||||
query = Dataset.query.filter(Dataset.tenant_id == tenant_id).order_by(Dataset.created_at.desc())
|
||||
|
||||
if user:
|
||||
# get permitted dataset ids
|
||||
@@ -137,7 +137,14 @@ class DatasetService:
|
||||
|
||||
@staticmethod
|
||||
def create_empty_dataset(
|
||||
tenant_id: str, name: str, indexing_technique: Optional[str], account: Account, permission: Optional[str] = None
|
||||
tenant_id: str,
|
||||
name: str,
|
||||
indexing_technique: Optional[str],
|
||||
account: Account,
|
||||
permission: Optional[str] = None,
|
||||
provider: str = "vendor",
|
||||
external_knowledge_api_id: Optional[str] = None,
|
||||
external_knowledge_id: Optional[str] = None,
|
||||
):
|
||||
# check if dataset name already exists
|
||||
if Dataset.query.filter_by(name=name, tenant_id=tenant_id).first():
|
||||
@@ -156,12 +163,28 @@ class DatasetService:
|
||||
dataset.embedding_model_provider = embedding_model.provider if embedding_model else None
|
||||
dataset.embedding_model = embedding_model.model if embedding_model else None
|
||||
dataset.permission = permission or DatasetPermissionEnum.ONLY_ME
|
||||
dataset.provider = provider
|
||||
db.session.add(dataset)
|
||||
db.session.flush()
|
||||
|
||||
if provider == "external" and external_knowledge_api_id:
|
||||
external_knowledge_api = ExternalDatasetService.get_external_knowledge_api(external_knowledge_api_id)
|
||||
if not external_knowledge_api:
|
||||
raise ValueError("External API template not found.")
|
||||
external_knowledge_binding = ExternalKnowledgeBindings(
|
||||
tenant_id=tenant_id,
|
||||
dataset_id=dataset.id,
|
||||
external_knowledge_api_id=external_knowledge_api_id,
|
||||
external_knowledge_id=external_knowledge_id,
|
||||
created_by=account.id,
|
||||
)
|
||||
db.session.add(external_knowledge_binding)
|
||||
|
||||
db.session.commit()
|
||||
return dataset
|
||||
|
||||
@staticmethod
|
||||
def get_dataset(dataset_id):
|
||||
def get_dataset(dataset_id) -> Dataset:
|
||||
return Dataset.query.filter_by(id=dataset_id).first()
|
||||
|
||||
@staticmethod
|
||||
@@ -202,81 +225,103 @@ class DatasetService:
|
||||
|
||||
@staticmethod
|
||||
def update_dataset(dataset_id, data, user):
|
||||
data.pop("partial_member_list", None)
|
||||
filtered_data = {k: v for k, v in data.items() if v is not None or k == "description"}
|
||||
dataset = DatasetService.get_dataset(dataset_id)
|
||||
|
||||
DatasetService.check_dataset_permission(dataset, user)
|
||||
action = None
|
||||
if dataset.indexing_technique != data["indexing_technique"]:
|
||||
# if update indexing_technique
|
||||
if data["indexing_technique"] == "economy":
|
||||
action = "remove"
|
||||
filtered_data["embedding_model"] = None
|
||||
filtered_data["embedding_model_provider"] = None
|
||||
filtered_data["collection_binding_id"] = None
|
||||
elif data["indexing_technique"] == "high_quality":
|
||||
action = "add"
|
||||
# get embedding model setting
|
||||
try:
|
||||
model_manager = ModelManager()
|
||||
embedding_model = model_manager.get_model_instance(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
provider=data["embedding_model_provider"],
|
||||
model_type=ModelType.TEXT_EMBEDDING,
|
||||
model=data["embedding_model"],
|
||||
)
|
||||
filtered_data["embedding_model"] = embedding_model.model
|
||||
filtered_data["embedding_model_provider"] = embedding_model.provider
|
||||
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
|
||||
embedding_model.provider, embedding_model.model
|
||||
)
|
||||
filtered_data["collection_binding_id"] = dataset_collection_binding.id
|
||||
except LLMBadRequestError:
|
||||
raise ValueError(
|
||||
"No Embedding Model available. Please configure a valid provider "
|
||||
"in the Settings -> Model Provider."
|
||||
)
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ValueError(ex.description)
|
||||
else:
|
||||
if dataset.provider == "external":
|
||||
dataset.retrieval_model = data.get("external_retrieval_model", None)
|
||||
dataset.name = data.get("name", dataset.name)
|
||||
dataset.description = data.get("description", "")
|
||||
external_knowledge_id = data.get("external_knowledge_id", None)
|
||||
db.session.add(dataset)
|
||||
if not external_knowledge_id:
|
||||
raise ValueError("External knowledge id is required.")
|
||||
external_knowledge_api_id = data.get("external_knowledge_api_id", None)
|
||||
if not external_knowledge_api_id:
|
||||
raise ValueError("External knowledge api id is required.")
|
||||
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(dataset_id=dataset_id).first()
|
||||
if (
|
||||
data["embedding_model_provider"] != dataset.embedding_model_provider
|
||||
or data["embedding_model"] != dataset.embedding_model
|
||||
external_knowledge_binding.external_knowledge_id != external_knowledge_id
|
||||
or external_knowledge_binding.external_knowledge_api_id != external_knowledge_api_id
|
||||
):
|
||||
action = "update"
|
||||
try:
|
||||
model_manager = ModelManager()
|
||||
embedding_model = model_manager.get_model_instance(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
provider=data["embedding_model_provider"],
|
||||
model_type=ModelType.TEXT_EMBEDDING,
|
||||
model=data["embedding_model"],
|
||||
)
|
||||
filtered_data["embedding_model"] = embedding_model.model
|
||||
filtered_data["embedding_model_provider"] = embedding_model.provider
|
||||
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
|
||||
embedding_model.provider, embedding_model.model
|
||||
)
|
||||
filtered_data["collection_binding_id"] = dataset_collection_binding.id
|
||||
except LLMBadRequestError:
|
||||
raise ValueError(
|
||||
"No Embedding Model available. Please configure a valid provider "
|
||||
"in the Settings -> Model Provider."
|
||||
)
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ValueError(ex.description)
|
||||
external_knowledge_binding.external_knowledge_id = external_knowledge_id
|
||||
external_knowledge_binding.external_knowledge_api_id = external_knowledge_api_id
|
||||
db.session.add(external_knowledge_binding)
|
||||
db.session.commit()
|
||||
else:
|
||||
data.pop("partial_member_list", None)
|
||||
filtered_data = {k: v for k, v in data.items() if v is not None or k == "description"}
|
||||
action = None
|
||||
if dataset.indexing_technique != data["indexing_technique"]:
|
||||
# if update indexing_technique
|
||||
if data["indexing_technique"] == "economy":
|
||||
action = "remove"
|
||||
filtered_data["embedding_model"] = None
|
||||
filtered_data["embedding_model_provider"] = None
|
||||
filtered_data["collection_binding_id"] = None
|
||||
elif data["indexing_technique"] == "high_quality":
|
||||
action = "add"
|
||||
# get embedding model setting
|
||||
try:
|
||||
model_manager = ModelManager()
|
||||
embedding_model = model_manager.get_model_instance(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
provider=data["embedding_model_provider"],
|
||||
model_type=ModelType.TEXT_EMBEDDING,
|
||||
model=data["embedding_model"],
|
||||
)
|
||||
filtered_data["embedding_model"] = embedding_model.model
|
||||
filtered_data["embedding_model_provider"] = embedding_model.provider
|
||||
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
|
||||
embedding_model.provider, embedding_model.model
|
||||
)
|
||||
filtered_data["collection_binding_id"] = dataset_collection_binding.id
|
||||
except LLMBadRequestError:
|
||||
raise ValueError(
|
||||
"No Embedding Model available. Please configure a valid provider "
|
||||
"in the Settings -> Model Provider."
|
||||
)
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ValueError(ex.description)
|
||||
else:
|
||||
if (
|
||||
data["embedding_model_provider"] != dataset.embedding_model_provider
|
||||
or data["embedding_model"] != dataset.embedding_model
|
||||
):
|
||||
action = "update"
|
||||
try:
|
||||
model_manager = ModelManager()
|
||||
embedding_model = model_manager.get_model_instance(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
provider=data["embedding_model_provider"],
|
||||
model_type=ModelType.TEXT_EMBEDDING,
|
||||
model=data["embedding_model"],
|
||||
)
|
||||
filtered_data["embedding_model"] = embedding_model.model
|
||||
filtered_data["embedding_model_provider"] = embedding_model.provider
|
||||
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
|
||||
embedding_model.provider, embedding_model.model
|
||||
)
|
||||
filtered_data["collection_binding_id"] = dataset_collection_binding.id
|
||||
except LLMBadRequestError:
|
||||
raise ValueError(
|
||||
"No Embedding Model available. Please configure a valid provider "
|
||||
"in the Settings -> Model Provider."
|
||||
)
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ValueError(ex.description)
|
||||
|
||||
filtered_data["updated_by"] = user.id
|
||||
filtered_data["updated_at"] = datetime.datetime.now()
|
||||
filtered_data["updated_by"] = user.id
|
||||
filtered_data["updated_at"] = datetime.datetime.now()
|
||||
|
||||
# update Retrieval model
|
||||
filtered_data["retrieval_model"] = data["retrieval_model"]
|
||||
# update Retrieval model
|
||||
filtered_data["retrieval_model"] = data["retrieval_model"]
|
||||
|
||||
dataset.query.filter_by(id=dataset_id).update(filtered_data)
|
||||
dataset.query.filter_by(id=dataset_id).update(filtered_data)
|
||||
|
||||
db.session.commit()
|
||||
if action:
|
||||
deal_dataset_vector_index_task.delay(dataset_id, action)
|
||||
db.session.commit()
|
||||
if action:
|
||||
deal_dataset_vector_index_task.delay(dataset_id, action)
|
||||
return dataset
|
||||
|
||||
@staticmethod
|
||||
|
@@ -0,0 +1,26 @@
|
||||
from typing import Literal, Optional, Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class AuthorizationConfig(BaseModel):
|
||||
type: Literal[None, "basic", "bearer", "custom"]
|
||||
api_key: Union[None, str] = None
|
||||
header: Union[None, str] = None
|
||||
|
||||
|
||||
class Authorization(BaseModel):
|
||||
type: Literal["no-auth", "api-key"]
|
||||
config: Optional[AuthorizationConfig] = None
|
||||
|
||||
|
||||
class ProcessStatusSetting(BaseModel):
|
||||
request_method: str
|
||||
url: str
|
||||
|
||||
|
||||
class ExternalKnowledgeApiSetting(BaseModel):
|
||||
url: str
|
||||
request_method: str
|
||||
headers: Optional[dict] = None
|
||||
params: Optional[dict] = None
|
274
api/services/external_knowledge_service.py
Normal file
274
api/services/external_knowledge_service.py
Normal file
@@ -0,0 +1,274 @@
|
||||
import json
|
||||
from copy import deepcopy
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import httpx
|
||||
import validators
|
||||
|
||||
# from tasks.external_document_indexing_task import external_document_indexing_task
|
||||
from core.helper import ssrf_proxy
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import (
|
||||
Dataset,
|
||||
ExternalKnowledgeApis,
|
||||
ExternalKnowledgeBindings,
|
||||
)
|
||||
from services.entities.external_knowledge_entities.external_knowledge_entities import (
|
||||
Authorization,
|
||||
ExternalKnowledgeApiSetting,
|
||||
)
|
||||
from services.errors.dataset import DatasetNameDuplicateError
|
||||
|
||||
|
||||
class ExternalDatasetService:
|
||||
@staticmethod
|
||||
def get_external_knowledge_apis(page, per_page, tenant_id, search=None) -> tuple[list[ExternalKnowledgeApis], int]:
|
||||
query = ExternalKnowledgeApis.query.filter(ExternalKnowledgeApis.tenant_id == tenant_id).order_by(
|
||||
ExternalKnowledgeApis.created_at.desc()
|
||||
)
|
||||
if search:
|
||||
query = query.filter(ExternalKnowledgeApis.name.ilike(f"%{search}%"))
|
||||
|
||||
external_knowledge_apis = query.paginate(page=page, per_page=per_page, max_per_page=100, error_out=False)
|
||||
|
||||
return external_knowledge_apis.items, external_knowledge_apis.total
|
||||
|
||||
@classmethod
|
||||
def validate_api_list(cls, api_settings: dict):
|
||||
if not api_settings:
|
||||
raise ValueError("api list is empty")
|
||||
if "endpoint" not in api_settings and not api_settings["endpoint"]:
|
||||
raise ValueError("endpoint is required")
|
||||
if "api_key" not in api_settings and not api_settings["api_key"]:
|
||||
raise ValueError("api_key is required")
|
||||
|
||||
@staticmethod
|
||||
def create_external_knowledge_api(tenant_id: str, user_id: str, args: dict) -> ExternalKnowledgeApis:
|
||||
ExternalDatasetService.check_endpoint_and_api_key(args.get("settings"))
|
||||
external_knowledge_api = ExternalKnowledgeApis(
|
||||
tenant_id=tenant_id,
|
||||
created_by=user_id,
|
||||
updated_by=user_id,
|
||||
name=args.get("name"),
|
||||
description=args.get("description", ""),
|
||||
settings=json.dumps(args.get("settings"), ensure_ascii=False),
|
||||
)
|
||||
|
||||
db.session.add(external_knowledge_api)
|
||||
db.session.commit()
|
||||
return external_knowledge_api
|
||||
|
||||
@staticmethod
|
||||
def check_endpoint_and_api_key(settings: dict):
|
||||
if "endpoint" not in settings or not settings["endpoint"]:
|
||||
raise ValueError("endpoint is required")
|
||||
if "api_key" not in settings or not settings["api_key"]:
|
||||
raise ValueError("api_key is required")
|
||||
|
||||
endpoint = f"{settings['endpoint']}/retrieval"
|
||||
api_key = settings["api_key"]
|
||||
if not validators.url(endpoint):
|
||||
raise ValueError(f"invalid endpoint: {endpoint}")
|
||||
try:
|
||||
response = httpx.post(endpoint, headers={"Authorization": f"Bearer {api_key}"})
|
||||
except Exception as e:
|
||||
raise ValueError(f"failed to connect to the endpoint: {endpoint}")
|
||||
if response.status_code == 502:
|
||||
raise ValueError(f"Bad Gateway: failed to connect to the endpoint: {endpoint}")
|
||||
if response.status_code == 404:
|
||||
raise ValueError(f"Not Found: failed to connect to the endpoint: {endpoint}")
|
||||
if response.status_code == 403:
|
||||
raise ValueError(f"Forbidden: Authorization failed with api_key: {api_key}")
|
||||
|
||||
@staticmethod
|
||||
def get_external_knowledge_api(external_knowledge_api_id: str) -> ExternalKnowledgeApis:
|
||||
return ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id).first()
|
||||
|
||||
@staticmethod
|
||||
def update_external_knowledge_api(tenant_id, user_id, external_knowledge_api_id, args) -> ExternalKnowledgeApis:
|
||||
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
|
||||
id=external_knowledge_api_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if external_knowledge_api is None:
|
||||
raise ValueError("api template not found")
|
||||
|
||||
external_knowledge_api.name = args.get("name")
|
||||
external_knowledge_api.description = args.get("description", "")
|
||||
external_knowledge_api.settings = json.dumps(args.get("settings"), ensure_ascii=False)
|
||||
external_knowledge_api.updated_by = user_id
|
||||
external_knowledge_api.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
|
||||
db.session.commit()
|
||||
|
||||
return external_knowledge_api
|
||||
|
||||
@staticmethod
|
||||
def delete_external_knowledge_api(tenant_id: str, external_knowledge_api_id: str):
|
||||
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
|
||||
id=external_knowledge_api_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if external_knowledge_api is None:
|
||||
raise ValueError("api template not found")
|
||||
|
||||
db.session.delete(external_knowledge_api)
|
||||
db.session.commit()
|
||||
|
||||
@staticmethod
|
||||
def external_knowledge_api_use_check(external_knowledge_api_id: str) -> tuple[bool, int]:
|
||||
count = ExternalKnowledgeBindings.query.filter_by(external_knowledge_api_id=external_knowledge_api_id).count()
|
||||
if count > 0:
|
||||
return True, count
|
||||
return False, 0
|
||||
|
||||
@staticmethod
|
||||
def get_external_knowledge_binding_with_dataset_id(tenant_id: str, dataset_id: str) -> ExternalKnowledgeBindings:
|
||||
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
|
||||
dataset_id=dataset_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if not external_knowledge_binding:
|
||||
raise ValueError("external knowledge binding not found")
|
||||
return external_knowledge_binding
|
||||
|
||||
@staticmethod
|
||||
def document_create_args_validate(tenant_id: str, external_knowledge_api_id: str, process_parameter: dict):
|
||||
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
|
||||
id=external_knowledge_api_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if external_knowledge_api is None:
|
||||
raise ValueError("api template not found")
|
||||
settings = json.loads(external_knowledge_api.settings)
|
||||
for setting in settings:
|
||||
custom_parameters = setting.get("document_process_setting")
|
||||
if custom_parameters:
|
||||
for parameter in custom_parameters:
|
||||
if parameter.get("required", False) and not process_parameter.get(parameter.get("name")):
|
||||
raise ValueError(f'{parameter.get("name")} is required')
|
||||
|
||||
@staticmethod
|
||||
def process_external_api(
|
||||
settings: ExternalKnowledgeApiSetting, files: Union[None, dict[str, Any]]
|
||||
) -> httpx.Response:
|
||||
"""
|
||||
do http request depending on api bundle
|
||||
"""
|
||||
|
||||
kwargs = {
|
||||
"url": settings.url,
|
||||
"headers": settings.headers,
|
||||
"follow_redirects": True,
|
||||
}
|
||||
|
||||
response = getattr(ssrf_proxy, settings.request_method)(data=json.dumps(settings.params), files=files, **kwargs)
|
||||
|
||||
return response
|
||||
|
||||
@staticmethod
|
||||
def assembling_headers(authorization: Authorization, headers: Optional[dict] = None) -> dict[str, Any]:
|
||||
authorization = deepcopy(authorization)
|
||||
if headers:
|
||||
headers = deepcopy(headers)
|
||||
else:
|
||||
headers = {}
|
||||
if authorization.type == "api-key":
|
||||
if authorization.config is None:
|
||||
raise ValueError("authorization config is required")
|
||||
|
||||
if authorization.config.api_key is None:
|
||||
raise ValueError("api_key is required")
|
||||
|
||||
if not authorization.config.header:
|
||||
authorization.config.header = "Authorization"
|
||||
|
||||
if authorization.config.type == "bearer":
|
||||
headers[authorization.config.header] = f"Bearer {authorization.config.api_key}"
|
||||
elif authorization.config.type == "basic":
|
||||
headers[authorization.config.header] = f"Basic {authorization.config.api_key}"
|
||||
elif authorization.config.type == "custom":
|
||||
headers[authorization.config.header] = authorization.config.api_key
|
||||
|
||||
return headers
|
||||
|
||||
@staticmethod
|
||||
def get_external_knowledge_api_settings(settings: dict) -> ExternalKnowledgeApiSetting:
|
||||
return ExternalKnowledgeApiSetting.parse_obj(settings)
|
||||
|
||||
@staticmethod
|
||||
def create_external_dataset(tenant_id: str, user_id: str, args: dict) -> Dataset:
|
||||
# check if dataset name already exists
|
||||
if Dataset.query.filter_by(name=args.get("name"), tenant_id=tenant_id).first():
|
||||
raise DatasetNameDuplicateError(f"Dataset with name {args.get('name')} already exists.")
|
||||
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
|
||||
id=args.get("external_knowledge_api_id"), tenant_id=tenant_id
|
||||
).first()
|
||||
|
||||
if external_knowledge_api is None:
|
||||
raise ValueError("api template not found")
|
||||
|
||||
dataset = Dataset(
|
||||
tenant_id=tenant_id,
|
||||
name=args.get("name"),
|
||||
description=args.get("description", ""),
|
||||
provider="external",
|
||||
retrieval_model=args.get("external_retrieval_model"),
|
||||
created_by=user_id,
|
||||
)
|
||||
|
||||
db.session.add(dataset)
|
||||
db.session.flush()
|
||||
|
||||
external_knowledge_binding = ExternalKnowledgeBindings(
|
||||
tenant_id=tenant_id,
|
||||
dataset_id=dataset.id,
|
||||
external_knowledge_api_id=args.get("external_knowledge_api_id"),
|
||||
external_knowledge_id=args.get("external_knowledge_id"),
|
||||
created_by=user_id,
|
||||
)
|
||||
db.session.add(external_knowledge_binding)
|
||||
|
||||
db.session.commit()
|
||||
|
||||
return dataset
|
||||
|
||||
@staticmethod
|
||||
def fetch_external_knowledge_retrieval(
|
||||
tenant_id: str, dataset_id: str, query: str, external_retrieval_parameters: dict
|
||||
) -> list:
|
||||
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
|
||||
dataset_id=dataset_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if not external_knowledge_binding:
|
||||
raise ValueError("external knowledge binding not found")
|
||||
|
||||
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
|
||||
id=external_knowledge_binding.external_knowledge_api_id
|
||||
).first()
|
||||
if not external_knowledge_api:
|
||||
raise ValueError("external api template not found")
|
||||
|
||||
settings = json.loads(external_knowledge_api.settings)
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if settings.get("api_key"):
|
||||
headers["Authorization"] = f"Bearer {settings.get('api_key')}"
|
||||
score_threshold_enabled = external_retrieval_parameters.get("score_threshold_enabled") or False
|
||||
score_threshold = external_retrieval_parameters.get("score_threshold", 0.0) if score_threshold_enabled else 0.0
|
||||
request_params = {
|
||||
"retrieval_setting": {
|
||||
"top_k": external_retrieval_parameters.get("top_k"),
|
||||
"score_threshold": score_threshold,
|
||||
},
|
||||
"query": query,
|
||||
"knowledge_id": external_knowledge_binding.external_knowledge_id,
|
||||
}
|
||||
|
||||
external_knowledge_api_setting = {
|
||||
"url": f"{settings.get('endpoint')}/retrieval",
|
||||
"request_method": "post",
|
||||
"headers": headers,
|
||||
"params": request_params,
|
||||
}
|
||||
response = ExternalDatasetService.process_external_api(
|
||||
ExternalKnowledgeApiSetting(**external_knowledge_api_setting), None
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return response.json().get("records", [])
|
||||
return []
|
@@ -19,7 +19,15 @@ default_retrieval_model = {
|
||||
|
||||
class HitTestingService:
|
||||
@classmethod
|
||||
def retrieve(cls, dataset: Dataset, query: str, account: Account, retrieval_model: dict, limit: int = 10) -> dict:
|
||||
def retrieve(
|
||||
cls,
|
||||
dataset: Dataset,
|
||||
query: str,
|
||||
account: Account,
|
||||
retrieval_model: dict,
|
||||
external_retrieval_model: dict,
|
||||
limit: int = 10,
|
||||
) -> dict:
|
||||
if dataset.available_document_count == 0 or dataset.available_segment_count == 0:
|
||||
return {
|
||||
"query": {
|
||||
@@ -62,10 +70,44 @@ class HitTestingService:
|
||||
|
||||
return cls.compact_retrieve_response(dataset, query, all_documents)
|
||||
|
||||
@classmethod
|
||||
def external_retrieve(
|
||||
cls,
|
||||
dataset: Dataset,
|
||||
query: str,
|
||||
account: Account,
|
||||
external_retrieval_model: dict,
|
||||
) -> dict:
|
||||
if dataset.provider != "external":
|
||||
return {
|
||||
"query": {"content": query},
|
||||
"records": [],
|
||||
}
|
||||
|
||||
start = time.perf_counter()
|
||||
|
||||
all_documents = RetrievalService.external_retrieve(
|
||||
dataset_id=dataset.id,
|
||||
query=cls.escape_query_for_search(query),
|
||||
external_retrieval_model=external_retrieval_model,
|
||||
)
|
||||
|
||||
end = time.perf_counter()
|
||||
logging.debug(f"External knowledge hit testing retrieve in {end - start:0.4f} seconds")
|
||||
|
||||
dataset_query = DatasetQuery(
|
||||
dataset_id=dataset.id, content=query, source="hit_testing", created_by_role="account", created_by=account.id
|
||||
)
|
||||
|
||||
db.session.add(dataset_query)
|
||||
db.session.commit()
|
||||
|
||||
return cls.compact_external_retrieve_response(dataset, query, all_documents)
|
||||
|
||||
@classmethod
|
||||
def compact_retrieve_response(cls, dataset: Dataset, query: str, documents: list[Document]):
|
||||
i = 0
|
||||
records = []
|
||||
|
||||
for document in documents:
|
||||
index_node_id = document.metadata["doc_id"]
|
||||
|
||||
@@ -81,7 +123,6 @@ class HitTestingService:
|
||||
)
|
||||
|
||||
if not segment:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
record = {
|
||||
@@ -91,8 +132,6 @@ class HitTestingService:
|
||||
|
||||
records.append(record)
|
||||
|
||||
i += 1
|
||||
|
||||
return {
|
||||
"query": {
|
||||
"content": query,
|
||||
@@ -100,6 +139,25 @@ class HitTestingService:
|
||||
"records": records,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def compact_external_retrieve_response(cls, dataset: Dataset, query: str, documents: list):
|
||||
records = []
|
||||
if dataset.provider == "external":
|
||||
for document in documents:
|
||||
record = {
|
||||
"content": document.get("content", None),
|
||||
"title": document.get("title", None),
|
||||
"score": document.get("score", None),
|
||||
"metadata": document.get("metadata", None),
|
||||
}
|
||||
records.append(record)
|
||||
return {
|
||||
"query": {
|
||||
"content": query,
|
||||
},
|
||||
"records": records,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def hit_testing_args_check(cls, args):
|
||||
query = args["query"]
|
||||
|
Reference in New Issue
Block a user