Feat add childchunk api (#16094)

This commit is contained in:
GuanMu
2025-03-24 20:58:28 +08:00
committed by GitHub
parent 1be0d26c1f
commit 360986f38d
4 changed files with 869 additions and 31 deletions

View File

@@ -2140,6 +2140,88 @@ class SegmentService:
query = query.where(ChildChunk.content.ilike(f"%{keyword}%"))
return query.paginate(page=page, per_page=limit, max_per_page=100, error_out=False)
@classmethod
def get_child_chunk_by_id(cls, child_chunk_id: str, tenant_id: str) -> Optional[ChildChunk]:
"""Get a child chunk by its ID."""
result = ChildChunk.query.filter(ChildChunk.id == child_chunk_id, ChildChunk.tenant_id == tenant_id).first()
return result if isinstance(result, ChildChunk) else None
@classmethod
def get_segments(
cls, document_id: str, tenant_id: str, status_list: list[str] | None = None, keyword: str | None = None
):
"""Get segments for a document with optional filtering."""
query = DocumentSegment.query.filter(
DocumentSegment.document_id == document_id, DocumentSegment.tenant_id == tenant_id
)
if status_list:
query = query.filter(DocumentSegment.status.in_(status_list))
if keyword:
query = query.filter(DocumentSegment.content.ilike(f"%{keyword}%"))
segments = query.order_by(DocumentSegment.position.asc()).all()
total = len(segments)
return segments, total
@classmethod
def update_segment_by_id(
cls, tenant_id: str, dataset_id: str, document_id: str, segment_id: str, segment_data: dict, user_id: str
) -> tuple[DocumentSegment, Document]:
"""Update a segment by its ID with validation and checks."""
# check dataset
dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first()
if not dataset:
raise NotFound("Dataset not found.")
# check user's model setting
DatasetService.check_dataset_model_setting(dataset)
# check document
document = DocumentService.get_document(dataset_id, document_id)
if not document:
raise NotFound("Document not found.")
# check embedding model setting if high quality
if dataset.indexing_technique == "high_quality":
try:
model_manager = ModelManager()
model_manager.get_model_instance(
tenant_id=user_id,
provider=dataset.embedding_model_provider,
model_type=ModelType.TEXT_EMBEDDING,
model=dataset.embedding_model,
)
except LLMBadRequestError:
raise ValueError(
"No Embedding Model available. Please configure a valid provider in the Settings -> Model Provider."
)
except ProviderTokenNotInitError as ex:
raise ValueError(ex.description)
# check segment
segment = DocumentSegment.query.filter(
DocumentSegment.id == segment_id, DocumentSegment.tenant_id == user_id
).first()
if not segment:
raise NotFound("Segment not found.")
# validate and update segment
cls.segment_create_args_validate(segment_data, document)
updated_segment = cls.update_segment(SegmentUpdateArgs(**segment_data), segment, document, dataset)
return updated_segment, document
@classmethod
def get_segment_by_id(cls, segment_id: str, tenant_id: str) -> Optional[DocumentSegment]:
"""Get a segment by its ID."""
result = DocumentSegment.query.filter(
DocumentSegment.id == segment_id, DocumentSegment.tenant_id == tenant_id
).first()
return result if isinstance(result, DocumentSegment) else None
class DatasetCollectionBindingService:
@classmethod