Feat add childchunk api (#16094)

2025-03-24 20:58:28 +08:00
parent 1be0d26c1f
commit 360986f38d
4 changed files with 869 additions and 31 deletions
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -2140,6 +2140,88 @@ class SegmentService:
            query = query.where(ChildChunk.content.ilike(f"%{keyword}%"))
        return query.paginate(page=page, per_page=limit, max_per_page=100, error_out=False)

+    @classmethod
+    def get_child_chunk_by_id(cls, child_chunk_id: str, tenant_id: str) -> Optional[ChildChunk]:
+        """Get a child chunk by its ID."""
+        result = ChildChunk.query.filter(ChildChunk.id == child_chunk_id, ChildChunk.tenant_id == tenant_id).first()
+        return result if isinstance(result, ChildChunk) else None
+
+    @classmethod
+    def get_segments(
+        cls, document_id: str, tenant_id: str, status_list: list[str] | None = None, keyword: str | None = None
+    ):
+        """Get segments for a document with optional filtering."""
+        query = DocumentSegment.query.filter(
+            DocumentSegment.document_id == document_id, DocumentSegment.tenant_id == tenant_id
+        )
+
+        if status_list:
+            query = query.filter(DocumentSegment.status.in_(status_list))
+
+        if keyword:
+            query = query.filter(DocumentSegment.content.ilike(f"%{keyword}%"))
+
+        segments = query.order_by(DocumentSegment.position.asc()).all()
+        total = len(segments)
+
+        return segments, total
+
+    @classmethod
+    def update_segment_by_id(
+        cls, tenant_id: str, dataset_id: str, document_id: str, segment_id: str, segment_data: dict, user_id: str
+    ) -> tuple[DocumentSegment, Document]:
+        """Update a segment by its ID with validation and checks."""
+        # check dataset
+        dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first()
+        if not dataset:
+            raise NotFound("Dataset not found.")
+
+        # check user's model setting
+        DatasetService.check_dataset_model_setting(dataset)
+
+        # check document
+        document = DocumentService.get_document(dataset_id, document_id)
+        if not document:
+            raise NotFound("Document not found.")
+
+        # check embedding model setting if high quality
+        if dataset.indexing_technique == "high_quality":
+            try:
+                model_manager = ModelManager()
+                model_manager.get_model_instance(
+                    tenant_id=user_id,
+                    provider=dataset.embedding_model_provider,
+                    model_type=ModelType.TEXT_EMBEDDING,
+                    model=dataset.embedding_model,
+                )
+            except LLMBadRequestError:
+                raise ValueError(
+                    "No Embedding Model available. Please configure a valid provider in the Settings -> Model Provider."
+                )
+            except ProviderTokenNotInitError as ex:
+                raise ValueError(ex.description)
+
+        # check segment
+        segment = DocumentSegment.query.filter(
+            DocumentSegment.id == segment_id, DocumentSegment.tenant_id == user_id
+        ).first()
+        if not segment:
+            raise NotFound("Segment not found.")
+
+        # validate and update segment
+        cls.segment_create_args_validate(segment_data, document)
+        updated_segment = cls.update_segment(SegmentUpdateArgs(**segment_data), segment, document, dataset)
+
+        return updated_segment, document
+
+    @classmethod
+    def get_segment_by_id(cls, segment_id: str, tenant_id: str) -> Optional[DocumentSegment]:
+        """Get a segment by its ID."""
+        result = DocumentSegment.query.filter(
+            DocumentSegment.id == segment_id, DocumentSegment.tenant_id == tenant_id
+        ).first()
+        return result if isinstance(result, DocumentSegment) else None
+

 class DatasetCollectionBindingService:
    @classmethod