diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 7479bb39b..37eb3eab6 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -270,7 +270,9 @@ class IndexingRunner: tenant_id=tenant_id, model_type=ModelType.TEXT_EMBEDDING, ) - preview_texts = [] # type: ignore + # keep separate, avoid union-list ambiguity + preview_texts: list[PreviewDetail] = [] + qa_preview_texts: list[QAPreviewDetail] = [] total_segments = 0 index_type = doc_form @@ -293,14 +295,14 @@ class IndexingRunner: for document in documents: if len(preview_texts) < 10: if doc_form and doc_form == "qa_model": - preview_detail = QAPreviewDetail( + qa_detail = QAPreviewDetail( question=document.page_content, answer=document.metadata.get("answer") or "" ) - preview_texts.append(preview_detail) + qa_preview_texts.append(qa_detail) else: - preview_detail = PreviewDetail(content=document.page_content) # type: ignore + preview_detail = PreviewDetail(content=document.page_content) if document.children: - preview_detail.child_chunks = [child.page_content for child in document.children] # type: ignore + preview_detail.child_chunks = [child.page_content for child in document.children] preview_texts.append(preview_detail) # delete image files and related db records @@ -321,8 +323,8 @@ class IndexingRunner: db.session.delete(image_file) if doc_form and doc_form == "qa_model": - return IndexingEstimate(total_segments=total_segments * 20, qa_preview=preview_texts, preview=[]) - return IndexingEstimate(total_segments=total_segments, preview=preview_texts) # type: ignore + return IndexingEstimate(total_segments=total_segments * 20, qa_preview=qa_preview_texts, preview=[]) + return IndexingEstimate(total_segments=total_segments, preview=preview_texts) def _extract( self, index_processor: BaseIndexProcessor, dataset_document: DatasetDocument, process_rule: dict @@ -424,6 +426,7 @@ class IndexingRunner: """ Get the NodeParser object according to the processing rule. """ + character_splitter: TextSplitter if processing_rule_mode in ["custom", "hierarchical"]: # The user-defined segmentation rule max_segmentation_tokens_length = dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH @@ -450,7 +453,7 @@ class IndexingRunner: embedding_model_instance=embedding_model_instance, ) - return character_splitter # type: ignore + return character_splitter def _split_to_documents_for_estimate( self, text_docs: list[Document], splitter: TextSplitter, processing_rule: DatasetProcessRule diff --git a/api/core/rag/index_processor/processor/parent_child_index_processor.py b/api/core/rag/index_processor/processor/parent_child_index_processor.py index cb7f6ab57..d1088af85 100644 --- a/api/core/rag/index_processor/processor/parent_child_index_processor.py +++ b/api/core/rag/index_processor/processor/parent_child_index_processor.py @@ -36,7 +36,7 @@ class ParentChildIndexProcessor(BaseIndexProcessor): if not process_rule.get("rules"): raise ValueError("No rules found in process rule.") rules = Rule(**process_rule.get("rules")) - all_documents = [] # type: ignore + all_documents: list[Document] = [] if rules.parent_mode == ParentMode.PARAGRAPH: # Split the text documents into nodes. if not rules.segmentation: