diff --git a/api/tasks/clean_dataset_task.py b/api/tasks/clean_dataset_task.py index 69e5df025..9a45115b0 100644 --- a/api/tasks/clean_dataset_task.py +++ b/api/tasks/clean_dataset_task.py @@ -56,19 +56,29 @@ def clean_dataset_task( documents = db.session.query(Document).where(Document.dataset_id == dataset_id).all() segments = db.session.query(DocumentSegment).where(DocumentSegment.dataset_id == dataset_id).all() - # Fix: Always clean vector database resources regardless of document existence - # This ensures all 33 vector databases properly drop tables/collections/indices - if doc_form is None: - # Use default paragraph index type for empty datasets to enable vector database cleanup + # Enhanced validation: Check if doc_form is None, empty string, or contains only whitespace + # This ensures all invalid doc_form values are properly handled + if doc_form is None or (isinstance(doc_form, str) and not doc_form.strip()): + # Use default paragraph index type for empty/invalid datasets to enable vector database cleanup from core.rag.index_processor.constant.index_type import IndexType doc_form = IndexType.PARAGRAPH_INDEX logging.info( - click.style(f"No documents found, using default index type for cleanup: {doc_form}", fg="yellow") + click.style(f"Invalid doc_form detected, using default index type for cleanup: {doc_form}", fg="yellow") ) - index_processor = IndexProcessorFactory(doc_form).init_index_processor() - index_processor.clean(dataset, None, with_keywords=True, delete_child_chunks=True) + # Add exception handling around IndexProcessorFactory.clean() to prevent single point of failure + # This ensures Document/Segment deletion can continue even if vector database cleanup fails + try: + index_processor = IndexProcessorFactory(doc_form).init_index_processor() + index_processor.clean(dataset, None, with_keywords=True, delete_child_chunks=True) + logging.info(click.style(f"Successfully cleaned vector database for dataset: {dataset_id}", fg="green")) + except Exception as index_cleanup_error: + logging.exception(click.style(f"Failed to clean vector database for dataset {dataset_id}", fg="red")) + # Continue with document and segment deletion even if vector cleanup fails + logging.info( + click.style(f"Continuing with document and segment deletion for dataset: {dataset_id}", fg="yellow") + ) if documents is None or len(documents) == 0: logging.info(click.style(f"No documents found for dataset: {dataset_id}", fg="green")) @@ -128,6 +138,14 @@ def clean_dataset_task( click.style(f"Cleaned dataset when dataset deleted: {dataset_id} latency: {end_at - start_at}", fg="green") ) except Exception: + # Add rollback to prevent dirty session state in case of exceptions + # This ensures the database session is properly cleaned up + try: + db.session.rollback() + logging.info(click.style(f"Rolled back database session for dataset: {dataset_id}", fg="yellow")) + except Exception as rollback_error: + logging.exception("Failed to rollback database session") + logging.exception("Cleaned dataset when dataset deleted failed") finally: db.session.close()