Fix ClickZetta stability and reduce logging noise (#23632)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
2025-08-08 22:57:47 +08:00
parent b32b7712e2
commit 14e1c16cf2
3 changed files with 505 additions and 255 deletions
--- a/api/core/rag/datasource/vdb/clickzetta/README.md
+++ b/api/core/rag/datasource/vdb/clickzetta/README.md
@@ -185,6 +185,6 @@ Clickzetta supports advanced full-text search with multiple analyzers:
 ## References
- [Clickzetta Vector Search Documentation](../../../../../../../yunqidoc/cn_markdown_20250526/vector-search.md)
+- [Clickzetta Vector Search Documentation](https://yunqi.tech/documents/vector-search)
- [Clickzetta Inverted Index Documentation](../../../../../../../yunqidoc/cn_markdown_20250526/inverted-index.md)
+- [Clickzetta Inverted Index Documentation](https://yunqi.tech/documents/inverted-index)
- [Clickzetta SQL Functions](../../../../../../../yunqidoc/cn_markdown_20250526/sql_functions/)
+- [Clickzetta SQL Functions](https://yunqi.tech/documents/sql-reference)
--- a/api/core/rag/datasource/vdb/clickzetta/clickzetta_vector.py
+++ b/api/core/rag/datasource/vdb/clickzetta/clickzetta_vector.py
@@ -1,7 +1,9 @@
 import json
 import logging
 import queue
 import re
 import threading
 import time
 import uuid
 from typing import TYPE_CHECKING, Any, Optional
@@ -67,51 +69,84 @@ class ClickzettaConfig(BaseModel):
        return values
-class ClickzettaVector(BaseVector):
+class ClickzettaConnectionPool:
    """
-    Clickzetta vector storage implementation.
+    Global connection pool for ClickZetta connections.
    Manages connection reuse across ClickzettaVector instances.
    """
-    # Class-level write queue and lock for serializing writes
+    _instance: Optional["ClickzettaConnectionPool"] = None
-    _write_queue: Optional[queue.Queue] = None
+    _lock = threading.Lock()
    _write_thread: Optional[threading.Thread] = None
    _write_lock = threading.Lock()
    _shutdown = False
-    def __init__(self, collection_name: str, config: ClickzettaConfig):
+    def __init__(self):
-        super().__init__(collection_name)
+        self._pools: dict[str, list[tuple[Connection, float]]] = {}  # config_key -> [(connection, last_used_time)]
-        self._config = config
+        self._pool_locks: dict[str, threading.Lock] = {}
-        self._table_name = collection_name.replace("-", "_").lower()  # Ensure valid table name
+        self._max_pool_size = 5  # Maximum connections per configuration
-        self._connection: Optional[Connection] = None
+        self._connection_timeout = 300  # 5 minutes timeout
-        self._init_connection()
+        self._cleanup_thread: Optional[threading.Thread] = None
-        self._init_write_queue()
+        self._shutdown = False
        self._start_cleanup_thread()
-    def _init_connection(self):
+    @classmethod
-        """Initialize Clickzetta connection."""
+    def get_instance(cls) -> "ClickzettaConnectionPool":
-        self._connection = clickzetta.connect(
+        """Get singleton instance of connection pool."""
-            username=self._config.username,
+        if cls._instance is None:
-            password=self._config.password,
+            with cls._lock:
-            instance=self._config.instance,
+                if cls._instance is None:
-            service=self._config.service,
+                    cls._instance = cls()
-            workspace=self._config.workspace,
+        return cls._instance
-            vcluster=self._config.vcluster,
+
-            schema=self._config.schema_name,
+    def _get_config_key(self, config: ClickzettaConfig) -> str:
        """Generate unique key for connection configuration."""
        return (
            f"{config.username}:{config.instance}:{config.service}:"
            f"{config.workspace}:{config.vcluster}:{config.schema_name}"
        )
-        # Set session parameters for better string handling and performance optimization
+    def _create_connection(self, config: ClickzettaConfig) -> "Connection":
-        if self._connection is not None:
+        """Create a new ClickZetta connection."""
-            with self._connection.cursor() as cursor:
+        max_retries = 3
-                # Use quote mode for string literal escaping to handle quotes better
+        retry_delay = 1.0
                cursor.execute("SET cz.sql.string.literal.escape.mode = 'quote'")
                logger.info("Set string literal escape mode to 'quote' for better quote handling")
-                # Performance optimization hints for vector operations
+        for attempt in range(max_retries):
                self._set_performance_hints(cursor)
    def _set_performance_hints(self, cursor):
        """Set ClickZetta performance optimization hints for vector operations."""
            try:
-            # Performance optimization hints for vector operations and query processing
+                connection = clickzetta.connect(
                    username=config.username,
                    password=config.password,
                    instance=config.instance,
                    service=config.service,
                    workspace=config.workspace,
                    vcluster=config.vcluster,
                    schema=config.schema_name,
                )
                # Configure connection session settings
                self._configure_connection(connection)
                logger.debug("Created new ClickZetta connection (attempt %d/%d)", attempt + 1, max_retries)
                return connection
            except Exception:
                logger.exception("ClickZetta connection attempt %d/%d failed", attempt + 1, max_retries)
                if attempt < max_retries - 1:
                    time.sleep(retry_delay * (2**attempt))
                else:
                    raise
        raise RuntimeError(f"Failed to create ClickZetta connection after {max_retries} attempts")
    def _configure_connection(self, connection: "Connection") -> None:
        """Configure connection session settings."""
        try:
            with connection.cursor() as cursor:
                # Temporarily suppress ClickZetta client logging to reduce noise
                clickzetta_logger = logging.getLogger("clickzetta")
                original_level = clickzetta_logger.level
                clickzetta_logger.setLevel(logging.WARNING)
                try:
                    # Use quote mode for string literal escaping
                    cursor.execute("SET cz.sql.string.literal.escape.mode = 'quote'")
                    # Apply performance optimization hints
                    performance_hints = [
                        # Vector index optimization
                        "SET cz.storage.parquet.vector.index.read.memory.cache = true",
@@ -138,14 +173,222 @@ class ClickzettaVector(BaseVector):
                    for hint in performance_hints:
                        cursor.execute(hint)
-
+                finally:
-            logger.info(
+                    # Restore original logging level
-                "Applied %d performance optimization hints for ClickZetta vector operations", len(performance_hints)
+                    clickzetta_logger.setLevel(original_level)
            )
        except Exception:
-            # Catch any errors setting performance hints but continue with defaults
+            logger.exception("Failed to configure connection, continuing with defaults")
-            logger.exception("Failed to set some performance hints, continuing with default settings")
+
    def _is_connection_valid(self, connection: "Connection") -> bool:
        """Check if connection is still valid."""
        try:
            with connection.cursor() as cursor:
                cursor.execute("SELECT 1")
                return True
        except Exception:
            return False
    def get_connection(self, config: ClickzettaConfig) -> "Connection":
        """Get a connection from the pool or create a new one."""
        config_key = self._get_config_key(config)
        # Ensure pool lock exists
        if config_key not in self._pool_locks:
            with self._lock:
                if config_key not in self._pool_locks:
                    self._pool_locks[config_key] = threading.Lock()
                    self._pools[config_key] = []
        with self._pool_locks[config_key]:
            pool = self._pools[config_key]
            current_time = time.time()
            # Try to reuse existing connection
            while pool:
                connection, last_used = pool.pop(0)
                # Check if connection is not expired and still valid
                if current_time - last_used < self._connection_timeout and self._is_connection_valid(connection):
                    logger.debug("Reusing ClickZetta connection from pool")
                    return connection
                else:
                    # Connection expired or invalid, close it
                    try:
                        connection.close()
                    except Exception:
                        pass
            # No valid connection found, create new one
            return self._create_connection(config)
    def return_connection(self, config: ClickzettaConfig, connection: "Connection") -> None:
        """Return a connection to the pool."""
        config_key = self._get_config_key(config)
        if config_key not in self._pool_locks:
            # Pool was cleaned up, just close the connection
            try:
                connection.close()
            except Exception:
                pass
            return
        with self._pool_locks[config_key]:
            pool = self._pools[config_key]
            # Only return to pool if not at capacity and connection is valid
            if len(pool) < self._max_pool_size and self._is_connection_valid(connection):
                pool.append((connection, time.time()))
                logger.debug("Returned ClickZetta connection to pool")
            else:
                # Pool full or connection invalid, close it
                try:
                    connection.close()
                except Exception:
                    pass
    def _cleanup_expired_connections(self) -> None:
        """Clean up expired connections from all pools."""
        current_time = time.time()
        with self._lock:
            for config_key in list(self._pools.keys()):
                if config_key not in self._pool_locks:
                    continue
                with self._pool_locks[config_key]:
                    pool = self._pools[config_key]
                    valid_connections = []
                    for connection, last_used in pool:
                        if current_time - last_used < self._connection_timeout:
                            valid_connections.append((connection, last_used))
                        else:
                            try:
                                connection.close()
                            except Exception:
                                pass
                    self._pools[config_key] = valid_connections
    def _start_cleanup_thread(self) -> None:
        """Start background thread for connection cleanup."""
        def cleanup_worker():
            while not self._shutdown:
                try:
                    time.sleep(60)  # Cleanup every minute
                    if not self._shutdown:
                        self._cleanup_expired_connections()
                except Exception:
                    logger.exception("Error in connection pool cleanup")
        self._cleanup_thread = threading.Thread(target=cleanup_worker, daemon=True)
        self._cleanup_thread.start()
    def shutdown(self) -> None:
        """Shutdown connection pool and close all connections."""
        self._shutdown = True
        with self._lock:
            for config_key in list(self._pools.keys()):
                if config_key not in self._pool_locks:
                    continue
                with self._pool_locks[config_key]:
                    pool = self._pools[config_key]
                    for connection, _ in pool:
                        try:
                            connection.close()
                        except Exception:
                            pass
                    pool.clear()
 class ClickzettaVector(BaseVector):
    """
    Clickzetta vector storage implementation.
    """
    # Class-level write queue and lock for serializing writes
    _write_queue: Optional[queue.Queue] = None
    _write_thread: Optional[threading.Thread] = None
    _write_lock = threading.Lock()
    _shutdown = False
    def __init__(self, collection_name: str, config: ClickzettaConfig):
        super().__init__(collection_name)
        self._config = config
        self._table_name = collection_name.replace("-", "_").lower()  # Ensure valid table name
        self._connection_pool = ClickzettaConnectionPool.get_instance()
        self._init_write_queue()
    def _get_connection(self) -> "Connection":
        """Get a connection from the pool."""
        return self._connection_pool.get_connection(self._config)
    def _return_connection(self, connection: "Connection") -> None:
        """Return a connection to the pool."""
        self._connection_pool.return_connection(self._config, connection)
    class ConnectionContext:
        """Context manager for borrowing and returning connections."""
        def __init__(self, vector_instance: "ClickzettaVector"):
            self.vector = vector_instance
            self.connection: Optional[Connection] = None
        def __enter__(self) -> "Connection":
            self.connection = self.vector._get_connection()
            return self.connection
        def __exit__(self, exc_type, exc_val, exc_tb):
            if self.connection:
                self.vector._return_connection(self.connection)
    def get_connection_context(self) -> "ClickzettaVector.ConnectionContext":
        """Get a connection context manager."""
        return self.ConnectionContext(self)
    def _parse_metadata(self, raw_metadata: str, row_id: str) -> dict:
        """
        Parse metadata from JSON string with proper error handling and fallback.
        Args:
            raw_metadata: Raw JSON string from database
            row_id: Row ID for fallback document_id
        Returns:
            Parsed metadata dict with guaranteed required fields
        """
        try:
            if raw_metadata:
                metadata = json.loads(raw_metadata)
                # Handle double-encoded JSON
                if isinstance(metadata, str):
                    metadata = json.loads(metadata)
                # Ensure we have a dict
                if not isinstance(metadata, dict):
                    metadata = {}
            else:
                metadata = {}
        except (json.JSONDecodeError, TypeError):
            logger.exception("JSON parsing failed for metadata")
            # Fallback: extract document_id with regex
            doc_id_match = re.search(r'"document_id":\s*"([^"]+)"', raw_metadata or "")
            metadata = {"document_id": doc_id_match.group(1)} if doc_id_match else {}
        # Ensure required fields are set
        metadata["doc_id"] = row_id  # segment id
        # Ensure document_id exists (critical for Dify's format_retrieval_documents)
        if "document_id" not in metadata:
            metadata["document_id"] = row_id  # fallback to segment id
        return metadata
    @classmethod
    def _init_write_queue(cls):
@@ -204,24 +447,33 @@ class ClickzettaVector(BaseVector):
        return "clickzetta"
    def _ensure_connection(self) -> "Connection":
-        """Ensure connection is available and return it."""
+        """Get a connection from the pool."""
-        if self._connection is None:
+        return self._get_connection()
            raise RuntimeError("Database connection not initialized")
        return self._connection
    def _table_exists(self) -> bool:
        """Check if the table exists."""
        try:
-            connection = self._ensure_connection()
+            with self.get_connection_context() as connection:
                with connection.cursor() as cursor:
                    cursor.execute(f"DESC {self._config.schema_name}.{self._table_name}")
                    return True
-        except (RuntimeError, ValueError) as e:
+        except Exception as e:
-            if "table or view not found" in str(e).lower():
+            error_message = str(e).lower()
            # Handle ClickZetta specific "table or view not found" errors
            if any(
                phrase in error_message
                for phrase in ["table or view not found", "czlh-42000", "semantic analysis exception"]
            ):
                logger.debug("Table %s.%s does not exist", self._config.schema_name, self._table_name)
                return False
            else:
-                # Re-raise if it's a different error
+                # For other connection/permission errors, log warning but return False to avoid blocking cleanup
-                raise
+                logger.exception(
                    "Table existence check failed for %s.%s, assuming it doesn't exist",
                    self._config.schema_name,
                    self._table_name,
                )
                return False
    def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
        """Create the collection and add initial documents."""
@@ -253,7 +505,7 @@ class ClickzettaVector(BaseVector):
        ) COMMENT 'Dify RAG knowledge base vector storage table for document embeddings and content'
        """
-        connection = self._ensure_connection()
+        with self.get_connection_context() as connection:
            with connection.cursor() as cursor:
                cursor.execute(create_table_sql)
                logger.info("Created table %s.%s", self._config.schema_name, self._table_name)
@@ -432,14 +684,23 @@ class ClickzettaVector(BaseVector):
            f"VALUES (?, ?, CAST(? AS JSON), CAST(? AS VECTOR({vector_dimension})))"
        )
-        connection = self._ensure_connection()
+        with self.get_connection_context() as connection:
            with connection.cursor() as cursor:
                try:
                    # Set session-level hints for batch insert operations
                    # Note: executemany doesn't support hints parameter, so we set them as session variables
                    # Temporarily suppress ClickZetta client logging to reduce noise
                    clickzetta_logger = logging.getLogger("clickzetta")
                    original_level = clickzetta_logger.level
                    clickzetta_logger.setLevel(logging.WARNING)
                    try:
                        cursor.execute("SET cz.sql.job.fast.mode = true")
                        cursor.execute("SET cz.sql.compaction.after.commit = true")
                        cursor.execute("SET cz.storage.always.prefetch.internal = true")
                    finally:
                        # Restore original logging level
                        clickzetta_logger.setLevel(original_level)
                    cursor.executemany(insert_sql, data_rows)
                    logger.info(
@@ -457,11 +718,16 @@ class ClickzettaVector(BaseVector):
    def text_exists(self, id: str) -> bool:
        """Check if a document exists by ID."""
        # Check if table exists first
        if not self._table_exists():
            return False
        safe_id = self._safe_doc_id(id)
-        connection = self._ensure_connection()
+        with self.get_connection_context() as connection:
            with connection.cursor() as cursor:
                cursor.execute(
-                f"SELECT COUNT(*) FROM {self._config.schema_name}.{self._table_name} WHERE id = ?", [safe_id]
+                    f"SELECT COUNT(*) FROM {self._config.schema_name}.{self._table_name} WHERE id = ?",
                    binding_params=[safe_id],
                )
                result = cursor.fetchone()
                return result[0] > 0 if result else False
@@ -482,13 +748,14 @@ class ClickzettaVector(BaseVector):
    def _delete_by_ids_impl(self, ids: list[str]) -> None:
        """Implementation of delete by IDs (executed in write worker thread)."""
        safe_ids = [self._safe_doc_id(id) for id in ids]
        # Create properly escaped string literals for SQL
        id_list = ",".join(f"'{id}'" for id in safe_ids)
        sql = f"DELETE FROM {self._config.schema_name}.{self._table_name} WHERE id IN ({id_list})"
-        connection = self._ensure_connection()
+        # Use parameterized query to prevent SQL injection
        placeholders = ",".join("?" for _ in safe_ids)
        sql = f"DELETE FROM {self._config.schema_name}.{self._table_name} WHERE id IN ({placeholders})"
        with self.get_connection_context() as connection:
            with connection.cursor() as cursor:
-            cursor.execute(sql)
+                cursor.execute(sql, binding_params=safe_ids)
    def delete_by_metadata_field(self, key: str, value: str) -> None:
        """Delete documents by metadata field."""
@@ -502,7 +769,7 @@ class ClickzettaVector(BaseVector):
    def _delete_by_metadata_field_impl(self, key: str, value: str) -> None:
        """Implementation of delete by metadata field (executed in write worker thread)."""
-        connection = self._ensure_connection()
+        with self.get_connection_context() as connection:
            with connection.cursor() as cursor:
                # Using JSON path to filter with parameterized query
                # Note: JSON path requires literal key name, cannot be parameterized
@@ -511,10 +778,19 @@ class ClickzettaVector(BaseVector):
                    f"DELETE FROM {self._config.schema_name}.{self._table_name} "
                    f"WHERE json_extract_string({Field.METADATA_KEY.value}, '$.{key}') = ?"
                )
-            cursor.execute(sql, [value])
+                cursor.execute(sql, binding_params=[value])
    def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
        """Search for documents by vector similarity."""
        # Check if table exists first
        if not self._table_exists():
            logger.warning(
                "Table %s.%s does not exist, returning empty results",
                self._config.schema_name,
                self._table_name,
            )
            return []
        top_k = kwargs.get("top_k", 10)
        score_threshold = kwargs.get("score_threshold", 0.0)
        document_ids_filter = kwargs.get("document_ids_filter")
@@ -565,7 +841,7 @@ class ClickzettaVector(BaseVector):
        """
        documents = []
-        connection = self._ensure_connection()
+        with self.get_connection_context() as connection:
            with connection.cursor() as cursor:
                # Use hints parameter for vector search optimization
                search_hints = {
@@ -575,37 +851,12 @@ class ClickzettaVector(BaseVector):
                        "cz.storage.parquet.vector.index.read.memory.cache": True,
                    }
                }
-            cursor.execute(search_sql, parameters=search_hints)
+                cursor.execute(search_sql, search_hints)
                results = cursor.fetchall()
                for row in results:
-                # Parse metadata from JSON string (may be double-encoded)
+                    # Parse metadata using centralized method
-                try:
+                    metadata = self._parse_metadata(row[2], row[0])
                    if row[2]:
                        metadata = json.loads(row[2])
                        # If result is a string, it's double-encoded JSON - parse again
                        if isinstance(metadata, str):
                            metadata = json.loads(metadata)
                        if not isinstance(metadata, dict):
                            metadata = {}
                    else:
                        metadata = {}
                except (json.JSONDecodeError, TypeError) as e:
                    logger.exception("JSON parsing failed")
                    # Fallback: extract document_id with regex
                    import re
                    doc_id_match = re.search(r'"document_id":\s*"([^"]+)"', str(row[2] or ""))
                    metadata = {"document_id": doc_id_match.group(1)} if doc_id_match else {}
                # Ensure required fields are set
                metadata["doc_id"] = row[0]  # segment id
                # Ensure document_id exists (critical for Dify's format_retrieval_documents)
                if "document_id" not in metadata:
                    metadata["document_id"] = row[0]  # fallback to segment id
                    # Add score based on distance
                    if self._config.vector_distance_function == "cosine_distance":
@@ -624,6 +875,15 @@ class ClickzettaVector(BaseVector):
            logger.warning("Full-text search is not enabled. Enable inverted index in config.")
            return []
        # Check if table exists first
        if not self._table_exists():
            logger.warning(
                "Table %s.%s does not exist, returning empty results",
                self._config.schema_name,
                self._table_name,
            )
            return []
        top_k = kwargs.get("top_k", 10)
        document_ids_filter = kwargs.get("document_ids_filter")
@@ -659,7 +919,7 @@ class ClickzettaVector(BaseVector):
        """
        documents = []
-        connection = self._ensure_connection()
+        with self.get_connection_context() as connection:
            with connection.cursor() as cursor:
                try:
                    # Use hints parameter for full-text search optimization
@@ -670,7 +930,7 @@ class ClickzettaVector(BaseVector):
                            "cz.sql.index.prewhere.enabled": True,
                        }
                    }
-                cursor.execute(search_sql, parameters=fulltext_hints)
+                    cursor.execute(search_sql, fulltext_hints)
                    results = cursor.fetchall()
                    for row in results:
@@ -690,7 +950,6 @@ class ClickzettaVector(BaseVector):
                        except (json.JSONDecodeError, TypeError) as e:
                            logger.exception("JSON parsing failed")
                            # Fallback: extract document_id with regex
                        import re
                            doc_id_match = re.search(r'"document_id":\s*"([^"]+)"', str(row[2] or ""))
                            metadata = {"document_id": doc_id_match.group(1)} if doc_id_match else {}
@@ -715,6 +974,15 @@ class ClickzettaVector(BaseVector):
    def _search_by_like(self, query: str, **kwargs: Any) -> list[Document]:
        """Fallback search using LIKE operator."""
        # Check if table exists first
        if not self._table_exists():
            logger.warning(
                "Table %s.%s does not exist, returning empty results",
                self._config.schema_name,
                self._table_name,
            )
            return []
        top_k = kwargs.get("top_k", 10)
        document_ids_filter = kwargs.get("document_ids_filter")
@@ -746,7 +1014,7 @@ class ClickzettaVector(BaseVector):
        """
        documents = []
-        connection = self._ensure_connection()
+        with self.get_connection_context() as connection:
            with connection.cursor() as cursor:
                # Use hints parameter for LIKE search optimization
                like_hints = {
@@ -755,37 +1023,12 @@ class ClickzettaVector(BaseVector):
                        "cz.sql.job.fast.mode": True,
                    }
                }
-            cursor.execute(search_sql, parameters=like_hints)
+                cursor.execute(search_sql, like_hints)
                results = cursor.fetchall()
                for row in results:
-                # Parse metadata from JSON string (may be double-encoded)
+                    # Parse metadata using centralized method
-                try:
+                    metadata = self._parse_metadata(row[2], row[0])
                    if row[2]:
                        metadata = json.loads(row[2])
                        # If result is a string, it's double-encoded JSON - parse again
                        if isinstance(metadata, str):
                            metadata = json.loads(metadata)
                        if not isinstance(metadata, dict):
                            metadata = {}
                    else:
                        metadata = {}
                except (json.JSONDecodeError, TypeError) as e:
                    logger.exception("JSON parsing failed")
                    # Fallback: extract document_id with regex
                    import re
                    doc_id_match = re.search(r'"document_id":\s*"([^"]+)"', str(row[2] or ""))
                    metadata = {"document_id": doc_id_match.group(1)} if doc_id_match else {}
                # Ensure required fields are set
                metadata["doc_id"] = row[0]  # segment id
                # Ensure document_id exists (critical for Dify's format_retrieval_documents)
                if "document_id" not in metadata:
                    metadata["document_id"] = row[0]  # fallback to segment id
                    metadata["score"] = 0.5  # Lower score for LIKE search
                    doc = Document(page_content=row[1], metadata=metadata)
@@ -795,7 +1038,7 @@ class ClickzettaVector(BaseVector):
    def delete(self) -> None:
        """Delete the entire collection."""
-        connection = self._ensure_connection()
+        with self.get_connection_context() as connection:
            with connection.cursor() as cursor:
                cursor.execute(f"DROP TABLE IF EXISTS {self._config.schema_name}.{self._table_name}")
--- a/api/tasks/clean_dataset_task.py
+++ b/api/tasks/clean_dataset_task.py
@@ -59,7 +59,14 @@ def clean_dataset_task(
        # Fix: Always clean vector database resources regardless of document existence
        # This ensures all 33 vector databases properly drop tables/collections/indices
        if doc_form is None:
-            raise ValueError("Index type must be specified.")
+            # Use default paragraph index type for empty datasets to enable vector database cleanup
            from core.rag.index_processor.constant.index_type import IndexType
            doc_form = IndexType.PARAGRAPH_INDEX
            logging.info(
                click.style(f"No documents found, using default index type for cleanup: {doc_form}", fg="yellow")
            )
        index_processor = IndexProcessorFactory(doc_form).init_index_processor()
        index_processor.clean(dataset, None, with_keywords=True, delete_child_chunks=True)