dify/api/tests/integration_tests/vdb/clickzetta/test_clickzetta.py

import contextlib
import os

import pytest

from core.rag.datasource.vdb.clickzetta.clickzetta_vector import ClickzettaConfig, ClickzettaVector
from core.rag.models.document import Document
from tests.integration_tests.vdb.test_vector_store import AbstractVectorTest, get_example_text, setup_mock_redis


class TestClickzettaVector(AbstractVectorTest):
    """
    Test cases for Clickzetta vector database integration.
    """

    @pytest.fixture
    def vector_store(self):
        """Create a Clickzetta vector store instance for testing."""
        # Skip test if Clickzetta credentials are not configured
        if not os.getenv("CLICKZETTA_USERNAME"):
            pytest.skip("CLICKZETTA_USERNAME is not configured")
        if not os.getenv("CLICKZETTA_PASSWORD"):
            pytest.skip("CLICKZETTA_PASSWORD is not configured")
        if not os.getenv("CLICKZETTA_INSTANCE"):
            pytest.skip("CLICKZETTA_INSTANCE is not configured")

        config = ClickzettaConfig(
            username=os.getenv("CLICKZETTA_USERNAME", ""),
            password=os.getenv("CLICKZETTA_PASSWORD", ""),
            instance=os.getenv("CLICKZETTA_INSTANCE", ""),
            service=os.getenv("CLICKZETTA_SERVICE", "api.clickzetta.com"),
            workspace=os.getenv("CLICKZETTA_WORKSPACE", "quick_start"),
            vcluster=os.getenv("CLICKZETTA_VCLUSTER", "default_ap"),
            schema=os.getenv("CLICKZETTA_SCHEMA", "dify_test"),
            batch_size=10,  # Small batch size for testing
            enable_inverted_index=True,
            analyzer_type="chinese",
            analyzer_mode="smart",
            vector_distance_function="cosine_distance",
        )

        with setup_mock_redis():
            vector = ClickzettaVector(collection_name="test_collection_" + str(os.getpid()), config=config)

            yield vector

            # Cleanup: delete the test collection
            with contextlib.suppress(Exception):
                vector.delete()

    def test_clickzetta_vector_basic_operations(self, vector_store):
        """Test basic CRUD operations on Clickzetta vector store."""
        # Prepare test data
        texts = [
            "这是第一个测试文档，包含一些中文内容。",
            "This is the second test document with English content.",
            "第三个文档混合了English和中文内容。",
        ]
        embeddings = [
            [0.1, 0.2, 0.3, 0.4],
            [0.5, 0.6, 0.7, 0.8],
            [0.9, 1.0, 1.1, 1.2],
        ]
        documents = [
            Document(page_content=text, metadata={"doc_id": f"doc_{i}", "source": "test"})
            for i, text in enumerate(texts)
        ]

        # Test create (initial insert)
        vector_store.create(texts=documents, embeddings=embeddings)

        # Test text_exists
        assert vector_store.text_exists("doc_0")
        assert not vector_store.text_exists("doc_999")

        # Test search_by_vector
        query_vector = [0.1, 0.2, 0.3, 0.4]
        results = vector_store.search_by_vector(query_vector, top_k=2)
        assert len(results) > 0
        assert results[0].page_content == texts[0]  # Should match the first document

        # Test search_by_full_text (Chinese)
        results = vector_store.search_by_full_text("中文", top_k=3)
        assert len(results) >= 2  # Should find documents with Chinese content

        # Test search_by_full_text (English)
        results = vector_store.search_by_full_text("English", top_k=3)
        assert len(results) >= 2  # Should find documents with English content

        # Test delete_by_ids
        vector_store.delete_by_ids(["doc_0"])
        assert not vector_store.text_exists("doc_0")
        assert vector_store.text_exists("doc_1")

        # Test delete_by_metadata_field
        vector_store.delete_by_metadata_field("source", "test")
        assert not vector_store.text_exists("doc_1")
        assert not vector_store.text_exists("doc_2")

    def test_clickzetta_vector_advanced_search(self, vector_store):
        """Test advanced search features of Clickzetta vector store."""
        # Prepare test data with more complex metadata
        documents = []
        embeddings = []
        for i in range(10):
            doc = Document(
                page_content=f"Document {i}: " + get_example_text(),
                metadata={
                    "doc_id": f"adv_doc_{i}",
                    "category": "technical" if i % 2 == 0 else "general",
                    "document_id": f"doc_{i // 3}",  # Group documents
                    "importance": i,
                },
            )
            documents.append(doc)
            # Create varied embeddings
            embeddings.append([0.1 * i, 0.2 * i, 0.3 * i, 0.4 * i])

        vector_store.create(texts=documents, embeddings=embeddings)

        # Test vector search with document filter
        query_vector = [0.5, 1.0, 1.5, 2.0]
        results = vector_store.search_by_vector(query_vector, top_k=5, document_ids_filter=["doc_0", "doc_1"])
        assert len(results) > 0
        # All results should belong to doc_0 or doc_1 groups
        for result in results:
            assert result.metadata["document_id"] in ["doc_0", "doc_1"]

        # Test score threshold
        results = vector_store.search_by_vector(query_vector, top_k=10, score_threshold=0.5)
        # Check that all results have a score above threshold
        for result in results:
            assert result.metadata.get("score", 0) >= 0.5

    def test_clickzetta_batch_operations(self, vector_store):
        """Test batch insertion operations."""
        # Prepare large batch of documents
        batch_size = 25
        documents = []
        embeddings = []

        for i in range(batch_size):
            doc = Document(
                page_content=f"Batch document {i}: This is a test document for batch processing.",
                metadata={"doc_id": f"batch_doc_{i}", "batch": "test_batch"},
            )
            documents.append(doc)
            embeddings.append([0.1 * (i % 10), 0.2 * (i % 10), 0.3 * (i % 10), 0.4 * (i % 10)])

        # Test batch insert
        vector_store.add_texts(documents=documents, embeddings=embeddings)

        # Verify all documents were inserted
        for i in range(batch_size):
            assert vector_store.text_exists(f"batch_doc_{i}")

        # Clean up
        vector_store.delete_by_metadata_field("batch", "test_batch")

    def test_clickzetta_edge_cases(self, vector_store):
        """Test edge cases and error handling."""
        # Test empty operations
        vector_store.create(texts=[], embeddings=[])
        vector_store.add_texts(documents=[], embeddings=[])
        vector_store.delete_by_ids([])

        # Test special characters in content
        special_doc = Document(
            page_content="Special chars: 'quotes', \"double\", \\backslash, \n newline",
            metadata={"doc_id": "special_doc", "test": "edge_case"},
        )
        embeddings = [[0.1, 0.2, 0.3, 0.4]]

        vector_store.add_texts(documents=[special_doc], embeddings=embeddings)
        assert vector_store.text_exists("special_doc")

        # Test search with special characters
        results = vector_store.search_by_full_text("quotes", top_k=1)
        if results:  # Full-text search might not be available
            assert len(results) > 0

        # Clean up
        vector_store.delete_by_ids(["special_doc"])

    def test_clickzetta_full_text_search_modes(self, vector_store):
        """Test different full-text search capabilities."""
        # Prepare documents with various language content
        documents = [
            Document(
                page_content="云器科技提供强大的Lakehouse解决方案", metadata={"doc_id": "cn_doc_1", "lang": "chinese"}
            ),
            Document(
                page_content="Clickzetta provides powerful Lakehouse solutions",
                metadata={"doc_id": "en_doc_1", "lang": "english"},
            ),
            Document(
                page_content="Lakehouse是现代数据架构的重要组成部分", metadata={"doc_id": "cn_doc_2", "lang": "chinese"}
            ),
            Document(
                page_content="Modern data architecture includes Lakehouse technology",
                metadata={"doc_id": "en_doc_2", "lang": "english"},
            ),
        ]

        embeddings = [[0.1, 0.2, 0.3, 0.4] for _ in documents]

        vector_store.create(texts=documents, embeddings=embeddings)

        # Test Chinese full-text search
        results = vector_store.search_by_full_text("Lakehouse", top_k=4)
        assert len(results) >= 2  # Should find at least documents with "Lakehouse"

        # Test English full-text search
        results = vector_store.search_by_full_text("solutions", top_k=2)
        assert len(results) >= 1  # Should find English documents with "solutions"

        # Test mixed search
        results = vector_store.search_by_full_text("数据架构", top_k=2)
        assert len(results) >= 1  # Should find Chinese documents with this phrase

        # Clean up
        vector_store.delete_by_metadata_field("lang", "chinese")
        vector_store.delete_by_metadata_field("lang", "english")