224 lines
9.1 KiB
Python
224 lines
9.1 KiB
Python
import contextlib
|
|
import os
|
|
|
|
import pytest
|
|
|
|
from core.rag.datasource.vdb.clickzetta.clickzetta_vector import ClickzettaConfig, ClickzettaVector
|
|
from core.rag.models.document import Document
|
|
from tests.integration_tests.vdb.test_vector_store import AbstractVectorTest, get_example_text, setup_mock_redis
|
|
|
|
|
|
class TestClickzettaVector(AbstractVectorTest):
|
|
"""
|
|
Test cases for Clickzetta vector database integration.
|
|
"""
|
|
|
|
@pytest.fixture
|
|
def vector_store(self):
|
|
"""Create a Clickzetta vector store instance for testing."""
|
|
# Skip test if Clickzetta credentials are not configured
|
|
if not os.getenv("CLICKZETTA_USERNAME"):
|
|
pytest.skip("CLICKZETTA_USERNAME is not configured")
|
|
if not os.getenv("CLICKZETTA_PASSWORD"):
|
|
pytest.skip("CLICKZETTA_PASSWORD is not configured")
|
|
if not os.getenv("CLICKZETTA_INSTANCE"):
|
|
pytest.skip("CLICKZETTA_INSTANCE is not configured")
|
|
|
|
config = ClickzettaConfig(
|
|
username=os.getenv("CLICKZETTA_USERNAME", ""),
|
|
password=os.getenv("CLICKZETTA_PASSWORD", ""),
|
|
instance=os.getenv("CLICKZETTA_INSTANCE", ""),
|
|
service=os.getenv("CLICKZETTA_SERVICE", "api.clickzetta.com"),
|
|
workspace=os.getenv("CLICKZETTA_WORKSPACE", "quick_start"),
|
|
vcluster=os.getenv("CLICKZETTA_VCLUSTER", "default_ap"),
|
|
schema=os.getenv("CLICKZETTA_SCHEMA", "dify_test"),
|
|
batch_size=10, # Small batch size for testing
|
|
enable_inverted_index=True,
|
|
analyzer_type="chinese",
|
|
analyzer_mode="smart",
|
|
vector_distance_function="cosine_distance",
|
|
)
|
|
|
|
with setup_mock_redis():
|
|
vector = ClickzettaVector(collection_name="test_collection_" + str(os.getpid()), config=config)
|
|
|
|
yield vector
|
|
|
|
# Cleanup: delete the test collection
|
|
with contextlib.suppress(Exception):
|
|
vector.delete()
|
|
|
|
def test_clickzetta_vector_basic_operations(self, vector_store):
|
|
"""Test basic CRUD operations on Clickzetta vector store."""
|
|
# Prepare test data
|
|
texts = [
|
|
"这是第一个测试文档,包含一些中文内容。",
|
|
"This is the second test document with English content.",
|
|
"第三个文档混合了English和中文内容。",
|
|
]
|
|
embeddings = [
|
|
[0.1, 0.2, 0.3, 0.4],
|
|
[0.5, 0.6, 0.7, 0.8],
|
|
[0.9, 1.0, 1.1, 1.2],
|
|
]
|
|
documents = [
|
|
Document(page_content=text, metadata={"doc_id": f"doc_{i}", "source": "test"})
|
|
for i, text in enumerate(texts)
|
|
]
|
|
|
|
# Test create (initial insert)
|
|
vector_store.create(texts=documents, embeddings=embeddings)
|
|
|
|
# Test text_exists
|
|
assert vector_store.text_exists("doc_0")
|
|
assert not vector_store.text_exists("doc_999")
|
|
|
|
# Test search_by_vector
|
|
query_vector = [0.1, 0.2, 0.3, 0.4]
|
|
results = vector_store.search_by_vector(query_vector, top_k=2)
|
|
assert len(results) > 0
|
|
assert results[0].page_content == texts[0] # Should match the first document
|
|
|
|
# Test search_by_full_text (Chinese)
|
|
results = vector_store.search_by_full_text("中文", top_k=3)
|
|
assert len(results) >= 2 # Should find documents with Chinese content
|
|
|
|
# Test search_by_full_text (English)
|
|
results = vector_store.search_by_full_text("English", top_k=3)
|
|
assert len(results) >= 2 # Should find documents with English content
|
|
|
|
# Test delete_by_ids
|
|
vector_store.delete_by_ids(["doc_0"])
|
|
assert not vector_store.text_exists("doc_0")
|
|
assert vector_store.text_exists("doc_1")
|
|
|
|
# Test delete_by_metadata_field
|
|
vector_store.delete_by_metadata_field("source", "test")
|
|
assert not vector_store.text_exists("doc_1")
|
|
assert not vector_store.text_exists("doc_2")
|
|
|
|
def test_clickzetta_vector_advanced_search(self, vector_store):
|
|
"""Test advanced search features of Clickzetta vector store."""
|
|
# Prepare test data with more complex metadata
|
|
documents = []
|
|
embeddings = []
|
|
for i in range(10):
|
|
doc = Document(
|
|
page_content=f"Document {i}: " + get_example_text(),
|
|
metadata={
|
|
"doc_id": f"adv_doc_{i}",
|
|
"category": "technical" if i % 2 == 0 else "general",
|
|
"document_id": f"doc_{i // 3}", # Group documents
|
|
"importance": i,
|
|
},
|
|
)
|
|
documents.append(doc)
|
|
# Create varied embeddings
|
|
embeddings.append([0.1 * i, 0.2 * i, 0.3 * i, 0.4 * i])
|
|
|
|
vector_store.create(texts=documents, embeddings=embeddings)
|
|
|
|
# Test vector search with document filter
|
|
query_vector = [0.5, 1.0, 1.5, 2.0]
|
|
results = vector_store.search_by_vector(query_vector, top_k=5, document_ids_filter=["doc_0", "doc_1"])
|
|
assert len(results) > 0
|
|
# All results should belong to doc_0 or doc_1 groups
|
|
for result in results:
|
|
assert result.metadata["document_id"] in ["doc_0", "doc_1"]
|
|
|
|
# Test score threshold
|
|
results = vector_store.search_by_vector(query_vector, top_k=10, score_threshold=0.5)
|
|
# Check that all results have a score above threshold
|
|
for result in results:
|
|
assert result.metadata.get("score", 0) >= 0.5
|
|
|
|
def test_clickzetta_batch_operations(self, vector_store):
|
|
"""Test batch insertion operations."""
|
|
# Prepare large batch of documents
|
|
batch_size = 25
|
|
documents = []
|
|
embeddings = []
|
|
|
|
for i in range(batch_size):
|
|
doc = Document(
|
|
page_content=f"Batch document {i}: This is a test document for batch processing.",
|
|
metadata={"doc_id": f"batch_doc_{i}", "batch": "test_batch"},
|
|
)
|
|
documents.append(doc)
|
|
embeddings.append([0.1 * (i % 10), 0.2 * (i % 10), 0.3 * (i % 10), 0.4 * (i % 10)])
|
|
|
|
# Test batch insert
|
|
vector_store.add_texts(documents=documents, embeddings=embeddings)
|
|
|
|
# Verify all documents were inserted
|
|
for i in range(batch_size):
|
|
assert vector_store.text_exists(f"batch_doc_{i}")
|
|
|
|
# Clean up
|
|
vector_store.delete_by_metadata_field("batch", "test_batch")
|
|
|
|
def test_clickzetta_edge_cases(self, vector_store):
|
|
"""Test edge cases and error handling."""
|
|
# Test empty operations
|
|
vector_store.create(texts=[], embeddings=[])
|
|
vector_store.add_texts(documents=[], embeddings=[])
|
|
vector_store.delete_by_ids([])
|
|
|
|
# Test special characters in content
|
|
special_doc = Document(
|
|
page_content="Special chars: 'quotes', \"double\", \\backslash, \n newline",
|
|
metadata={"doc_id": "special_doc", "test": "edge_case"},
|
|
)
|
|
embeddings = [[0.1, 0.2, 0.3, 0.4]]
|
|
|
|
vector_store.add_texts(documents=[special_doc], embeddings=embeddings)
|
|
assert vector_store.text_exists("special_doc")
|
|
|
|
# Test search with special characters
|
|
results = vector_store.search_by_full_text("quotes", top_k=1)
|
|
if results: # Full-text search might not be available
|
|
assert len(results) > 0
|
|
|
|
# Clean up
|
|
vector_store.delete_by_ids(["special_doc"])
|
|
|
|
def test_clickzetta_full_text_search_modes(self, vector_store):
|
|
"""Test different full-text search capabilities."""
|
|
# Prepare documents with various language content
|
|
documents = [
|
|
Document(
|
|
page_content="云器科技提供强大的Lakehouse解决方案", metadata={"doc_id": "cn_doc_1", "lang": "chinese"}
|
|
),
|
|
Document(
|
|
page_content="Clickzetta provides powerful Lakehouse solutions",
|
|
metadata={"doc_id": "en_doc_1", "lang": "english"},
|
|
),
|
|
Document(
|
|
page_content="Lakehouse是现代数据架构的重要组成部分", metadata={"doc_id": "cn_doc_2", "lang": "chinese"}
|
|
),
|
|
Document(
|
|
page_content="Modern data architecture includes Lakehouse technology",
|
|
metadata={"doc_id": "en_doc_2", "lang": "english"},
|
|
),
|
|
]
|
|
|
|
embeddings = [[0.1, 0.2, 0.3, 0.4] for _ in documents]
|
|
|
|
vector_store.create(texts=documents, embeddings=embeddings)
|
|
|
|
# Test Chinese full-text search
|
|
results = vector_store.search_by_full_text("Lakehouse", top_k=4)
|
|
assert len(results) >= 2 # Should find at least documents with "Lakehouse"
|
|
|
|
# Test English full-text search
|
|
results = vector_store.search_by_full_text("solutions", top_k=2)
|
|
assert len(results) >= 1 # Should find English documents with "solutions"
|
|
|
|
# Test mixed search
|
|
results = vector_store.search_by_full_text("数据架构", top_k=2)
|
|
assert len(results) >= 1 # Should find Chinese documents with this phrase
|
|
|
|
# Clean up
|
|
vector_store.delete_by_metadata_field("lang", "chinese")
|
|
vector_store.delete_by_metadata_field("lang", "english")
|