feat: add upstash as a new vector database provider (#9644)

2024-10-23 09:16:35 +08:00
parent 999d3f1539
commit 8e7a752b2a
16 changed files with 365 additions and 4 deletions
--- a/api/tests/integration_tests/vdb/__mock/upstashvectordb.py
+++ b/api/tests/integration_tests/vdb/__mock/upstashvectordb.py
@@ -0,0 +1,75 @@
+import os
+from typing import Optional
+
+import pytest
+from _pytest.monkeypatch import MonkeyPatch
+from upstash_vector import Index
+
+
+# Mocking the Index class from upstash_vector
+class MockIndex:
+    def __init__(self, url="", token=""):
+        self.url = url
+        self.token = token
+        self.vectors = []
+
+    def upsert(self, vectors):
+        for vector in vectors:
+            vector.score = 0.5
+            self.vectors.append(vector)
+        return {"code": 0, "msg": "operation success", "affectedCount": len(vectors)}
+
+    def fetch(self, ids):
+        return [vector for vector in self.vectors if vector.id in ids]
+
+    def delete(self, ids):
+        self.vectors = [vector for vector in self.vectors if vector.id not in ids]
+        return {"code": 0, "msg": "Success"}
+
+    def query(
+        self,
+        vector: None,
+        top_k: int = 10,
+        include_vectors: bool = False,
+        include_metadata: bool = False,
+        filter: str = "",
+        data: Optional[str] = None,
+        namespace: str = "",
+        include_data: bool = False,
+    ):
+        # Simple mock query, in real scenario you would calculate similarity
+        mock_result = []
+        for vector_data in self.vectors:
+            mock_result.append(vector_data)
+        return mock_result[:top_k]
+
+    def reset(self):
+        self.vectors = []
+
+    def info(self):
+        return AttrDict({"dimension": 1024})
+
+
+class AttrDict(dict):
+    def __getattr__(self, item):
+        return self.get(item)
+
+
+MOCK = os.getenv("MOCK_SWITCH", "false").lower() == "true"
+
+
+@pytest.fixture
+def setup_upstashvector_mock(request, monkeypatch: MonkeyPatch):
+    if MOCK:
+        monkeypatch.setattr(Index, "__init__", MockIndex.__init__)
+        monkeypatch.setattr(Index, "upsert", MockIndex.upsert)
+        monkeypatch.setattr(Index, "fetch", MockIndex.fetch)
+        monkeypatch.setattr(Index, "delete", MockIndex.delete)
+        monkeypatch.setattr(Index, "query", MockIndex.query)
+        monkeypatch.setattr(Index, "reset", MockIndex.reset)
+        monkeypatch.setattr(Index, "info", MockIndex.info)
+
+    yield
+
+    if MOCK:
+        monkeypatch.undo()
--- a/api/tests/integration_tests/vdb/upstash/init.py
+++ b/api/tests/integration_tests/vdb/upstash/init.py
--- a/api/tests/integration_tests/vdb/upstash/test_upstash_vector.py
+++ b/api/tests/integration_tests/vdb/upstash/test_upstash_vector.py
@@ -0,0 +1,63 @@
+import time
+import uuid
+
+from core.rag.datasource.vdb.upstash.upstash_vector import UpstashVector, UpstashVectorConfig
+from core.rag.models.document import Document
+from tests.integration_tests.vdb.__mock.upstashvectordb import setup_upstashvector_mock
+from tests.integration_tests.vdb.test_vector_store import AbstractVectorTest
+
+
+def get_example_text() -> str:
+    return "test_text"
+
+
+def get_example_document(doc_id: str) -> Document:
+    doc = Document(
+        page_content=get_example_text(),
+        metadata={
+            "doc_id": doc_id,
+            "doc_hash": doc_id,
+            "document_id": doc_id,
+            "dataset_id": doc_id,
+        },
+    )
+    return doc
+
+
+class UpstashVectorTest(AbstractVectorTest):
+    def __init__(self):
+        super().__init__()
+        self.vector = UpstashVector(
+            collection_name="test_collection",
+            config=UpstashVectorConfig(
+                url="your-server-url",
+                token="your-access-token",
+            ),
+        )
+        self.example_embedding = [1.001 * i for i in range(self.vector._get_index_dimension())]
+
+    def add_texts(self) -> list[str]:
+        batch_size = 1
+        documents = [get_example_document(doc_id=str(uuid.uuid4())) for _ in range(batch_size)]
+        embeddings = [self.example_embedding] * batch_size
+        self.vector.add_texts(documents=documents, embeddings=embeddings)
+        return [doc.metadata["doc_id"] for doc in documents]
+
+    def get_ids_by_metadata_field(self):
+        print("doc_id", self.example_doc_id)
+        ids = self.vector.get_ids_by_metadata_field(key="document_id", value=self.example_doc_id)
+        assert len(ids) != 0
+
+    def run_all_tests(self):
+        self.create_vector()
+        time.sleep(1)
+        self.search_by_vector()
+        self.text_exists()
+        self.get_ids_by_metadata_field()
+        added_doc_ids = self.add_texts()
+        self.delete_by_ids(added_doc_ids + [self.example_doc_id])
+        self.delete_vector()
+
+
+def test_upstash_vector(setup_upstashvector_mock):
+    UpstashVectorTest().run_all_tests()