diff --git a/api/.env.example b/api/.env.example index a609fae80..04b3a9224 100644 --- a/api/.env.example +++ b/api/.env.example @@ -297,6 +297,7 @@ OCEANBASE_VECTOR_USER=root@test OCEANBASE_VECTOR_PASSWORD=difyai123456 OCEANBASE_VECTOR_DATABASE=test OCEANBASE_MEMORY_LIMIT=6G +OCEANBASE_ENABLE_HYBRID_SEARCH=false # openGauss configuration OPENGAUSS_HOST=127.0.0.1 diff --git a/api/configs/middleware/vdb/oceanbase_config.py b/api/configs/middleware/vdb/oceanbase_config.py index 87427af96..9b11a2273 100644 --- a/api/configs/middleware/vdb/oceanbase_config.py +++ b/api/configs/middleware/vdb/oceanbase_config.py @@ -33,3 +33,9 @@ class OceanBaseVectorConfig(BaseSettings): description="Name of the OceanBase Vector database to connect to", default=None, ) + + OCEANBASE_ENABLE_HYBRID_SEARCH: bool = Field( + description="Enable hybrid search features (requires OceanBase >= 4.3.5.1). Set to false for compatibility " + "with older versions", + default=False, + ) diff --git a/api/controllers/console/datasets/datasets.py b/api/controllers/console/datasets/datasets.py index fed304f72..52dd0242e 100644 --- a/api/controllers/console/datasets/datasets.py +++ b/api/controllers/console/datasets/datasets.py @@ -646,7 +646,6 @@ class DatasetRetrievalSettingApi(Resource): | VectorType.BAIDU | VectorType.VIKINGDB | VectorType.UPSTASH - | VectorType.OCEANBASE ): return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]} case ( @@ -664,6 +663,7 @@ class DatasetRetrievalSettingApi(Resource): | VectorType.COUCHBASE | VectorType.MILVUS | VectorType.OPENGAUSS + | VectorType.OCEANBASE ): return { "retrieval_method": [ @@ -692,7 +692,6 @@ class DatasetRetrievalSettingMockApi(Resource): | VectorType.BAIDU | VectorType.VIKINGDB | VectorType.UPSTASH - | VectorType.OCEANBASE ): return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]} case ( @@ -708,6 +707,7 @@ class DatasetRetrievalSettingMockApi(Resource): | VectorType.PGVECTOR | VectorType.LINDORM | VectorType.OPENGAUSS + | VectorType.OCEANBASE ): return { "retrieval_method": [ diff --git a/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py b/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py index 8ff97f2f2..f4145f539 100644 --- a/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py +++ b/api/core/rag/datasource/vdb/oceanbase/oceanbase_vector.py @@ -31,6 +31,7 @@ class OceanBaseVectorConfig(BaseModel): user: str password: str database: str + enable_hybrid_search: bool = False @model_validator(mode="before") @classmethod @@ -57,6 +58,7 @@ class OceanBaseVector(BaseVector): password=self._config.password, db_name=self._config.database, ) + self._hybrid_search_enabled = self._check_hybrid_search_support() # Check if hybrid search is supported def get_type(self) -> str: return VectorType.OCEANBASE @@ -98,6 +100,16 @@ class OceanBaseVector(BaseVector): columns=cols, vidxs=vidx_params, ) + try: + if self._hybrid_search_enabled: + self._client.perform_raw_text_sql(f"""ALTER TABLE {self._collection_name} + ADD FULLTEXT INDEX fulltext_index_for_col_text (text) WITH PARSER ik""") + except Exception as e: + raise Exception( + "Failed to add fulltext index to the target table, your OceanBase version must be 4.3.5.1 or above " + + "to support fulltext index and vector index in the same table", + e, + ) vals = [] params = self._client.perform_raw_text_sql("SHOW PARAMETERS LIKE '%ob_vector_memory_limit_percentage%'") for row in params: @@ -116,6 +128,27 @@ class OceanBaseVector(BaseVector): ) redis_client.set(collection_exist_cache_key, 1, ex=3600) + def _check_hybrid_search_support(self) -> bool: + """ + Check if the current OceanBase version supports hybrid search. + Returns True if the version is >= 4.3.5.1, otherwise False. + """ + if not self._config.enable_hybrid_search: + return False + + try: + from packaging import version + + # return OceanBase_CE 4.3.5.1 (r101000042025031818-bxxxx) (Built Mar 18 2025 18:13:36) + result = self._client.perform_raw_text_sql("SELECT @@version_comment AS version") + ob_full_version = result.fetchone()[0] + ob_version = ob_full_version.split()[1] + logger.debug("Current OceanBase version is %s", ob_version) + return version.parse(ob_version).base_version >= version.parse("4.3.5.1").base_version + except Exception as e: + logger.warning(f"Failed to check OceanBase version: {str(e)}. Disabling hybrid search.") + return False + def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs): ids = self._get_uuids(documents) for id, doc, emb in zip(ids, documents, embeddings): @@ -130,7 +163,7 @@ class OceanBaseVector(BaseVector): ) def text_exists(self, id: str) -> bool: - cur = self._client.get(table_name=self._collection_name, id=id) + cur = self._client.get(table_name=self._collection_name, ids=id) return bool(cur.rowcount != 0) def delete_by_ids(self, ids: list[str]) -> None: @@ -139,9 +172,12 @@ class OceanBaseVector(BaseVector): self._client.delete(table_name=self._collection_name, ids=ids) def get_ids_by_metadata_field(self, key: str, value: str) -> list[str]: + from sqlalchemy import text + cur = self._client.get( table_name=self._collection_name, - where_clause=f"metadata->>'$.{key}' = '{value}'", + ids=None, + where_clause=[text(f"metadata->>'$.{key}' = '{value}'")], output_column_name=["id"], ) return [row[0] for row in cur] @@ -151,36 +187,84 @@ class OceanBaseVector(BaseVector): self.delete_by_ids(ids) def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]: - return [] + if not self._hybrid_search_enabled: + return [] + + try: + top_k = kwargs.get("top_k", 5) + if not isinstance(top_k, int) or top_k <= 0: + raise ValueError("top_k must be a positive integer") + + document_ids_filter = kwargs.get("document_ids_filter") + where_clause = "" + if document_ids_filter: + document_ids = ", ".join(f"'{id}'" for id in document_ids_filter) + where_clause = f" AND metadata->>'$.document_id' IN ({document_ids})" + + full_sql = f"""SELECT metadata, text, MATCH (text) AGAINST (:query) AS score + FROM {self._collection_name} + WHERE MATCH (text) AGAINST (:query) > 0 + {where_clause} + ORDER BY score DESC + LIMIT {top_k}""" + + with self._client.engine.connect() as conn: + with conn.begin(): + from sqlalchemy import text + + result = conn.execute(text(full_sql), {"query": query}) + rows = result.fetchall() + + docs = [] + for row in rows: + metadata_str, _text, score = row + try: + metadata = json.loads(metadata_str) + except json.JSONDecodeError: + print(f"Invalid JSON metadata: {metadata_str}") + metadata = {} + metadata["score"] = score + docs.append(Document(page_content=_text, metadata=metadata)) + + return docs + except Exception as e: + logger.warning(f"Failed to fulltext search: {str(e)}.") + return [] def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]: document_ids_filter = kwargs.get("document_ids_filter") - where_clause = None + _where_clause = None if document_ids_filter: document_ids = ", ".join(f"'{id}'" for id in document_ids_filter) where_clause = f"metadata->>'$.document_id' in ({document_ids})" + from sqlalchemy import text + + _where_clause = [text(where_clause)] ef_search = kwargs.get("ef_search", self._hnsw_ef_search) if ef_search != self._hnsw_ef_search: self._client.set_ob_hnsw_ef_search(ef_search) self._hnsw_ef_search = ef_search topk = kwargs.get("top_k", 10) - cur = self._client.ann_search( - table_name=self._collection_name, - vec_column_name="vector", - vec_data=query_vector, - topk=topk, - distance_func=func.l2_distance, - output_column_names=["text", "metadata"], - with_dist=True, - where_clause=where_clause, - ) + try: + cur = self._client.ann_search( + table_name=self._collection_name, + vec_column_name="vector", + vec_data=query_vector, + topk=topk, + distance_func=func.l2_distance, + output_column_names=["text", "metadata"], + with_dist=True, + where_clause=_where_clause, + ) + except Exception as e: + raise Exception("Failed to search by vector. ", e) docs = [] - for text, metadata, distance in cur: + for _text, metadata, distance in cur: metadata = json.loads(metadata) metadata["score"] = 1 - distance / math.sqrt(2) docs.append( Document( - page_content=text, + page_content=_text, metadata=metadata, ) ) diff --git a/docker/.env.example b/docker/.env.example index 2e069db9e..4d7486ed8 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -554,6 +554,7 @@ OCEANBASE_VECTOR_PASSWORD=difyai123456 OCEANBASE_VECTOR_DATABASE=test OCEANBASE_CLUSTER_NAME=difyai OCEANBASE_MEMORY_LIMIT=6G +OCEANBASE_ENABLE_HYBRID_SEARCH=false # opengauss configurations, only available when VECTOR_STORE is `opengauss` OPENGAUSS_HOST=opengauss diff --git a/docker/docker-compose-template.yaml b/docker/docker-compose-template.yaml index 520cb948c..05db17102 100644 --- a/docker/docker-compose-template.yaml +++ b/docker/docker-compose-template.yaml @@ -373,7 +373,8 @@ services: # OceanBase vector database oceanbase: - image: quay.io/oceanbase/oceanbase-ce:4.3.3.0-100000142024101215 + image: oceanbase/oceanbase-ce:4.3.5.1-101000042025031818 + container_name: oceanbase profiles: - oceanbase restart: always @@ -386,7 +387,9 @@ services: OB_SYS_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456} OB_TENANT_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456} OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai} - OB_SERVER_IP: '127.0.0.1' + MODE: MINI + ports: + - "${OCEANBASE_VECTOR_PORT:-2881}:2881" # Oracle vector database oracle: diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 4a40cb1ae..c6a90eb52 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -252,6 +252,7 @@ x-shared-env: &shared-api-worker-env OCEANBASE_VECTOR_DATABASE: ${OCEANBASE_VECTOR_DATABASE:-test} OCEANBASE_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai} OCEANBASE_MEMORY_LIMIT: ${OCEANBASE_MEMORY_LIMIT:-6G} + OCEANBASE_ENABLE_HYBRID_SEARCH: ${OCEANBASE_ENABLE_HYBRID_SEARCH:-false} OPENGAUSS_HOST: ${OPENGAUSS_HOST:-opengauss} OPENGAUSS_PORT: ${OPENGAUSS_PORT:-6600} OPENGAUSS_USER: ${OPENGAUSS_USER:-postgres} @@ -804,7 +805,8 @@ services: # OceanBase vector database oceanbase: - image: quay.io/oceanbase/oceanbase-ce:4.3.3.0-100000142024101215 + image: oceanbase/oceanbase-ce:4.3.5.1-101000042025031818 + container_name: oceanbase profiles: - oceanbase restart: always @@ -817,7 +819,9 @@ services: OB_SYS_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456} OB_TENANT_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456} OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai} - OB_SERVER_IP: '127.0.0.1' + MODE: MINI + ports: + - "${OCEANBASE_VECTOR_PORT:-2881}:2881" # Oracle vector database oracle: