fix: drop dead code phase2 unused class (#22042)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-07-17 09:33:07 +08:00
parent 3587bd4040
commit d2933c2bfe
23 changed files with 1 additions and 576 deletions
--- a/api/core/rag/extractor/blob/blob.py
+++ b/api/core/rag/extractor/blob/blob.py
@@ -9,8 +9,7 @@ from __future__ import annotations

 import contextlib
 import mimetypes
-from abc import ABC, abstractmethod
-from collections.abc import Generator, Iterable, Mapping
+from collections.abc import Generator, Mapping
 from io import BufferedReader, BytesIO
 from pathlib import Path, PurePath
 from typing import Any, Optional, Union
@@ -143,21 +142,3 @@ class Blob(BaseModel):
        if self.source:
            str_repr += f" {self.source}"
        return str_repr
-
-
-class BlobLoader(ABC):
-    """Abstract interface for blob loaders implementation.
-
-    Implementer should be able to load raw content from a datasource system according
-    to some criteria and return the raw content lazily as a stream of blobs.
-    """
-
-    @abstractmethod
-    def yield_blobs(
-        self,
-    ) -> Iterable[Blob]:
-        """A lazy loader for raw data represented by Blob object.
-
-        Returns:
-            A generator over blobs
-        """
--- a/api/core/rag/extractor/unstructured/unstructured_pdf_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_pdf_extractor.py
@@ -1,47 +0,0 @@
-import logging
-
-from core.rag.extractor.extractor_base import BaseExtractor
-from core.rag.models.document import Document
-
-logger = logging.getLogger(__name__)
-
-
-class UnstructuredPDFExtractor(BaseExtractor):
-    """Load pdf files.
-
-
-    Args:
-        file_path: Path to the file to load.
-
-        api_url: Unstructured API URL
-
-        api_key: Unstructured API Key
-    """
-
-    def __init__(self, file_path: str, api_url: str, api_key: str):
-        """Initialize with file path."""
-        self._file_path = file_path
-        self._api_url = api_url
-        self._api_key = api_key
-
-    def extract(self) -> list[Document]:
-        if self._api_url:
-            from unstructured.partition.api import partition_via_api
-
-            elements = partition_via_api(
-                filename=self._file_path, api_url=self._api_url, api_key=self._api_key, strategy="auto"
-            )
-        else:
-            from unstructured.partition.pdf import partition_pdf
-
-            elements = partition_pdf(filename=self._file_path, strategy="auto")
-
-        from unstructured.chunking.title import chunk_by_title
-
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
-        documents = []
-        for chunk in chunks:
-            text = chunk.text.strip()
-            documents.append(Document(page_content=text))
-
-        return documents
--- a/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
@@ -1,34 +0,0 @@
-import logging
-
-from core.rag.extractor.extractor_base import BaseExtractor
-from core.rag.models.document import Document
-
-logger = logging.getLogger(__name__)
-
-
-class UnstructuredTextExtractor(BaseExtractor):
-    """Load msg files.
-
-
-    Args:
-        file_path: Path to the file to load.
-    """
-
-    def __init__(self, file_path: str, api_url: str):
-        """Initialize with file path."""
-        self._file_path = file_path
-        self._api_url = api_url
-
-    def extract(self) -> list[Document]:
-        from unstructured.partition.text import partition_text
-
-        elements = partition_text(filename=self._file_path)
-        from unstructured.chunking.title import chunk_by_title
-
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
-        documents = []
-        for chunk in chunks:
-            text = chunk.text.strip()
-            documents.append(Document(page_content=text))
-
-        return documents