fix: drop dead code phase2 unused class (#22042)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-07-17 09:33:07 +08:00
parent 3587bd4040
commit d2933c2bfe
23 changed files with 1 additions and 576 deletions
--- a/api/core/rag/cleaner/unstructured/unstructured_extra_whitespace_cleaner.py
+++ b/api/core/rag/cleaner/unstructured/unstructured_extra_whitespace_cleaner.py
@@ -1,12 +0,0 @@
-"""Abstract interface for document clean implementations."""
-
-from core.rag.cleaner.cleaner_base import BaseCleaner
-
-
-class UnstructuredNonAsciiCharsCleaner(BaseCleaner):
-    def clean(self, content) -> str:
-        """clean document content."""
-        from unstructured.cleaners.core import clean_extra_whitespace
-
-        # Returns "ITEM 1A: RISK FACTORS"
-        return clean_extra_whitespace(content)
--- a/api/core/rag/cleaner/unstructured/unstructured_group_broken_paragraphs_cleaner.py
+++ b/api/core/rag/cleaner/unstructured/unstructured_group_broken_paragraphs_cleaner.py
@@ -1,15 +0,0 @@
-"""Abstract interface for document clean implementations."""
-
-from core.rag.cleaner.cleaner_base import BaseCleaner
-
-
-class UnstructuredGroupBrokenParagraphsCleaner(BaseCleaner):
-    def clean(self, content) -> str:
-        """clean document content."""
-        import re
-
-        from unstructured.cleaners.core import group_broken_paragraphs
-
-        para_split_re = re.compile(r"(\s*\n\s*){3}")
-
-        return group_broken_paragraphs(content, paragraph_split=para_split_re)
--- a/api/core/rag/cleaner/unstructured/unstructured_non_ascii_chars_cleaner.py
+++ b/api/core/rag/cleaner/unstructured/unstructured_non_ascii_chars_cleaner.py
@@ -1,12 +0,0 @@
-"""Abstract interface for document clean implementations."""
-
-from core.rag.cleaner.cleaner_base import BaseCleaner
-
-
-class UnstructuredNonAsciiCharsCleaner(BaseCleaner):
-    def clean(self, content) -> str:
-        """clean document content."""
-        from unstructured.cleaners.core import clean_non_ascii_chars
-
-        # Returns "This text contains non-ascii characters!"
-        return clean_non_ascii_chars(content)
--- a/api/core/rag/cleaner/unstructured/unstructured_replace_unicode_quotes_cleaner.py
+++ b/api/core/rag/cleaner/unstructured/unstructured_replace_unicode_quotes_cleaner.py
@@ -1,12 +0,0 @@
-"""Abstract interface for document clean implementations."""
-
-from core.rag.cleaner.cleaner_base import BaseCleaner
-
-
-class UnstructuredNonAsciiCharsCleaner(BaseCleaner):
-    def clean(self, content) -> str:
-        """Replaces unicode quote characters, such as the \x91 character in a string."""
-
-        from unstructured.cleaners.core import replace_unicode_quotes
-
-        return replace_unicode_quotes(content)
--- a/api/core/rag/cleaner/unstructured/unstructured_translate_text_cleaner.py
+++ b/api/core/rag/cleaner/unstructured/unstructured_translate_text_cleaner.py
@@ -1,11 +0,0 @@
-"""Abstract interface for document clean implementations."""
-
-from core.rag.cleaner.cleaner_base import BaseCleaner
-
-
-class UnstructuredTranslateTextCleaner(BaseCleaner):
-    def clean(self, content) -> str:
-        """clean document content."""
-        from unstructured.cleaners.translate import translate_text
-
-        return translate_text(content)
--- a/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_entities.py
+++ b/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_entities.py
@@ -1,17 +0,0 @@
-from typing import Optional
-
-from pydantic import BaseModel
-
-
-class ClusterEntity(BaseModel):
-    """
-    Model Config Entity.
-    """
-
-    name: str
-    cluster_id: str
-    displayName: str
-    region: str
-    spendingLimit: Optional[int] = 1000
-    version: str
-    createdBy: str
--- a/api/core/rag/extractor/blob/blob.py
+++ b/api/core/rag/extractor/blob/blob.py
@@ -9,8 +9,7 @@ from __future__ import annotations

 import contextlib
 import mimetypes
-from abc import ABC, abstractmethod
-from collections.abc import Generator, Iterable, Mapping
+from collections.abc import Generator, Mapping
 from io import BufferedReader, BytesIO
 from pathlib import Path, PurePath
 from typing import Any, Optional, Union
@@ -143,21 +142,3 @@ class Blob(BaseModel):
        if self.source:
            str_repr += f" {self.source}"
        return str_repr
-
-
-class BlobLoader(ABC):
-    """Abstract interface for blob loaders implementation.
-
-    Implementer should be able to load raw content from a datasource system according
-    to some criteria and return the raw content lazily as a stream of blobs.
-    """
-
-    @abstractmethod
-    def yield_blobs(
-        self,
-    ) -> Iterable[Blob]:
-        """A lazy loader for raw data represented by Blob object.
-
-        Returns:
-            A generator over blobs
-        """
--- a/api/core/rag/extractor/unstructured/unstructured_pdf_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_pdf_extractor.py
@@ -1,47 +0,0 @@
-import logging
-
-from core.rag.extractor.extractor_base import BaseExtractor
-from core.rag.models.document import Document
-
-logger = logging.getLogger(__name__)
-
-
-class UnstructuredPDFExtractor(BaseExtractor):
-    """Load pdf files.
-
-
-    Args:
-        file_path: Path to the file to load.
-
-        api_url: Unstructured API URL
-
-        api_key: Unstructured API Key
-    """
-
-    def __init__(self, file_path: str, api_url: str, api_key: str):
-        """Initialize with file path."""
-        self._file_path = file_path
-        self._api_url = api_url
-        self._api_key = api_key
-
-    def extract(self) -> list[Document]:
-        if self._api_url:
-            from unstructured.partition.api import partition_via_api
-
-            elements = partition_via_api(
-                filename=self._file_path, api_url=self._api_url, api_key=self._api_key, strategy="auto"
-            )
-        else:
-            from unstructured.partition.pdf import partition_pdf
-
-            elements = partition_pdf(filename=self._file_path, strategy="auto")
-
-        from unstructured.chunking.title import chunk_by_title
-
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
-        documents = []
-        for chunk in chunks:
-            text = chunk.text.strip()
-            documents.append(Document(page_content=text))
-
-        return documents
--- a/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
@@ -1,34 +0,0 @@
-import logging
-
-from core.rag.extractor.extractor_base import BaseExtractor
-from core.rag.models.document import Document
-
-logger = logging.getLogger(__name__)
-
-
-class UnstructuredTextExtractor(BaseExtractor):
-    """Load msg files.
-
-
-    Args:
-        file_path: Path to the file to load.
-    """
-
-    def __init__(self, file_path: str, api_url: str):
-        """Initialize with file path."""
-        self._file_path = file_path
-        self._api_url = api_url
-
-    def extract(self) -> list[Document]:
-        from unstructured.partition.text import partition_text
-
-        elements = partition_text(filename=self._file_path)
-        from unstructured.chunking.title import chunk_by_title
-
-        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
-        documents = []
-        for chunk in chunks:
-            text = chunk.text.strip()
-            documents.append(Document(page_content=text))
-
-        return documents
--- a/api/core/rag/splitter/text_splitter.py
+++ b/api/core/rag/splitter/text_splitter.py
@@ -10,7 +10,6 @@ from typing import (
    Any,
    Literal,
    Optional,
-    TypedDict,
    TypeVar,
    Union,
 )
@@ -168,167 +167,6 @@ class TextSplitter(BaseDocumentTransformer, ABC):
        raise NotImplementedError


-class CharacterTextSplitter(TextSplitter):
-    """Splitting text that looks at characters."""
-
-    def __init__(self, separator: str = "\n\n", **kwargs: Any) -> None:
-        """Create a new TextSplitter."""
-        super().__init__(**kwargs)
-        self._separator = separator
-
-    def split_text(self, text: str) -> list[str]:
-        """Split incoming text and return chunks."""
-        # First we naively split the large input into a bunch of smaller ones.
-        splits = _split_text_with_regex(text, self._separator, self._keep_separator)
-        _separator = "" if self._keep_separator else self._separator
-        _good_splits_lengths = []  # cache the lengths of the splits
-        if splits:
-            _good_splits_lengths.extend(self._length_function(splits))
-        return self._merge_splits(splits, _separator, _good_splits_lengths)
-
-
-class LineType(TypedDict):
-    """Line type as typed dict."""
-
-    metadata: dict[str, str]
-    content: str
-
-
-class HeaderType(TypedDict):
-    """Header type as typed dict."""
-
-    level: int
-    name: str
-    data: str
-
-
-class MarkdownHeaderTextSplitter:
-    """Splitting markdown files based on specified headers."""
-
-    def __init__(self, headers_to_split_on: list[tuple[str, str]], return_each_line: bool = False):
-        """Create a new MarkdownHeaderTextSplitter.
-
-        Args:
-            headers_to_split_on: Headers we want to track
-            return_each_line: Return each line w/ associated headers
-        """
-        # Output line-by-line or aggregated into chunks w/ common headers
-        self.return_each_line = return_each_line
-        # Given the headers we want to split on,
-        # (e.g., "#, ##, etc") order by length
-        self.headers_to_split_on = sorted(headers_to_split_on, key=lambda split: len(split[0]), reverse=True)
-
-    def aggregate_lines_to_chunks(self, lines: list[LineType]) -> list[Document]:
-        """Combine lines with common metadata into chunks
-        Args:
-            lines: Line of text / associated header metadata
-        """
-        aggregated_chunks: list[LineType] = []
-
-        for line in lines:
-            if aggregated_chunks and aggregated_chunks[-1]["metadata"] == line["metadata"]:
-                # If the last line in the aggregated list
-                # has the same metadata as the current line,
-                # append the current content to the last lines's content
-                aggregated_chunks[-1]["content"] += "  \n" + line["content"]
-            else:
-                # Otherwise, append the current line to the aggregated list
-                aggregated_chunks.append(line)
-
-        return [Document(page_content=chunk["content"], metadata=chunk["metadata"]) for chunk in aggregated_chunks]
-
-    def split_text(self, text: str) -> list[Document]:
-        """Split markdown file
-        Args:
-            text: Markdown file"""
-
-        # Split the input text by newline character ("\n").
-        lines = text.split("\n")
-        # Final output
-        lines_with_metadata: list[LineType] = []
-        # Content and metadata of the chunk currently being processed
-        current_content: list[str] = []
-        current_metadata: dict[str, str] = {}
-        # Keep track of the nested header structure
-        # header_stack: List[Dict[str, Union[int, str]]] = []
-        header_stack: list[HeaderType] = []
-        initial_metadata: dict[str, str] = {}
-
-        for line in lines:
-            stripped_line = line.strip()
-            # Check each line against each of the header types (e.g., #, ##)
-            for sep, name in self.headers_to_split_on:
-                # Check if line starts with a header that we intend to split on
-                if stripped_line.startswith(sep) and (
-                    # Header with no text OR header is followed by space
-                    # Both are valid conditions that sep is being used a header
-                    len(stripped_line) == len(sep) or stripped_line[len(sep)] == " "
-                ):
-                    # Ensure we are tracking the header as metadata
-                    if name is not None:
-                        # Get the current header level
-                        current_header_level = sep.count("#")
-
-                        # Pop out headers of lower or same level from the stack
-                        while header_stack and header_stack[-1]["level"] >= current_header_level:
-                            # We have encountered a new header
-                            # at the same or higher level
-                            popped_header = header_stack.pop()
-                            # Clear the metadata for the
-                            # popped header in initial_metadata
-                            if popped_header["name"] in initial_metadata:
-                                initial_metadata.pop(popped_header["name"])
-
-                        # Push the current header to the stack
-                        header: HeaderType = {
-                            "level": current_header_level,
-                            "name": name,
-                            "data": stripped_line[len(sep) :].strip(),
-                        }
-                        header_stack.append(header)
-                        # Update initial_metadata with the current header
-                        initial_metadata[name] = header["data"]
-
-                    # Add the previous line to the lines_with_metadata
-                    # only if current_content is not empty
-                    if current_content:
-                        lines_with_metadata.append(
-                            {
-                                "content": "\n".join(current_content),
-                                "metadata": current_metadata.copy(),
-                            }
-                        )
-                        current_content.clear()
-
-                    break
-            else:
-                if stripped_line:
-                    current_content.append(stripped_line)
-                elif current_content:
-                    lines_with_metadata.append(
-                        {
-                            "content": "\n".join(current_content),
-                            "metadata": current_metadata.copy(),
-                        }
-                    )
-                    current_content.clear()
-
-            current_metadata = initial_metadata.copy()
-
-        if current_content:
-            lines_with_metadata.append({"content": "\n".join(current_content), "metadata": current_metadata})
-
-        # lines_with_metadata has each line with associated header metadata
-        # aggregate these into chunks based on common metadata
-        if not self.return_each_line:
-            return self.aggregate_lines_to_chunks(lines_with_metadata)
-        else:
-            return [
-                Document(page_content=chunk["content"], metadata=chunk["metadata"]) for chunk in lines_with_metadata
-            ]
-
-
-# should be in newer Python versions (3.10+)
 # @dataclass(frozen=True, kw_only=True, slots=True)
@dataclass(frozen=True)
 class Tokenizer: