fix: drop dead code phase2 unused class (#22042)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-07-17 09:33:07 +08:00
parent 3587bd4040
commit d2933c2bfe
23 changed files with 1 additions and 576 deletions
--- a/api/core/rag/splitter/text_splitter.py
+++ b/api/core/rag/splitter/text_splitter.py
@@ -10,7 +10,6 @@ from typing import (
    Any,
    Literal,
    Optional,
-    TypedDict,
    TypeVar,
    Union,
 )
@@ -168,167 +167,6 @@ class TextSplitter(BaseDocumentTransformer, ABC):
        raise NotImplementedError


-class CharacterTextSplitter(TextSplitter):
-    """Splitting text that looks at characters."""
-
-    def __init__(self, separator: str = "\n\n", **kwargs: Any) -> None:
-        """Create a new TextSplitter."""
-        super().__init__(**kwargs)
-        self._separator = separator
-
-    def split_text(self, text: str) -> list[str]:
-        """Split incoming text and return chunks."""
-        # First we naively split the large input into a bunch of smaller ones.
-        splits = _split_text_with_regex(text, self._separator, self._keep_separator)
-        _separator = "" if self._keep_separator else self._separator
-        _good_splits_lengths = []  # cache the lengths of the splits
-        if splits:
-            _good_splits_lengths.extend(self._length_function(splits))
-        return self._merge_splits(splits, _separator, _good_splits_lengths)
-
-
-class LineType(TypedDict):
-    """Line type as typed dict."""
-
-    metadata: dict[str, str]
-    content: str
-
-
-class HeaderType(TypedDict):
-    """Header type as typed dict."""
-
-    level: int
-    name: str
-    data: str
-
-
-class MarkdownHeaderTextSplitter:
-    """Splitting markdown files based on specified headers."""
-
-    def __init__(self, headers_to_split_on: list[tuple[str, str]], return_each_line: bool = False):
-        """Create a new MarkdownHeaderTextSplitter.
-
-        Args:
-            headers_to_split_on: Headers we want to track
-            return_each_line: Return each line w/ associated headers
-        """
-        # Output line-by-line or aggregated into chunks w/ common headers
-        self.return_each_line = return_each_line
-        # Given the headers we want to split on,
-        # (e.g., "#, ##, etc") order by length
-        self.headers_to_split_on = sorted(headers_to_split_on, key=lambda split: len(split[0]), reverse=True)
-
-    def aggregate_lines_to_chunks(self, lines: list[LineType]) -> list[Document]:
-        """Combine lines with common metadata into chunks
-        Args:
-            lines: Line of text / associated header metadata
-        """
-        aggregated_chunks: list[LineType] = []
-
-        for line in lines:
-            if aggregated_chunks and aggregated_chunks[-1]["metadata"] == line["metadata"]:
-                # If the last line in the aggregated list
-                # has the same metadata as the current line,
-                # append the current content to the last lines's content
-                aggregated_chunks[-1]["content"] += "  \n" + line["content"]
-            else:
-                # Otherwise, append the current line to the aggregated list
-                aggregated_chunks.append(line)
-
-        return [Document(page_content=chunk["content"], metadata=chunk["metadata"]) for chunk in aggregated_chunks]
-
-    def split_text(self, text: str) -> list[Document]:
-        """Split markdown file
-        Args:
-            text: Markdown file"""
-
-        # Split the input text by newline character ("\n").
-        lines = text.split("\n")
-        # Final output
-        lines_with_metadata: list[LineType] = []
-        # Content and metadata of the chunk currently being processed
-        current_content: list[str] = []
-        current_metadata: dict[str, str] = {}
-        # Keep track of the nested header structure
-        # header_stack: List[Dict[str, Union[int, str]]] = []
-        header_stack: list[HeaderType] = []
-        initial_metadata: dict[str, str] = {}
-
-        for line in lines:
-            stripped_line = line.strip()
-            # Check each line against each of the header types (e.g., #, ##)
-            for sep, name in self.headers_to_split_on:
-                # Check if line starts with a header that we intend to split on
-                if stripped_line.startswith(sep) and (
-                    # Header with no text OR header is followed by space
-                    # Both are valid conditions that sep is being used a header
-                    len(stripped_line) == len(sep) or stripped_line[len(sep)] == " "
-                ):
-                    # Ensure we are tracking the header as metadata
-                    if name is not None:
-                        # Get the current header level
-                        current_header_level = sep.count("#")
-
-                        # Pop out headers of lower or same level from the stack
-                        while header_stack and header_stack[-1]["level"] >= current_header_level:
-                            # We have encountered a new header
-                            # at the same or higher level
-                            popped_header = header_stack.pop()
-                            # Clear the metadata for the
-                            # popped header in initial_metadata
-                            if popped_header["name"] in initial_metadata:
-                                initial_metadata.pop(popped_header["name"])
-
-                        # Push the current header to the stack
-                        header: HeaderType = {
-                            "level": current_header_level,
-                            "name": name,
-                            "data": stripped_line[len(sep) :].strip(),
-                        }
-                        header_stack.append(header)
-                        # Update initial_metadata with the current header
-                        initial_metadata[name] = header["data"]
-
-                    # Add the previous line to the lines_with_metadata
-                    # only if current_content is not empty
-                    if current_content:
-                        lines_with_metadata.append(
-                            {
-                                "content": "\n".join(current_content),
-                                "metadata": current_metadata.copy(),
-                            }
-                        )
-                        current_content.clear()
-
-                    break
-            else:
-                if stripped_line:
-                    current_content.append(stripped_line)
-                elif current_content:
-                    lines_with_metadata.append(
-                        {
-                            "content": "\n".join(current_content),
-                            "metadata": current_metadata.copy(),
-                        }
-                    )
-                    current_content.clear()
-
-            current_metadata = initial_metadata.copy()
-
-        if current_content:
-            lines_with_metadata.append({"content": "\n".join(current_content), "metadata": current_metadata})
-
-        # lines_with_metadata has each line with associated header metadata
-        # aggregate these into chunks based on common metadata
-        if not self.return_each_line:
-            return self.aggregate_lines_to_chunks(lines_with_metadata)
-        else:
-            return [
-                Document(page_content=chunk["content"], metadata=chunk["metadata"]) for chunk in lines_with_metadata
-            ]
-
-
-# should be in newer Python versions (3.10+)
 # @dataclass(frozen=True, kw_only=True, slots=True)
@dataclass(frozen=True)
 class Tokenizer: