fix: drop dead code phase2 unused class (#22042)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
yihong
2025-07-17 09:33:07 +08:00
committed by GitHub
parent 3587bd4040
commit d2933c2bfe
23 changed files with 1 additions and 576 deletions

View File

@@ -9,8 +9,7 @@ from __future__ import annotations
import contextlib
import mimetypes
from abc import ABC, abstractmethod
from collections.abc import Generator, Iterable, Mapping
from collections.abc import Generator, Mapping
from io import BufferedReader, BytesIO
from pathlib import Path, PurePath
from typing import Any, Optional, Union
@@ -143,21 +142,3 @@ class Blob(BaseModel):
if self.source:
str_repr += f" {self.source}"
return str_repr
class BlobLoader(ABC):
"""Abstract interface for blob loaders implementation.
Implementer should be able to load raw content from a datasource system according
to some criteria and return the raw content lazily as a stream of blobs.
"""
@abstractmethod
def yield_blobs(
self,
) -> Iterable[Blob]:
"""A lazy loader for raw data represented by Blob object.
Returns:
A generator over blobs
"""

View File

@@ -1,47 +0,0 @@
import logging
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
logger = logging.getLogger(__name__)
class UnstructuredPDFExtractor(BaseExtractor):
"""Load pdf files.
Args:
file_path: Path to the file to load.
api_url: Unstructured API URL
api_key: Unstructured API Key
"""
def __init__(self, file_path: str, api_url: str, api_key: str):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(
filename=self._file_path, api_url=self._api_url, api_key=self._api_key, strategy="auto"
)
else:
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(filename=self._file_path, strategy="auto")
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents

View File

@@ -1,34 +0,0 @@
import logging
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
logger = logging.getLogger(__name__)
class UnstructuredTextExtractor(BaseExtractor):
"""Load msg files.
Args:
file_path: Path to the file to load.
"""
def __init__(self, file_path: str, api_url: str):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
def extract(self) -> list[Document]:
from unstructured.partition.text import partition_text
elements = partition_text(filename=self._file_path)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents