feat:api Add support for extracting EPUB files in ExtractProcessor (#3254)

Co-authored-by: crazywoola <427733928@qq.com>
This commit is contained in:
LiuVaayne
2024-04-12 11:25:02 +08:00
committed by GitHub
parent 44448ba68d
commit b00466f025
4 changed files with 44 additions and 2 deletions

View File

@@ -0,0 +1,37 @@
import logging
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
logger = logging.getLogger(__name__)
class UnstructuredEpubExtractor(BaseExtractor):
"""Load epub files.
Args:
file_path: Path to the file to load.
"""
def __init__(
self,
file_path: str,
api_url: str = None,
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
def extract(self) -> list[Document]:
from unstructured.partition.epub import partition_epub
elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents