From 5f11fe521dfc2e8d94990d5067cefd08fa1f51bc Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Thu, 24 Oct 2024 18:13:05 +0800 Subject: [PATCH] remove unstructured pdf extract (#9794) --- api/core/rag/extractor/extract_processor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index 603f7555d..6a23f3cfe 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -21,7 +21,6 @@ from core.rag.extractor.unstructured.unstructured_eml_extractor import Unstructu from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor -from core.rag.extractor.unstructured.unstructured_pdf_extractor import UnstructuredPDFExtractor from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor from core.rag.extractor.unstructured.unstructured_text_extractor import UnstructuredTextExtractor @@ -103,7 +102,7 @@ class ExtractProcessor: if file_extension in {".xlsx", ".xls"}: extractor = ExcelExtractor(file_path) elif file_extension == ".pdf": - extractor = UnstructuredPDFExtractor(file_path, unstructured_api_url, unstructured_api_key) + extractor = PdfExtractor(file_path) elif file_extension in {".md", ".markdown"}: extractor = ( UnstructuredMarkdownExtractor(file_path, unstructured_api_url, unstructured_api_key)