feat: support xlsx file parsing (#304)

Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
lisaifei@cvte.com
2023-06-09 15:57:19 +08:00
committed by GitHub
parent bbe58327c8
commit 0abd67288b
4 changed files with 41 additions and 2 deletions

View File

@@ -12,6 +12,8 @@ from llama_index.data_structs import Node
from llama_index.data_structs.node_v2 import DocumentRelationship
from llama_index.node_parser import SimpleNodeParser, NodeParser
from llama_index.readers.file.base import DEFAULT_FILE_EXTRACTOR
from llama_index.readers.file.markdown_parser import MarkdownParser
from core.index.readers.xlsx_parser import XLSXParser
from core.docstore.dataset_docstore import DatesetDocumentStore
from core.index.keyword_table_index import KeywordTableIndex
from core.index.readers.html_parser import HTMLParser
@@ -250,6 +252,7 @@ class IndexingRunner:
file_extractor[".html"] = HTMLParser()
file_extractor[".htm"] = HTMLParser()
file_extractor[".pdf"] = PDFParser({'upload_file': upload_file})
file_extractor[".xlsx"] = XLSXParser()
loader = SimpleDirectoryReader(input_files=[filepath], file_extractor=file_extractor)
text_docs = loader.load_data()