support images and tables extract from docx (#4619)

This commit is contained in:
Jyong
2024-05-23 18:05:23 +08:00
committed by GitHub
parent 5893ebec55
commit 233c4150d1
10 changed files with 163 additions and 23 deletions

View File

@@ -428,7 +428,7 @@ class IndexingRunner:
chunk_size=segmentation["max_tokens"],
chunk_overlap=chunk_overlap,
fixed_separator=separator,
separators=["\n\n", "", ".", " ", ""],
separators=["\n\n", "", ". ", " ", ""],
embedding_model_instance=embedding_model_instance
)
else:
@@ -436,7 +436,7 @@ class IndexingRunner:
character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder(
chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'],
chunk_overlap=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['chunk_overlap'],
separators=["\n\n", "", ".", " ", ""],
separators=["\n\n", "", ". ", " ", ""],
embedding_model_instance=embedding_model_instance
)