support images and tables extract from docx (#4619)

This commit is contained in:
Jyong
2024-05-23 18:05:23 +08:00
committed by GitHub
parent 5893ebec55
commit 233c4150d1
10 changed files with 163 additions and 23 deletions

View File

@@ -57,7 +57,7 @@ class BaseIndexProcessor(ABC):
chunk_size=segmentation["max_tokens"],
chunk_overlap=segmentation.get('chunk_overlap', 0),
fixed_separator=separator,
separators=["\n\n", "", ".", " ", ""],
separators=["\n\n", "", ". ", " ", ""],
embedding_model_instance=embedding_model_instance
)
else:
@@ -65,7 +65,7 @@ class BaseIndexProcessor(ABC):
character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder(
chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'],
chunk_overlap=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['chunk_overlap'],
separators=["\n\n", "", ".", " ", ""],
separators=["\n\n", "", ". ", " ", ""],
embedding_model_instance=embedding_model_instance
)