Feat: chunk overlap supported (#2209)

Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
KVOJJJin
2024-01-26 13:24:40 +08:00
committed by GitHub
parent 3322710dac
commit 89fcf4ea7c
9 changed files with 53 additions and 8 deletions

View File

@@ -562,7 +562,7 @@ class IndexingRunner:
character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder(
chunk_size=segmentation["max_tokens"],
chunk_overlap=0,
chunk_overlap=segmentation.get('chunk_overlap', 0),
fixed_separator=separator,
separators=["\n\n", "", ".", " ", ""],
embedding_model_instance=embedding_model_instance
@@ -571,7 +571,7 @@ class IndexingRunner:
# Automatic segmentation
character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder(
chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'],
chunk_overlap=0,
chunk_overlap=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['chunk_overlap'],
separators=["\n\n", "", ".", " ", ""],
embedding_model_instance=embedding_model_instance
)