feat: optimize split rule when use custom split segment identifier (#35)

This commit is contained in:
John Wang
2023-05-16 12:57:25 +08:00
committed by GitHub
parent 3117619ef3
commit 815f794eef
2 changed files with 73 additions and 6 deletions

View File

@@ -18,6 +18,7 @@ from core.docstore.dataset_docstore import DatesetDocumentStore
from core.index.keyword_table_index import KeywordTableIndex
from core.index.readers.html_parser import HTMLParser
from core.index.readers.pdf_parser import PDFParser
from core.index.spiltter.fixed_text_splitter import FixedRecursiveCharacterTextSplitter
from core.index.vector_index import VectorIndex
from core.llm.token_calculator import TokenCalculator
from extensions.ext_database import db
@@ -267,16 +268,14 @@ class IndexingRunner:
raise ValueError("Custom segment length should be between 50 and 1000.")
separator = segmentation["separator"]
if not separator:
separators = ["\n\n", "", ".", " ", ""]
else:
if separator:
separator = separator.replace('\\n', '\n')
separators = [separator, ""]
character_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
character_splitter = FixedRecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=segmentation["max_tokens"],
chunk_overlap=0,
separators=separators
fixed_separator=separator,
separators=["\n\n", "", ".", " ", ""]
)
else:
# Automatic segmentation