fix: markdown_extractor lost chunks if it starts without a header(#21308) (#21309)

This commit is contained in:
Jin
2025-06-21 23:10:00 +08:00
committed by GitHub
parent d333aac84a
commit 3e7f8bad56
2 changed files with 27 additions and 10 deletions

View File

@@ -68,22 +68,17 @@ class MarkdownExtractor(BaseExtractor):
continue
header_match = re.match(r"^#+\s", line)
if header_match:
if current_header is not None:
markdown_tups.append((current_header, current_text))
current_header = line
current_text = ""
else:
current_text += line + "\n"
markdown_tups.append((current_header, current_text))
if current_header is not None:
# pass linting, assert keys are defined
markdown_tups = [
(re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) for key, value in markdown_tups
(re.sub(r"#", "", cast(str, key)).strip() if key else None, re.sub(r"<.*?>", "", value))
for key, value in markdown_tups
]
else:
markdown_tups = [(key, re.sub("\n", "", value)) for key, value in markdown_tups]
return markdown_tups

View File

@@ -0,0 +1,22 @@
from core.rag.extractor.markdown_extractor import MarkdownExtractor
def test_markdown_to_tups():
markdown = """
this is some text without header
# title 1
this is balabala text
## title 2
this is more specific text.
"""
extractor = MarkdownExtractor(file_path="dummy_path")
updated_output = extractor.markdown_to_tups(markdown)
assert len(updated_output) == 3
key, header_value = updated_output[0]
assert key == None
assert header_value.strip() == "this is some text without header"
title_1, value = updated_output[1]
assert title_1.strip() == "title 1"
assert value.strip() == "this is balabala text"