fix: markdown_extractor lost chunks if it starts without a header(#21308) (#21309)

2025-06-21 23:10:00 +08:00
parent d333aac84a
commit 3e7f8bad56
2 changed files with 27 additions and 10 deletions
--- a/api/core/rag/extractor/markdown_extractor.py
+++ b/api/core/rag/extractor/markdown_extractor.py
@@ -68,22 +68,17 @@ class MarkdownExtractor(BaseExtractor):
                continue
            header_match = re.match(r"^#+\s", line)
            if header_match:
-                if current_header is not None:
-                    markdown_tups.append((current_header, current_text))
-
+                markdown_tups.append((current_header, current_text))
                current_header = line
                current_text = ""
            else:
                current_text += line + "\n"
        markdown_tups.append((current_header, current_text))

-        if current_header is not None:
-            # pass linting, assert keys are defined
-            markdown_tups = [
-                (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) for key, value in markdown_tups
-            ]
-        else:
-            markdown_tups = [(key, re.sub("\n", "", value)) for key, value in markdown_tups]
+        markdown_tups = [
+            (re.sub(r"#", "", cast(str, key)).strip() if key else None, re.sub(r"<.*?>", "", value))
+            for key, value in markdown_tups
+        ]

        return markdown_tups