This commit is contained in:
@@ -68,22 +68,17 @@ class MarkdownExtractor(BaseExtractor):
|
||||
continue
|
||||
header_match = re.match(r"^#+\s", line)
|
||||
if header_match:
|
||||
if current_header is not None:
|
||||
markdown_tups.append((current_header, current_text))
|
||||
|
||||
markdown_tups.append((current_header, current_text))
|
||||
current_header = line
|
||||
current_text = ""
|
||||
else:
|
||||
current_text += line + "\n"
|
||||
markdown_tups.append((current_header, current_text))
|
||||
|
||||
if current_header is not None:
|
||||
# pass linting, assert keys are defined
|
||||
markdown_tups = [
|
||||
(re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) for key, value in markdown_tups
|
||||
]
|
||||
else:
|
||||
markdown_tups = [(key, re.sub("\n", "", value)) for key, value in markdown_tups]
|
||||
markdown_tups = [
|
||||
(re.sub(r"#", "", cast(str, key)).strip() if key else None, re.sub(r"<.*?>", "", value))
|
||||
for key, value in markdown_tups
|
||||
]
|
||||
|
||||
return markdown_tups
|
||||
|
||||
|
@@ -0,0 +1,22 @@
|
||||
from core.rag.extractor.markdown_extractor import MarkdownExtractor
|
||||
|
||||
|
||||
def test_markdown_to_tups():
|
||||
markdown = """
|
||||
this is some text without header
|
||||
|
||||
# title 1
|
||||
this is balabala text
|
||||
|
||||
## title 2
|
||||
this is more specific text.
|
||||
"""
|
||||
extractor = MarkdownExtractor(file_path="dummy_path")
|
||||
updated_output = extractor.markdown_to_tups(markdown)
|
||||
assert len(updated_output) == 3
|
||||
key, header_value = updated_output[0]
|
||||
assert key == None
|
||||
assert header_value.strip() == "this is some text without header"
|
||||
title_1, value = updated_output[1]
|
||||
assert title_1.strip() == "title 1"
|
||||
assert value.strip() == "this is balabala text"
|
Reference in New Issue
Block a user