From 6b5c2bea4d36d814d56ae9dd3f1aae16baea9a89 Mon Sep 17 00:00:00 2001 From: Yongtao Huang Date: Tue, 19 Aug 2025 22:44:22 +0800 Subject: [PATCH] Fix: correctly match http/https URLs in image upload file (#24180) --- api/core/indexing_runner.py | 2 +- api/core/tools/utils/rag_web_reader.py | 17 ------------- api/core/tools/utils/web_reader_tool.py | 12 ++++----- api/tasks/clean_dataset_task.py | 2 +- api/tasks/clean_document_task.py | 2 +- .../core/tools/utils/test_web_reader_tool.py | 25 +++++++++++++++++++ 6 files changed, 34 insertions(+), 26 deletions(-) delete mode 100644 api/core/tools/utils/rag_web_reader.py create mode 100644 api/tests/unit_tests/core/tools/utils/test_web_reader_tool.py diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 2387658bb..b40278c76 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -30,7 +30,7 @@ from core.rag.splitter.fixed_text_splitter import ( FixedRecursiveCharacterTextSplitter, ) from core.rag.splitter.text_splitter import TextSplitter -from core.tools.utils.rag_web_reader import get_image_upload_file_ids +from core.tools.utils.web_reader_tool import get_image_upload_file_ids from extensions.ext_database import db from extensions.ext_redis import redis_client from extensions.ext_storage import storage diff --git a/api/core/tools/utils/rag_web_reader.py b/api/core/tools/utils/rag_web_reader.py deleted file mode 100644 index 22c47fa81..000000000 --- a/api/core/tools/utils/rag_web_reader.py +++ /dev/null @@ -1,17 +0,0 @@ -import re - - -def get_image_upload_file_ids(content): - pattern = r"!\[image\]\((http?://.*?(file-preview|image-preview))\)" - matches = re.findall(pattern, content) - image_upload_file_ids = [] - for match in matches: - if match[1] == "file-preview": - content_pattern = r"files/([^/]+)/file-preview" - else: - content_pattern = r"files/([^/]+)/image-preview" - content_match = re.search(content_pattern, match[0]) - if content_match: - image_upload_file_id = content_match.group(1) - image_upload_file_ids.append(image_upload_file_id) - return image_upload_file_ids diff --git a/api/core/tools/utils/web_reader_tool.py b/api/core/tools/utils/web_reader_tool.py index df052c16d..770c0ef7b 100644 --- a/api/core/tools/utils/web_reader_tool.py +++ b/api/core/tools/utils/web_reader_tool.py @@ -80,14 +80,14 @@ def get_url(url: str, user_agent: Optional[str] = None) -> str: else: content = response.text - article = extract_using_readabilipy(content) + article = extract_using_readability(content) if not article.text: return "" res = FULL_TEMPLATE.format( title=article.title, - author=article.auther, + author=article.author, text=article.text, ) @@ -97,15 +97,15 @@ def get_url(url: str, user_agent: Optional[str] = None) -> str: @dataclass class Article: title: str - auther: str + author: str text: Sequence[dict] -def extract_using_readabilipy(html: str): +def extract_using_readability(html: str): json_article: dict[str, Any] = simple_json_from_html_string(html, use_readability=True) article = Article( title=json_article.get("title") or "", - auther=json_article.get("byline") or "", + author=json_article.get("byline") or "", text=json_article.get("plain_text") or [], ) @@ -113,7 +113,7 @@ def extract_using_readabilipy(html: str): def get_image_upload_file_ids(content): - pattern = r"!\[image\]\((http?://.*?(file-preview|image-preview))\)" + pattern = r"!\[image\]\((https?://.*?(file-preview|image-preview))\)" matches = re.findall(pattern, content) image_upload_file_ids = [] for match in matches: diff --git a/api/tasks/clean_dataset_task.py b/api/tasks/clean_dataset_task.py index 9a45115b0..7b940847c 100644 --- a/api/tasks/clean_dataset_task.py +++ b/api/tasks/clean_dataset_task.py @@ -5,7 +5,7 @@ import click from celery import shared_task # type: ignore from core.rag.index_processor.index_processor_factory import IndexProcessorFactory -from core.tools.utils.rag_web_reader import get_image_upload_file_ids +from core.tools.utils.web_reader_tool import get_image_upload_file_ids from extensions.ext_database import db from extensions.ext_storage import storage from models.dataset import ( diff --git a/api/tasks/clean_document_task.py b/api/tasks/clean_document_task.py index d690106d1..5479ba8e8 100644 --- a/api/tasks/clean_document_task.py +++ b/api/tasks/clean_document_task.py @@ -6,7 +6,7 @@ import click from celery import shared_task # type: ignore from core.rag.index_processor.index_processor_factory import IndexProcessorFactory -from core.tools.utils.rag_web_reader import get_image_upload_file_ids +from core.tools.utils.web_reader_tool import get_image_upload_file_ids from extensions.ext_database import db from extensions.ext_storage import storage from models.dataset import Dataset, DatasetMetadataBinding, DocumentSegment diff --git a/api/tests/unit_tests/core/tools/utils/test_web_reader_tool.py b/api/tests/unit_tests/core/tools/utils/test_web_reader_tool.py new file mode 100644 index 000000000..c17308baa --- /dev/null +++ b/api/tests/unit_tests/core/tools/utils/test_web_reader_tool.py @@ -0,0 +1,25 @@ +from core.tools.utils.web_reader_tool import get_image_upload_file_ids + + +def test_get_image_upload_file_ids(): + # should extract id from https + file-preview + content = "![image](https://example.com/a/b/files/abc123/file-preview)" + assert get_image_upload_file_ids(content) == ["abc123"] + + # should extract id from http + image-preview + content = "![image](http://host/files/xyz789/image-preview)" + assert get_image_upload_file_ids(content) == ["xyz789"] + + # should not match invalid scheme 'htt://' + content = "![image](htt://host/files/bad/file-preview)" + assert get_image_upload_file_ids(content) == [] + + # should extract multiple ids in order + content = """ + some text + ![image](https://h/files/id1/file-preview) + middle + ![image](http://h/files/id2/image-preview) + end + """ + assert get_image_upload_file_ids(content) == ["id1", "id2"]