From 1c7404099ddce2baf3391393ce01e655b33e31b3 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com> Date: Thu, 3 Jul 2025 05:06:49 -0400 Subject: [PATCH] fix: prevent timeout in file encoding detection for large files (#21453) Co-authored-by: crazywoola <427733928@qq.com> --- api/core/rag/extractor/helpers.py | 10 +++++++--- api/core/rag/extractor/text_extractor.py | 6 +++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/api/core/rag/extractor/helpers.py b/api/core/rag/extractor/helpers.py index 69ca9d5d6..3d2fb55d9 100644 --- a/api/core/rag/extractor/helpers.py +++ b/api/core/rag/extractor/helpers.py @@ -1,7 +1,6 @@ """Document loader helpers.""" import concurrent.futures -from pathlib import Path from typing import NamedTuple, Optional, cast @@ -16,7 +15,7 @@ class FileEncoding(NamedTuple): """The language of the file.""" -def detect_file_encodings(file_path: str, timeout: int = 5) -> list[FileEncoding]: +def detect_file_encodings(file_path: str, timeout: int = 5, sample_size: int = 1024 * 1024) -> list[FileEncoding]: """Try to detect the file encoding. Returns a list of `FileEncoding` tuples with the detected encodings ordered @@ -25,11 +24,16 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> list[FileEncoding Args: file_path: The path to the file to detect the encoding for. timeout: The timeout in seconds for the encoding detection. + sample_size: The number of bytes to read for encoding detection. Default is 1MB. + For large files, reading only a sample is sufficient and prevents timeout. """ import chardet def read_and_detect(file_path: str) -> list[dict]: - rawdata = Path(file_path).read_bytes() + with open(file_path, "rb") as f: + # Read only a sample of the file for encoding detection + # This prevents timeout on large files while still providing accurate encoding detection + rawdata = f.read(sample_size) return cast(list[dict], chardet.detect_all(rawdata)) with concurrent.futures.ThreadPoolExecutor() as executor: diff --git a/api/core/rag/extractor/text_extractor.py b/api/core/rag/extractor/text_extractor.py index b2b51d71d..a00d328cb 100644 --- a/api/core/rag/extractor/text_extractor.py +++ b/api/core/rag/extractor/text_extractor.py @@ -36,8 +36,12 @@ class TextExtractor(BaseExtractor): break except UnicodeDecodeError: continue + else: + raise RuntimeError( + f"Decode failed: {self._file_path}, all detected encodings failed. Original error: {e}" + ) else: - raise RuntimeError(f"Error loading {self._file_path}") from e + raise RuntimeError(f"Decode failed: {self._file_path}, specified encoding failed. Original error: {e}") except Exception as e: raise RuntimeError(f"Error loading {self._file_path}") from e