diff --git a/api/factories/file_factory.py b/api/factories/file_factory.py index 512a9cb60..b2bcee5dc 100644 --- a/api/factories/file_factory.py +++ b/api/factories/file_factory.py @@ -1,4 +1,6 @@ import mimetypes +import os +import urllib.parse import uuid from collections.abc import Callable, Mapping, Sequence from typing import Any, cast @@ -240,16 +242,21 @@ def _build_from_remote_url( def _get_remote_file_info(url: str): file_size = -1 - filename = url.split("/")[-1].split("?")[0] or "unknown_file" - mime_type = mimetypes.guess_type(filename)[0] or "" + parsed_url = urllib.parse.urlparse(url) + url_path = parsed_url.path + filename = os.path.basename(url_path) + + # Initialize mime_type from filename as fallback + mime_type, _ = mimetypes.guess_type(filename) resp = ssrf_proxy.head(url, follow_redirects=True) resp = cast(httpx.Response, resp) if resp.status_code == httpx.codes.OK: if content_disposition := resp.headers.get("Content-Disposition"): filename = str(content_disposition.split("filename=")[-1].strip('"')) + # Re-guess mime_type from updated filename + mime_type, _ = mimetypes.guess_type(filename) file_size = int(resp.headers.get("Content-Length", file_size)) - mime_type = mime_type or str(resp.headers.get("Content-Type", "")) return mime_type, filename, file_size