diff --git a/api/core/rag/extractor/firecrawl/firecrawl_app.py b/api/core/rag/extractor/firecrawl/firecrawl_app.py index 836a1398b..83a4ac651 100644 --- a/api/core/rag/extractor/firecrawl/firecrawl_app.py +++ b/api/core/rag/extractor/firecrawl/firecrawl_app.py @@ -22,6 +22,7 @@ class FirecrawlApp: "formats": ["markdown"], "onlyMainContent": True, "timeout": 30000, + "integration": "dify", } if params: json_data.update(params) @@ -39,7 +40,7 @@ class FirecrawlApp: def crawl_url(self, url, params=None) -> str: # Documentation: https://docs.firecrawl.dev/api-reference/endpoint/crawl-post headers = self._prepare_headers() - json_data = {"url": url} + json_data = {"url": url, "integration": "dify"} if params: json_data.update(params) response = self._post_request(f"{self.base_url}/v1/crawl", json_data, headers) @@ -49,7 +50,6 @@ class FirecrawlApp: return cast(str, job_id) else: self._handle_error(response, "start crawl job") - # FIXME: unreachable code for mypy return "" # unreachable def check_crawl_status(self, job_id) -> dict[str, Any]: @@ -82,7 +82,6 @@ class FirecrawlApp: ) else: self._handle_error(response, "check crawl status") - # FIXME: unreachable code for mypy return {} # unreachable def _format_crawl_status_response( @@ -126,4 +125,31 @@ class FirecrawlApp: def _handle_error(self, response, action) -> None: error_message = response.json().get("error", "Unknown error occurred") - raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}") + raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}") # type: ignore[return] + + def search(self, query: str, params: dict[str, Any] | None = None) -> dict[str, Any]: + # Documentation: https://docs.firecrawl.dev/api-reference/endpoint/search + headers = self._prepare_headers() + json_data = { + "query": query, + "limit": 5, + "lang": "en", + "country": "us", + "timeout": 60000, + "ignoreInvalidURLs": False, + "scrapeOptions": {}, + "integration": "dify", + } + if params: + json_data.update(params) + response = self._post_request(f"{self.base_url}/v1/search", json_data, headers) + if response.status_code == 200: + response_data = response.json() + if not response_data.get("success"): + raise Exception(f"Search failed. Error: {response_data.get('warning', 'Unknown error')}") + return cast(dict[str, Any], response_data) + elif response.status_code in {402, 409, 500, 429, 408}: + self._handle_error(response, "perform search") + return {} # Avoid additional exception after handling error + else: + raise Exception(f"Failed to perform search. Status code: {response.status_code}")