refactor: Update Firecrawl to use v1 API (#12574)

Co-authored-by: Ademílson Tonato <ademilson.tonato@refurbed.com>
This commit is contained in:
Ademílson Tonato
2025-01-23 03:14:48 +00:00
committed by GitHub
parent f565f08aa0
commit 6024d8a42d
4 changed files with 81 additions and 81 deletions

View File

@@ -38,30 +38,24 @@ class WebsiteService:
only_main_content = options.get("only_main_content", False)
if not crawl_sub_pages:
params = {
"crawlerOptions": {
"includes": [],
"excludes": [],
"generateImgAltText": True,
"limit": 1,
"returnOnlyUrls": False,
"pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False},
}
"includes": [],
"excludes": [],
"generateImgAltText": True,
"limit": 1,
"scrapeOptions": {"onlyMainContent": only_main_content},
}
else:
includes = options.get("includes").split(",") if options.get("includes") else []
excludes = options.get("excludes").split(",") if options.get("excludes") else []
params = {
"crawlerOptions": {
"includes": includes,
"excludes": excludes,
"generateImgAltText": True,
"limit": options.get("limit", 1),
"returnOnlyUrls": False,
"pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False},
}
"includes": includes,
"excludes": excludes,
"generateImgAltText": True,
"limit": options.get("limit", 1),
"scrapeOptions": {"onlyMainContent": only_main_content},
}
if options.get("max_depth"):
params["crawlerOptions"]["maxDepth"] = options.get("max_depth")
params["maxDepth"] = options.get("max_depth")
job_id = firecrawl_app.crawl_url(url, params)
website_crawl_time_cache_key = f"website_crawl_{job_id}"
time = str(datetime.datetime.now().timestamp())
@@ -228,7 +222,7 @@ class WebsiteService:
# decrypt api_key
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
params = {"pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False}}
params = {"onlyMainContent": only_main_content}
result = firecrawl_app.scrape_url(url, params)
return result
else: