feat: mypy for all type check (#10921)

2024-12-24 18:38:51 +08:00
parent c91e8b1737
commit 56e15d09a9
584 changed files with 3975 additions and 2826 deletions
--- a/api/services/website_service.py
+++ b/api/services/website_service.py
@@ -1,8 +1,9 @@
 import datetime
 import json
+from typing import Any

 import requests
-from flask_login import current_user
+from flask_login import current_user  # type: ignore

 from core.helper import encrypter
 from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
@@ -23,9 +24,9 @@ class WebsiteService:

    @classmethod
    def crawl_url(cls, args: dict) -> dict:
-        provider = args.get("provider")
+        provider = args.get("provider", "")
        url = args.get("url")
-        options = args.get("options")
+        options = args.get("options", "")
        credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
        if provider == "firecrawl":
            # decrypt api_key
@@ -164,16 +165,18 @@ class WebsiteService:
        return crawl_status_data

    @classmethod
-    def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict | None:
+    def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict[Any, Any] | None:
        credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
        # decrypt api_key
        api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
+        # FIXME data is redefine too many times here, use Any to ease the type checking, fix it later
+        data: Any
        if provider == "firecrawl":
            file_key = "website_files/" + job_id + ".txt"
            if storage.exists(file_key):
-                data = storage.load_once(file_key)
-                if data:
-                    data = json.loads(data.decode("utf-8"))
+                d = storage.load_once(file_key)
+                if d:
+                    data = json.loads(d.decode("utf-8"))
            else:
                firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
                result = firecrawl_app.check_crawl_status(job_id)
@@ -183,22 +186,17 @@ class WebsiteService:
            if data:
                for item in data:
                    if item.get("source_url") == url:
-                        return item
+                        return dict(item)
            return None
        elif provider == "jinareader":
-            file_key = "website_files/" + job_id + ".txt"
-            if storage.exists(file_key):
-                data = storage.load_once(file_key)
-                if data:
-                    data = json.loads(data.decode("utf-8"))
-            elif not job_id:
+            if not job_id:
                response = requests.get(
                    f"https://r.jina.ai/{url}",
                    headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
                )
                if response.json().get("code") != 200:
                    raise ValueError("Failed to crawl")
-                return response.json().get("data")
+                return dict(response.json().get("data", {}))
            else:
                api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
                response = requests.post(
@@ -218,12 +216,13 @@ class WebsiteService:
                data = response.json().get("data", {})
                for item in data.get("processed", {}).values():
                    if item.get("data", {}).get("url") == url:
-                        return item.get("data", {})
+                        return dict(item.get("data", {}))
+            return None
        else:
            raise ValueError("Invalid provider")

    @classmethod
-    def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_content: bool) -> dict | None:
+    def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_content: bool) -> dict:
        credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
        if provider == "firecrawl":
            # decrypt api_key