WebscraperTool bypass cloudflare site by cloudscraper (#6337)

2024-07-17 14:13:57 +08:00
parent 7943f7f697
commit 4e2fba404d
3 changed files with 58 additions and 21 deletions
--- a/api/core/tools/utils/web_reader_tool.py
+++ b/api/core/tools/utils/web_reader_tool.py
@@ -10,6 +10,7 @@ import unicodedata
 from contextlib import contextmanager
 from urllib.parse import unquote

+import cloudscraper
 import requests
 from bs4 import BeautifulSoup, CData, Comment, NavigableString
 from newspaper import Article
@@ -46,29 +47,34 @@ def get_url(url: str, user_agent: str = None) -> str:
    supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
    response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))

+    if response.status_code == 200:
+        # check content-type
+        content_type = response.headers.get('Content-Type')
+        if content_type:
+            main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
+        else:
+            content_disposition = response.headers.get('Content-Disposition', '')
+            filename_match = re.search(r'filename="([^"]+)"', content_disposition)
+            if filename_match:
+                filename = unquote(filename_match.group(1))
+                extension = re.search(r'\.(\w+)$', filename)
+                if extension:
+                    main_content_type = mimetypes.guess_type(filename)[0]
+
+        if main_content_type not in supported_content_types:
+            return "Unsupported content-type [{}] of URL.".format(main_content_type)
+
+        if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
+            return ExtractProcessor.load_from_url(url, return_text=True)
+
+        response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
+    elif response.status_code == 403:
+        scraper = cloudscraper.create_scraper()
+        response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
+
    if response.status_code != 200:
        return "URL returned status code {}.".format(response.status_code)

-    # check content-type
-    content_type = response.headers.get('Content-Type')
-    if content_type:
-        main_content_type = response.headers.get('Content-Type').split(';')[0].strip()
-    else:
-        content_disposition = response.headers.get('Content-Disposition', '')
-        filename_match = re.search(r'filename="([^"]+)"', content_disposition)
-        if filename_match:
-            filename = unquote(filename_match.group(1))
-            extension = re.search(r'\.(\w+)$', filename)
-            if extension:
-                main_content_type = mimetypes.guess_type(filename)[0]
-
-    if main_content_type not in supported_content_types:
-        return "Unsupported content-type [{}] of URL.".format(main_content_type)
-
-    if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
-        return ExtractProcessor.load_from_url(url, return_text=True)
-
-    response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
    a = extract_using_readabilipy(response.text)

    if not a['plain_text'] or not a['plain_text'].strip():