fix wrong charset when decoding Chinese content (#6774)

Co-authored-by: zhangwb <zhangwb@zts.com.cn>
This commit is contained in:
eric-0x72
2024-07-30 21:32:45 +08:00
committed by GitHub
parent 53a89bbbc7
commit 98d9837fbc

View File

@@ -10,6 +10,7 @@ import unicodedata
from contextlib import contextmanager from contextlib import contextmanager
from urllib.parse import unquote from urllib.parse import unquote
import chardet
import cloudscraper import cloudscraper
from bs4 import BeautifulSoup, CData, Comment, NavigableString from bs4 import BeautifulSoup, CData, Comment, NavigableString
from regex import regex from regex import regex
@@ -75,7 +76,18 @@ def get_url(url: str, user_agent: str = None) -> str:
if response.status_code != 200: if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code) return "URL returned status code {}.".format(response.status_code)
a = extract_using_readabilipy(response.text) # Detect encoding using chardet
detected_encoding = chardet.detect(response.content)
encoding = detected_encoding['encoding']
if encoding:
try:
content = response.content.decode(encoding)
except (UnicodeDecodeError, TypeError):
content = response.text
else:
content = response.text
a = extract_using_readabilipy(content)
if not a['plain_text'] or not a['plain_text'].strip(): if not a['plain_text'] or not a['plain_text'].strip():
return '' return ''