fix: 同步web站点内容编码错误,导致乱码

This commit is contained in:
shaohuzhang1 2024-03-25 18:46:25 +08:00
parent a01d5beb59
commit cf003aa2d2

View File

@ -6,6 +6,7 @@ from functools import reduce
from typing import List, Set from typing import List, Set
from urllib.parse import urljoin, urlparse, ParseResult, urlsplit from urllib.parse import urljoin, urlparse, ParseResult, urlsplit
import chardet
import html2text as ht import html2text as ht
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -121,7 +122,7 @@ class Fork:
@staticmethod @staticmethod
def get_beautiful_soup(response): def get_beautiful_soup(response):
encoding = response.apparent_encoding if response.apparent_encoding is not None else 'utf-8' encoding = response.encoding if response.encoding and response.encoding != 'ISO-8859-1' is not None else response.apparent_encoding
html_content = response.content.decode(encoding) html_content = response.content.decode(encoding)
return BeautifulSoup(html_content, "html.parser") return BeautifulSoup(html_content, "html.parser")