feat: implement charset detection utility in fork.py
--bug=1059829 --user=刘瑞斌 【知识库】web导入知识库中文字符乱码 https://www.tapd.cn/62980211/s/1746162
This commit is contained in:
parent
1742db681d
commit
6184058e4d
@ -1,5 +1,4 @@
|
|||||||
import copy
|
import copy
|
||||||
import logging
|
|
||||||
import re
|
import re
|
||||||
import traceback
|
import traceback
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
@ -139,18 +138,30 @@ class Fork:
|
|||||||
html_content = response.content.decode(encoding)
|
html_content = response.content.decode(encoding)
|
||||||
beautiful_soup = BeautifulSoup(html_content, "html.parser")
|
beautiful_soup = BeautifulSoup(html_content, "html.parser")
|
||||||
meta_list = beautiful_soup.find_all('meta')
|
meta_list = beautiful_soup.find_all('meta')
|
||||||
charset_list = [meta.attrs.get('charset') for meta in meta_list if
|
charset_list = Fork.get_charset_list(meta_list)
|
||||||
meta.attrs is not None and 'charset' in meta.attrs]
|
|
||||||
if len(charset_list) > 0:
|
if len(charset_list) > 0:
|
||||||
charset = charset_list[0]
|
charset = charset_list[0]
|
||||||
if charset != encoding:
|
if charset != encoding:
|
||||||
try:
|
try:
|
||||||
html_content = response.content.decode(charset)
|
html_content = response.content.decode(charset, errors='replace')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
maxkb_logger.error(f'{e}')
|
maxkb_logger.error(f'{e}: {traceback.format_exc()}')
|
||||||
return BeautifulSoup(html_content, "html.parser")
|
return BeautifulSoup(html_content, "html.parser")
|
||||||
return beautiful_soup
|
return beautiful_soup
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_charset_list(meta_list):
|
||||||
|
charset_list = []
|
||||||
|
for meta in meta_list:
|
||||||
|
if meta.attrs is not None:
|
||||||
|
if 'charset' in meta.attrs:
|
||||||
|
charset_list.append(meta.attrs.get('charset'))
|
||||||
|
elif meta.attrs.get('http-equiv', '').lower() == 'content-type' and 'content' in meta.attrs:
|
||||||
|
match = re.search(r'charset=([^\s;]+)', meta.attrs['content'], re.I)
|
||||||
|
if match:
|
||||||
|
charset_list.append(match.group(1))
|
||||||
|
return charset_list
|
||||||
|
|
||||||
def fork(self):
|
def fork(self):
|
||||||
try:
|
try:
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user