refactor: enhance HTML tag removal in text processing to exclude audio, video, and image tags
This commit is contained in:
parent
d0722dc048
commit
0316afa299
@ -116,6 +116,9 @@ def markdown_to_plain_text(md: str) -> str:
|
|||||||
text = re.sub(r'\n{2,}', '\n', text)
|
text = re.sub(r'\n{2,}', '\n', text)
|
||||||
# 使用正则表达式去除所有 HTML 标签
|
# 使用正则表达式去除所有 HTML 标签
|
||||||
text = re.sub(r'<[^>]+>', '', text)
|
text = re.sub(r'<[^>]+>', '', text)
|
||||||
|
# 先移除特定媒体标签(优先级高于通用HTML标签移除)
|
||||||
|
text = re.sub(r'<(audio|video)[^>]*>.*?</\1>', '', text, flags=re.DOTALL) # 匹配音频/视频标签
|
||||||
|
text = re.sub(r'<img[^>]*>', '', text) # 匹配图片标签
|
||||||
# 去除多余的空白字符(包括换行符、制表符等)
|
# 去除多余的空白字符(包括换行符、制表符等)
|
||||||
text = re.sub(r'\s+', ' ', text)
|
text = re.sub(r'\s+', ' ', text)
|
||||||
# 去除表单渲染
|
# 去除表单渲染
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user