parent
d9e171c430
commit
86f500208f
66
apps/common/handle/impl/html_split_handle.py
Normal file
66
apps/common/handle/impl/html_split_handle.py
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
@project: maxkb
|
||||||
|
@Author:虎
|
||||||
|
@file: html_split_handle.py
|
||||||
|
@date:2024/5/23 10:58
|
||||||
|
@desc:
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from charset_normalizer import detect
|
||||||
|
from html2text import html2text
|
||||||
|
|
||||||
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
|
from common.util.split_model import SplitModel
|
||||||
|
|
||||||
|
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
||||||
|
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
|
||||||
|
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
|
||||||
|
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
|
||||||
|
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
|
||||||
|
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
|
||||||
|
|
||||||
|
|
||||||
|
def get_encoding(buffer):
|
||||||
|
beautiful_soup = BeautifulSoup(buffer, "html.parser")
|
||||||
|
meta_list = beautiful_soup.find_all('meta')
|
||||||
|
charset_list = [meta.attrs.get('charset') for meta in meta_list if
|
||||||
|
meta.attrs is not None and 'charset' in meta.attrs]
|
||||||
|
if len(charset_list) > 0:
|
||||||
|
charset = charset_list[0]
|
||||||
|
return charset
|
||||||
|
return detect(buffer)['encoding']
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLSplitHandle(BaseSplitHandle):
|
||||||
|
def support(self, file, get_buffer):
|
||||||
|
buffer = get_buffer(file)
|
||||||
|
file_name: str = file.name.lower()
|
||||||
|
if file_name.endswith(".html"):
|
||||||
|
return True
|
||||||
|
result = detect(buffer)
|
||||||
|
if result['encoding'] is not None and result['confidence'] is not None and result['encoding'] != 'ascii' and \
|
||||||
|
result['confidence'] > 0.5:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
||||||
|
buffer = get_buffer(file)
|
||||||
|
|
||||||
|
if pattern_list is not None and len(pattern_list) > 0:
|
||||||
|
split_model = SplitModel(pattern_list, with_filter, limit)
|
||||||
|
else:
|
||||||
|
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
|
||||||
|
try:
|
||||||
|
encoding = get_encoding(buffer)
|
||||||
|
content = buffer.decode(encoding)
|
||||||
|
content = html2text(content)
|
||||||
|
except BaseException as e:
|
||||||
|
return {'name': file.name,
|
||||||
|
'content': []}
|
||||||
|
return {'name': file.name,
|
||||||
|
'content': split_model.parse(content)
|
||||||
|
}
|
||||||
@ -25,6 +25,7 @@ from common.event.common import work_thread_pool
|
|||||||
from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs, UpdateEmbeddingDatasetIdArgs
|
from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs, UpdateEmbeddingDatasetIdArgs
|
||||||
from common.exception.app_exception import AppApiException
|
from common.exception.app_exception import AppApiException
|
||||||
from common.handle.impl.doc_split_handle import DocSplitHandle
|
from common.handle.impl.doc_split_handle import DocSplitHandle
|
||||||
|
from common.handle.impl.html_split_handle import HTMLSplitHandle
|
||||||
from common.handle.impl.pdf_split_handle import PdfSplitHandle
|
from common.handle.impl.pdf_split_handle import PdfSplitHandle
|
||||||
from common.handle.impl.text_split_handle import TextSplitHandle
|
from common.handle.impl.text_split_handle import TextSplitHandle
|
||||||
from common.mixins.api_mixin import ApiMixin
|
from common.mixins.api_mixin import ApiMixin
|
||||||
@ -772,7 +773,7 @@ class FileBufferHandle:
|
|||||||
|
|
||||||
|
|
||||||
default_split_handle = TextSplitHandle()
|
default_split_handle = TextSplitHandle()
|
||||||
split_handles = [DocSplitHandle(), PdfSplitHandle(), default_split_handle]
|
split_handles = [HTMLSplitHandle(), DocSplitHandle(), PdfSplitHandle(), default_split_handle]
|
||||||
|
|
||||||
|
|
||||||
def save_image(image_list):
|
def save_image(image_list):
|
||||||
|
|||||||
@ -43,7 +43,7 @@ export function getImgUrl(name: string) {
|
|||||||
}
|
}
|
||||||
// 是否是白名单后缀
|
// 是否是白名单后缀
|
||||||
export function isRightType(name: string) {
|
export function isRightType(name: string) {
|
||||||
const typeList = ['txt', 'pdf', 'docx', 'csv', 'md']
|
const typeList = ['txt', 'pdf', 'docx', 'csv', 'md', 'html']
|
||||||
return typeList.includes(fileType(name))
|
return typeList.includes(fileType(name))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -17,7 +17,7 @@
|
|||||||
action="#"
|
action="#"
|
||||||
:auto-upload="false"
|
:auto-upload="false"
|
||||||
:show-file-list="false"
|
:show-file-list="false"
|
||||||
accept=".txt, .md, .csv, .log, .docx, .pdf"
|
accept=".txt, .md, .csv, .log, .docx, .pdf, .html"
|
||||||
:limit="50"
|
:limit="50"
|
||||||
:on-exceed="onExceed"
|
:on-exceed="onExceed"
|
||||||
:on-change="fileHandleChange"
|
:on-change="fileHandleChange"
|
||||||
@ -31,7 +31,9 @@
|
|||||||
<em class="hover" @click.prevent="handlePreview(true)"> 选择文件夹 </em>
|
<em class="hover" @click.prevent="handlePreview(true)"> 选择文件夹 </em>
|
||||||
</p>
|
</p>
|
||||||
<div class="upload__decoration">
|
<div class="upload__decoration">
|
||||||
<p>支持格式:TXT、Markdown、PDF、DOCX,每次最多上传50个文件,每个文件不超过 100MB</p>
|
<p>
|
||||||
|
支持格式:TXT、Markdown、PDF、DOCX、HTML 每次最多上传50个文件,每个文件不超过 100MB
|
||||||
|
</p>
|
||||||
<p>若使用【高级分段】建议上传前规范文件的分段标识</p>
|
<p>若使用【高级分段】建议上传前规范文件的分段标识</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user