feat: add initial implementations of various file handling classes for CSV, XLS, and XLSX formats

2025-04-30 14:14:41 +08:00 · 2025-04-30 14:14:41 +08:00 · 48297d81e5
commit 48297d81e5
parent c8ce7e28d8
36 changed files with 2427 additions and 1 deletions
--- a/apps/common/constants/permission_constants.py
+++ b/apps/common/constants/permission_constants.py
@ -218,6 +218,14 @@ class PermissionConstants(Enum):
                                                                                            RoleConstants.USER])
    KNOWLEDGE_DELETE = Permission(group=Group.KNOWLEDGE, operate=Operate.DELETE, role_list=[RoleConstants.ADMIN,
                                                                                            RoleConstants.USER])
    DOCUMENT_READ = Permission(group=Group.KNOWLEDGE, operate=Operate.DELETE, role_list=[RoleConstants.ADMIN,
                                                                                            RoleConstants.USER])
    DOCUMENT_CREATE = Permission(group=Group.KNOWLEDGE, operate=Operate.DELETE, role_list=[RoleConstants.ADMIN,
                                                                                            RoleConstants.USER])
    DOCUMENT_EDIT = Permission(group=Group.KNOWLEDGE, operate=Operate.DELETE, role_list=[RoleConstants.ADMIN,
                                                                                            RoleConstants.USER])
    DOCUMENT_DELETE = Permission(group=Group.KNOWLEDGE, operate=Operate.DELETE, role_list=[RoleConstants.ADMIN,
                                                                                            RoleConstants.USER])
    def get_workspace_application_permission(self):
        return lambda r, kwargs: Permission(group=self.value.group, operate=self.value.operate,
--- a/apps/common/handle/init.py
+++ b/apps/common/handle/init.py
@ -0,0 +1,8 @@
 # coding=utf-8
 """
    @project: qabot
    @Author：虎
    @file： __init__.py.py
    @date：2023/9/6 10:09
    @desc:
 """
--- a/apps/common/handle/base_parse_qa_handle.py
+++ b/apps/common/handle/base_parse_qa_handle.py
@ -0,0 +1,52 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： base_parse_qa_handle.py
    @date：2024/5/21 14:56
    @desc:
 """
 from abc import ABC, abstractmethod
 def get_row_value(row, title_row_index_dict, field):
    index = title_row_index_dict.get(field)
    if index is None:
        return None
    if (len(row) - 1) >= index:
        return row[index]
    return None
 def get_title_row_index_dict(title_row_list):
    title_row_index_dict = {}
    if len(title_row_list) == 1:
        title_row_index_dict['content'] = 0
    elif len(title_row_list) == 1:
        title_row_index_dict['title'] = 0
        title_row_index_dict['content'] = 1
    else:
        title_row_index_dict['title'] = 0
        title_row_index_dict['content'] = 1
        title_row_index_dict['problem_list'] = 2
    for index in range(len(title_row_list)):
        title_row = title_row_list[index]
        if title_row is None:
            title_row = ''
        if title_row.startswith('分段标题'):
            title_row_index_dict['title'] = index
        if title_row.startswith('分段内容'):
            title_row_index_dict['content'] = index
        if title_row.startswith('问题'):
            title_row_index_dict['problem_list'] = index
    return title_row_index_dict
 class BaseParseQAHandle(ABC):
    @abstractmethod
    def support(self, file, get_buffer):
        pass
    @abstractmethod
    def handle(self, file, get_buffer, save_image):
        pass
--- a/apps/common/handle/base_parse_table_handle.py
+++ b/apps/common/handle/base_parse_table_handle.py
@ -0,0 +1,23 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： base_parse_qa_handle.py
    @date：2024/5/21 14:56
    @desc:
 """
 from abc import ABC, abstractmethod
 class BaseParseTableHandle(ABC):
    @abstractmethod
    def support(self, file, get_buffer):
        pass
    @abstractmethod
    def handle(self, file, get_buffer,save_image):
        pass
    @abstractmethod
    def get_content(self, file, save_image):
        pass
--- a/apps/common/handle/base_split_handle.py
+++ b/apps/common/handle/base_split_handle.py
@ -0,0 +1,24 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： base_split_handle.py
    @date：2024/3/27 18:13
    @desc:
 """
 from abc import ABC, abstractmethod
 from typing import List
 class BaseSplitHandle(ABC):
    @abstractmethod
    def support(self, file, get_buffer):
        pass
    @abstractmethod
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
        pass
    @abstractmethod
    def get_content(self, file, save_image):
        pass
--- a/apps/common/handle/base_to_response.py
+++ b/apps/common/handle/base_to_response.py
@ -0,0 +1,30 @@
 # coding=utf-8
 """
    @project: MaxKB
    @Author：虎
    @file： base_to_response.py
    @date：2024/9/6 16:04
    @desc:
 """
 from abc import ABC, abstractmethod
 from rest_framework import status
 class BaseToResponse(ABC):
    @abstractmethod
    def to_block_response(self, chat_id, chat_record_id, content, is_end, completion_tokens,
                          prompt_tokens, other_params: dict = None,
                          _status=status.HTTP_200_OK):
        pass
    @abstractmethod
    def to_stream_chunk_response(self, chat_id, chat_record_id, node_id, up_node_id_list, content, is_end,
                                 completion_tokens,
                                 prompt_tokens, other_params: dict = None):
        pass
    @staticmethod
    def format_stream_chunk(response_str):
        return 'data: ' + response_str + '\n\n'
--- a/apps/common/handle/handle_exception.py
+++ b/apps/common/handle/handle_exception.py
@ -0,0 +1,94 @@
 # coding=utf-8
 """
    @project: qabot
    @Author：虎
    @file： handle_exception.py
    @date：2023/9/5 19:29
    @desc:
 """
 import logging
 import traceback
 from rest_framework.exceptions import ValidationError, ErrorDetail, APIException
 from rest_framework.views import exception_handler
 from common.exception.app_exception import AppApiException
 from django.utils.translation import gettext_lazy as _
 from common.result import result
 def to_result(key, args, parent_key=None):
    """
    将校验异常 args转换为统一数据
    :param key:       校验key
    :param args:      校验异常参数
    :param parent_key 父key
    :return: 接口响应对象
    """
    error_detail = list(filter(
        lambda d: True if isinstance(d, ErrorDetail) else True if isinstance(d, dict) and len(
            d.keys()) > 0 else False,
        (args[0] if len(args) > 0 else {key: [ErrorDetail(_('Unknown exception'), code='unknown')]}).get(key)))[0]
    if isinstance(error_detail, dict):
        return list(map(lambda k: to_result(k, args=[error_detail],
                                            parent_key=key if parent_key is None else parent_key + '.' + key),
                        error_detail.keys() if len(error_detail) > 0 else []))[0]
    return result.Result(500 if isinstance(error_detail.code, str) else error_detail.code,
                         message=f"【{key if parent_key is None else parent_key + '.' + key}】为必填参数" if str(
                             error_detail) == "This field is required." else error_detail)
 def validation_error_to_result(exc: ValidationError):
    """
    校验异常转响应对象
    :param exc: 校验异常
    :return: 接口响应对象
    """
    try:
        v = find_err_detail(exc.detail)
        if v is None:
            return result.error(str(exc.detail))
        return result.error(str(v))
    except Exception as e:
        return result.error(str(exc.detail))
 def find_err_detail(exc_detail):
    if isinstance(exc_detail, ErrorDetail):
        return exc_detail
    if isinstance(exc_detail, dict):
        keys = exc_detail.keys()
        for key in keys:
            _value = exc_detail[key]
            if isinstance(_value, list):
                return find_err_detail(_value)
            if isinstance(_value, ErrorDetail):
                return _value
            if isinstance(_value, dict) and len(_value.keys()) > 0:
                return find_err_detail(_value)
    if isinstance(exc_detail, list):
        for v in exc_detail:
            r = find_err_detail(v)
            if r is not None:
                return r
 def handle_exception(exc, context):
    exception_class = exc.__class__
    # 先调用REST framework默认的异常处理方法获得标准错误响应对象
    response = exception_handler(exc, context)
    # 在此处补充自定义的异常处理
    if issubclass(exception_class, ValidationError):
        return validation_error_to_result(exc)
    if issubclass(exception_class, AppApiException):
        return result.Result(exc.code, exc.message, response_status=exc.status_code)
    if issubclass(exception_class, APIException):
        return result.error(exc.detail)
    if response is None:
        logging.getLogger("max_kb_error").error(f'{str(exc)}:{traceback.format_exc()}')
        return result.error(str(exc))
    return response
--- a/apps/common/handle/impl/init.py
+++ b/apps/common/handle/impl/init.py
@ -0,0 +1,8 @@
 # coding=utf-8
 """
    @project: qabot
    @Author：虎
    @file： __init__.py.py
    @date：2023/9/6 10:09
    @desc:
 """
--- a/apps/common/handle/impl/common_handle.py
+++ b/apps/common/handle/impl/common_handle.py
@ -0,0 +1,116 @@
 # coding=utf-8
 """
    @project: MaxKB
    @Author：虎
    @file： tools.py
    @date：2024/9/11 16:41
    @desc:
 """
 import io
 import uuid_utils.compat as uuid
 from functools import reduce
 from io import BytesIO
 from xml.etree.ElementTree import fromstring
 from zipfile import ZipFile
 from PIL import Image as PILImage
 from openpyxl.drawing.image import Image as openpyxl_Image
 from openpyxl.packaging.relationship import get_rels_path, get_dependents
 from openpyxl.xml.constants import SHEET_DRAWING_NS, REL_NS, SHEET_MAIN_NS
 from common.handle.base_parse_qa_handle import get_title_row_index_dict, get_row_value
 from knowledge.models import File
 def parse_element(element) -> {}:
    data = {}
    xdr_namespace = "{%s}" % SHEET_DRAWING_NS
    targets = level_order_traversal(element, xdr_namespace + "nvPicPr")
    for target in targets:
        cNvPr = embed = ""
        for child in target:
            if child.tag == xdr_namespace + "nvPicPr":
                cNvPr = child[0].attrib["name"]
            elif child.tag == xdr_namespace + "blipFill":
                _rel_embed = "{%s}embed" % REL_NS
                embed = child[0].attrib[_rel_embed]
        if cNvPr:
            data[cNvPr] = embed
    return data
 def parse_element_sheet_xml(element) -> []:
    data = []
    xdr_namespace = "{%s}" % SHEET_MAIN_NS
    targets = level_order_traversal(element, xdr_namespace + "f")
    for target in targets:
        for child in target:
            if child.tag == xdr_namespace + "f":
                data.append(child.text)
    return data
 def level_order_traversal(root, flag: str) -> []:
    queue = [root]
    targets = []
    while queue:
        node = queue.pop(0)
        children = [child.tag for child in node]
        if flag in children:
            targets.append(node)
            continue
        for child in node:
            queue.append(child)
    return targets
 def handle_images(deps, archive: ZipFile) -> []:
    images = []
    if not PILImage:  # Pillow not installed, drop images
        return images
    for dep in deps:
        try:
            image_io = archive.read(dep.target)
            image = openpyxl_Image(BytesIO(image_io))
        except Exception as e:
            print(e)
            continue
        image.embed = dep.id  # 文件rId
        image.target = dep.target  # 文件地址
        images.append(image)
    return images
 def xlsx_embed_cells_images(buffer) -> {}:
    archive = ZipFile(buffer)
    # 解析cellImage.xml文件
    deps = get_dependents(archive, get_rels_path("xl/cellimages.xml"))
    image_rel = handle_images(deps=deps, archive=archive)
    # 工作表及其中图片ID
    sheet_list = {}
    for item in archive.namelist():
        if not item.startswith('xl/worksheets/sheet'):
            continue
        key = item.split('/')[-1].split('.')[0].split('sheet')[-1]
        sheet_list[key] = parse_element_sheet_xml(fromstring(archive.read(item)))
    cell_images_xml = parse_element(fromstring(archive.read("xl/cellimages.xml")))
    cell_images_rel = {}
    for image in image_rel:
        cell_images_rel[image.embed] = image
    for cnv, embed in cell_images_xml.items():
        cell_images_xml[cnv] = cell_images_rel.get(embed)
    result = {}
    for key, img in cell_images_xml.items():
        image_excel_id_list = [_xl for _xl in
                               reduce(lambda x, y: [*x, *y], [sheet for sheet_id, sheet in sheet_list.items()], []) if
                               key in _xl]
        if len(image_excel_id_list) > 0:
            image_excel_id = image_excel_id_list[-1]
            f = archive.open(img.target)
            img_byte = io.BytesIO()
            im = PILImage.open(f).convert('RGB')
            im.save(img_byte, format='JPEG')
            image = File(id=uuid.uuid7(), file_name=img.path, meta={'debug': False, 'content': img_byte.getvalue()})
            result['=' + image_excel_id] = image
    archive.close()
    return result
--- a/apps/common/handle/impl/csv_split_handle.py
+++ b/apps/common/handle/impl/csv_split_handle.py
@ -0,0 +1,72 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： csv_parse_qa_handle.py
    @date：2024/5/21 14:59
    @desc:
 """
 import csv
 import io
 import os
 from typing import List
 from charset_normalizer import detect
 from common.handle.base_split_handle import BaseSplitHandle
 def post_cell(cell_value):
    return cell_value.replace('\n', '<br>').replace('|', '&#124;')
 def row_to_md(row):
    return '| ' + ' | '.join(
        [post_cell(cell) if cell is not None else '' for cell in row]) + ' |\n'
 class CsvSplitHandle(BaseSplitHandle):
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
        buffer = get_buffer(file)
        paragraphs = []
        file_name = os.path.basename(file.name)
        result = {'name': file_name, 'content': paragraphs}
        try:
            reader = csv.reader(io.TextIOWrapper(io.BytesIO(buffer), encoding=detect(buffer)['encoding']))
            try:
                title_row_list = reader.__next__()
                title_md_content = row_to_md(title_row_list)
                title_md_content += '| ' + ' | '.join(
                    ['---' if cell is not None else '' for cell in title_row_list]) + ' |\n'
            except Exception as e:
                return result
            if len(title_row_list) == 0:
                return result
            result_item_content = ''
            for row in reader:
                next_md_content = row_to_md(row)
                next_md_content_len = len(next_md_content)
                result_item_content_len = len(result_item_content)
                if len(result_item_content) == 0:
                    result_item_content += title_md_content
                    result_item_content += next_md_content
                else:
                    if result_item_content_len + next_md_content_len < limit:
                        result_item_content += next_md_content
                    else:
                        paragraphs.append({'content': result_item_content, 'title': ''})
                        result_item_content = title_md_content + next_md_content
            if len(result_item_content) > 0:
                paragraphs.append({'content': result_item_content, 'title': ''})
            return result
        except Exception as e:
            return result
    def get_content(self, file, save_image):
        pass
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        if file_name.endswith(".csv"):
            return True
        return False
--- a/apps/common/handle/impl/doc_split_handle.py
+++ b/apps/common/handle/impl/doc_split_handle.py
@ -0,0 +1,235 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： text_split_handle.py
    @date：2024/3/27 18:19
    @desc:
 """
 import io
 import os
 import re
 import traceback
 import uuid_utils.compat as uuid
 from functools import reduce
 from typing import List
 from docx import Document, ImagePart
 from docx.oxml import ns
 from docx.table import Table
 from docx.text.paragraph import Paragraph
 from common.handle.base_split_handle import BaseSplitHandle
 from common.utils.split_model import SplitModel
 from knowledge.models import File
 from django.utils.translation import gettext_lazy as _
 default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
                        re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
                        re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
                        re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
                        re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
                        re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
 old_docx_nsmap = {'v': 'urn:schemas-microsoft-com:vml'}
 combine_nsmap = {**ns.nsmap, **old_docx_nsmap}
 def image_to_mode(image, doc: Document, images_list, get_image_id):
    image_ids = image['get_image_id_handle'](image.get('image'))
    for img_id in image_ids:  # 获取图片id
        part = doc.part.related_parts[img_id]  # 根据图片id获取对应的图片
        if isinstance(part, ImagePart):
            image_uuid = get_image_id(img_id)
            if len([i for i in images_list if i.id == image_uuid]) == 0:
                image = File(id=image_uuid, file_name=part.filename, meta={'debug': False, 'content': part.blob})
                images_list.append(image)
            return f'![](/api/image/{image_uuid})'
        return None
    return None
 def get_paragraph_element_images(paragraph_element, doc: Document, images_list, get_image_id):
    images_xpath_list = [(".//pic:pic", lambda img: img.xpath('.//a:blip/@r:embed')),
                         (".//w:pict", lambda img: img.xpath('.//v:imagedata/@r:id', namespaces=combine_nsmap))]
    images = []
    for images_xpath, get_image_id_handle in images_xpath_list:
        try:
            _images = paragraph_element.xpath(images_xpath)
            if _images is not None and len(_images) > 0:
                for image in _images:
                    images.append({'image': image, 'get_image_id_handle': get_image_id_handle})
        except Exception as e:
            pass
    return images
 def images_to_string(images, doc: Document, images_list, get_image_id):
    return "".join(
        [item for item in [image_to_mode(image, doc, images_list, get_image_id) for image in images] if
         item is not None])
 def get_paragraph_element_txt(paragraph_element, doc: Document, images_list, get_image_id):
    try:
        images = get_paragraph_element_images(paragraph_element, doc, images_list, get_image_id)
        if len(images) > 0:
            return images_to_string(images, doc, images_list, get_image_id)
        elif paragraph_element.text is not None:
            return paragraph_element.text
        return ""
    except Exception as e:
        print(e)
    return ""
 def get_paragraph_txt(paragraph: Paragraph, doc: Document, images_list, get_image_id):
    try:
        return "".join([get_paragraph_element_txt(e, doc, images_list, get_image_id) for e in paragraph._element])
    except Exception as e:
        return ""
 def get_cell_text(cell, doc: Document, images_list, get_image_id):
    try:
        return "".join(
            [get_paragraph_txt(paragraph, doc, images_list, get_image_id) for paragraph in cell.paragraphs]).replace(
            "\n", '</br>')
    except Exception as e:
        return ""
 def get_image_id_func():
    image_map = {}
    def get_image_id(image_id):
        _v = image_map.get(image_id)
        if _v is None:
            image_map[image_id] = uuid.uuid7()
            return image_map.get(image_id)
        return _v
    return get_image_id
 title_font_list = [
    [36, 100],
    [30, 36]
 ]
 def get_title_level(paragraph: Paragraph):
    try:
        if paragraph.style is not None:
            psn = paragraph.style.name
            if psn.startswith('Heading') or psn.startswith('TOC 标题') or psn.startswith('标题'):
                return int(psn.replace("Heading ", '').replace('TOC 标题', '').replace('标题',
                                                                                       ''))
        if len(paragraph.runs) == 1:
            font_size = paragraph.runs[0].font.size
            pt = font_size.pt
            if pt >= 30:
                for _value, index in zip(title_font_list, range(len(title_font_list))):
                    if pt >= _value[0] and pt < _value[1]:
                        return index + 1
    except Exception as e:
        pass
    return None
 class DocSplitHandle(BaseSplitHandle):
    @staticmethod
    def paragraph_to_md(paragraph: Paragraph, doc: Document, images_list, get_image_id):
        try:
            title_level = get_title_level(paragraph)
            if title_level is not None:
                title = "".join(["#" for i in range(title_level)]) + " " + paragraph.text
                images = reduce(lambda x, y: [*x, *y],
                                [get_paragraph_element_images(e, doc, images_list, get_image_id) for e in
                                 paragraph._element],
                                [])
                if len(images) > 0:
                    return title + '\n' + images_to_string(images, doc, images_list, get_image_id) if len(
                        paragraph.text) > 0 else images_to_string(images, doc, images_list, get_image_id)
                return title
        except Exception as e:
            traceback.print_exc()
            return paragraph.text
        return get_paragraph_txt(paragraph, doc, images_list, get_image_id)
    @staticmethod
    def table_to_md(table, doc: Document, images_list, get_image_id):
        rows = table.rows
        # 创建 Markdown 格式的表格
        md_table = '| ' + ' | '.join(
            [get_cell_text(cell, doc, images_list, get_image_id) for cell in rows[0].cells]) + ' |\n'
        md_table += '| ' + ' | '.join(['---' for i in range(len(rows[0].cells))]) + ' |\n'
        for row in rows[1:]:
            md_table += '| ' + ' | '.join(
                [get_cell_text(cell, doc, images_list, get_image_id) for cell in row.cells]) + ' |\n'
        return md_table
    def to_md(self, doc, images_list, get_image_id):
        elements = []
        for element in doc.element.body:
            tag = str(element.tag)
            if tag.endswith('tbl'):
                # 处理表格
                table = Table(element, doc)
                elements.append(table)
            elif tag.endswith('p'):
                # 处理段落
                paragraph = Paragraph(element, doc)
                elements.append(paragraph)
        return "\n".join(
            [self.paragraph_to_md(element, doc, images_list, get_image_id) if isinstance(element,
                                                                                         Paragraph) else self.table_to_md(
                element,
                doc,
                images_list, get_image_id)
             for element
             in elements])
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
        file_name = os.path.basename(file.name)
        try:
            image_list = []
            buffer = get_buffer(file)
            doc = Document(io.BytesIO(buffer))
            content = self.to_md(doc, image_list, get_image_id_func())
            if len(image_list) > 0:
                save_image(image_list)
            if pattern_list is not None and len(pattern_list) > 0:
                split_model = SplitModel(pattern_list, with_filter, limit)
            else:
                split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
        except BaseException as e:
            traceback.print_exception(e)
            return {'name': file_name,
                    'content': []}
        return {'name': file_name,
                'content': split_model.parse(content)
                }
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        if file_name.endswith(".docx") or file_name.endswith(".doc") or file_name.endswith(
                ".DOC") or file_name.endswith(".DOCX"):
            return True
        return False
    def get_content(self, file, save_image):
        try:
            image_list = []
            buffer = file.read()
            doc = Document(io.BytesIO(buffer))
            content = self.to_md(doc, image_list, get_image_id_func())
            if len(image_list) > 0:
                content = content.replace('/api/image/', '/api/file/')
                save_image(image_list)
            return content
        except BaseException as e:
            traceback.print_exception(e)
            return f'{e}'
--- a/apps/common/handle/impl/html_split_handle.py
+++ b/apps/common/handle/impl/html_split_handle.py
@ -0,0 +1,73 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： html_split_handle.py
    @date：2024/5/23 10:58
    @desc:
 """
 import re
 import traceback
 from typing import List
 from bs4 import BeautifulSoup
 from charset_normalizer import detect
 from html2text import html2text
 from common.handle.base_split_handle import BaseSplitHandle
 from common.utils.split_model import SplitModel
 default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
                        re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
                        re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
                        re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
                        re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
                        re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
 def get_encoding(buffer):
    beautiful_soup = BeautifulSoup(buffer, "html.parser")
    meta_list = beautiful_soup.find_all('meta')
    charset_list = [meta.attrs.get('charset') for meta in meta_list if
                    meta.attrs is not None and 'charset' in meta.attrs]
    if len(charset_list) > 0:
        charset = charset_list[0]
        return charset
    return detect(buffer)['encoding']
 class HTMLSplitHandle(BaseSplitHandle):
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        if file_name.endswith(".html") or file_name.endswith(".HTML"):
            return True
        return False
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
        buffer = get_buffer(file)
        if pattern_list is not None and len(pattern_list) > 0:
            split_model = SplitModel(pattern_list, with_filter, limit)
        else:
            split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
        try:
            encoding = get_encoding(buffer)
            content = buffer.decode(encoding)
            content = html2text(content)
        except BaseException as e:
            return {'name': file.name,
                    'content': []}
        return {'name': file.name,
                'content': split_model.parse(content)
                }
    def get_content(self, file, save_image):
        buffer = file.read()
        try:
            encoding = get_encoding(buffer)
            content = buffer.decode(encoding)
            return html2text(content)
        except BaseException as e:
            traceback.print_exception(e)
            return f'{e}'
--- a/apps/common/handle/impl/pdf_split_handle.py
+++ b/apps/common/handle/impl/pdf_split_handle.py
@ -0,0 +1,339 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： text_split_handle.py
    @date：2024/3/27 18:19
    @desc:
 """
 import logging
 import os
 import re
 import tempfile
 import time
 import traceback
 from typing import List
 import fitz
 from langchain_community.document_loaders import PyPDFLoader
 from common.handle.base_split_handle import BaseSplitHandle
 from common.utils.split_model import SplitModel
 from django.utils.translation import gettext_lazy as _
 default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
                        re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
                        re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
                        re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
                        re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
                        re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*"),
                        re.compile("(?<!\n)\n\n+")]
 max_kb = logging.getLogger("max_kb")
 def check_links_in_pdf(doc):
    for page_number in range(len(doc)):
        page = doc[page_number]
        links = page.get_links()
        if links:
            for link in links:
                if link['kind'] == 1:
                    return True
    return False
 class PdfSplitHandle(BaseSplitHandle):
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            # 将上传的文件保存到临时文件中
            for chunk in file.chunks():
                temp_file.write(chunk)
            # 获取临时文件的路径
            temp_file_path = temp_file.name
        pdf_document = fitz.open(temp_file_path)
        try:
            # 处理有目录的pdf
            result = self.handle_toc(pdf_document, limit)
            if result is not None:
                return {'name': file.name, 'content': result}
            # 没目录但是有链接的pdf
            result = self.handle_links(pdf_document, pattern_list, with_filter, limit)
            if result is not None and len(result) > 0:
                return {'name': file.name, 'content': result}
            # 没有目录的pdf
            content = self.handle_pdf_content(file, pdf_document)
            if pattern_list is not None and len(pattern_list) > 0:
                split_model = SplitModel(pattern_list, with_filter, limit)
            else:
                split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
        except BaseException as e:
            max_kb.error(f"File: {file.name}, error: {e}")
            return {'name': file.name,
                    'content': []}
        finally:
            pdf_document.close()
            # 处理完后可以删除临时文件
            os.remove(temp_file_path)
        return {'name': file.name,
                'content': split_model.parse(content)
                }
    @staticmethod
    def handle_pdf_content(file, pdf_document):
        content = ""
        for page_num in range(len(pdf_document)):
            start_time = time.time()
            page = pdf_document.load_page(page_num)
            text = page.get_text()
            if text and text.strip():  # 如果页面中有文本内容
                page_content = text
            else:
                try:
                    new_doc = fitz.open()
                    new_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
                    page_num_pdf = tempfile.gettempdir() + f"/{file.name}_{page_num}.pdf"
                    new_doc.save(page_num_pdf)
                    new_doc.close()
                    loader = PyPDFLoader(page_num_pdf, extract_images=True)
                    page_content = "\n" + loader.load()[0].page_content
                except NotImplementedError as e:
                    # 文件格式不支持，直接退出
                    raise e
                except BaseException as e:
                    # 当页出错继续进行下一页，防止一个页面出错导致整个文件解析失败
                    max_kb.error(f"File: {file.name}, Page: {page_num + 1}, error: {e}")
                    continue
                finally:
                    os.remove(page_num_pdf)
            content += page_content
            # Null characters are not allowed.
            content = content.replace('\0', '')
            elapsed_time = time.time() - start_time
            max_kb.debug(
                f"File: {file.name}, Page: {page_num + 1}, Time : {elapsed_time: .3f}s,   content-length: {len(page_content)}")
        return content
    @staticmethod
    def handle_toc(doc, limit):
        # 找到目录
        toc = doc.get_toc()
        if toc is None or len(toc) == 0:
            return None
        # 创建存储章节内容的数组
        chapters = []
        # 遍历目录并按章节提取文本
        for i, entry in enumerate(toc):
            level, title, start_page = entry
            start_page -= 1  # PyMuPDF 页码从 0 开始，书签页码从 1 开始
            chapter_title = title
            # 确定结束页码，如果是最后一个章节则到文档末尾
            if i + 1 < len(toc):
                end_page = toc[i + 1][2] - 1
            else:
                end_page = doc.page_count - 1
            # 去掉标题中的符号
            title = PdfSplitHandle.handle_chapter_title(title)
            # 提取该章节的文本内容
            chapter_text = ""
            for page_num in range(start_page, end_page + 1):
                page = doc.load_page(page_num)  # 加载页面
                text = page.get_text("text")
                text = re.sub(r'(?<!。)\n+', '', text)
                text = re.sub(r'(?<!.)\n+', '', text)
                # print(f'title: {title}')
                idx = text.find(title)
                if idx > -1:
                    text = text[idx + len(title):]
                if i + 1 < len(toc):
                    l, next_title, next_start_page = toc[i + 1]
                    next_title = PdfSplitHandle.handle_chapter_title(next_title)
                    # print(f'next_title: {next_title}')
                    idx = text.find(next_title)
                    if idx > -1:
                        text = text[:idx]
                chapter_text += text  # 提取文本
            # Null characters are not allowed.
            chapter_text = chapter_text.replace('\0', '')
            # 限制标题长度
            real_chapter_title = chapter_title[:256]
            # 限制章节内容长度
            if 0 < limit < len(chapter_text):
                split_text = PdfSplitHandle.split_text(chapter_text, limit)
                for text in split_text:
                    chapters.append({"title": real_chapter_title, "content": text})
            else:
                chapters.append({"title": real_chapter_title, "content": chapter_text if chapter_text else real_chapter_title})
            # 保存章节内容和章节标题
        return chapters
    @staticmethod
    def handle_links(doc, pattern_list, with_filter, limit):
        # 检查文档是否包含内部链接
        if not check_links_in_pdf(doc):
            return
        # 创建存储章节内容的数组
        chapters = []
        toc_start_page = -1
        page_content = ""
        handle_pre_toc = True
        # 遍历 PDF 的每一页，查找带有目录链接的页
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            links = page.get_links()
            # 如果目录开始页码未设置，则设置为当前页码
            if len(links) > 0:
                toc_start_page = page_num
            if toc_start_page < 0:
                page_content += page.get_text('text')
            # 检查该页是否包含内部链接（即指向文档内部的页面）
            for num in range(len(links)):
                link = links[num]
                if link['kind'] == 1:  # 'kind' 为 1 表示内部链接
                    # 获取链接目标的页面
                    dest_page = link['page']
                    rect = link['from']  # 获取链接的矩形区域
                    # 如果目录开始页码包括前言部分，则不处理前言部分
                    if dest_page < toc_start_page:
                        handle_pre_toc = False
                    # 提取链接区域的文本作为标题
                    link_title = page.get_text("text", clip=rect).strip().split("\n")[0].replace('.', '').strip()
                    # print(f'link_title: {link_title}')
                    # 提取目标页面内容作为章节开始
                    start_page = dest_page
                    end_page = dest_page
                    # 下一个link
                    next_link = links[num + 1] if num + 1 < len(links) else None
                    next_link_title = None
                    if next_link is not None and next_link['kind'] == 1:
                        rect = next_link['from']
                        next_link_title = page.get_text("text", clip=rect).strip() \
                            .split("\n")[0].replace('.', '').strip()
                        # print(f'next_link_title: {next_link_title}')
                        end_page = next_link['page']
                    # 提取章节内容
                    chapter_text = ""
                    for p_num in range(start_page, end_page + 1):
                        p = doc.load_page(p_num)
                        text = p.get_text("text")
                        text = re.sub(r'(?<!。)\n+', '', text)
                        text = re.sub(r'(?<!.)\n+', '', text)
                        # print(f'\n{text}\n')
                        idx = text.find(link_title)
                        if idx > -1:
                            text = text[idx + len(link_title):]
                        if next_link_title is not None:
                            idx = text.find(next_link_title)
                            if idx > -1:
                                text = text[:idx]
                        chapter_text += text
                    # Null characters are not allowed.
                    chapter_text = chapter_text.replace('\0', '')
                    # 限制章节内容长度
                    if 0 < limit < len(chapter_text):
                        split_text = PdfSplitHandle.split_text(chapter_text, limit)
                        for text in split_text:
                            chapters.append({"title": link_title, "content": text})
                    else:
                        # 保存章节信息
                        chapters.append({"title": link_title, "content": chapter_text})
        # 目录中没有前言部分，手动处理
        if handle_pre_toc:
            pre_toc = []
            lines = page_content.strip().split('\n')
            try:
                for line in lines:
                    if re.match(r'^前\s*言', line):
                        pre_toc.append({'title': line, 'content': ''})
                    else:
                        pre_toc[-1]['content'] += line
                for i in range(len(pre_toc)):
                    pre_toc[i]['content'] = re.sub(r'(?<!。)\n+', '', pre_toc[i]['content'])
                    pre_toc[i]['content'] = re.sub(r'(?<!.)\n+', '', pre_toc[i]['content'])
            except BaseException as e:
                max_kb.error(_('This document has no preface and is treated as ordinary text: {e}').format(e=e))
                if pattern_list is not None and len(pattern_list) > 0:
                    split_model = SplitModel(pattern_list, with_filter, limit)
                else:
                    split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
                # 插入目录前的部分
                page_content = re.sub(r'(?<!。)\n+', '', page_content)
                page_content = re.sub(r'(?<!.)\n+', '', page_content)
                page_content = page_content.strip()
                pre_toc = split_model.parse(page_content)
            chapters = pre_toc + chapters
        return chapters
    @staticmethod
    def split_text(text, length):
        segments = []
        current_segment = ""
        for char in text:
            current_segment += char
            if len(current_segment) >= length:
                # 查找最近的句号
                last_period_index = current_segment.rfind('.')
                if last_period_index != -1:
                    segments.append(current_segment[:last_period_index + 1])
                    current_segment = current_segment[last_period_index + 1:]  # 更新当前段落
                else:
                    segments.append(current_segment)
                    current_segment = ""
        # 处理剩余的部分
        if current_segment:
            segments.append(current_segment)
        return segments
    @staticmethod
    def handle_chapter_title(title):
        title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
        title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
        return title
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        if file_name.endswith(".pdf") or file_name.endswith(".PDF"):
            return True
        return False
    def get_content(self, file, save_image):
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            # 将上传的文件保存到临时文件中
            temp_file.write(file.read())
            # 获取临时文件的路径
            temp_file_path = temp_file.name
        pdf_document = fitz.open(temp_file_path)
        try:
            return self.handle_pdf_content(file, pdf_document)
        except BaseException as e:
            traceback.print_exception(e)
            return f'{e}'
--- a/apps/common/handle/impl/qa/init.py
+++ b/apps/common/handle/impl/qa/init.py
@ -0,0 +1,8 @@
 # coding=utf-8
 """
    @project: qabot
    @Author：虎
    @file： __init__.py.py
    @date：2023/9/6 10:09
    @desc:
 """
--- a/apps/common/handle/impl/qa/csv_parse_qa_handle.py
+++ b/apps/common/handle/impl/qa/csv_parse_qa_handle.py
@ -0,0 +1,59 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： csv_parse_qa_handle.py
    @date：2024/5/21 14:59
    @desc:
 """
 import csv
 import io
 from charset_normalizer import detect
 from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
 def read_csv_standard(file_path):
    data = []
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            data.append(row)
    return data
 class CsvParseQAHandle(BaseParseQAHandle):
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        if file_name.endswith(".csv"):
            return True
        return False
    def handle(self, file, get_buffer, save_image):
        buffer = get_buffer(file)
        try:
            reader = csv.reader(io.TextIOWrapper(io.BytesIO(buffer), encoding=detect(buffer)['encoding']))
            try:
                title_row_list = reader.__next__()
            except Exception as e:
                return [{'name': file.name, 'paragraphs': []}]
            if len(title_row_list) == 0:
                return [{'name': file.name, 'paragraphs': []}]
            title_row_index_dict = get_title_row_index_dict(title_row_list)
            paragraph_list = []
            for row in reader:
                content = get_row_value(row, title_row_index_dict, 'content')
                if content is None:
                    continue
                problem = get_row_value(row, title_row_index_dict, 'problem_list')
                problem = str(problem) if problem is not None else ''
                problem_list = [{'content': p[0:255]} for p in problem.split('\n') if len(p.strip()) > 0]
                title = get_row_value(row, title_row_index_dict, 'title')
                title = str(title) if title is not None else ''
                paragraph_list.append({'title': title[0:255],
                                       'content': content[0:102400],
                                       'problem_list': problem_list})
            return [{'name': file.name, 'paragraphs': paragraph_list}]
        except Exception as e:
            return [{'name': file.name, 'paragraphs': []}]
--- a/apps/common/handle/impl/qa/xls_parse_qa_handle.py
+++ b/apps/common/handle/impl/qa/xls_parse_qa_handle.py
@ -0,0 +1,61 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： xls_parse_qa_handle.py
    @date：2024/5/21 14:59
    @desc:
 """
 import xlrd
 from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
 def handle_sheet(file_name, sheet):
    rows = iter([sheet.row_values(i) for i in range(sheet.nrows)])
    try:
        title_row_list = next(rows)
    except Exception as e:
        return {'name': file_name, 'paragraphs': []}
    if len(title_row_list) == 0:
        return {'name': file_name, 'paragraphs': []}
    title_row_index_dict = get_title_row_index_dict(title_row_list)
    paragraph_list = []
    for row in rows:
        content = get_row_value(row, title_row_index_dict, 'content')
        if content is None:
            continue
        problem = get_row_value(row, title_row_index_dict, 'problem_list')
        problem = str(problem) if problem is not None else ''
        problem_list = [{'content': p[0:255]} for p in problem.split('\n') if len(p.strip()) > 0]
        title = get_row_value(row, title_row_index_dict, 'title')
        title = str(title) if title is not None else ''
        content = str(content)
        paragraph_list.append({'title': title[0:255],
                               'content': content[0:102400],
                               'problem_list': problem_list})
    return {'name': file_name, 'paragraphs': paragraph_list}
 class XlsParseQAHandle(BaseParseQAHandle):
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        buffer = get_buffer(file)
        if file_name.endswith(".xls") and xlrd.inspect_format(content=buffer):
            return True
        return False
    def handle(self, file, get_buffer, save_image):
        buffer = get_buffer(file)
        try:
            workbook = xlrd.open_workbook(file_contents=buffer)
            worksheets = workbook.sheets()
            worksheets_size = len(worksheets)
            return [row for row in
                    [handle_sheet(file.name,
                                  sheet) if worksheets_size == 1 and sheet.name == 'Sheet1' else handle_sheet(
                        sheet.name, sheet) for sheet
                     in worksheets] if row is not None]
        except Exception as e:
            return [{'name': file.name, 'paragraphs': []}]
--- a/apps/common/handle/impl/qa/xlsx_parse_qa_handle.py
+++ b/apps/common/handle/impl/qa/xlsx_parse_qa_handle.py
@ -0,0 +1,72 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： xlsx_parse_qa_handle.py
    @date：2024/5/21 14:59
    @desc:
 """
 import io
 import openpyxl
 from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
 from common.handle.impl.common_handle import xlsx_embed_cells_images
 def handle_sheet(file_name, sheet, image_dict):
    rows = sheet.rows
    try:
        title_row_list = next(rows)
        title_row_list = [row.value for row in title_row_list]
    except Exception as e:
        return {'name': file_name, 'paragraphs': []}
    if len(title_row_list) == 0:
        return {'name': file_name, 'paragraphs': []}
    title_row_index_dict = get_title_row_index_dict(title_row_list)
    paragraph_list = []
    for row in rows:
        content = get_row_value(row, title_row_index_dict, 'content')
        if content is None or content.value is None:
            continue
        problem = get_row_value(row, title_row_index_dict, 'problem_list')
        problem = str(problem.value) if problem is not None and problem.value is not None else ''
        problem_list = [{'content': p[0:255]} for p in problem.split('\n') if len(p.strip()) > 0]
        title = get_row_value(row, title_row_index_dict, 'title')
        title = str(title.value) if title is not None and title.value is not None else ''
        content = str(content.value)
        image = image_dict.get(content, None)
        if image is not None:
            content = f'![](/api/image/{image.id})'
        paragraph_list.append({'title': title[0:255],
                               'content': content[0:102400],
                               'problem_list': problem_list})
    return {'name': file_name, 'paragraphs': paragraph_list}
 class XlsxParseQAHandle(BaseParseQAHandle):
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        if file_name.endswith(".xlsx"):
            return True
        return False
    def handle(self, file, get_buffer, save_image):
        buffer = get_buffer(file)
        try:
            workbook = openpyxl.load_workbook(io.BytesIO(buffer))
            try:
                image_dict: dict = xlsx_embed_cells_images(io.BytesIO(buffer))
                save_image([item for item in image_dict.values()])
            except Exception as e:
                image_dict = {}
            worksheets = workbook.worksheets
            worksheets_size = len(worksheets)
            return [row for row in
                    [handle_sheet(file.name,
                                  sheet,
                                  image_dict) if worksheets_size == 1 and sheet.title == 'Sheet1' else handle_sheet(
                        sheet.title, sheet, image_dict) for sheet
                     in worksheets] if row is not None]
        except Exception as e:
            return [{'name': file.name, 'paragraphs': []}]
--- a/apps/common/handle/impl/qa/zip_parse_qa_handle.py
+++ b/apps/common/handle/impl/qa/zip_parse_qa_handle.py
@ -0,0 +1,161 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： text_split_handle.py
    @date：2024/3/27 18:19
    @desc:
 """
 import io
 import os
 import re
 import uuid_utils.compat as uuid
 import zipfile
 from typing import List
 from urllib.parse import urljoin
 from django.db.models import QuerySet
 from common.handle.base_parse_qa_handle import BaseParseQAHandle
 from common.handle.impl.qa.csv_parse_qa_handle import CsvParseQAHandle
 from common.handle.impl.qa.xls_parse_qa_handle import XlsParseQAHandle
 from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle
 from common.utils.common import parse_md_image
 from knowledge.models import File
 from django.utils.translation import gettext_lazy as _
 class FileBufferHandle:
    buffer = None
    def get_buffer(self, file):
        if self.buffer is None:
            self.buffer = file.read()
        return self.buffer
 split_handles = [
    XlsParseQAHandle(),
    XlsxParseQAHandle(),
    CsvParseQAHandle()
 ]
 def file_to_paragraph(file, save_inner_image):
    """
    文件转换为段落列表
    @param file: 文件
    @return: {
      name:文件名
      paragraphs:段落列表
    }
    """
    get_buffer = FileBufferHandle().get_buffer
    for split_handle in split_handles:
        if split_handle.support(file, get_buffer):
            return split_handle.handle(file, get_buffer, save_inner_image)
    raise Exception(_("Unsupported file format"))
 def is_valid_uuid(uuid_str: str):
    """
    校验字符串是否是uuid
    @param uuid_str: 需要校验的字符串
    @return: bool
    """
    try:
        uuid.UUID(uuid_str)
    except ValueError:
        return False
    return True
 def get_image_list(result_list: list, zip_files: List[str]):
    """
    获取图片文件列表
    @param result_list:
    @param zip_files:
    @return:
    """
    image_file_list = []
    for result in result_list:
        for p in result.get('paragraphs', []):
            content: str = p.get('content', '')
            image_list = parse_md_image(content)
            for image in image_list:
                search = re.search("\(.*\)", image)
                if search:
                    new_image_id = str(uuid.uuid7())
                    source_image_path = search.group().replace('(', '').replace(')', '')
                    image_path = urljoin(result.get('name'), '.' + source_image_path if source_image_path.startswith(
                        '/') else source_image_path)
                    if not zip_files.__contains__(image_path):
                        continue
                    if image_path.startswith('api/file/') or image_path.startswith('api/image/'):
                        image_id = image_path.replace('api/file/', '').replace('api/image/', '')
                        if is_valid_uuid(image_id):
                            image_file_list.append({'source_file': image_path,
                                                    'image_id': image_id})
                        else:
                            image_file_list.append({'source_file': image_path,
                                                    'image_id': new_image_id})
                            content = content.replace(source_image_path, f'/api/image/{new_image_id}')
                            p['content'] = content
                    else:
                        image_file_list.append({'source_file': image_path,
                                                'image_id': new_image_id})
                        content = content.replace(source_image_path, f'/api/image/{new_image_id}')
                        p['content'] = content
    return image_file_list
 def filter_image_file(result_list: list, image_list):
    image_source_file_list = [image.get('source_file') for image in image_list]
    return [r for r in result_list if not image_source_file_list.__contains__(r.get('name', ''))]
 class ZipParseQAHandle(BaseParseQAHandle):
    def handle(self, file, get_buffer, save_image):
        buffer = get_buffer(file)
        bytes_io = io.BytesIO(buffer)
        result = []
        # 打开zip文件
        with zipfile.ZipFile(bytes_io, 'r') as zip_ref:
            # 获取压缩包中的文件名列表
            files = zip_ref.namelist()
            # 读取压缩包中的文件内容
            for file in files:
                # 跳过 macOS 特有的元数据目录和文件
                if file.endswith('/') or file.startswith('__MACOSX'):
                    continue
                with zip_ref.open(file) as f:
                    # 对文件内容进行处理
                    try:
                        value = file_to_paragraph(f, save_image)
                        if isinstance(value, list):
                            result = [*result, *value]
                        else:
                            result.append(value)
                    except Exception:
                        pass
            image_list = get_image_list(result, files)
            result = filter_image_file(result, image_list)
            image_mode_list = []
            for image in image_list:
                with zip_ref.open(image.get('source_file')) as f:
                    i = File(
                        id=image.get('image_id'),
                        file_name=os.path.basename(image.get('source_file')),
                        meta={'debug': False, 'content': f.read()}
                    )
                    image_mode_list.append(i)
            save_image(image_mode_list)
        return result
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        if file_name.endswith(".zip") or file_name.endswith(".ZIP"):
            return True
        return False
--- a/apps/common/handle/impl/response/init.py
+++ b/apps/common/handle/impl/response/init.py
@ -0,0 +1,8 @@
 # coding=utf-8
 """
    @project: qabot
    @Author：虎
    @file： __init__.py.py
    @date：2023/9/6 10:09
    @desc:
 """
--- a/apps/common/handle/impl/response/openai_to_response.py
+++ b/apps/common/handle/impl/response/openai_to_response.py
@ -0,0 +1,52 @@
 # coding=utf-8
 """
    @project: MaxKB
    @Author：虎
    @file： openai_to_response.py
    @date：2024/9/6 16:08
    @desc:
 """
 import datetime
 from django.http import JsonResponse
 from openai.types import CompletionUsage
 from openai.types.chat import ChatCompletionChunk, ChatCompletionMessage, ChatCompletion
 from openai.types.chat.chat_completion import Choice as BlockChoice
 from openai.types.chat.chat_completion_chunk import Choice, ChoiceDelta
 from rest_framework import status
 from common.handle.base_to_response import BaseToResponse
 class OpenaiToResponse(BaseToResponse):
    def to_block_response(self, chat_id, chat_record_id, content, is_end, completion_tokens, prompt_tokens,
                          other_params: dict = None,
                          _status=status.HTTP_200_OK):
        if other_params is None:
            other_params = {}
        data = ChatCompletion(id=chat_record_id, choices=[
            BlockChoice(finish_reason='stop', index=0, chat_id=chat_id,
                        answer_list=other_params.get('answer_list', ""),
                        message=ChatCompletionMessage(role='assistant', content=content))],
                              created=datetime.datetime.now().second, model='', object='chat.completion',
                              usage=CompletionUsage(completion_tokens=completion_tokens,
                                                    prompt_tokens=prompt_tokens,
                                                    total_tokens=completion_tokens + prompt_tokens)
                              ).dict()
        return JsonResponse(data=data, status=_status)
    def to_stream_chunk_response(self, chat_id, chat_record_id, node_id, up_node_id_list, content, is_end,
                                 completion_tokens,
                                 prompt_tokens, other_params: dict = None):
        if other_params is None:
            other_params = {}
        chunk = ChatCompletionChunk(id=chat_record_id, model='', object='chat.completion.chunk',
                                    created=datetime.datetime.now().second, choices=[
                Choice(delta=ChoiceDelta(content=content, reasoning_content=other_params.get('reasoning_content', ""),
                                         chat_id=chat_id),
                       finish_reason='stop' if is_end else None,
                       index=0)],
                                    usage=CompletionUsage(completion_tokens=completion_tokens,
                                                          prompt_tokens=prompt_tokens,
                                                          total_tokens=completion_tokens + prompt_tokens)).json()
        return super().format_stream_chunk(chunk)
--- a/apps/common/handle/impl/response/system_to_response.py
+++ b/apps/common/handle/impl/response/system_to_response.py
@ -0,0 +1,41 @@
 # coding=utf-8
 """
    @project: MaxKB
    @Author：虎
    @file： system_to_response.py
    @date：2024/9/6 18:03
    @desc:
 """
 import json
 from rest_framework import status
 from common.handle.base_to_response import BaseToResponse
 from common.result import result
 class SystemToResponse(BaseToResponse):
    def to_block_response(self, chat_id, chat_record_id, content, is_end, completion_tokens,
                          prompt_tokens, other_params: dict = None,
                          _status=status.HTTP_200_OK):
        if other_params is None:
            other_params = {}
        return result.success({'chat_id': str(chat_id), 'id': str(chat_record_id), 'operate': True,
                               'content': content, 'is_end': is_end, **other_params,
                               'completion_tokens': completion_tokens, 'prompt_tokens': prompt_tokens},
                              response_status=_status,
                              code=_status)
    def to_stream_chunk_response(self, chat_id, chat_record_id, node_id, up_node_id_list, content, is_end,
                                 completion_tokens,
                                 prompt_tokens, other_params: dict = None):
        if other_params is None:
            other_params = {}
        chunk = json.dumps({'chat_id': str(chat_id), 'chat_record_id': str(chat_record_id), 'operate': True,
                            'content': content, 'node_id': node_id, 'up_node_id_list': up_node_id_list,
                            'is_end': is_end,
                            'usage': {'completion_tokens': completion_tokens,
                                      'prompt_tokens': prompt_tokens,
                                      'total_tokens': completion_tokens + prompt_tokens},
                            **other_params})
        return super().format_stream_chunk(chunk)
--- a/apps/common/handle/impl/table/init.py
+++ b/apps/common/handle/impl/table/init.py
@ -0,0 +1,8 @@
 # coding=utf-8
 """
    @project: qabot
    @Author：虎
    @file： __init__.py.py
    @date：2023/9/6 10:09
    @desc:
 """
--- a/apps/common/handle/impl/table/csv_parse_table_handle.py
+++ b/apps/common/handle/impl/table/csv_parse_table_handle.py
@ -0,0 +1,44 @@
 # coding=utf-8
 import logging
 from charset_normalizer import detect
 from common.handle.base_parse_table_handle import BaseParseTableHandle
 max_kb = logging.getLogger("max_kb")
 class CsvSplitHandle(BaseParseTableHandle):
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        if file_name.endswith(".csv"):
            return True
        return False
    def handle(self, file, get_buffer,save_image):
        buffer = get_buffer(file)
        try:
            content = buffer.decode(detect(buffer)['encoding'])
        except BaseException as e:
            max_kb.error(f'csv split handle error: {e}')
            return [{'name': file.name, 'paragraphs': []}]
        csv_model = content.split('\n')
        paragraphs = []
        # 第一行为标题
        title = csv_model[0].split(',')
        for row in csv_model[1:]:
            if not row:
                continue
            line = '; '.join([f'{key}:{value}' for key, value in zip(title, row.split(','))])
            paragraphs.append({'title': '', 'content': line})
        return [{'name': file.name, 'paragraphs': paragraphs}]
    def get_content(self, file, save_image):
        buffer = file.read()
        try:
            return buffer.decode(detect(buffer)['encoding'])
        except BaseException as e:
            max_kb.error(f'csv split handle error: {e}')
            return f'error: {e}'
--- a/apps/common/handle/impl/table/xls_parse_table_handle.py
+++ b/apps/common/handle/impl/table/xls_parse_table_handle.py
@ -0,0 +1,94 @@
 # coding=utf-8
 import logging
 import xlrd
 from common.handle.base_parse_table_handle import BaseParseTableHandle
 max_kb = logging.getLogger("max_kb")
 class XlsSplitHandle(BaseParseTableHandle):
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        buffer = get_buffer(file)
        if file_name.endswith(".xls") and xlrd.inspect_format(content=buffer):
            return True
        return False
    def handle(self, file, get_buffer, save_image):
        buffer = get_buffer(file)
        try:
            wb = xlrd.open_workbook(file_contents=buffer, formatting_info=True)
            result = []
            sheets = wb.sheets()
            for sheet in sheets:
                # 获取合并单元格的范围信息
                merged_cells = sheet.merged_cells
                print(merged_cells)
                data = []
                paragraphs = []
                # 获取第一行作为标题行
                headers = [sheet.cell_value(0, col_idx) for col_idx in range(sheet.ncols)]
                # 从第二行开始遍历每一行（跳过标题行）
                for row_idx in range(1, sheet.nrows):
                    row_data = {}
                    for col_idx in range(sheet.ncols):
                        cell_value = sheet.cell_value(row_idx, col_idx)
                        # 检查是否为空单元格，如果为空检查是否在合并区域中
                        if cell_value == "":
                            # 检查当前单元格是否在合并区域
                            for (rlo, rhi, clo, chi) in merged_cells:
                                if rlo <= row_idx < rhi and clo <= col_idx < chi:
                                    # 使用合并区域的左上角单元格的值
                                    cell_value = sheet.cell_value(rlo, clo)
                                    break
                        # 将标题作为键，单元格的值作为值存入字典
                        row_data[headers[col_idx]] = cell_value
                    data.append(row_data)
                for row in data:
                    row_output = "; ".join([f"{key}: {value}" for key, value in row.items()])
                    # print(row_output)
                    paragraphs.append({'title': '', 'content': row_output})
                result.append({'name': sheet.name, 'paragraphs': paragraphs})
        except BaseException as e:
            max_kb.error(f'excel split handle error: {e}')
            return [{'name': file.name, 'paragraphs': []}]
        return result
    def get_content(self, file, save_image):
        # 打开 .xls 文件
        try:
            workbook = xlrd.open_workbook(file_contents=file.read(), formatting_info=True)
            sheets = workbook.sheets()
            md_tables = ''
            for sheet in sheets:
                # 过滤空白的sheet
                if sheet.nrows == 0 or sheet.ncols == 0:
                    continue
                # 获取表头和内容
                headers = sheet.row_values(0)
                data = [sheet.row_values(row_idx) for row_idx in range(1, sheet.nrows)]
                # 构建 Markdown 表格
                md_table = '| ' + ' | '.join(headers) + ' |\n'
                md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
                for row in data:
                    # 将每个单元格中的内容替换换行符为 <br> 以保留原始格式
                    md_table += '| ' + ' | '.join(
                        [str(cell)
                         .replace('\r\n', '<br>')
                         .replace('\n', '<br>')
                         if cell else '' for cell in row]) + ' |\n'
                md_tables += md_table + '\n\n'
            return md_tables
        except Exception as e:
            max_kb.error(f'excel split handle error: {e}')
            return f'error: {e}'
--- a/apps/common/handle/impl/table/xlsx_parse_table_handle.py
+++ b/apps/common/handle/impl/table/xlsx_parse_table_handle.py
@ -0,0 +1,118 @@
 # coding=utf-8
 import io
 import logging
 from openpyxl import load_workbook
 from common.handle.base_parse_table_handle import BaseParseTableHandle
 from common.handle.impl.common_handle import xlsx_embed_cells_images
 max_kb = logging.getLogger("max_kb")
 class XlsxSplitHandle(BaseParseTableHandle):
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        if file_name.endswith('.xlsx'):
            return True
        return False
    def fill_merged_cells(self, sheet, image_dict):
        data = []
        # 获取第一行作为标题行
        headers = []
        for idx, cell in enumerate(sheet[1]):
            if cell.value is None:
                headers.append(' ' * (idx + 1))
            else:
                headers.append(cell.value)
        # 从第二行开始遍历每一行
        for row in sheet.iter_rows(min_row=2, values_only=False):
            row_data = {}
            for col_idx, cell in enumerate(row):
                cell_value = cell.value
                # 如果单元格为空，并且该单元格在合并单元格内，获取合并单元格的值
                if cell_value is None:
                    for merged_range in sheet.merged_cells.ranges:
                        if cell.coordinate in merged_range:
                            cell_value = sheet[merged_range.min_row][merged_range.min_col - 1].value
                            break
                image = image_dict.get(cell_value, None)
                if image is not None:
                    cell_value = f'![](/api/image/{image.id})'
                # 使用标题作为键，单元格的值作为值存入字典
                row_data[headers[col_idx]] = cell_value
            data.append(row_data)
        return data
    def handle(self, file, get_buffer, save_image):
        buffer = get_buffer(file)
        try:
            wb = load_workbook(io.BytesIO(buffer))
            try:
                image_dict: dict = xlsx_embed_cells_images(io.BytesIO(buffer))
                save_image([item for item in image_dict.values()])
            except Exception as e:
                image_dict = {}
            result = []
            for sheetname in wb.sheetnames:
                paragraphs = []
                ws = wb[sheetname]
                data = self.fill_merged_cells(ws, image_dict)
                for row in data:
                    row_output = "; ".join([f"{key}: {value}" for key, value in row.items()])
                    # print(row_output)
                    paragraphs.append({'title': '', 'content': row_output})
                result.append({'name': sheetname, 'paragraphs': paragraphs})
        except BaseException as e:
            max_kb.error(f'excel split handle error: {e}')
            return [{'name': file.name, 'paragraphs': []}]
        return result
    def get_content(self, file, save_image):
        try:
            # 加载 Excel 文件
            workbook = load_workbook(file)
            try:
                image_dict: dict = xlsx_embed_cells_images(file)
                if len(image_dict) > 0:
                    save_image(image_dict.values())
            except Exception as e:
                print(f'{e}')
                image_dict = {}
            md_tables = ''
            # 如果未指定 sheet_name，则使用第一个工作表
            for sheetname in workbook.sheetnames:
                sheet = workbook[sheetname] if sheetname else workbook.active
                rows = self.fill_merged_cells(sheet, image_dict)
                if len(rows) == 0:
                    continue
                # 提取表头和内容
                headers = [f"{key}" for key, value in rows[0].items()]
                # 构建 Markdown 表格
                md_table = '| ' + ' | '.join(headers) + ' |\n'
                md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
                for row in rows:
                    r = [f'{value}' for key, value in row.items()]
                    md_table += '| ' + ' | '.join(
                        [str(cell).replace('\n', '<br>') if cell is not None else '' for cell in r]) + ' |\n'
                md_tables += md_table + '\n\n'
                md_tables = md_tables.replace('/api/image/', '/api/file/')
            return md_tables
        except Exception as e:
            max_kb.error(f'excel split handle error: {e}')
            return f'error: {e}'
--- a/apps/common/handle/impl/text_split_handle.py
+++ b/apps/common/handle/impl/text_split_handle.py
@ -0,0 +1,60 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： text_split_handle.py
    @date：2024/3/27 18:19
    @desc:
 """
 import re
 import traceback
 from typing import List
 from charset_normalizer import detect
 from common.handle.base_split_handle import BaseSplitHandle
 from common.utils.split_model import SplitModel
 default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
                        re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
                        re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
                        re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
                        re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
                        re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
 class TextSplitHandle(BaseSplitHandle):
    def support(self, file, get_buffer):
        buffer = get_buffer(file)
        file_name: str = file.name.lower()
        if file_name.endswith(".md") or file_name.endswith('.txt') or file_name.endswith('.TXT') or file_name.endswith(
                '.MD'):
            return True
        result = detect(buffer)
        if result['encoding'] is not None and result['confidence'] is not None and result['encoding'] != 'ascii' and \
                result['confidence'] > 0.5:
            return True
        return False
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
        buffer = get_buffer(file)
        if pattern_list is not None and len(pattern_list) > 0:
            split_model = SplitModel(pattern_list, with_filter, limit)
        else:
            split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
        try:
            content = buffer.decode(detect(buffer)['encoding'])
        except BaseException as e:
            return {'name': file.name,
                    'content': []}
        return {'name': file.name,
                'content': split_model.parse(content)
                }
    def get_content(self, file, save_image):
        buffer = file.read()
        try:
           return buffer.decode(detect(buffer)['encoding'])
        except BaseException as e:
            traceback.print_exception(e)
            return f'{e}'
--- a/apps/common/handle/impl/xls_split_handle.py
+++ b/apps/common/handle/impl/xls_split_handle.py
@ -0,0 +1,80 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： xls_parse_qa_handle.py
    @date：2024/5/21 14:59
    @desc:
 """
 from typing import List
 import xlrd
 from common.handle.base_split_handle import BaseSplitHandle
 def post_cell(cell_value):
    return cell_value.replace('\r\n', '<br>').replace('\n', '<br>').replace('|', '&#124;')
 def row_to_md(row):
    return '| ' + ' | '.join(
        [post_cell(str(cell)) if cell is not None else '' for cell in row]) + ' |\n'
 def handle_sheet(file_name, sheet, limit: int):
    rows = iter([sheet.row_values(i) for i in range(sheet.nrows)])
    paragraphs = []
    result = {'name': file_name, 'content': paragraphs}
    try:
        title_row_list = next(rows)
        title_md_content = row_to_md(title_row_list)
        title_md_content += '| ' + ' | '.join(
            ['---' if cell is not None else '' for cell in title_row_list]) + ' |\n'
    except Exception as e:
        return result
    if len(title_row_list) == 0:
        return result
    result_item_content = ''
    for row in rows:
        next_md_content = row_to_md(row)
        next_md_content_len = len(next_md_content)
        result_item_content_len = len(result_item_content)
        if len(result_item_content) == 0:
            result_item_content += title_md_content
            result_item_content += next_md_content
        else:
            if result_item_content_len + next_md_content_len < limit:
                result_item_content += next_md_content
            else:
                paragraphs.append({'content': result_item_content, 'title': ''})
                result_item_content = title_md_content + next_md_content
    if len(result_item_content) > 0:
        paragraphs.append({'content': result_item_content, 'title': ''})
    return result
 class XlsSplitHandle(BaseSplitHandle):
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
        buffer = get_buffer(file)
        try:
            workbook = xlrd.open_workbook(file_contents=buffer)
            worksheets = workbook.sheets()
            worksheets_size = len(worksheets)
            return [row for row in
                    [handle_sheet(file.name,
                                  sheet, limit) if worksheets_size == 1 and sheet.name == 'Sheet1' else handle_sheet(
                        sheet.name, sheet, limit) for sheet
                     in worksheets] if row is not None]
        except Exception as e:
            return [{'name': file.name, 'content': []}]
    def get_content(self, file, save_image):
        pass
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        buffer = get_buffer(file)
        if file_name.endswith(".xls") and xlrd.inspect_format(content=buffer):
            return True
        return False
--- a/apps/common/handle/impl/xlsx_split_handle.py
+++ b/apps/common/handle/impl/xlsx_split_handle.py
@ -0,0 +1,92 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： xlsx_parse_qa_handle.py
    @date：2024/5/21 14:59
    @desc:
 """
 import io
 from typing import List
 import openpyxl
 from common.handle.base_split_handle import BaseSplitHandle
 from common.handle.impl.common_handle import xlsx_embed_cells_images
 def post_cell(image_dict, cell_value):
    image = image_dict.get(cell_value, None)
    if image is not None:
        return f'![](/api/image/{image.id})'
    return cell_value.replace('\n', '<br>').replace('|', '&#124;')
 def row_to_md(row, image_dict):
    return '| ' + ' | '.join(
        [post_cell(image_dict, str(cell.value if cell.value is not None else '')) if cell is not None else '' for cell
         in row]) + ' |\n'
 def handle_sheet(file_name, sheet, image_dict, limit: int):
    rows = sheet.rows
    paragraphs = []
    result = {'name': file_name, 'content': paragraphs}
    try:
        title_row_list = next(rows)
        title_md_content = row_to_md(title_row_list, image_dict)
        title_md_content += '| ' + ' | '.join(
            ['---' if cell is not None else '' for cell in title_row_list]) + ' |\n'
    except Exception as e:
        return result
    if len(title_row_list) == 0:
        return result
    result_item_content = ''
    for row in rows:
        next_md_content = row_to_md(row, image_dict)
        next_md_content_len = len(next_md_content)
        result_item_content_len = len(result_item_content)
        if len(result_item_content) == 0:
            result_item_content += title_md_content
            result_item_content += next_md_content
        else:
            if result_item_content_len + next_md_content_len < limit:
                result_item_content += next_md_content
            else:
                paragraphs.append({'content': result_item_content, 'title': ''})
                result_item_content = title_md_content + next_md_content
    if len(result_item_content) > 0:
        paragraphs.append({'content': result_item_content, 'title': ''})
    return result
 class XlsxSplitHandle(BaseSplitHandle):
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
        buffer = get_buffer(file)
        try:
            workbook = openpyxl.load_workbook(io.BytesIO(buffer))
            try:
                image_dict: dict = xlsx_embed_cells_images(io.BytesIO(buffer))
                save_image([item for item in image_dict.values()])
            except Exception as e:
                image_dict = {}
            worksheets = workbook.worksheets
            worksheets_size = len(worksheets)
            return [row for row in
                    [handle_sheet(file.name,
                                  sheet,
                                  image_dict,
                                  limit) if worksheets_size == 1 and sheet.title == 'Sheet1' else handle_sheet(
                        sheet.title, sheet, image_dict, limit) for sheet
                     in worksheets] if row is not None]
        except Exception as e:
            return [{'name': file.name, 'content': []}]
    def get_content(self, file, save_image):
        pass
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        if file_name.endswith(".xlsx"):
            return True
        return False
--- a/apps/common/handle/impl/zip_split_handle.py
+++ b/apps/common/handle/impl/zip_split_handle.py
@ -0,0 +1,164 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： text_split_handle.py
    @date：2024/3/27 18:19
    @desc:
 """
 import io
 import os
 import re
 import zipfile
 from typing import List
 from urllib.parse import urljoin
 import uuid_utils.compat as uuid
 from charset_normalizer import detect
 from django.utils.translation import gettext_lazy as _
 from common.handle.base_split_handle import BaseSplitHandle
 from common.handle.impl.csv_split_handle import CsvSplitHandle
 from common.handle.impl.doc_split_handle import DocSplitHandle
 from common.handle.impl.html_split_handle import HTMLSplitHandle
 from common.handle.impl.pdf_split_handle import PdfSplitHandle
 from common.handle.impl.text_split_handle import TextSplitHandle
 from common.handle.impl.xls_split_handle import XlsSplitHandle
 from common.handle.impl.xlsx_split_handle import XlsxSplitHandle
 from common.utils.common import parse_md_image
 from knowledge.models import File
 class FileBufferHandle:
    buffer = None
    def get_buffer(self, file):
        if self.buffer is None:
            self.buffer = file.read()
        return self.buffer
 default_split_handle = TextSplitHandle()
 split_handles = [
    HTMLSplitHandle(),
    DocSplitHandle(),
    PdfSplitHandle(),
    XlsxSplitHandle(),
    XlsSplitHandle(),
    CsvSplitHandle(),
    default_split_handle
 ]
 def file_to_paragraph(file, pattern_list: List, with_filter: bool, limit: int, save_inner_image):
    get_buffer = FileBufferHandle().get_buffer
    for split_handle in split_handles:
        if split_handle.support(file, get_buffer):
            return split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, save_inner_image)
    raise Exception(_('Unsupported file format'))
 def is_valid_uuid(uuid_str: str):
    try:
        uuid.UUID(uuid_str)
    except ValueError:
        return False
    return True
 def get_image_list(result_list: list, zip_files: List[str]):
    image_file_list = []
    for result in result_list:
        for p in result.get('content', []):
            content: str = p.get('content', '')
            image_list = parse_md_image(content)
            for image in image_list:
                search = re.search("\(.*\)", image)
                if search:
                    new_image_id = str(uuid.uuid7())
                    source_image_path = search.group().replace('(', '').replace(')', '')
                    source_image_path = source_image_path.strip().split(" ")[0]
                    image_path = urljoin(result.get('name'), '.' + source_image_path if source_image_path.startswith(
                        '/') else source_image_path)
                    if not zip_files.__contains__(image_path):
                        continue
                    if image_path.startswith('api/file/') or image_path.startswith('api/image/'):
                        image_id = image_path.replace('api/file/', '').replace('api/image/', '')
                        if is_valid_uuid(image_id):
                            image_file_list.append({'source_file': image_path,
                                                    'image_id': image_id})
                        else:
                            image_file_list.append({'source_file': image_path,
                                                    'image_id': new_image_id})
                            content = content.replace(source_image_path, f'/api/image/{new_image_id}')
                            p['content'] = content
                    else:
                        image_file_list.append({'source_file': image_path,
                                                'image_id': new_image_id})
                        content = content.replace(source_image_path, f'/api/image/{new_image_id}')
                        p['content'] = content
    return image_file_list
 def get_file_name(file_name):
    try:
        file_name_code = file_name.encode('cp437')
        charset = detect(file_name_code)['encoding']
        return file_name_code.decode(charset)
    except Exception as e:
        return file_name
 def filter_image_file(result_list: list, image_list):
    image_source_file_list = [image.get('source_file') for image in image_list]
    return [r for r in result_list if not image_source_file_list.__contains__(r.get('name', ''))]
 class ZipSplitHandle(BaseSplitHandle):
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
        buffer = get_buffer(file)
        bytes_io = io.BytesIO(buffer)
        result = []
        # 打开zip文件
        with zipfile.ZipFile(bytes_io, 'r') as zip_ref:
            # 获取压缩包中的文件名列表
            files = zip_ref.namelist()
            # 读取压缩包中的文件内容
            for file in files:
                if file.endswith('/') or file.startswith('__MACOSX'):
                    continue
                with zip_ref.open(file) as f:
                    # 对文件内容进行处理
                    try:
                        # 处理一下文件名
                        f.name = get_file_name(f.name)
                        value = file_to_paragraph(f, pattern_list, with_filter, limit, save_image)
                        if isinstance(value, list):
                            result = [*result, *value]
                        else:
                            result.append(value)
                    except Exception:
                        pass
            image_list = get_image_list(result, files)
            result = filter_image_file(result, image_list)
            image_mode_list = []
            for image in image_list:
                with zip_ref.open(image.get('source_file')) as f:
                    i = File(
                        id=image.get('image_id'),
                        image_name=os.path.basename(image.get('source_file')),
                        meta={'debug': False, 'content': f.read()}  # 这里的content是二进制数据
                    )
                    image_mode_list.append(i)
            save_image(image_mode_list)
        return result
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        if file_name.endswith(".zip") or file_name.endswith(".ZIP"):
            return True
        return False
    def get_content(self, file, save_image):
        return ""
--- a/apps/common/utils/common.py
+++ b/apps/common/utils/common.py
@ -257,3 +257,9 @@ def post(post_function):
    return inner
 def parse_md_image(content: str):
    matches = re.finditer("!\[.*?\]\(.*?\)", content)
    image_list = [match.group() for match in matches]
    return image_list
--- a/apps/knowledge/api/document.py
+++ b/apps/knowledge/api/document.py
@ -32,3 +32,46 @@ class DocumentCreateAPI(APIMixin):
    @staticmethod
    def get_response():
        return DocumentCreateResponse
 class DocumentSplitAPI(APIMixin):
    @staticmethod
    def get_parameters():
        return [
            OpenApiParameter(
                name="workspace_id",
                description="工作空间id",
                type=OpenApiTypes.STR,
                location='path',
                required=True,
            ),
            OpenApiParameter(
                name="file",
                description="文件",
                type=OpenApiTypes.BINARY,
                location='query',
                required=False,
            ),
            OpenApiParameter(
                name="limit",
                description="分段长度",
                type=OpenApiTypes.INT,
                location='query',
                required=False,
            ),
            OpenApiParameter(
                name="patterns",
                description="分段正则列表",
                type=OpenApiTypes.STR,
                location='query',
                required=False,
            ),
            OpenApiParameter(
                name="with_filter",
                description="是否清除特殊字符",
                type=OpenApiTypes.BOOL,
                location='query',
                required=False,
            ),
        ]
--- a/apps/knowledge/serializers/document.py
+++ b/apps/knowledge/serializers/document.py
@ -13,14 +13,34 @@ from rest_framework import serializers
 from common.db.search import native_search
 from common.event import ListenerManagement
 from common.exception.app_exception import AppApiException
 from common.handle.impl.csv_split_handle import CsvSplitHandle
 from common.handle.impl.doc_split_handle import DocSplitHandle
 from common.handle.impl.html_split_handle import HTMLSplitHandle
 from common.handle.impl.pdf_split_handle import PdfSplitHandle
 from common.handle.impl.text_split_handle import TextSplitHandle
 from common.handle.impl.xls_split_handle import XlsSplitHandle
 from common.handle.impl.xlsx_split_handle import XlsxSplitHandle
 from common.handle.impl.zip_split_handle import ZipSplitHandle
 from common.utils.common import post, get_file_content
 from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \
-    TaskType
+    TaskType, File
 from knowledge.serializers.common import ProblemParagraphManage
 from knowledge.serializers.paragraph import ParagraphSerializers, ParagraphInstanceSerializer
 from knowledge.task import embedding_by_document
 from maxkb.const import PROJECT_DIR
 default_split_handle = TextSplitHandle()
 split_handles = [
    HTMLSplitHandle(),
    DocSplitHandle(),
    PdfSplitHandle(),
    XlsxSplitHandle(),
    XlsSplitHandle(),
    CsvSplitHandle(),
    ZipSplitHandle(),
    default_split_handle
 ]
 class DocumentInstanceSerializer(serializers.Serializer):
    name = serializers.CharField(required=True, label=_('document name'), max_length=128, min_length=1)
@ -34,6 +54,17 @@ class DocumentCreateRequest(serializers.Serializer):
    documents = DocumentInstanceSerializer(required=False, many=True)
 class DocumentSplitRequest(serializers.Serializer):
    file = serializers.ListField(required=True, label=_('file list'))
    limit = serializers.IntegerField(required=False, label=_('limit'))
    patterns = serializers.ListField(
        required=False,
        child=serializers.CharField(required=True, label=_('patterns')),
        label=_('patterns')
    )
    with_filter = serializers.BooleanField(required=False, label=_('Auto Clean'))
 class DocumentSerializers(serializers.Serializer):
    class Operate(serializers.Serializer):
        document_id = serializers.UUIDField(required=True, label=_('document id'))
@ -177,3 +208,67 @@ class DocumentSerializers(serializers.Serializer):
                document_model,
                instance.get('paragraphs') if 'paragraphs' in instance else []
            )
    class Split(serializers.Serializer):
        workspace_id = serializers.CharField(required=True, label=_('workspace id'))
        knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id'))
        def is_valid(self, *, raise_exception=True):
            super().is_valid(raise_exception=True)
            files = self.data.get('file')
            for f in files:
                if f.size > 1024 * 1024 * 100:
                    raise AppApiException(500, _(
                        'The maximum size of the uploaded file cannot exceed {}MB'
                    ).format(100))
        def parse(self, instance):
            self.is_valid(raise_exception=True)
            DocumentSplitRequest(instance).is_valid(raise_exception=True)
            file_list = instance.get("file")
            return reduce(
                lambda x, y: [*x, *y],
                [self.file_to_paragraph(
                    f,
                    instance.get("patterns", None),
                    instance.get("with_filter", None),
                    instance.get("limit", 4096)
                ) for f in file_list],
                []
            )
        def save_image(self, image_list):
            if image_list is not None and len(image_list) > 0:
                exist_image_list = [str(i.get('id')) for i in
                                    QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')]
                save_image_list = [image for image in image_list if not exist_image_list.__contains__(str(image.id))]
                save_image_list = list({img.id: img for img in save_image_list}.values())
                # save image
                for file in save_image_list:
                    file_bytes = file.meta.pop('content')
                    file.workspace_id = self.data.get('workspace_id')
                    file.meta['knowledge_id'] = self.data.get('knowledge_id')
                    file.save(file_bytes)
        def file_to_paragraph(self, file, pattern_list: List, with_filter: bool, limit: int):
            get_buffer = FileBufferHandle().get_buffer
            for split_handle in split_handles:
                if split_handle.support(file, get_buffer):
                    result = split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, self.save_image)
                    if isinstance(result, list):
                        return result
                    return [result]
            result = default_split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, self.save_image)
            if isinstance(result, list):
                return result
            return [result]
 class FileBufferHandle:
    buffer = None
    def get_buffer(self, file):
        if self.buffer is None:
            self.buffer = file.read()
        return self.buffer
--- a/apps/knowledge/urls.py
+++ b/apps/knowledge/urls.py
@ -8,5 +8,6 @@ urlpatterns = [
    path('workspace/<str:workspace_id>/knowledge/base', views.KnowledgeBaseView.as_view()),
    path('workspace/<str:workspace_id>/knowledge/web', views.KnowledgeWebView.as_view()),
    path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>', views.KnowledgeView.Operate.as_view()),
    path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/split', views.DocumentView.Split.as_view()),
    path('workspace/<str:workspace_id>/knowledge/<int:current_page>/<int:page_size>', views.KnowledgeView.Page.as_view()),
 ]
--- a/apps/knowledge/views/init.py
+++ b/apps/knowledge/views/init.py
@ -1 +1,2 @@
 from .knowledge import *
 from .document import *
--- a/apps/knowledge/views/document.py
+++ b/apps/knowledge/views/document.py
@ -0,0 +1,70 @@
 from django.utils.translation import gettext_lazy as _
 from drf_spectacular.utils import extend_schema
 from rest_framework.parsers import MultiPartParser
 from rest_framework.request import Request
 from rest_framework.views import APIView
 from common.auth import TokenAuth
 from common.auth.authentication import has_permissions
 from common.constants.permission_constants import PermissionConstants, CompareConstants
 from common.result import result
 from knowledge.api.document import DocumentSplitAPI
 from knowledge.api.knowledge import KnowledgeTreeReadAPI
 from knowledge.serializers.document import DocumentSerializers
 from knowledge.serializers.knowledge import KnowledgeSerializer
 class DocumentView(APIView):
    authentication_classes = [TokenAuth]
    @extend_schema(
        methods=['GET'],
        description=_('Get document'),
        operation_id=_('Get document'),
        parameters=KnowledgeTreeReadAPI.get_parameters(),
        responses=KnowledgeTreeReadAPI.get_response(),
        tags=[_('Knowledge Base')]
    )
    @has_permissions(PermissionConstants.DOCUMENT_READ.get_workspace_permission())
    def get(self, request: Request, workspace_id: str):
        return result.success(KnowledgeSerializer.Query(
            data={
                'workspace_id': workspace_id,
                'folder_id': request.query_params.get('folder_id'),
                'name': request.query_params.get('name'),
                'desc': request.query_params.get("desc"),
                'user_id': request.query_params.get('user_id')
            }
        ).list())
    class Split(APIView):
        authentication_classes = [TokenAuth]
        parser_classes = [MultiPartParser]
        @extend_schema(
            methods=['POST'],
            description=_('Segmented document'),
            operation_id=_('Segmented document'),
            parameters=DocumentSplitAPI.get_parameters(),
            request=DocumentSplitAPI.get_request(),
            responses=DocumentSplitAPI.get_response(),
            tags=[_('Knowledge Base/Documentation')]
        )
        @has_permissions([
            PermissionConstants.DOCUMENT_CREATE.get_workspace_permission(),
            PermissionConstants.DOCUMENT_EDIT.get_workspace_permission(),
        ])
        def post(self, request: Request, workspace_id: str, knowledge_id: str):
            split_data = {'file': request.FILES.getlist('file')}
            request_data = request.data
            if 'patterns' in request.data and request.data.get('patterns') is not None and len(
                    request.data.get('patterns')) > 0:
                split_data.__setitem__('patterns', request_data.getlist('patterns'))
            if 'limit' in request.data:
                split_data.__setitem__('limit', request_data.get('limit'))
            if 'with_filter' in request.data:
                split_data.__setitem__('with_filter', request_data.get('with_filter'))
            return result.success(DocumentSerializers.Split(data={
                'workspace_id': workspace_id,
                'knowledge_id': knowledge_id,
            }).parse(split_data))
--- a/pyproject.toml
+++ b/pyproject.toml
@ -46,6 +46,12 @@ celery-once = "3.0.1"
 beautifulsoup4 = "4.13.4"
 html2text = "2025.4.15"
 jieba = "0.42.1"
 openpyxl = "3.1.5"
 python-docx = "1.1.2"
 xlrd = "2.0.1"
 xlwt = "1.3.0"
 pymupdf = "1.24.9"
 pypdf = "4.3.1"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
`@ -1 +1,2 @@`
	`from .knowledge import *`	`from .knowledge import *`
		`from .document import *`