feat: 高级编排支持文件上传(WIP)

2024-11-14 11:11:53 +08:00 · 2024-11-14 11:11:53 +08:00 · b57a619bdb
commit b57a619bdb
parent a0cfcb73a9
11 changed files with 149 additions and 6 deletions
--- a/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py
+++ b/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py
@ -1,23 +1,36 @@
 # coding=utf-8
 import io
 from django.db.models import QuerySet
 from application.flow.i_step_node import NodeResult
 from application.flow.step_node.document_extract_node.i_document_extract_node import IDocumentExtractNode
 from dataset.models import File
 from dataset.serializers.document_serializers import split_handles, parse_table_handle_list, FileBufferHandle
 class BaseDocumentExtractNode(IDocumentExtractNode):
    def execute(self, document, **kwargs):
        get_buffer = FileBufferHandle().get_buffer
        self.context['document_list'] = document
        content = ''
        spliter = '\n-----------------------------------\n'
-        if len(document) > 0:
+        if document is None:
-            for doc in document:
+            return NodeResult({'content': content}, {})
                file = QuerySet(File).filter(id=doc['file_id']).first()
                file_type = doc['name'].split('.')[-1]
                if file_type.lower() in ['txt', 'md', 'csv', 'html']:
                    content += spliter + doc['name'] + '\n' + file.get_byte().tobytes().decode('utf-8')
        for doc in document:
            file = QuerySet(File).filter(id=doc['file_id']).first()
            buffer = io.BytesIO(file.get_byte().tobytes())
            buffer.name = doc['name']  # this is the important line
            for split_handle in (parse_table_handle_list + split_handles):
                if split_handle.support(buffer, get_buffer):
                    # 回到文件头
                    buffer.seek(0)
                    file_content = split_handle.get_content(buffer)
                    content += spliter + '## ' + doc['name'] + '\n' + file_content
                    return NodeResult({'content': content}, {})
        return NodeResult({'content': content}, {})
--- a/apps/common/handle/base_parse_table_handle.py
+++ b/apps/common/handle/base_parse_table_handle.py
@ -17,3 +17,7 @@ class BaseParseTableHandle(ABC):
    @abstractmethod
    def handle(self, file, get_buffer,save_image):
        pass
    @abstractmethod
    def get_content(self, file):
        pass
--- a/apps/common/handle/base_split_handle.py
+++ b/apps/common/handle/base_split_handle.py
@ -18,3 +18,7 @@ class BaseSplitHandle(ABC):
    @abstractmethod
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
        pass
    @abstractmethod
    def get_content(self, file):
        pass
--- a/apps/common/handle/impl/doc_split_handle.py
+++ b/apps/common/handle/impl/doc_split_handle.py
@ -189,3 +189,13 @@ class DocSplitHandle(BaseSplitHandle):
                ".DOC") or file_name.endswith(".DOCX"):
            return True
        return False
    def get_content(self, file):
        try:
            image_list = []
            buffer = file.read()
            doc = Document(io.BytesIO(buffer))
            return self.to_md(doc, image_list, get_image_id_func())
        except BaseException as e:
            traceback.print_exception(e)
            return ''
--- a/apps/common/handle/impl/html_split_handle.py
+++ b/apps/common/handle/impl/html_split_handle.py
@ -7,6 +7,7 @@
    @desc:
 """
 import re
 import traceback
 from typing import List
 from bs4 import BeautifulSoup
@ -59,3 +60,14 @@ class HTMLSplitHandle(BaseSplitHandle):
        return {'name': file.name,
                'content': split_model.parse(content)
                }
    def get_content(self, file):
        buffer = file.read()
        try:
            encoding = get_encoding(buffer)
            content = buffer.decode(encoding)
            return html2text(content)
        except BaseException as e:
            traceback.print_exception(e)
            return ''
--- a/apps/common/handle/impl/pdf_split_handle.py
+++ b/apps/common/handle/impl/pdf_split_handle.py
@ -11,6 +11,7 @@ import os
 import re
 import tempfile
 import time
 import traceback
 from typing import List
 import fitz
@ -297,3 +298,17 @@ class PdfSplitHandle(BaseSplitHandle):
        if file_name.endswith(".pdf") or file_name.endswith(".PDF"):
            return True
        return False
    def get_content(self, file):
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            # 将上传的文件保存到临时文件中
            temp_file.write(file.read())
            # 获取临时文件的路径
            temp_file_path = temp_file.name
        pdf_document = fitz.open(temp_file_path)
        try:
            return self.handle_pdf_content(file, pdf_document)
        except BaseException as e:
            traceback.print_exception(e)
            return ''
--- a/apps/common/handle/impl/table/csv_parse_table_handle.py
+++ b/apps/common/handle/impl/table/csv_parse_table_handle.py
@ -34,3 +34,11 @@ class CsvSplitHandle(BaseParseTableHandle):
            paragraphs.append({'title': '', 'content': line})
        return [{'name': file.name, 'paragraphs': paragraphs}]
    def get_content(self, file):
        buffer = file.read()
        try:
            return buffer.decode(detect(buffer)['encoding'])
        except BaseException as e:
            max_kb.error(f'csv split handle error: {e}')
            return [{'name': file.name, 'paragraphs': []}]
--- a/apps/common/handle/impl/table/xls_parse_table_handle.py
+++ b/apps/common/handle/impl/table/xls_parse_table_handle.py
@ -60,3 +60,24 @@ class XlsSplitHandle(BaseParseTableHandle):
            max_kb.error(f'excel split handle error: {e}')
            return [{'name': file.name, 'paragraphs': []}]
        return result
    def get_content(self, file):
        # 打开 .xls 文件
        workbook = xlrd.open_workbook(file_contents=file.read(), formatting_info=True)
        sheets = workbook.sheets()
        md_tables = ''
        for sheet in sheets:
            # 获取表头和内容
            headers = sheet.row_values(0)
            data = [sheet.row_values(row_idx) for row_idx in range(1, sheet.nrows)]
            # 构建 Markdown 表格
            md_table = '| ' + ' | '.join(headers) + ' |\n'
            md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
            for row in data:
                # 将每个单元格中的内容替换换行符为 <br> 以保留原始格式
                md_table += '| ' + ' | '.join([str(cell).replace('\n', '<br>') if cell else '' for cell in row]) + ' |\n'
                md_tables += md_table + '\n\n'
        return md_tables
--- a/apps/common/handle/impl/table/xlsx_parse_table_handle.py
+++ b/apps/common/handle/impl/table/xlsx_parse_table_handle.py
@ -72,3 +72,31 @@ class XlsxSplitHandle(BaseParseTableHandle):
            max_kb.error(f'excel split handle error: {e}')
            return [{'name': file.name, 'paragraphs': []}]
        return result
    def get_content(self, file):
        # 加载 Excel 文件
        workbook = load_workbook(file)
        md_tables = ''
        # 如果未指定 sheet_name，则使用第一个工作表
        for sheetname in workbook.sheetnames:
            sheet = workbook[sheetname] if sheetname else workbook.active
            # 获取工作表的所有行
            rows = list(sheet.iter_rows(values_only=True))
            if not rows:
                continue
            # 提取表头和内容
            headers = rows[0]
            data = rows[1:]
            # 构建 Markdown 表格
            md_table = '| ' + ' | '.join(headers) + ' |\n'
            md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
            for row in data:
                md_table += '| ' + ' | '.join(
                    [str(cell).replace('\n', '<br>') if cell is not None else '' for cell in row]) + ' |\n'
            md_tables += md_table + '\n\n'
        return md_tables
--- a/apps/common/handle/impl/text_split_handle.py
+++ b/apps/common/handle/impl/text_split_handle.py
@ -7,6 +7,7 @@
    @desc:
 """
 import re
 import traceback
 from typing import List
 from charset_normalizer import detect
@ -49,3 +50,11 @@ class TextSplitHandle(BaseSplitHandle):
        return {'name': file.name,
                'content': split_model.parse(content)
                }
    def get_content(self, file):
        buffer = file.read()
        try:
           return buffer.decode(detect(buffer)['encoding'])
        except BaseException as e:
            traceback.print_exception(e)
            return ''
--- a/ui/src/components/ai-chat/ExecutionDetailDialog.vue
+++ b/ui/src/components/ai-chat/ExecutionDetailDialog.vue
@ -182,6 +182,25 @@
                    </div>
                  </template>
                  <!-- 文档内容提取 -->
                  <template v-if="item.type === WorkflowType.DocumentExtractNode">
                    <div class="card-never border-r-4">
                      <h5 class="p-8-12">参数输出</h5>
                      <div class="p-8-12 border-t-dashed lighter">
                        <el-scrollbar height="150">
                          <MdPreview
                            v-if="item.content"
                            ref="editorRef"
                            editorId="preview-only"
                            :modelValue="item.content"
                            style="background: none"
                          />
                          <template v-else> - </template>
                        </el-scrollbar>
                      </div>
                    </div>
                  </template>
                  <!-- 函数库 -->
                  <template
                    v-if="