feat: 高级编排支持文件上传(WIP)
This commit is contained in:
parent
a0cfcb73a9
commit
b57a619bdb
@ -1,23 +1,36 @@
|
|||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
import io
|
||||||
|
|
||||||
from django.db.models import QuerySet
|
from django.db.models import QuerySet
|
||||||
|
|
||||||
from application.flow.i_step_node import NodeResult
|
from application.flow.i_step_node import NodeResult
|
||||||
from application.flow.step_node.document_extract_node.i_document_extract_node import IDocumentExtractNode
|
from application.flow.step_node.document_extract_node.i_document_extract_node import IDocumentExtractNode
|
||||||
from dataset.models import File
|
from dataset.models import File
|
||||||
|
from dataset.serializers.document_serializers import split_handles, parse_table_handle_list, FileBufferHandle
|
||||||
|
|
||||||
|
|
||||||
class BaseDocumentExtractNode(IDocumentExtractNode):
|
class BaseDocumentExtractNode(IDocumentExtractNode):
|
||||||
def execute(self, document, **kwargs):
|
def execute(self, document, **kwargs):
|
||||||
|
get_buffer = FileBufferHandle().get_buffer
|
||||||
|
|
||||||
self.context['document_list'] = document
|
self.context['document_list'] = document
|
||||||
content = ''
|
content = ''
|
||||||
spliter = '\n-----------------------------------\n'
|
spliter = '\n-----------------------------------\n'
|
||||||
if len(document) > 0:
|
if document is None:
|
||||||
for doc in document:
|
return NodeResult({'content': content}, {})
|
||||||
file = QuerySet(File).filter(id=doc['file_id']).first()
|
|
||||||
file_type = doc['name'].split('.')[-1]
|
|
||||||
if file_type.lower() in ['txt', 'md', 'csv', 'html']:
|
|
||||||
content += spliter + doc['name'] + '\n' + file.get_byte().tobytes().decode('utf-8')
|
|
||||||
|
|
||||||
|
for doc in document:
|
||||||
|
file = QuerySet(File).filter(id=doc['file_id']).first()
|
||||||
|
buffer = io.BytesIO(file.get_byte().tobytes())
|
||||||
|
buffer.name = doc['name'] # this is the important line
|
||||||
|
|
||||||
|
for split_handle in (parse_table_handle_list + split_handles):
|
||||||
|
if split_handle.support(buffer, get_buffer):
|
||||||
|
# 回到文件头
|
||||||
|
buffer.seek(0)
|
||||||
|
file_content = split_handle.get_content(buffer)
|
||||||
|
content += spliter + '## ' + doc['name'] + '\n' + file_content
|
||||||
|
return NodeResult({'content': content}, {})
|
||||||
|
|
||||||
return NodeResult({'content': content}, {})
|
return NodeResult({'content': content}, {})
|
||||||
|
|
||||||
|
|||||||
@ -17,3 +17,7 @@ class BaseParseTableHandle(ABC):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def handle(self, file, get_buffer,save_image):
|
def handle(self, file, get_buffer,save_image):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_content(self, file):
|
||||||
|
pass
|
||||||
@ -18,3 +18,7 @@ class BaseSplitHandle(ABC):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_content(self, file):
|
||||||
|
pass
|
||||||
|
|||||||
@ -189,3 +189,13 @@ class DocSplitHandle(BaseSplitHandle):
|
|||||||
".DOC") or file_name.endswith(".DOCX"):
|
".DOC") or file_name.endswith(".DOCX"):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def get_content(self, file):
|
||||||
|
try:
|
||||||
|
image_list = []
|
||||||
|
buffer = file.read()
|
||||||
|
doc = Document(io.BytesIO(buffer))
|
||||||
|
return self.to_md(doc, image_list, get_image_id_func())
|
||||||
|
except BaseException as e:
|
||||||
|
traceback.print_exception(e)
|
||||||
|
return ''
|
||||||
@ -7,6 +7,7 @@
|
|||||||
@desc:
|
@desc:
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
|
import traceback
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@ -59,3 +60,14 @@ class HTMLSplitHandle(BaseSplitHandle):
|
|||||||
return {'name': file.name,
|
return {'name': file.name,
|
||||||
'content': split_model.parse(content)
|
'content': split_model.parse(content)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def get_content(self, file):
|
||||||
|
buffer = file.read()
|
||||||
|
|
||||||
|
try:
|
||||||
|
encoding = get_encoding(buffer)
|
||||||
|
content = buffer.decode(encoding)
|
||||||
|
return html2text(content)
|
||||||
|
except BaseException as e:
|
||||||
|
traceback.print_exception(e)
|
||||||
|
return ''
|
||||||
@ -11,6 +11,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
|
import traceback
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
@ -297,3 +298,17 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
if file_name.endswith(".pdf") or file_name.endswith(".PDF"):
|
if file_name.endswith(".pdf") or file_name.endswith(".PDF"):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def get_content(self, file):
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
||||||
|
# 将上传的文件保存到临时文件中
|
||||||
|
temp_file.write(file.read())
|
||||||
|
# 获取临时文件的路径
|
||||||
|
temp_file_path = temp_file.name
|
||||||
|
|
||||||
|
pdf_document = fitz.open(temp_file_path)
|
||||||
|
try:
|
||||||
|
return self.handle_pdf_content(file, pdf_document)
|
||||||
|
except BaseException as e:
|
||||||
|
traceback.print_exception(e)
|
||||||
|
return ''
|
||||||
@ -34,3 +34,11 @@ class CsvSplitHandle(BaseParseTableHandle):
|
|||||||
paragraphs.append({'title': '', 'content': line})
|
paragraphs.append({'title': '', 'content': line})
|
||||||
|
|
||||||
return [{'name': file.name, 'paragraphs': paragraphs}]
|
return [{'name': file.name, 'paragraphs': paragraphs}]
|
||||||
|
|
||||||
|
def get_content(self, file):
|
||||||
|
buffer = file.read()
|
||||||
|
try:
|
||||||
|
return buffer.decode(detect(buffer)['encoding'])
|
||||||
|
except BaseException as e:
|
||||||
|
max_kb.error(f'csv split handle error: {e}')
|
||||||
|
return [{'name': file.name, 'paragraphs': []}]
|
||||||
@ -60,3 +60,24 @@ class XlsSplitHandle(BaseParseTableHandle):
|
|||||||
max_kb.error(f'excel split handle error: {e}')
|
max_kb.error(f'excel split handle error: {e}')
|
||||||
return [{'name': file.name, 'paragraphs': []}]
|
return [{'name': file.name, 'paragraphs': []}]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def get_content(self, file):
|
||||||
|
# 打开 .xls 文件
|
||||||
|
workbook = xlrd.open_workbook(file_contents=file.read(), formatting_info=True)
|
||||||
|
sheets = workbook.sheets()
|
||||||
|
md_tables = ''
|
||||||
|
for sheet in sheets:
|
||||||
|
|
||||||
|
# 获取表头和内容
|
||||||
|
headers = sheet.row_values(0)
|
||||||
|
data = [sheet.row_values(row_idx) for row_idx in range(1, sheet.nrows)]
|
||||||
|
|
||||||
|
# 构建 Markdown 表格
|
||||||
|
md_table = '| ' + ' | '.join(headers) + ' |\n'
|
||||||
|
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
|
||||||
|
for row in data:
|
||||||
|
# 将每个单元格中的内容替换换行符为 <br> 以保留原始格式
|
||||||
|
md_table += '| ' + ' | '.join([str(cell).replace('\n', '<br>') if cell else '' for cell in row]) + ' |\n'
|
||||||
|
md_tables += md_table + '\n\n'
|
||||||
|
|
||||||
|
return md_tables
|
||||||
|
|||||||
@ -72,3 +72,31 @@ class XlsxSplitHandle(BaseParseTableHandle):
|
|||||||
max_kb.error(f'excel split handle error: {e}')
|
max_kb.error(f'excel split handle error: {e}')
|
||||||
return [{'name': file.name, 'paragraphs': []}]
|
return [{'name': file.name, 'paragraphs': []}]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def get_content(self, file):
|
||||||
|
# 加载 Excel 文件
|
||||||
|
workbook = load_workbook(file)
|
||||||
|
md_tables = ''
|
||||||
|
# 如果未指定 sheet_name,则使用第一个工作表
|
||||||
|
for sheetname in workbook.sheetnames:
|
||||||
|
sheet = workbook[sheetname] if sheetname else workbook.active
|
||||||
|
|
||||||
|
# 获取工作表的所有行
|
||||||
|
rows = list(sheet.iter_rows(values_only=True))
|
||||||
|
if not rows:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 提取表头和内容
|
||||||
|
headers = rows[0]
|
||||||
|
data = rows[1:]
|
||||||
|
|
||||||
|
# 构建 Markdown 表格
|
||||||
|
md_table = '| ' + ' | '.join(headers) + ' |\n'
|
||||||
|
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
|
||||||
|
for row in data:
|
||||||
|
md_table += '| ' + ' | '.join(
|
||||||
|
[str(cell).replace('\n', '<br>') if cell is not None else '' for cell in row]) + ' |\n'
|
||||||
|
|
||||||
|
md_tables += md_table + '\n\n'
|
||||||
|
return md_tables
|
||||||
@ -7,6 +7,7 @@
|
|||||||
@desc:
|
@desc:
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
|
import traceback
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from charset_normalizer import detect
|
from charset_normalizer import detect
|
||||||
@ -49,3 +50,11 @@ class TextSplitHandle(BaseSplitHandle):
|
|||||||
return {'name': file.name,
|
return {'name': file.name,
|
||||||
'content': split_model.parse(content)
|
'content': split_model.parse(content)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def get_content(self, file):
|
||||||
|
buffer = file.read()
|
||||||
|
try:
|
||||||
|
return buffer.decode(detect(buffer)['encoding'])
|
||||||
|
except BaseException as e:
|
||||||
|
traceback.print_exception(e)
|
||||||
|
return ''
|
||||||
@ -182,6 +182,25 @@
|
|||||||
</div>
|
</div>
|
||||||
</template>
|
</template>
|
||||||
|
|
||||||
|
<!-- 文档内容提取 -->
|
||||||
|
<template v-if="item.type === WorkflowType.DocumentExtractNode">
|
||||||
|
<div class="card-never border-r-4">
|
||||||
|
<h5 class="p-8-12">参数输出</h5>
|
||||||
|
<div class="p-8-12 border-t-dashed lighter">
|
||||||
|
<el-scrollbar height="150">
|
||||||
|
<MdPreview
|
||||||
|
v-if="item.content"
|
||||||
|
ref="editorRef"
|
||||||
|
editorId="preview-only"
|
||||||
|
:modelValue="item.content"
|
||||||
|
style="background: none"
|
||||||
|
/>
|
||||||
|
<template v-else> - </template>
|
||||||
|
</el-scrollbar>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</template>
|
||||||
|
|
||||||
<!-- 函数库 -->
|
<!-- 函数库 -->
|
||||||
<template
|
<template
|
||||||
v-if="
|
v-if="
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user