fix: 表格数据区分xls和xlsx
This commit is contained in:
parent
c58635e7cc
commit
746f587698
47
apps/common/handle/impl/table/xls_parse_table_handle.py
Normal file
47
apps/common/handle/impl/table/xls_parse_table_handle.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import xlrd
|
||||||
|
|
||||||
|
from common.handle.base_parse_table_handle import BaseParseTableHandle
|
||||||
|
|
||||||
|
max_kb = logging.getLogger("max_kb")
|
||||||
|
|
||||||
|
|
||||||
|
class XlsSplitHandle(BaseParseTableHandle):
|
||||||
|
def support(self, file, get_buffer):
|
||||||
|
file_name: str = file.name.lower()
|
||||||
|
buffer = get_buffer(file)
|
||||||
|
if file_name.endswith(".xls") and xlrd.inspect_format(content=buffer):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def handle(self, file, get_buffer, save_image):
|
||||||
|
buffer = get_buffer(file)
|
||||||
|
try:
|
||||||
|
wb = xlrd.open_workbook(file_contents=buffer)
|
||||||
|
result = []
|
||||||
|
sheets = wb.sheets()
|
||||||
|
for sheet in sheets:
|
||||||
|
paragraphs = []
|
||||||
|
rows = iter([sheet.row_values(i) for i in range(sheet.nrows)])
|
||||||
|
if not rows: continue
|
||||||
|
ti = next(rows)
|
||||||
|
for r in rows:
|
||||||
|
l = []
|
||||||
|
for i, c in enumerate(r):
|
||||||
|
if not c:
|
||||||
|
continue
|
||||||
|
t = str(ti[i]) if i < len(ti) else ""
|
||||||
|
t += (": " if t else "") + str(c)
|
||||||
|
l.append(t)
|
||||||
|
l = "; ".join(l)
|
||||||
|
if sheet.name.lower().find("sheet") < 0:
|
||||||
|
l += " ——" + sheet.name
|
||||||
|
paragraphs.append({'title': '', 'content': l})
|
||||||
|
result.append({'name': sheet.name, 'paragraphs': paragraphs})
|
||||||
|
|
||||||
|
except BaseException as e:
|
||||||
|
max_kb.error(f'excel split handle error: {e}')
|
||||||
|
return [{'name': file.name, 'paragraphs': []}]
|
||||||
|
return result
|
||||||
@ -10,10 +10,10 @@ from common.handle.impl.tools import xlsx_embed_cells_images
|
|||||||
max_kb = logging.getLogger("max_kb")
|
max_kb = logging.getLogger("max_kb")
|
||||||
|
|
||||||
|
|
||||||
class ExcelSplitHandle(BaseParseTableHandle):
|
class XlsxSplitHandle(BaseParseTableHandle):
|
||||||
def support(self, file, get_buffer):
|
def support(self, file, get_buffer):
|
||||||
file_name: str = file.name.lower()
|
file_name: str = file.name.lower()
|
||||||
if file_name.endswith('.xls') or file_name.endswith('.xlsx'):
|
if file_name.endswith('.xlsx'):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -34,13 +34,11 @@ class ExcelSplitHandle(BaseParseTableHandle):
|
|||||||
if not rows: continue
|
if not rows: continue
|
||||||
ti = list(rows[0])
|
ti = list(rows[0])
|
||||||
for r in list(rows[1:]):
|
for r in list(rows[1:]):
|
||||||
title = []
|
|
||||||
l = []
|
l = []
|
||||||
for i, c in enumerate(r):
|
for i, c in enumerate(r):
|
||||||
if not c.value:
|
if not c.value:
|
||||||
continue
|
continue
|
||||||
t = str(ti[i].value) if i < len(ti) else ""
|
t = str(ti[i].value) if i < len(ti) else ""
|
||||||
title.append(t)
|
|
||||||
content = str(c.value)
|
content = str(c.value)
|
||||||
image = image_dict.get(content, None)
|
image = image_dict.get(content, None)
|
||||||
if image is not None:
|
if image is not None:
|
||||||
@ -34,7 +34,8 @@ from common.handle.impl.qa.csv_parse_qa_handle import CsvParseQAHandle
|
|||||||
from common.handle.impl.qa.xls_parse_qa_handle import XlsParseQAHandle
|
from common.handle.impl.qa.xls_parse_qa_handle import XlsParseQAHandle
|
||||||
from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle
|
from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle
|
||||||
from common.handle.impl.table.csv_parse_table_handle import CsvSplitHandle
|
from common.handle.impl.table.csv_parse_table_handle import CsvSplitHandle
|
||||||
from common.handle.impl.table.excel_parse_table_handle import ExcelSplitHandle
|
from common.handle.impl.table.xlsx_parse_table_handle import XlsxSplitHandle
|
||||||
|
from common.handle.impl.table.xls_parse_table_handle import XlsSplitHandle
|
||||||
from common.handle.impl.text_split_handle import TextSplitHandle
|
from common.handle.impl.text_split_handle import TextSplitHandle
|
||||||
from common.mixins.api_mixin import ApiMixin
|
from common.mixins.api_mixin import ApiMixin
|
||||||
from common.util.common import post, flat_map
|
from common.util.common import post, flat_map
|
||||||
@ -53,7 +54,7 @@ from embedding.task.embedding import embedding_by_document, delete_embedding_by_
|
|||||||
from smartdoc.conf import PROJECT_DIR
|
from smartdoc.conf import PROJECT_DIR
|
||||||
|
|
||||||
parse_qa_handle_list = [XlsParseQAHandle(), CsvParseQAHandle(), XlsxParseQAHandle()]
|
parse_qa_handle_list = [XlsParseQAHandle(), CsvParseQAHandle(), XlsxParseQAHandle()]
|
||||||
parse_table_handle_list = [CsvSplitHandle(), ExcelSplitHandle()]
|
parse_table_handle_list = [CsvSplitHandle(), XlsSplitHandle(), XlsxSplitHandle()]
|
||||||
|
|
||||||
|
|
||||||
class FileBufferHandle:
|
class FileBufferHandle:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user