fix: handle string type for limit and improve error logging in pdf_split_handle

--bug=1057493 --user=刘瑞斌 【知识库】上传文档,使用高级分段报错 https://www.tapd.cn/62980211/s/1720110
This commit is contained in:
CaptainB 2025-06-30 12:40:11 +08:00
parent 049c0e0bb0
commit 82a2203be6
14 changed files with 63 additions and 24 deletions

View File

@ -8,10 +8,12 @@
""" """
import csv import csv
import io import io
import traceback
from charset_normalizer import detect from charset_normalizer import detect
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
from common.utils.logger import maxkb_logger
def read_csv_standard(file_path): def read_csv_standard(file_path):
@ -56,4 +58,5 @@ class CsvParseQAHandle(BaseParseQAHandle):
'problem_list': problem_list}) 'problem_list': problem_list})
return [{'name': file.name, 'paragraphs': paragraph_list}] return [{'name': file.name, 'paragraphs': paragraph_list}]
except Exception as e: except Exception as e:
maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'paragraphs': []}] return [{'name': file.name, 'paragraphs': []}]

View File

@ -6,10 +6,12 @@
@date2024/5/21 14:59 @date2024/5/21 14:59
@desc: @desc:
""" """
import traceback
import xlrd import xlrd
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
from common.utils.logger import maxkb_logger
def handle_sheet(file_name, sheet): def handle_sheet(file_name, sheet):
@ -58,4 +60,5 @@ class XlsParseQAHandle(BaseParseQAHandle):
sheet.name, sheet) for sheet sheet.name, sheet) for sheet
in worksheets] if row is not None] in worksheets] if row is not None]
except Exception as e: except Exception as e:
maxkb_logger.error(f"Error processing XLS file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'paragraphs': []}] return [{'name': file.name, 'paragraphs': []}]

View File

@ -7,11 +7,13 @@
@desc: @desc:
""" """
import io import io
import traceback
import openpyxl import openpyxl
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
from common.handle.impl.common_handle import xlsx_embed_cells_images from common.handle.impl.common_handle import xlsx_embed_cells_images
from common.utils.logger import maxkb_logger
def handle_sheet(file_name, sheet, image_dict): def handle_sheet(file_name, sheet, image_dict):
@ -69,4 +71,5 @@ class XlsxParseQAHandle(BaseParseQAHandle):
sheet.title, sheet, image_dict) for sheet sheet.title, sheet, image_dict) for sheet
in worksheets] if row is not None] in worksheets] if row is not None]
except Exception as e: except Exception as e:
maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'paragraphs': []}] return [{'name': file.name, 'paragraphs': []}]

View File

@ -1,5 +1,6 @@
# coding=utf-8 # coding=utf-8
import logging import logging
import traceback
from charset_normalizer import detect from charset_normalizer import detect
@ -19,7 +20,7 @@ class CsvParseTableHandle(BaseParseTableHandle):
try: try:
content = buffer.decode(detect(buffer)['encoding']) content = buffer.decode(detect(buffer)['encoding'])
except BaseException as e: except BaseException as e:
maxkb_logger.error(f'csv split handle error: {e}') maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'paragraphs': []}] return [{'name': file.name, 'paragraphs': []}]
csv_model = content.split('\n') csv_model = content.split('\n')

View File

@ -1,5 +1,6 @@
# coding=utf-8 # coding=utf-8
import logging import logging
import traceback
import xlrd import xlrd
@ -55,7 +56,7 @@ class XlsParseTableHandle(BaseParseTableHandle):
result.append({'name': sheet.name, 'paragraphs': paragraphs}) result.append({'name': sheet.name, 'paragraphs': paragraphs})
except BaseException as e: except BaseException as e:
maxkb_logger.error(f'excel split handle error: {e}') maxkb_logger.error(f"Error processing XLS file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'paragraphs': []}] return [{'name': file.name, 'paragraphs': []}]
return result return result

View File

@ -1,6 +1,7 @@
# coding=utf-8 # coding=utf-8
import io import io
import logging import logging
import traceback
from openpyxl import load_workbook from openpyxl import load_workbook
@ -73,7 +74,7 @@ class XlsxParseTableHandle(BaseParseTableHandle):
result.append({'name': sheetname, 'paragraphs': paragraphs}) result.append({'name': sheetname, 'paragraphs': paragraphs})
except BaseException as e: except BaseException as e:
maxkb_logger.error(f'excel split handle error: {e}') maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'paragraphs': []}] return [{'name': file.name, 'paragraphs': []}]
return result return result

View File

@ -9,11 +9,13 @@
import csv import csv
import io import io
import os import os
import traceback
from typing import List from typing import List
from charset_normalizer import detect from charset_normalizer import detect
from common.handle.base_split_handle import BaseSplitHandle from common.handle.base_split_handle import BaseSplitHandle
from common.utils.logger import maxkb_logger
def post_cell(cell_value): def post_cell(cell_value):
@ -60,6 +62,7 @@ class CsvSplitHandle(BaseSplitHandle):
paragraphs.append({'content': result_item_content, 'title': ''}) paragraphs.append({'content': result_item_content, 'title': ''})
return result return result
except Exception as e: except Exception as e:
maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
return result return result
def get_content(self, file, save_image): def get_content(self, file, save_image):

View File

@ -7,7 +7,6 @@
@desc: @desc:
""" """
import io import io
import logging
import os import os
import re import re
import traceback import traceback
@ -155,7 +154,7 @@ class DocSplitHandle(BaseSplitHandle):
return title return title
except Exception as e: except Exception as e:
traceback.print_exc() maxkb_logger.error(f"Error processing DOC file: {e}, {traceback.format_exc()}")
return paragraph.text return paragraph.text
return get_paragraph_txt(paragraph, doc, images_list, get_image_id) return get_paragraph_txt(paragraph, doc, images_list, get_image_id)
@ -207,12 +206,15 @@ class DocSplitHandle(BaseSplitHandle):
else: else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
except BaseException as e: except BaseException as e:
traceback.print_exception(e) maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
return {'name': file_name, return {
'content': []} 'name': file_name,
return {'name': file_name, 'content': []
'content': split_model.parse(content) }
} return {
'name': file_name,
'content': split_model.parse(content)
}
def support(self, file, get_buffer): def support(self, file, get_buffer):
file_name: str = file.name.lower() file_name: str = file.name.lower()

View File

@ -15,6 +15,7 @@ from charset_normalizer import detect
from html2text import html2text from html2text import html2text
from common.handle.base_split_handle import BaseSplitHandle from common.handle.base_split_handle import BaseSplitHandle
from common.utils.logger import maxkb_logger
from common.utils.split_model import SplitModel from common.utils.split_model import SplitModel
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
@ -55,11 +56,15 @@ class HTMLSplitHandle(BaseSplitHandle):
content = buffer.decode(encoding) content = buffer.decode(encoding)
content = html2text(content) content = html2text(content)
except BaseException as e: except BaseException as e:
return {'name': file.name, maxkb_logger.error(f"Error processing HTML file {file.name}: {e}, {traceback.format_exc()}")
'content': []}
return {'name': file.name, return {
'content': split_model.parse(content) 'name': file.name, 'content': []
} }
return {
'name': file.name,
'content': split_model.parse(content)
}
def get_content(self, file, save_image): def get_content(self, file, save_image):
buffer = file.read() buffer = file.read()

View File

@ -6,7 +6,6 @@
@date2024/3/27 18:19 @date2024/3/27 18:19
@desc: @desc:
""" """
import logging
import os import os
import re import re
import tempfile import tempfile
@ -31,7 +30,6 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile("(?<!\n)\n\n+")] re.compile("(?<!\n)\n\n+")]
def check_links_in_pdf(doc): def check_links_in_pdf(doc):
for page_number in range(len(doc)): for page_number in range(len(doc)):
page = doc[page_number] page = doc[page_number]
@ -54,6 +52,8 @@ class PdfSplitHandle(BaseSplitHandle):
pdf_document = fitz.open(temp_file_path) pdf_document = fitz.open(temp_file_path)
try: try:
if type(limit) is str:
limit = int(limit)
# 处理有目录的pdf # 处理有目录的pdf
result = self.handle_toc(pdf_document, limit) result = self.handle_toc(pdf_document, limit)
if result is not None: if result is not None:
@ -72,17 +72,20 @@ class PdfSplitHandle(BaseSplitHandle):
else: else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
except BaseException as e: except BaseException as e:
maxkb_logger.error(f"File: {file.name}, error: {e}") maxkb_logger.error(f"File: {file.name}, error: {e}, {traceback.format_exc()}")
return {'name': file.name, return {
'content': []} 'name': file.name,
'content': []
}
finally: finally:
pdf_document.close() pdf_document.close()
# 处理完后可以删除临时文件 # 处理完后可以删除临时文件
os.remove(temp_file_path) os.remove(temp_file_path)
return {'name': file.name, return {
'content': split_model.parse(content) 'name': file.name,
} 'content': split_model.parse(content)
}
@staticmethod @staticmethod
def handle_pdf_content(file, pdf_document): def handle_pdf_content(file, pdf_document):

View File

@ -13,6 +13,7 @@ from typing import List
from charset_normalizer import detect from charset_normalizer import detect
from common.handle.base_split_handle import BaseSplitHandle from common.handle.base_split_handle import BaseSplitHandle
from common.utils.logger import maxkb_logger
from common.utils.split_model import SplitModel from common.utils.split_model import SplitModel
default_pattern_list = [ default_pattern_list = [
@ -47,6 +48,7 @@ class TextSplitHandle(BaseSplitHandle):
try: try:
content = buffer.decode(detect(buffer)['encoding']) content = buffer.decode(detect(buffer)['encoding'])
except BaseException as e: except BaseException as e:
maxkb_logger.error(f"Error processing TEXT file {file.name}: {e}, {traceback.format_exc()}")
return {'name': file.name, 'content': []} return {'name': file.name, 'content': []}
return {'name': file.name, 'content': split_model.parse(content)} return {'name': file.name, 'content': split_model.parse(content)}

View File

@ -6,11 +6,13 @@
@date2024/5/21 14:59 @date2024/5/21 14:59
@desc: @desc:
""" """
import traceback
from typing import List from typing import List
import xlrd import xlrd
from common.handle.base_split_handle import BaseSplitHandle from common.handle.base_split_handle import BaseSplitHandle
from common.utils.logger import maxkb_logger
def post_cell(cell_value): def post_cell(cell_value):
@ -58,6 +60,8 @@ class XlsSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image): def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file) buffer = get_buffer(file)
try: try:
if type(limit) is str:
limit = int(limit)
workbook = xlrd.open_workbook(file_contents=buffer) workbook = xlrd.open_workbook(file_contents=buffer)
worksheets = workbook.sheets() worksheets = workbook.sheets()
worksheets_size = len(worksheets) worksheets_size = len(worksheets)
@ -67,6 +71,7 @@ class XlsSplitHandle(BaseSplitHandle):
sheet.name, sheet, limit) for sheet sheet.name, sheet, limit) for sheet
in worksheets] if row is not None] in worksheets] if row is not None]
except Exception as e: except Exception as e:
maxkb_logger.error(f"Error processing XLS file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'content': []}] return [{'name': file.name, 'content': []}]
def get_content(self, file, save_image): def get_content(self, file, save_image):

View File

@ -7,12 +7,14 @@
@desc: @desc:
""" """
import io import io
import traceback
from typing import List from typing import List
import openpyxl import openpyxl
from common.handle.base_split_handle import BaseSplitHandle from common.handle.base_split_handle import BaseSplitHandle
from common.handle.impl.common_handle import xlsx_embed_cells_images from common.handle.impl.common_handle import xlsx_embed_cells_images
from common.utils.logger import maxkb_logger
def post_cell(image_dict, cell_value): def post_cell(image_dict, cell_value):
@ -64,6 +66,8 @@ class XlsxSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image): def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
buffer = get_buffer(file) buffer = get_buffer(file)
try: try:
if type(limit) is str:
limit = int(limit)
workbook = openpyxl.load_workbook(io.BytesIO(buffer)) workbook = openpyxl.load_workbook(io.BytesIO(buffer))
try: try:
image_dict: dict = xlsx_embed_cells_images(io.BytesIO(buffer)) image_dict: dict = xlsx_embed_cells_images(io.BytesIO(buffer))
@ -80,6 +84,7 @@ class XlsxSplitHandle(BaseSplitHandle):
sheet.title, sheet, image_dict, limit) for sheet sheet.title, sheet, image_dict, limit) for sheet
in worksheets] if row is not None] in worksheets] if row is not None]
except Exception as e: except Exception as e:
maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
return [{'name': file.name, 'content': []}] return [{'name': file.name, 'content': []}]
def get_content(self, file, save_image): def get_content(self, file, save_image):

View File

@ -119,6 +119,8 @@ def filter_image_file(result_list: list, image_list):
class ZipSplitHandle(BaseSplitHandle): class ZipSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image): def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
if type(limit) is str:
limit = int(limit)
buffer = get_buffer(file) buffer = get_buffer(file)
bytes_io = io.BytesIO(buffer) bytes_io = io.BytesIO(buffer)
result = [] result = []