fix: handle string type for limit and improve error logging in pdf_split_handle

--bug=1057493 --user=刘瑞斌【知识库】上传文档，使用高级分段报错 https://www.tapd.cn/62980211/s/1720110
2025-06-30 12:40:11 +08:00 · 2025-06-30 12:40:11 +08:00 · 82a2203be6
commit 82a2203be6
parent 049c0e0bb0
14 changed files with 63 additions and 24 deletions
--- a/apps/common/handle/impl/qa/csv_parse_qa_handle.py
+++ b/apps/common/handle/impl/qa/csv_parse_qa_handle.py
@ -8,10 +8,12 @@
 """
 import csv
 import io
 import traceback
 from charset_normalizer import detect
 from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
 from common.utils.logger import maxkb_logger
 def read_csv_standard(file_path):
@ -56,4 +58,5 @@ class CsvParseQAHandle(BaseParseQAHandle):
                                       'problem_list': problem_list})
            return [{'name': file.name, 'paragraphs': paragraph_list}]
        except Exception as e:
            maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
            return [{'name': file.name, 'paragraphs': []}]
--- a/apps/common/handle/impl/qa/xls_parse_qa_handle.py
+++ b/apps/common/handle/impl/qa/xls_parse_qa_handle.py
@ -6,10 +6,12 @@
    @date：2024/5/21 14:59
    @desc:
 """
 import traceback
 import xlrd
 from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
 from common.utils.logger import maxkb_logger
 def handle_sheet(file_name, sheet):
@ -58,4 +60,5 @@ class XlsParseQAHandle(BaseParseQAHandle):
                        sheet.name, sheet) for sheet
                     in worksheets] if row is not None]
        except Exception as e:
            maxkb_logger.error(f"Error processing XLS file {file.name}: {e}, {traceback.format_exc()}")
            return [{'name': file.name, 'paragraphs': []}]
--- a/apps/common/handle/impl/qa/xlsx_parse_qa_handle.py
+++ b/apps/common/handle/impl/qa/xlsx_parse_qa_handle.py
@ -7,11 +7,13 @@
    @desc:
 """
 import io
 import traceback
 import openpyxl
 from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
 from common.handle.impl.common_handle import xlsx_embed_cells_images
 from common.utils.logger import maxkb_logger
 def handle_sheet(file_name, sheet, image_dict):
@ -69,4 +71,5 @@ class XlsxParseQAHandle(BaseParseQAHandle):
                        sheet.title, sheet, image_dict) for sheet
                     in worksheets] if row is not None]
        except Exception as e:
            maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
            return [{'name': file.name, 'paragraphs': []}]
--- a/apps/common/handle/impl/table/csv_parse_table_handle.py
+++ b/apps/common/handle/impl/table/csv_parse_table_handle.py
@ -1,5 +1,6 @@
 # coding=utf-8
 import logging
 import traceback
 from charset_normalizer import detect
@ -19,7 +20,7 @@ class CsvParseTableHandle(BaseParseTableHandle):
        try:
            content = buffer.decode(detect(buffer)['encoding'])
        except BaseException as e:
-            maxkb_logger.error(f'csv split handle error: {e}')
+            maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
            return [{'name': file.name, 'paragraphs': []}]
        csv_model = content.split('\n')
--- a/apps/common/handle/impl/table/xls_parse_table_handle.py
+++ b/apps/common/handle/impl/table/xls_parse_table_handle.py
@ -1,5 +1,6 @@
 # coding=utf-8
 import logging
 import traceback
 import xlrd
@ -55,7 +56,7 @@ class XlsParseTableHandle(BaseParseTableHandle):
                result.append({'name': sheet.name, 'paragraphs': paragraphs})
        except BaseException as e:
-            maxkb_logger.error(f'excel split handle error: {e}')
+            maxkb_logger.error(f"Error processing XLS file {file.name}: {e}, {traceback.format_exc()}")
            return [{'name': file.name, 'paragraphs': []}]
        return result
--- a/apps/common/handle/impl/table/xlsx_parse_table_handle.py
+++ b/apps/common/handle/impl/table/xlsx_parse_table_handle.py
@ -1,6 +1,7 @@
 # coding=utf-8
 import io
 import logging
 import traceback
 from openpyxl import load_workbook
@ -73,7 +74,7 @@ class XlsxParseTableHandle(BaseParseTableHandle):
                result.append({'name': sheetname, 'paragraphs': paragraphs})
        except BaseException as e:
-            maxkb_logger.error(f'excel split handle error: {e}')
+            maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
            return [{'name': file.name, 'paragraphs': []}]
        return result
--- a/apps/common/handle/impl/text/csv_split_handle.py
+++ b/apps/common/handle/impl/text/csv_split_handle.py
@ -9,11 +9,13 @@
 import csv
 import io
 import os
 import traceback
 from typing import List
 from charset_normalizer import detect
 from common.handle.base_split_handle import BaseSplitHandle
 from common.utils.logger import maxkb_logger
 def post_cell(cell_value):
@ -60,6 +62,7 @@ class CsvSplitHandle(BaseSplitHandle):
                paragraphs.append({'content': result_item_content, 'title': ''})
            return result
        except Exception as e:
            maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
            return result
    def get_content(self, file, save_image):
--- a/apps/common/handle/impl/text/doc_split_handle.py
+++ b/apps/common/handle/impl/text/doc_split_handle.py
@ -7,7 +7,6 @@
    @desc:
 """
 import io
 import logging
 import os
 import re
 import traceback
@ -155,7 +154,7 @@ class DocSplitHandle(BaseSplitHandle):
                return title
        except Exception as e:
-            traceback.print_exc()
+            maxkb_logger.error(f"Error processing DOC file: {e}, {traceback.format_exc()}")
            return paragraph.text
        return get_paragraph_txt(paragraph, doc, images_list, get_image_id)
@ -207,12 +206,15 @@ class DocSplitHandle(BaseSplitHandle):
            else:
                split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
        except BaseException as e:
-            traceback.print_exception(e)
+            maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
-            return {'name': file_name,
+            return {
-                    'content': []}
+                'name': file_name,
-        return {'name': file_name,
+                'content': []
-                'content': split_model.parse(content)
+            }
-                }
+        return {
            'name': file_name,
            'content': split_model.parse(content)
        }
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
--- a/apps/common/handle/impl/text/html_split_handle.py
+++ b/apps/common/handle/impl/text/html_split_handle.py
@ -15,6 +15,7 @@ from charset_normalizer import detect
 from html2text import html2text
 from common.handle.base_split_handle import BaseSplitHandle
 from common.utils.logger import maxkb_logger
 from common.utils.split_model import SplitModel
 default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
@ -55,11 +56,15 @@ class HTMLSplitHandle(BaseSplitHandle):
            content = buffer.decode(encoding)
            content = html2text(content)
        except BaseException as e:
-            return {'name': file.name,
+            maxkb_logger.error(f"Error processing HTML file {file.name}: {e}, {traceback.format_exc()}")
-                    'content': []}
+
-        return {'name': file.name,
+            return {
-                'content': split_model.parse(content)
+                'name': file.name, 'content': []
-                }
+            }
        return {
            'name': file.name,
            'content': split_model.parse(content)
        }
    def get_content(self, file, save_image):
        buffer = file.read()
--- a/apps/common/handle/impl/text/pdf_split_handle.py
+++ b/apps/common/handle/impl/text/pdf_split_handle.py
@ -6,7 +6,6 @@
    @date：2024/3/27 18:19
    @desc:
 """
 import logging
 import os
 import re
 import tempfile
@ -31,7 +30,6 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
                        re.compile("(?<!\n)\n\n+")]
 def check_links_in_pdf(doc):
    for page_number in range(len(doc)):
        page = doc[page_number]
@ -54,6 +52,8 @@ class PdfSplitHandle(BaseSplitHandle):
        pdf_document = fitz.open(temp_file_path)
        try:
            if type(limit) is str:
                limit = int(limit)
            # 处理有目录的pdf
            result = self.handle_toc(pdf_document, limit)
            if result is not None:
@ -72,17 +72,20 @@ class PdfSplitHandle(BaseSplitHandle):
            else:
                split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
        except BaseException as e:
-            maxkb_logger.error(f"File: {file.name}, error: {e}")
+            maxkb_logger.error(f"File: {file.name}, error: {e}, {traceback.format_exc()}")
-            return {'name': file.name,
+            return {
-                    'content': []}
+                'name': file.name,
                'content': []
            }
        finally:
            pdf_document.close()
            # 处理完后可以删除临时文件
            os.remove(temp_file_path)
-        return {'name': file.name,
+        return {
-                'content': split_model.parse(content)
+            'name': file.name,
-                }
+            'content': split_model.parse(content)
        }
    @staticmethod
    def handle_pdf_content(file, pdf_document):
--- a/apps/common/handle/impl/text/text_split_handle.py
+++ b/apps/common/handle/impl/text/text_split_handle.py
@ -13,6 +13,7 @@ from typing import List
 from charset_normalizer import detect
 from common.handle.base_split_handle import BaseSplitHandle
 from common.utils.logger import maxkb_logger
 from common.utils.split_model import SplitModel
 default_pattern_list = [
@ -47,6 +48,7 @@ class TextSplitHandle(BaseSplitHandle):
        try:
            content = buffer.decode(detect(buffer)['encoding'])
        except BaseException as e:
            maxkb_logger.error(f"Error processing TEXT file {file.name}: {e}, {traceback.format_exc()}")
            return {'name': file.name, 'content': []}
        return {'name': file.name, 'content': split_model.parse(content)}
--- a/apps/common/handle/impl/text/xls_split_handle.py
+++ b/apps/common/handle/impl/text/xls_split_handle.py
@ -6,11 +6,13 @@
    @date：2024/5/21 14:59
    @desc:
 """
 import traceback
 from typing import List
 import xlrd
 from common.handle.base_split_handle import BaseSplitHandle
 from common.utils.logger import maxkb_logger
 def post_cell(cell_value):
@ -58,6 +60,8 @@ class XlsSplitHandle(BaseSplitHandle):
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
        buffer = get_buffer(file)
        try:
            if type(limit) is str:
                limit = int(limit)
            workbook = xlrd.open_workbook(file_contents=buffer)
            worksheets = workbook.sheets()
            worksheets_size = len(worksheets)
@ -67,6 +71,7 @@ class XlsSplitHandle(BaseSplitHandle):
                        sheet.name, sheet, limit) for sheet
                     in worksheets] if row is not None]
        except Exception as e:
            maxkb_logger.error(f"Error processing XLS file {file.name}: {e}, {traceback.format_exc()}")
            return [{'name': file.name, 'content': []}]
    def get_content(self, file, save_image):
--- a/apps/common/handle/impl/text/xlsx_split_handle.py
+++ b/apps/common/handle/impl/text/xlsx_split_handle.py
@ -7,12 +7,14 @@
    @desc:
 """
 import io
 import traceback
 from typing import List
 import openpyxl
 from common.handle.base_split_handle import BaseSplitHandle
 from common.handle.impl.common_handle import xlsx_embed_cells_images
 from common.utils.logger import maxkb_logger
 def post_cell(image_dict, cell_value):
@ -64,6 +66,8 @@ class XlsxSplitHandle(BaseSplitHandle):
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
        buffer = get_buffer(file)
        try:
            if type(limit) is str:
                limit = int(limit)
            workbook = openpyxl.load_workbook(io.BytesIO(buffer))
            try:
                image_dict: dict = xlsx_embed_cells_images(io.BytesIO(buffer))
@ -80,6 +84,7 @@ class XlsxSplitHandle(BaseSplitHandle):
                        sheet.title, sheet, image_dict, limit) for sheet
                     in worksheets] if row is not None]
        except Exception as e:
            maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
            return [{'name': file.name, 'content': []}]
    def get_content(self, file, save_image):
--- a/apps/common/handle/impl/text/zip_split_handle.py
+++ b/apps/common/handle/impl/text/zip_split_handle.py
@ -119,6 +119,8 @@ def filter_image_file(result_list: list, image_list):
 class ZipSplitHandle(BaseSplitHandle):
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
        if type(limit) is str:
            limit = int(limit)
        buffer = get_buffer(file)
        bytes_io = io.BytesIO(buffer)
        result = []