fix: handle string type for limit and improve error logging in pdf_split_handle
--bug=1057493 --user=刘瑞斌 【知识库】上传文档,使用高级分段报错 https://www.tapd.cn/62980211/s/1720110
This commit is contained in:
parent
049c0e0bb0
commit
82a2203be6
@ -8,10 +8,12 @@
|
|||||||
"""
|
"""
|
||||||
import csv
|
import csv
|
||||||
import io
|
import io
|
||||||
|
import traceback
|
||||||
|
|
||||||
from charset_normalizer import detect
|
from charset_normalizer import detect
|
||||||
|
|
||||||
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
|
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
|
||||||
|
from common.utils.logger import maxkb_logger
|
||||||
|
|
||||||
|
|
||||||
def read_csv_standard(file_path):
|
def read_csv_standard(file_path):
|
||||||
@ -56,4 +58,5 @@ class CsvParseQAHandle(BaseParseQAHandle):
|
|||||||
'problem_list': problem_list})
|
'problem_list': problem_list})
|
||||||
return [{'name': file.name, 'paragraphs': paragraph_list}]
|
return [{'name': file.name, 'paragraphs': paragraph_list}]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
|
||||||
return [{'name': file.name, 'paragraphs': []}]
|
return [{'name': file.name, 'paragraphs': []}]
|
||||||
|
|||||||
@ -6,10 +6,12 @@
|
|||||||
@date:2024/5/21 14:59
|
@date:2024/5/21 14:59
|
||||||
@desc:
|
@desc:
|
||||||
"""
|
"""
|
||||||
|
import traceback
|
||||||
|
|
||||||
import xlrd
|
import xlrd
|
||||||
|
|
||||||
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
|
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
|
||||||
|
from common.utils.logger import maxkb_logger
|
||||||
|
|
||||||
|
|
||||||
def handle_sheet(file_name, sheet):
|
def handle_sheet(file_name, sheet):
|
||||||
@ -58,4 +60,5 @@ class XlsParseQAHandle(BaseParseQAHandle):
|
|||||||
sheet.name, sheet) for sheet
|
sheet.name, sheet) for sheet
|
||||||
in worksheets] if row is not None]
|
in worksheets] if row is not None]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
maxkb_logger.error(f"Error processing XLS file {file.name}: {e}, {traceback.format_exc()}")
|
||||||
return [{'name': file.name, 'paragraphs': []}]
|
return [{'name': file.name, 'paragraphs': []}]
|
||||||
|
|||||||
@ -7,11 +7,13 @@
|
|||||||
@desc:
|
@desc:
|
||||||
"""
|
"""
|
||||||
import io
|
import io
|
||||||
|
import traceback
|
||||||
|
|
||||||
import openpyxl
|
import openpyxl
|
||||||
|
|
||||||
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
|
from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
|
||||||
from common.handle.impl.common_handle import xlsx_embed_cells_images
|
from common.handle.impl.common_handle import xlsx_embed_cells_images
|
||||||
|
from common.utils.logger import maxkb_logger
|
||||||
|
|
||||||
|
|
||||||
def handle_sheet(file_name, sheet, image_dict):
|
def handle_sheet(file_name, sheet, image_dict):
|
||||||
@ -69,4 +71,5 @@ class XlsxParseQAHandle(BaseParseQAHandle):
|
|||||||
sheet.title, sheet, image_dict) for sheet
|
sheet.title, sheet, image_dict) for sheet
|
||||||
in worksheets] if row is not None]
|
in worksheets] if row is not None]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
|
||||||
return [{'name': file.name, 'paragraphs': []}]
|
return [{'name': file.name, 'paragraphs': []}]
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
import logging
|
import logging
|
||||||
|
import traceback
|
||||||
|
|
||||||
from charset_normalizer import detect
|
from charset_normalizer import detect
|
||||||
|
|
||||||
@ -19,7 +20,7 @@ class CsvParseTableHandle(BaseParseTableHandle):
|
|||||||
try:
|
try:
|
||||||
content = buffer.decode(detect(buffer)['encoding'])
|
content = buffer.decode(detect(buffer)['encoding'])
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
maxkb_logger.error(f'csv split handle error: {e}')
|
maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
|
||||||
return [{'name': file.name, 'paragraphs': []}]
|
return [{'name': file.name, 'paragraphs': []}]
|
||||||
|
|
||||||
csv_model = content.split('\n')
|
csv_model = content.split('\n')
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
import logging
|
import logging
|
||||||
|
import traceback
|
||||||
|
|
||||||
import xlrd
|
import xlrd
|
||||||
|
|
||||||
@ -55,7 +56,7 @@ class XlsParseTableHandle(BaseParseTableHandle):
|
|||||||
result.append({'name': sheet.name, 'paragraphs': paragraphs})
|
result.append({'name': sheet.name, 'paragraphs': paragraphs})
|
||||||
|
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
maxkb_logger.error(f'excel split handle error: {e}')
|
maxkb_logger.error(f"Error processing XLS file {file.name}: {e}, {traceback.format_exc()}")
|
||||||
return [{'name': file.name, 'paragraphs': []}]
|
return [{'name': file.name, 'paragraphs': []}]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
|
import traceback
|
||||||
|
|
||||||
from openpyxl import load_workbook
|
from openpyxl import load_workbook
|
||||||
|
|
||||||
@ -73,7 +74,7 @@ class XlsxParseTableHandle(BaseParseTableHandle):
|
|||||||
result.append({'name': sheetname, 'paragraphs': paragraphs})
|
result.append({'name': sheetname, 'paragraphs': paragraphs})
|
||||||
|
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
maxkb_logger.error(f'excel split handle error: {e}')
|
maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
|
||||||
return [{'name': file.name, 'paragraphs': []}]
|
return [{'name': file.name, 'paragraphs': []}]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@ -9,11 +9,13 @@
|
|||||||
import csv
|
import csv
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
|
import traceback
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from charset_normalizer import detect
|
from charset_normalizer import detect
|
||||||
|
|
||||||
from common.handle.base_split_handle import BaseSplitHandle
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
|
from common.utils.logger import maxkb_logger
|
||||||
|
|
||||||
|
|
||||||
def post_cell(cell_value):
|
def post_cell(cell_value):
|
||||||
@ -60,6 +62,7 @@ class CsvSplitHandle(BaseSplitHandle):
|
|||||||
paragraphs.append({'content': result_item_content, 'title': ''})
|
paragraphs.append({'content': result_item_content, 'title': ''})
|
||||||
return result
|
return result
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def get_content(self, file, save_image):
|
def get_content(self, file, save_image):
|
||||||
|
|||||||
@ -7,7 +7,6 @@
|
|||||||
@desc:
|
@desc:
|
||||||
"""
|
"""
|
||||||
import io
|
import io
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import traceback
|
import traceback
|
||||||
@ -155,7 +154,7 @@ class DocSplitHandle(BaseSplitHandle):
|
|||||||
return title
|
return title
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
traceback.print_exc()
|
maxkb_logger.error(f"Error processing DOC file: {e}, {traceback.format_exc()}")
|
||||||
return paragraph.text
|
return paragraph.text
|
||||||
return get_paragraph_txt(paragraph, doc, images_list, get_image_id)
|
return get_paragraph_txt(paragraph, doc, images_list, get_image_id)
|
||||||
|
|
||||||
@ -207,12 +206,15 @@ class DocSplitHandle(BaseSplitHandle):
|
|||||||
else:
|
else:
|
||||||
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
|
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
traceback.print_exception(e)
|
maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
|
||||||
return {'name': file_name,
|
return {
|
||||||
'content': []}
|
'name': file_name,
|
||||||
return {'name': file_name,
|
'content': []
|
||||||
'content': split_model.parse(content)
|
}
|
||||||
}
|
return {
|
||||||
|
'name': file_name,
|
||||||
|
'content': split_model.parse(content)
|
||||||
|
}
|
||||||
|
|
||||||
def support(self, file, get_buffer):
|
def support(self, file, get_buffer):
|
||||||
file_name: str = file.name.lower()
|
file_name: str = file.name.lower()
|
||||||
|
|||||||
@ -15,6 +15,7 @@ from charset_normalizer import detect
|
|||||||
from html2text import html2text
|
from html2text import html2text
|
||||||
|
|
||||||
from common.handle.base_split_handle import BaseSplitHandle
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
|
from common.utils.logger import maxkb_logger
|
||||||
from common.utils.split_model import SplitModel
|
from common.utils.split_model import SplitModel
|
||||||
|
|
||||||
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
||||||
@ -55,11 +56,15 @@ class HTMLSplitHandle(BaseSplitHandle):
|
|||||||
content = buffer.decode(encoding)
|
content = buffer.decode(encoding)
|
||||||
content = html2text(content)
|
content = html2text(content)
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
return {'name': file.name,
|
maxkb_logger.error(f"Error processing HTML file {file.name}: {e}, {traceback.format_exc()}")
|
||||||
'content': []}
|
|
||||||
return {'name': file.name,
|
return {
|
||||||
'content': split_model.parse(content)
|
'name': file.name, 'content': []
|
||||||
}
|
}
|
||||||
|
return {
|
||||||
|
'name': file.name,
|
||||||
|
'content': split_model.parse(content)
|
||||||
|
}
|
||||||
|
|
||||||
def get_content(self, file, save_image):
|
def get_content(self, file, save_image):
|
||||||
buffer = file.read()
|
buffer = file.read()
|
||||||
|
|||||||
@ -6,7 +6,6 @@
|
|||||||
@date:2024/3/27 18:19
|
@date:2024/3/27 18:19
|
||||||
@desc:
|
@desc:
|
||||||
"""
|
"""
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
@ -31,7 +30,6 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
|||||||
re.compile("(?<!\n)\n\n+")]
|
re.compile("(?<!\n)\n\n+")]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def check_links_in_pdf(doc):
|
def check_links_in_pdf(doc):
|
||||||
for page_number in range(len(doc)):
|
for page_number in range(len(doc)):
|
||||||
page = doc[page_number]
|
page = doc[page_number]
|
||||||
@ -54,6 +52,8 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
|
|
||||||
pdf_document = fitz.open(temp_file_path)
|
pdf_document = fitz.open(temp_file_path)
|
||||||
try:
|
try:
|
||||||
|
if type(limit) is str:
|
||||||
|
limit = int(limit)
|
||||||
# 处理有目录的pdf
|
# 处理有目录的pdf
|
||||||
result = self.handle_toc(pdf_document, limit)
|
result = self.handle_toc(pdf_document, limit)
|
||||||
if result is not None:
|
if result is not None:
|
||||||
@ -72,17 +72,20 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
else:
|
else:
|
||||||
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
|
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
maxkb_logger.error(f"File: {file.name}, error: {e}")
|
maxkb_logger.error(f"File: {file.name}, error: {e}, {traceback.format_exc()}")
|
||||||
return {'name': file.name,
|
return {
|
||||||
'content': []}
|
'name': file.name,
|
||||||
|
'content': []
|
||||||
|
}
|
||||||
finally:
|
finally:
|
||||||
pdf_document.close()
|
pdf_document.close()
|
||||||
# 处理完后可以删除临时文件
|
# 处理完后可以删除临时文件
|
||||||
os.remove(temp_file_path)
|
os.remove(temp_file_path)
|
||||||
|
|
||||||
return {'name': file.name,
|
return {
|
||||||
'content': split_model.parse(content)
|
'name': file.name,
|
||||||
}
|
'content': split_model.parse(content)
|
||||||
|
}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def handle_pdf_content(file, pdf_document):
|
def handle_pdf_content(file, pdf_document):
|
||||||
|
|||||||
@ -13,6 +13,7 @@ from typing import List
|
|||||||
from charset_normalizer import detect
|
from charset_normalizer import detect
|
||||||
|
|
||||||
from common.handle.base_split_handle import BaseSplitHandle
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
|
from common.utils.logger import maxkb_logger
|
||||||
from common.utils.split_model import SplitModel
|
from common.utils.split_model import SplitModel
|
||||||
|
|
||||||
default_pattern_list = [
|
default_pattern_list = [
|
||||||
@ -47,6 +48,7 @@ class TextSplitHandle(BaseSplitHandle):
|
|||||||
try:
|
try:
|
||||||
content = buffer.decode(detect(buffer)['encoding'])
|
content = buffer.decode(detect(buffer)['encoding'])
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
|
maxkb_logger.error(f"Error processing TEXT file {file.name}: {e}, {traceback.format_exc()}")
|
||||||
return {'name': file.name, 'content': []}
|
return {'name': file.name, 'content': []}
|
||||||
return {'name': file.name, 'content': split_model.parse(content)}
|
return {'name': file.name, 'content': split_model.parse(content)}
|
||||||
|
|
||||||
|
|||||||
@ -6,11 +6,13 @@
|
|||||||
@date:2024/5/21 14:59
|
@date:2024/5/21 14:59
|
||||||
@desc:
|
@desc:
|
||||||
"""
|
"""
|
||||||
|
import traceback
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import xlrd
|
import xlrd
|
||||||
|
|
||||||
from common.handle.base_split_handle import BaseSplitHandle
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
|
from common.utils.logger import maxkb_logger
|
||||||
|
|
||||||
|
|
||||||
def post_cell(cell_value):
|
def post_cell(cell_value):
|
||||||
@ -58,6 +60,8 @@ class XlsSplitHandle(BaseSplitHandle):
|
|||||||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
||||||
buffer = get_buffer(file)
|
buffer = get_buffer(file)
|
||||||
try:
|
try:
|
||||||
|
if type(limit) is str:
|
||||||
|
limit = int(limit)
|
||||||
workbook = xlrd.open_workbook(file_contents=buffer)
|
workbook = xlrd.open_workbook(file_contents=buffer)
|
||||||
worksheets = workbook.sheets()
|
worksheets = workbook.sheets()
|
||||||
worksheets_size = len(worksheets)
|
worksheets_size = len(worksheets)
|
||||||
@ -67,6 +71,7 @@ class XlsSplitHandle(BaseSplitHandle):
|
|||||||
sheet.name, sheet, limit) for sheet
|
sheet.name, sheet, limit) for sheet
|
||||||
in worksheets] if row is not None]
|
in worksheets] if row is not None]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
maxkb_logger.error(f"Error processing XLS file {file.name}: {e}, {traceback.format_exc()}")
|
||||||
return [{'name': file.name, 'content': []}]
|
return [{'name': file.name, 'content': []}]
|
||||||
|
|
||||||
def get_content(self, file, save_image):
|
def get_content(self, file, save_image):
|
||||||
|
|||||||
@ -7,12 +7,14 @@
|
|||||||
@desc:
|
@desc:
|
||||||
"""
|
"""
|
||||||
import io
|
import io
|
||||||
|
import traceback
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import openpyxl
|
import openpyxl
|
||||||
|
|
||||||
from common.handle.base_split_handle import BaseSplitHandle
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
from common.handle.impl.common_handle import xlsx_embed_cells_images
|
from common.handle.impl.common_handle import xlsx_embed_cells_images
|
||||||
|
from common.utils.logger import maxkb_logger
|
||||||
|
|
||||||
|
|
||||||
def post_cell(image_dict, cell_value):
|
def post_cell(image_dict, cell_value):
|
||||||
@ -64,6 +66,8 @@ class XlsxSplitHandle(BaseSplitHandle):
|
|||||||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
||||||
buffer = get_buffer(file)
|
buffer = get_buffer(file)
|
||||||
try:
|
try:
|
||||||
|
if type(limit) is str:
|
||||||
|
limit = int(limit)
|
||||||
workbook = openpyxl.load_workbook(io.BytesIO(buffer))
|
workbook = openpyxl.load_workbook(io.BytesIO(buffer))
|
||||||
try:
|
try:
|
||||||
image_dict: dict = xlsx_embed_cells_images(io.BytesIO(buffer))
|
image_dict: dict = xlsx_embed_cells_images(io.BytesIO(buffer))
|
||||||
@ -80,6 +84,7 @@ class XlsxSplitHandle(BaseSplitHandle):
|
|||||||
sheet.title, sheet, image_dict, limit) for sheet
|
sheet.title, sheet, image_dict, limit) for sheet
|
||||||
in worksheets] if row is not None]
|
in worksheets] if row is not None]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
maxkb_logger.error(f"Error processing XLSX file {file.name}: {e}, {traceback.format_exc()}")
|
||||||
return [{'name': file.name, 'content': []}]
|
return [{'name': file.name, 'content': []}]
|
||||||
|
|
||||||
def get_content(self, file, save_image):
|
def get_content(self, file, save_image):
|
||||||
|
|||||||
@ -119,6 +119,8 @@ def filter_image_file(result_list: list, image_list):
|
|||||||
|
|
||||||
class ZipSplitHandle(BaseSplitHandle):
|
class ZipSplitHandle(BaseSplitHandle):
|
||||||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
||||||
|
if type(limit) is str:
|
||||||
|
limit = int(limit)
|
||||||
buffer = get_buffer(file)
|
buffer = get_buffer(file)
|
||||||
bytes_io = io.BytesIO(buffer)
|
bytes_io = io.BytesIO(buffer)
|
||||||
result = []
|
result = []
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user