chore: 解析错误时输出错误原因
This commit is contained in:
parent
ec4fe833b1
commit
2a87af6172
@ -28,9 +28,9 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
|||||||
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*"),
|
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*"),
|
||||||
re.compile("(?<!\n)\n\n+")]
|
re.compile("(?<!\n)\n\n+")]
|
||||||
|
|
||||||
|
|
||||||
max_kb = logging.getLogger("max_kb")
|
max_kb = logging.getLogger("max_kb")
|
||||||
|
|
||||||
|
|
||||||
class PdfSplitHandle(BaseSplitHandle):
|
class PdfSplitHandle(BaseSplitHandle):
|
||||||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
||||||
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
||||||
@ -60,6 +60,13 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
|
|
||||||
loader = PyPDFLoader(page_num_pdf, extract_images=True)
|
loader = PyPDFLoader(page_num_pdf, extract_images=True)
|
||||||
page_content = "\n" + loader.load()[0].page_content
|
page_content = "\n" + loader.load()[0].page_content
|
||||||
|
except NotImplementedError as e:
|
||||||
|
# 文件格式不支持,直接退出
|
||||||
|
raise e
|
||||||
|
except BaseException as e:
|
||||||
|
# 当页出错继续进行下一页,防止一个页面出错导致整个文件解析失败
|
||||||
|
max_kb.error(f"File: {file.name}, Page: {page_num + 1}, error: {e}")
|
||||||
|
continue
|
||||||
finally:
|
finally:
|
||||||
os.remove(page_num_pdf)
|
os.remove(page_num_pdf)
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user