fix: 处理不规范的pdf中前言部分没在目录中标识出来,导致不能正常识别的问题
This commit is contained in:
parent
d959bcd26c
commit
6cacb5be71
@ -6,20 +6,19 @@
|
|||||||
@date:2024/3/27 18:19
|
@date:2024/3/27 18:19
|
||||||
@desc:
|
@desc:
|
||||||
"""
|
"""
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
|
import tempfile
|
||||||
|
import time
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
import os
|
|
||||||
import tempfile
|
|
||||||
import logging
|
|
||||||
from langchain_community.document_loaders import PyPDFLoader
|
from langchain_community.document_loaders import PyPDFLoader
|
||||||
|
|
||||||
from common.handle.base_split_handle import BaseSplitHandle
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
from common.util.split_model import SplitModel
|
from common.util.split_model import SplitModel
|
||||||
|
|
||||||
import time
|
|
||||||
|
|
||||||
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
||||||
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
|
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
|
||||||
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
|
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
|
||||||
@ -48,7 +47,7 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
return {'name': file.name, 'content': result}
|
return {'name': file.name, 'content': result}
|
||||||
|
|
||||||
# 没目录但是有链接的pdf
|
# 没目录但是有链接的pdf
|
||||||
result = self.handle_links(pdf_document)
|
result = self.handle_links(pdf_document, pattern_list, with_filter, limit)
|
||||||
if result is not None and len(result) > 0:
|
if result is not None and len(result) > 0:
|
||||||
return {'name': file.name, 'content': result}
|
return {'name': file.name, 'content': result}
|
||||||
|
|
||||||
@ -168,15 +167,21 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
return title
|
return title
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def handle_links(doc):
|
def handle_links(doc, pattern_list, with_filter, limit):
|
||||||
# 创建存储章节内容的数组
|
# 创建存储章节内容的数组
|
||||||
chapters = []
|
chapters = []
|
||||||
|
toc_start_page = -1
|
||||||
|
page_content = ""
|
||||||
|
handle_pre_toc = True
|
||||||
# 遍历 PDF 的每一页,查找带有目录链接的页
|
# 遍历 PDF 的每一页,查找带有目录链接的页
|
||||||
for page_num in range(doc.page_count):
|
for page_num in range(doc.page_count):
|
||||||
page = doc.load_page(page_num)
|
page = doc.load_page(page_num)
|
||||||
links = page.get_links()
|
links = page.get_links()
|
||||||
|
# 如果目录开始页码未设置,则设置为当前页码
|
||||||
|
if len(links) > 0:
|
||||||
|
toc_start_page = page_num
|
||||||
|
if toc_start_page < 0:
|
||||||
|
page_content += page.get_text('text')
|
||||||
# 检查该页是否包含内部链接(即指向文档内部的页面)
|
# 检查该页是否包含内部链接(即指向文档内部的页面)
|
||||||
for num in range(len(links)):
|
for num in range(len(links)):
|
||||||
link = links[num]
|
link = links[num]
|
||||||
@ -184,6 +189,9 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
# 获取链接目标的页面
|
# 获取链接目标的页面
|
||||||
dest_page = link['page']
|
dest_page = link['page']
|
||||||
rect = link['from'] # 获取链接的矩形区域
|
rect = link['from'] # 获取链接的矩形区域
|
||||||
|
# 如果目录开始页码包括前言部分,则不处理前言部分
|
||||||
|
if dest_page < toc_start_page:
|
||||||
|
handle_pre_toc = False
|
||||||
|
|
||||||
# 提取链接区域的文本作为标题
|
# 提取链接区域的文本作为标题
|
||||||
link_title = page.get_text("text", clip=rect).strip().split("\n")[0].replace('.', '').strip()
|
link_title = page.get_text("text", clip=rect).strip().split("\n")[0].replace('.', '').strip()
|
||||||
@ -226,6 +234,17 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
"content": chapter_text
|
"content": chapter_text
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# 目录中没有前言部分,手动处理
|
||||||
|
if handle_pre_toc:
|
||||||
|
if pattern_list is not None and len(pattern_list) > 0:
|
||||||
|
split_model = SplitModel(pattern_list, with_filter, limit)
|
||||||
|
else:
|
||||||
|
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
|
||||||
|
# 插入目录前的部分
|
||||||
|
page_content = re.sub(r'(?<!。)\n+', '', page_content)
|
||||||
|
page_content = re.sub(r'(?<!.)\n+', '', page_content)
|
||||||
|
pre_toc = split_model.parse(page_content)
|
||||||
|
chapters = pre_toc + chapters
|
||||||
return chapters
|
return chapters
|
||||||
|
|
||||||
def support(self, file, get_buffer):
|
def support(self, file, get_buffer):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user