fix: 处理某些pdf中不包括目录和内部链接不能完整导入的问题
This commit is contained in:
parent
7c529c281c
commit
fb8b96779c
@ -31,6 +31,16 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
|||||||
max_kb = logging.getLogger("max_kb")
|
max_kb = logging.getLogger("max_kb")
|
||||||
|
|
||||||
|
|
||||||
|
def check_links_in_pdf(doc):
|
||||||
|
for page_number in range(len(doc)):
|
||||||
|
page = doc[page_number]
|
||||||
|
links = page.get_links()
|
||||||
|
if links:
|
||||||
|
for link in links:
|
||||||
|
if link['kind'] == 1:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
class PdfSplitHandle(BaseSplitHandle):
|
class PdfSplitHandle(BaseSplitHandle):
|
||||||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
||||||
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
||||||
@ -175,6 +185,9 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def handle_links(doc, pattern_list, with_filter, limit):
|
def handle_links(doc, pattern_list, with_filter, limit):
|
||||||
|
# 检查文档是否包含内部链接
|
||||||
|
if not check_links_in_pdf(doc):
|
||||||
|
return
|
||||||
# 创建存储章节内容的数组
|
# 创建存储章节内容的数组
|
||||||
chapters = []
|
chapters = []
|
||||||
toc_start_page = -1
|
toc_start_page = -1
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user