refactor: PDF分段强制按字数限制
--bug=1047568 --user=刘瑞斌 【github#1363】pdf 文件高级分段默认分段长度为500,但生成的段落长度超过29000字符 https://www.tapd.cn/57709429/s/1600183
This commit is contained in:
parent
2cb8d26609
commit
834ccaa35b
@ -42,7 +42,7 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
pdf_document = fitz.open(temp_file_path)
|
pdf_document = fitz.open(temp_file_path)
|
||||||
try:
|
try:
|
||||||
# 处理有目录的pdf
|
# 处理有目录的pdf
|
||||||
result = self.handle_toc(pdf_document)
|
result = self.handle_toc(pdf_document, limit)
|
||||||
if result is not None:
|
if result is not None:
|
||||||
return {'name': file.name, 'content': result}
|
return {'name': file.name, 'content': result}
|
||||||
|
|
||||||
@ -110,7 +110,7 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
return content
|
return content
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def handle_toc(doc):
|
def handle_toc(doc, limit):
|
||||||
# 找到目录
|
# 找到目录
|
||||||
toc = doc.get_toc()
|
toc = doc.get_toc()
|
||||||
if toc is None or len(toc) == 0:
|
if toc is None or len(toc) == 0:
|
||||||
@ -155,17 +155,16 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
text = text[:idx]
|
text = text[:idx]
|
||||||
|
|
||||||
chapter_text += text # 提取文本
|
chapter_text += text # 提取文本
|
||||||
|
# 限制章节内容长度
|
||||||
|
if 0 < limit < len(chapter_text):
|
||||||
|
split_text = PdfSplitHandle.split_text(chapter_text, limit)
|
||||||
|
for text in split_text:
|
||||||
|
chapters.append({"title": chapter_title, "content": text})
|
||||||
|
else:
|
||||||
|
chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title})
|
||||||
# 保存章节内容和章节标题
|
# 保存章节内容和章节标题
|
||||||
chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title})
|
|
||||||
return chapters
|
return chapters
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def handle_chapter_title(title):
|
|
||||||
title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
|
|
||||||
title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
|
|
||||||
return title
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def handle_links(doc, pattern_list, with_filter, limit):
|
def handle_links(doc, pattern_list, with_filter, limit):
|
||||||
# 创建存储章节内容的数组
|
# 创建存储章节内容的数组
|
||||||
@ -228,11 +227,14 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
text = text[:idx]
|
text = text[:idx]
|
||||||
chapter_text += text
|
chapter_text += text
|
||||||
|
|
||||||
# 保存章节信息
|
# 限制章节内容长度
|
||||||
chapters.append({
|
if 0 < limit < len(chapter_text):
|
||||||
"title": link_title,
|
split_text = PdfSplitHandle.split_text(chapter_text, limit)
|
||||||
"content": chapter_text
|
for text in split_text:
|
||||||
})
|
chapters.append({"title": link_title, "content": text})
|
||||||
|
else:
|
||||||
|
# 保存章节信息
|
||||||
|
chapters.append({"title": link_title, "content": chapter_text})
|
||||||
|
|
||||||
# 目录中没有前言部分,手动处理
|
# 目录中没有前言部分,手动处理
|
||||||
if handle_pre_toc:
|
if handle_pre_toc:
|
||||||
@ -261,6 +263,35 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
chapters = pre_toc + chapters
|
chapters = pre_toc + chapters
|
||||||
return chapters
|
return chapters
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def split_text(text, length):
|
||||||
|
segments = []
|
||||||
|
current_segment = ""
|
||||||
|
|
||||||
|
for char in text:
|
||||||
|
current_segment += char
|
||||||
|
if len(current_segment) >= length:
|
||||||
|
# 查找最近的句号
|
||||||
|
last_period_index = current_segment.rfind('.')
|
||||||
|
if last_period_index != -1:
|
||||||
|
segments.append(current_segment[:last_period_index + 1])
|
||||||
|
current_segment = current_segment[last_period_index + 1:] # 更新当前段落
|
||||||
|
else:
|
||||||
|
segments.append(current_segment)
|
||||||
|
current_segment = ""
|
||||||
|
|
||||||
|
# 处理剩余的部分
|
||||||
|
if current_segment:
|
||||||
|
segments.append(current_segment)
|
||||||
|
|
||||||
|
return segments
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def handle_chapter_title(title):
|
||||||
|
title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
|
||||||
|
title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
|
||||||
|
return title
|
||||||
|
|
||||||
def support(self, file, get_buffer):
|
def support(self, file, get_buffer):
|
||||||
file_name: str = file.name.lower()
|
file_name: str = file.name.lower()
|
||||||
if file_name.endswith(".pdf") or file_name.endswith(".PDF"):
|
if file_name.endswith(".pdf") or file_name.endswith(".PDF"):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user