refactor: PDF分段强制按字数限制

--bug=1047568 --user=刘瑞斌【github#1363】pdf 文件高级分段默认分段长度为500，但生成的段落长度超过29000字符 https://www.tapd.cn/57709429/s/1600183
2024-10-29 11:39:35 +08:00 · 2024-10-29 11:39:35 +08:00 · 834ccaa35b
commit 834ccaa35b
parent 2cb8d26609
1 changed files with 46 additions and 15 deletions
--- a/apps/common/handle/impl/pdf_split_handle.py
+++ b/apps/common/handle/impl/pdf_split_handle.py
@ -42,7 +42,7 @@ class PdfSplitHandle(BaseSplitHandle):
        pdf_document = fitz.open(temp_file_path)
        try:
            # 处理有目录的pdf
-            result = self.handle_toc(pdf_document)
+            result = self.handle_toc(pdf_document, limit)
            if result is not None:
                return {'name': file.name, 'content': result}
@ -110,7 +110,7 @@ class PdfSplitHandle(BaseSplitHandle):
        return content
    @staticmethod
-    def handle_toc(doc):
+    def handle_toc(doc, limit):
        # 找到目录
        toc = doc.get_toc()
        if toc is None or len(toc) == 0:
@ -155,17 +155,16 @@ class PdfSplitHandle(BaseSplitHandle):
                        text = text[:idx]
                chapter_text += text  # 提取文本
-
+            # 限制章节内容长度
            if 0 < limit < len(chapter_text):
                split_text = PdfSplitHandle.split_text(chapter_text, limit)
                for text in split_text:
                    chapters.append({"title": chapter_title, "content": text})
            else:
                chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title})
            # 保存章节内容和章节标题
            chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title})
        return chapters
    @staticmethod
    def handle_chapter_title(title):
        title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
        title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
        return title
    @staticmethod
    def handle_links(doc, pattern_list, with_filter, limit):
        # 创建存储章节内容的数组
@ -228,11 +227,14 @@ class PdfSplitHandle(BaseSplitHandle):
                                text = text[:idx]
                        chapter_text += text
-                    # 保存章节信息
+                    # 限制章节内容长度
-                    chapters.append({
+                    if 0 < limit < len(chapter_text):
-                        "title": link_title,
+                        split_text = PdfSplitHandle.split_text(chapter_text, limit)
-                        "content": chapter_text
+                        for text in split_text:
-                    })
+                            chapters.append({"title": link_title, "content": text})
                    else:
                        # 保存章节信息
                        chapters.append({"title": link_title, "content": chapter_text})
        # 目录中没有前言部分，手动处理
        if handle_pre_toc:
@ -261,6 +263,35 @@ class PdfSplitHandle(BaseSplitHandle):
            chapters = pre_toc + chapters
        return chapters
    @staticmethod
    def split_text(text, length):
        segments = []
        current_segment = ""
        for char in text:
            current_segment += char
            if len(current_segment) >= length:
                # 查找最近的句号
                last_period_index = current_segment.rfind('.')
                if last_period_index != -1:
                    segments.append(current_segment[:last_period_index + 1])
                    current_segment = current_segment[last_period_index + 1:]  # 更新当前段落
                else:
                    segments.append(current_segment)
                    current_segment = ""
        # 处理剩余的部分
        if current_segment:
            segments.append(current_segment)
        return segments
    @staticmethod
    def handle_chapter_title(title):
        title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
        title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
        return title
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        if file_name.endswith(".pdf") or file_name.endswith(".PDF"):