fix: Part of the docx document is parsed incorrectly (#1981)

2025-01-06 14:37:51 +08:00 · 2025-01-06 14:37:51 +08:00 · d9df013e33
commit d9df013e33
parent 00591a5b25
1 changed files with 5 additions and 3 deletions
--- a/apps/common/handle/impl/doc_split_handle.py
+++ b/apps/common/handle/impl/doc_split_handle.py
@ -113,8 +113,10 @@ class DocSplitHandle(BaseSplitHandle):
    def paragraph_to_md(paragraph: Paragraph, doc: Document, images_list, get_image_id):
        try:
            psn = paragraph.style.name
-            if psn.startswith('Heading'):
+            if psn.startswith('Heading') or psn.startswith('TOC 标题') or psn.startswith('标题'):
-                title = "".join(["#" for i in range(int(psn.replace("Heading ", '')))]) + " " + paragraph.text
+                title = "".join(["#" for i in range(
                    int(psn.replace("Heading ", '').replace('TOC 标题', '').replace('标题',
                                                                                    '')))]) + " " + paragraph.text
                images = reduce(lambda x, y: [*x, *y],
                                [get_paragraph_element_images(e, doc, images_list, get_image_id) for e in
                                 paragraph._element],
@ -202,4 +204,4 @@ class DocSplitHandle(BaseSplitHandle):
            return content
        except BaseException as e:
            traceback.print_exception(e)
-            return f'{e}'
+            return f'{e}'