fix: Part of the docx document is parsed incorrectly (#1981)
This commit is contained in:
parent
00591a5b25
commit
d9df013e33
@ -113,8 +113,10 @@ class DocSplitHandle(BaseSplitHandle):
|
|||||||
def paragraph_to_md(paragraph: Paragraph, doc: Document, images_list, get_image_id):
|
def paragraph_to_md(paragraph: Paragraph, doc: Document, images_list, get_image_id):
|
||||||
try:
|
try:
|
||||||
psn = paragraph.style.name
|
psn = paragraph.style.name
|
||||||
if psn.startswith('Heading'):
|
if psn.startswith('Heading') or psn.startswith('TOC 标题') or psn.startswith('标题'):
|
||||||
title = "".join(["#" for i in range(int(psn.replace("Heading ", '')))]) + " " + paragraph.text
|
title = "".join(["#" for i in range(
|
||||||
|
int(psn.replace("Heading ", '').replace('TOC 标题', '').replace('标题',
|
||||||
|
'')))]) + " " + paragraph.text
|
||||||
images = reduce(lambda x, y: [*x, *y],
|
images = reduce(lambda x, y: [*x, *y],
|
||||||
[get_paragraph_element_images(e, doc, images_list, get_image_id) for e in
|
[get_paragraph_element_images(e, doc, images_list, get_image_id) for e in
|
||||||
paragraph._element],
|
paragraph._element],
|
||||||
@ -202,4 +204,4 @@ class DocSplitHandle(BaseSplitHandle):
|
|||||||
return content
|
return content
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
traceback.print_exception(e)
|
traceback.print_exception(e)
|
||||||
return f'{e}'
|
return f'{e}'
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user