fix: 修复导入word文档,有的图片导入不进去
This commit is contained in:
parent
aab9cc63b1
commit
83d97439e4
@ -10,6 +10,7 @@ import io
|
|||||||
import re
|
import re
|
||||||
import traceback
|
import traceback
|
||||||
import uuid
|
import uuid
|
||||||
|
from functools import reduce
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from docx import Document, ImagePart
|
from docx import Document, ImagePart
|
||||||
@ -31,6 +32,7 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
|||||||
old_docx_nsmap = {'v': 'urn:schemas-microsoft-com:vml'}
|
old_docx_nsmap = {'v': 'urn:schemas-microsoft-com:vml'}
|
||||||
combine_nsmap = {**ns.nsmap, **old_docx_nsmap}
|
combine_nsmap = {**ns.nsmap, **old_docx_nsmap}
|
||||||
|
|
||||||
|
|
||||||
def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=True):
|
def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=True):
|
||||||
if is_new_docx:
|
if is_new_docx:
|
||||||
image_ids = image.xpath('.//a:blip/@r:embed')
|
image_ids = image.xpath('.//a:blip/@r:embed')
|
||||||
@ -46,18 +48,31 @@ def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=T
|
|||||||
return f''
|
return f''
|
||||||
|
|
||||||
|
|
||||||
|
def get_paragraph_element_images(paragraph_element, doc: Document, images_list, get_image_id):
|
||||||
|
images_xpath_list = [".//pic:pic", ".//w:pict"]
|
||||||
|
images = []
|
||||||
|
for images_xpath in images_xpath_list:
|
||||||
|
try:
|
||||||
|
_images = paragraph_element.xpath(images_xpath)
|
||||||
|
if _images is not None and len(_images) > 0:
|
||||||
|
for image in _images:
|
||||||
|
images.append(image)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
return images
|
||||||
|
|
||||||
|
|
||||||
|
def images_to_string(images, doc: Document, images_list, get_image_id):
|
||||||
|
return "".join(
|
||||||
|
[item for item in [image_to_mode(image, doc, images_list, get_image_id) for image in images] if
|
||||||
|
item is not None])
|
||||||
|
|
||||||
|
|
||||||
def get_paragraph_element_txt(paragraph_element, doc: Document, images_list, get_image_id):
|
def get_paragraph_element_txt(paragraph_element, doc: Document, images_list, get_image_id):
|
||||||
try:
|
try:
|
||||||
images = paragraph_element.xpath(".//pic:pic")
|
images = get_paragraph_element_images(paragraph_element, doc, images_list, get_image_id)
|
||||||
old_docx_images = paragraph_element.xpath(".//w:pict")
|
|
||||||
if len(images) > 0:
|
if len(images) > 0:
|
||||||
return "".join(
|
return images_to_string(images, doc, images_list, get_image_id)
|
||||||
[item for item in [image_to_mode(image, doc, images_list, get_image_id) for image in images] if
|
|
||||||
item is not None])
|
|
||||||
elif len(old_docx_images) > 0:
|
|
||||||
return "".join(
|
|
||||||
[item for item in [image_to_mode(image, doc, images_list, get_image_id, is_new_docx=False) for image in old_docx_images] if
|
|
||||||
item is not None])
|
|
||||||
elif paragraph_element.text is not None:
|
elif paragraph_element.text is not None:
|
||||||
return paragraph_element.text
|
return paragraph_element.text
|
||||||
return ""
|
return ""
|
||||||
@ -101,8 +116,18 @@ class DocSplitHandle(BaseSplitHandle):
|
|||||||
try:
|
try:
|
||||||
psn = paragraph.style.name
|
psn = paragraph.style.name
|
||||||
if psn.startswith('Heading'):
|
if psn.startswith('Heading'):
|
||||||
return "".join(["#" for i in range(int(psn.replace("Heading ", '')))]) + " " + paragraph.text
|
title = "".join(["#" for i in range(int(psn.replace("Heading ", '')))]) + " " + paragraph.text
|
||||||
|
images = reduce(lambda x, y: [*x, *y],
|
||||||
|
[get_paragraph_element_images(e, doc, images_list, get_image_id) for e in
|
||||||
|
paragraph._element],
|
||||||
|
[])
|
||||||
|
|
||||||
|
if len(images) > 0:
|
||||||
|
return title + '\n' + images_to_string(images, doc, images_list, get_image_id) if len(
|
||||||
|
paragraph.text) > 0 else images_to_string(images, doc, images_list, get_image_id)
|
||||||
|
return title
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
traceback.print_exc()
|
||||||
return paragraph.text
|
return paragraph.text
|
||||||
return get_paragraph_txt(paragraph, doc, images_list, get_image_id)
|
return get_paragraph_txt(paragraph, doc, images_list, get_image_id)
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user