fix: 修复旧word文档图片无法正常识别 #1533

This commit is contained in:
shaohuzhang1 2024-11-06 14:18:10 +08:00 committed by shaohuzhang1
parent af509a9f58
commit 22d9fdc42f

View File

@ -14,9 +14,9 @@ from functools import reduce
from typing import List from typing import List
from docx import Document, ImagePart from docx import Document, ImagePart
from docx.oxml import ns
from docx.table import Table from docx.table import Table
from docx.text.paragraph import Paragraph from docx.text.paragraph import Paragraph
from docx.oxml import ns
from common.handle.base_split_handle import BaseSplitHandle from common.handle.base_split_handle import BaseSplitHandle
from common.util.split_model import SplitModel from common.util.split_model import SplitModel
@ -33,11 +33,8 @@ old_docx_nsmap = {'v': 'urn:schemas-microsoft-com:vml'}
combine_nsmap = {**ns.nsmap, **old_docx_nsmap} combine_nsmap = {**ns.nsmap, **old_docx_nsmap}
def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=True): def image_to_mode(image, doc: Document, images_list, get_image_id):
if is_new_docx: image_ids = image['get_image_id_handle'](image.get('image'))
image_ids = image.xpath('.//a:blip/@r:embed')
else:
image_ids = image.xpath('.//v:imagedata/@r:id', namespaces=combine_nsmap)
for img_id in image_ids: # 获取图片id for img_id in image_ids: # 获取图片id
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片 part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
if isinstance(part, ImagePart): if isinstance(part, ImagePart):
@ -49,14 +46,15 @@ def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=T
def get_paragraph_element_images(paragraph_element, doc: Document, images_list, get_image_id): def get_paragraph_element_images(paragraph_element, doc: Document, images_list, get_image_id):
images_xpath_list = [".//pic:pic", ".//w:pict"] images_xpath_list = [(".//pic:pic", lambda img: img.xpath('.//a:blip/@r:embed')),
(".//w:pict", lambda img: img.xpath('.//v:imagedata/@r:id', namespaces=combine_nsmap))]
images = [] images = []
for images_xpath in images_xpath_list: for images_xpath, get_image_id_handle in images_xpath_list:
try: try:
_images = paragraph_element.xpath(images_xpath) _images = paragraph_element.xpath(images_xpath)
if _images is not None and len(_images) > 0: if _images is not None and len(_images) > 0:
for image in _images: for image in _images:
images.append(image) images.append({'image': image, 'get_image_id_handle': get_image_id_handle})
except Exception as e: except Exception as e:
pass pass
return images return images