fix: 修复旧word文档图片无法正常识别 #1533
This commit is contained in:
parent
af509a9f58
commit
22d9fdc42f
@ -14,9 +14,9 @@ from functools import reduce
|
|||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from docx import Document, ImagePart
|
from docx import Document, ImagePart
|
||||||
|
from docx.oxml import ns
|
||||||
from docx.table import Table
|
from docx.table import Table
|
||||||
from docx.text.paragraph import Paragraph
|
from docx.text.paragraph import Paragraph
|
||||||
from docx.oxml import ns
|
|
||||||
|
|
||||||
from common.handle.base_split_handle import BaseSplitHandle
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
from common.util.split_model import SplitModel
|
from common.util.split_model import SplitModel
|
||||||
@ -33,11 +33,8 @@ old_docx_nsmap = {'v': 'urn:schemas-microsoft-com:vml'}
|
|||||||
combine_nsmap = {**ns.nsmap, **old_docx_nsmap}
|
combine_nsmap = {**ns.nsmap, **old_docx_nsmap}
|
||||||
|
|
||||||
|
|
||||||
def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=True):
|
def image_to_mode(image, doc: Document, images_list, get_image_id):
|
||||||
if is_new_docx:
|
image_ids = image['get_image_id_handle'](image.get('image'))
|
||||||
image_ids = image.xpath('.//a:blip/@r:embed')
|
|
||||||
else:
|
|
||||||
image_ids = image.xpath('.//v:imagedata/@r:id', namespaces=combine_nsmap)
|
|
||||||
for img_id in image_ids: # 获取图片id
|
for img_id in image_ids: # 获取图片id
|
||||||
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
|
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
|
||||||
if isinstance(part, ImagePart):
|
if isinstance(part, ImagePart):
|
||||||
@ -49,14 +46,15 @@ def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=T
|
|||||||
|
|
||||||
|
|
||||||
def get_paragraph_element_images(paragraph_element, doc: Document, images_list, get_image_id):
|
def get_paragraph_element_images(paragraph_element, doc: Document, images_list, get_image_id):
|
||||||
images_xpath_list = [".//pic:pic", ".//w:pict"]
|
images_xpath_list = [(".//pic:pic", lambda img: img.xpath('.//a:blip/@r:embed')),
|
||||||
|
(".//w:pict", lambda img: img.xpath('.//v:imagedata/@r:id', namespaces=combine_nsmap))]
|
||||||
images = []
|
images = []
|
||||||
for images_xpath in images_xpath_list:
|
for images_xpath, get_image_id_handle in images_xpath_list:
|
||||||
try:
|
try:
|
||||||
_images = paragraph_element.xpath(images_xpath)
|
_images = paragraph_element.xpath(images_xpath)
|
||||||
if _images is not None and len(_images) > 0:
|
if _images is not None and len(_images) > 0:
|
||||||
for image in _images:
|
for image in _images:
|
||||||
images.append(image)
|
images.append({'image': image, 'get_image_id_handle': get_image_id_handle})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
return images
|
return images
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user