fix: 修复知识库上传旧版本docx文件后,图片未正常识别导入的问题 (#1382)

This commit is contained in:
Henry-Shaw 2024-10-16 14:39:52 +08:00 committed by GitHub
parent 45e9b9120b
commit 33d63c8efe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -15,6 +15,7 @@ from typing import List
from docx import Document, ImagePart from docx import Document, ImagePart
from docx.table import Table from docx.table import Table
from docx.text.paragraph import Paragraph from docx.text.paragraph import Paragraph
from docx.oxml import ns
from common.handle.base_split_handle import BaseSplitHandle from common.handle.base_split_handle import BaseSplitHandle
from common.util.split_model import SplitModel from common.util.split_model import SplitModel
@ -27,9 +28,15 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"), re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")] re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
old_docx_nsmap = {'v': 'urn:schemas-microsoft-com:vml'}
combine_nsmap = {**ns.nsmap, **old_docx_nsmap}
def image_to_mode(image, doc: Document, images_list, get_image_id): def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=True):
for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id if is_new_docx:
image_ids = image.xpath('.//a:blip/@r:embed')
else:
image_ids = image.xpath('.//v:imagedata/@r:id', namespaces=combine_nsmap)
for img_id in image_ids: # 获取图片id
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片 part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
if isinstance(part, ImagePart): if isinstance(part, ImagePart):
image_uuid = get_image_id(img_id) image_uuid = get_image_id(img_id)
@ -42,10 +49,15 @@ def image_to_mode(image, doc: Document, images_list, get_image_id):
def get_paragraph_element_txt(paragraph_element, doc: Document, images_list, get_image_id): def get_paragraph_element_txt(paragraph_element, doc: Document, images_list, get_image_id):
try: try:
images = paragraph_element.xpath(".//pic:pic") images = paragraph_element.xpath(".//pic:pic")
old_docx_images = paragraph_element.xpath(".//w:pict")
if len(images) > 0: if len(images) > 0:
return "".join( return "".join(
[item for item in [image_to_mode(image, doc, images_list, get_image_id) for image in images] if [item for item in [image_to_mode(image, doc, images_list, get_image_id) for image in images] if
item is not None]) item is not None])
elif len(old_docx_images) > 0:
return "".join(
[item for item in [image_to_mode(image, doc, images_list, get_image_id, is_new_docx=False) for image in old_docx_images] if
item is not None])
elif paragraph_element.text is not None: elif paragraph_element.text is not None:
return paragraph_element.text return paragraph_element.text
return "" return ""