fix: 修复知识库上传旧版本docx文件后,图片未正常识别导入的问题 (#1382)
This commit is contained in:
parent
45e9b9120b
commit
33d63c8efe
@ -15,6 +15,7 @@ from typing import List
|
|||||||
from docx import Document, ImagePart
|
from docx import Document, ImagePart
|
||||||
from docx.table import Table
|
from docx.table import Table
|
||||||
from docx.text.paragraph import Paragraph
|
from docx.text.paragraph import Paragraph
|
||||||
|
from docx.oxml import ns
|
||||||
|
|
||||||
from common.handle.base_split_handle import BaseSplitHandle
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
from common.util.split_model import SplitModel
|
from common.util.split_model import SplitModel
|
||||||
@ -27,9 +28,15 @@ default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
|||||||
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
|
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
|
||||||
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
|
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
|
||||||
|
|
||||||
|
old_docx_nsmap = {'v': 'urn:schemas-microsoft-com:vml'}
|
||||||
|
combine_nsmap = {**ns.nsmap, **old_docx_nsmap}
|
||||||
|
|
||||||
def image_to_mode(image, doc: Document, images_list, get_image_id):
|
def image_to_mode(image, doc: Document, images_list, get_image_id, is_new_docx=True):
|
||||||
for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id
|
if is_new_docx:
|
||||||
|
image_ids = image.xpath('.//a:blip/@r:embed')
|
||||||
|
else:
|
||||||
|
image_ids = image.xpath('.//v:imagedata/@r:id', namespaces=combine_nsmap)
|
||||||
|
for img_id in image_ids: # 获取图片id
|
||||||
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
|
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
|
||||||
if isinstance(part, ImagePart):
|
if isinstance(part, ImagePart):
|
||||||
image_uuid = get_image_id(img_id)
|
image_uuid = get_image_id(img_id)
|
||||||
@ -42,10 +49,15 @@ def image_to_mode(image, doc: Document, images_list, get_image_id):
|
|||||||
def get_paragraph_element_txt(paragraph_element, doc: Document, images_list, get_image_id):
|
def get_paragraph_element_txt(paragraph_element, doc: Document, images_list, get_image_id):
|
||||||
try:
|
try:
|
||||||
images = paragraph_element.xpath(".//pic:pic")
|
images = paragraph_element.xpath(".//pic:pic")
|
||||||
|
old_docx_images = paragraph_element.xpath(".//w:pict")
|
||||||
if len(images) > 0:
|
if len(images) > 0:
|
||||||
return "".join(
|
return "".join(
|
||||||
[item for item in [image_to_mode(image, doc, images_list, get_image_id) for image in images] if
|
[item for item in [image_to_mode(image, doc, images_list, get_image_id) for image in images] if
|
||||||
item is not None])
|
item is not None])
|
||||||
|
elif len(old_docx_images) > 0:
|
||||||
|
return "".join(
|
||||||
|
[item for item in [image_to_mode(image, doc, images_list, get_image_id, is_new_docx=False) for image in old_docx_images] if
|
||||||
|
item is not None])
|
||||||
elif paragraph_element.text is not None:
|
elif paragraph_element.text is not None:
|
||||||
return paragraph_element.text
|
return paragraph_element.text
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user