perf: Enhance Word parsing (#2612)
This commit is contained in:
parent
263c18ebca
commit
5ec94860b2
@ -110,24 +110,51 @@ def get_image_id_func():
|
|||||||
return get_image_id
|
return get_image_id
|
||||||
|
|
||||||
|
|
||||||
|
title_font_list = [
|
||||||
|
[36, 100],
|
||||||
|
[26, 36],
|
||||||
|
[24, 26],
|
||||||
|
[22, 24],
|
||||||
|
[18, 22],
|
||||||
|
[16, 18]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def get_title_level(paragraph: Paragraph):
|
||||||
|
try:
|
||||||
|
if paragraph.style is not None:
|
||||||
|
psn = paragraph.style.name
|
||||||
|
if psn.startswith('Heading') or psn.startswith('TOC 标题') or psn.startswith('标题'):
|
||||||
|
return int(psn.replace("Heading ", '').replace('TOC 标题', '').replace('标题',
|
||||||
|
''))
|
||||||
|
if len(paragraph.runs) == 1:
|
||||||
|
font_size = paragraph.runs[0].font.size
|
||||||
|
pt = font_size.pt
|
||||||
|
if pt >= 16:
|
||||||
|
for _value, index in zip(title_font_list, range(len(title_font_list))):
|
||||||
|
if pt >= _value[0] and pt < _value[1]:
|
||||||
|
return index + 1
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class DocSplitHandle(BaseSplitHandle):
|
class DocSplitHandle(BaseSplitHandle):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def paragraph_to_md(paragraph: Paragraph, doc: Document, images_list, get_image_id):
|
def paragraph_to_md(paragraph: Paragraph, doc: Document, images_list, get_image_id):
|
||||||
try:
|
try:
|
||||||
psn = paragraph.style.name
|
title_level = get_title_level(paragraph)
|
||||||
if psn.startswith('Heading') or psn.startswith('TOC 标题') or psn.startswith('标题'):
|
if title_level is not None:
|
||||||
title = "".join(["#" for i in range(
|
title = "".join(["#" for i in range(title_level)]) + " " + paragraph.text
|
||||||
int(psn.replace("Heading ", '').replace('TOC 标题', '').replace('标题',
|
|
||||||
'')))]) + " " + paragraph.text
|
|
||||||
images = reduce(lambda x, y: [*x, *y],
|
images = reduce(lambda x, y: [*x, *y],
|
||||||
[get_paragraph_element_images(e, doc, images_list, get_image_id) for e in
|
[get_paragraph_element_images(e, doc, images_list, get_image_id) for e in
|
||||||
paragraph._element],
|
paragraph._element],
|
||||||
[])
|
[])
|
||||||
|
|
||||||
if len(images) > 0:
|
if len(images) > 0:
|
||||||
return title + '\n' + images_to_string(images, doc, images_list, get_image_id) if len(
|
return title + '\n' + images_to_string(images, doc, images_list, get_image_id) if len(
|
||||||
paragraph.text) > 0 else images_to_string(images, doc, images_list, get_image_id)
|
paragraph.text) > 0 else images_to_string(images, doc, images_list, get_image_id)
|
||||||
return title
|
return title
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
return paragraph.text
|
return paragraph.text
|
||||||
|
|||||||
@ -339,13 +339,14 @@ class SplitModel:
|
|||||||
for e in result:
|
for e in result:
|
||||||
if len(e['content']) > 4096:
|
if len(e['content']) > 4096:
|
||||||
pass
|
pass
|
||||||
return [item for item in [self.post_reset_paragraph(row) for row in result] if
|
title_list = list(set([row.get('title') for row in result]))
|
||||||
|
return [item for item in [self.post_reset_paragraph(row, title_list) for row in result] if
|
||||||
'content' in item and len(item.get('content').strip()) > 0]
|
'content' in item and len(item.get('content').strip()) > 0]
|
||||||
|
|
||||||
def post_reset_paragraph(self, paragraph: Dict):
|
def post_reset_paragraph(self, paragraph: Dict, title_list: List[str]):
|
||||||
result = self.filter_title_special_characters(paragraph)
|
result = self.filter_title_special_characters(paragraph)
|
||||||
result = self.sub_title(result)
|
result = self.sub_title(result)
|
||||||
result = self.content_is_null(result)
|
result = self.content_is_null(result, title_list)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -357,11 +358,14 @@ class SplitModel:
|
|||||||
return paragraph
|
return paragraph
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def content_is_null(paragraph: Dict):
|
def content_is_null(paragraph: Dict, title_list: List[str]):
|
||||||
if 'title' in paragraph:
|
if 'title' in paragraph:
|
||||||
title = paragraph.get('title')
|
title = paragraph.get('title')
|
||||||
content = paragraph.get('content')
|
content = paragraph.get('content')
|
||||||
if (content is None or len(content.strip()) == 0) and (title is not None and len(title) > 0):
|
if (content is None or len(content.strip()) == 0) and (title is not None and len(title) > 0):
|
||||||
|
find = [t for t in title_list if t.__contains__(title) and t != title]
|
||||||
|
if find:
|
||||||
|
return {'title': '', 'content': ''}
|
||||||
return {'title': '', 'content': title}
|
return {'title': '', 'content': title}
|
||||||
return paragraph
|
return paragraph
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user