""" Flowchart recognition plugin for MinerU-based parsing. This module provides specialized flowchart recognition capabilities, extracted from gzero.py's gzero_gwc_flowchart function as requested. """ import os import base64 import asyncio from typing import Dict, Any from .logger import get_module_logger logger = get_module_logger('flowchart_plugin') from .config_base import MinerUConfig # Platform-specific imports (if available) try: from loader.gzero import gzero_vllm_proc, gzero_vllm_page_filter except ImportError: gzero_vllm_proc = None gzero_vllm_page_filter = None class FlowchartPlugin: """Plugin for specialized flowchart recognition and processing""" def __init__(self, learn_type: int = 9): """ Initialize flowchart plugin. Args: learn_type: Model type for AI processing """ self.learn_type = learn_type self.logger = logger self.config = MinerUConfig() async def process_flowchart_page(self, page_desc: Dict, page_count: int, image_name: str, temp_dir: str, src_name: str) -> Dict[str, Any]: """ Process a page containing GWC flowchart content. Based on gzero_gwc_flowchart function from gzero.py. Args: page_desc: Page description with text content page_count: Total number of pages image_name: Name of page image file temp_dir: Temporary directory src_name: Source file name Returns: Dictionary with processed flowchart content and metadata """ try: self.logger.info(f"flowchart-plugin: [{src_name}] processing flowchart page: {image_name}") # Load vision model vision_model = self.config.get_model(self.learn_type) # Load image and convert to base64 image_file = os.path.join(temp_dir, image_name) with open(image_file, 'rb') as file: image_data = file.read() image_base64 = base64.b64encode(image_data).decode("utf-8") image_url = f"data:image/png;base64,{image_base64}" # Define processing steps step0_prompt = self._get_step0_prompt() step1_prompt = self._get_step1_prompt() step2_prompt = self._get_step2_prompt() step3_prompt = self._get_step3_prompt() hint = f"{page_desc['index']}/{page_count}" # Step 0: Identify flowchart nodes step0_res = await self._execute_step( vision_model, step0_prompt, image_url, temp_dir, src_name, f"{os.path.splitext(image_name)[0]}.step0", hint ) # Step 1: Organize nodes by department step1_res = await self._execute_step_with_context( vision_model, step0_prompt, step1_prompt, image_url, step0_res['content'], temp_dir, src_name, f"{os.path.splitext(image_name)[0]}.step1", hint ) # Step 2: Create mermaid flowchart step2_res = await self._execute_step_with_context( vision_model, step0_prompt, step2_prompt, image_url, step0_res['content'], temp_dir, src_name, f"{os.path.splitext(image_name)[0]}.step2", hint ) # Step 3: Extract remaining content step3_res = await self._execute_complex_step( vision_model, step0_prompt, step2_prompt, step3_prompt, image_url, step0_res['content'], step2_res['content'], temp_dir, src_name, f"{os.path.splitext(image_name)[0]}.step3", hint ) # Combine results content = self._combine_flowchart_results( step0_res, step1_res, step2_res, step3_res ) # Calculate totals input_tokens = sum([step0_res['input_tokens'], step1_res['input_tokens'], step2_res['input_tokens'], step3_res['input_tokens']]) output_tokens = sum([step0_res['output_tokens'], step1_res['output_tokens'], step2_res['output_tokens'], step3_res['output_tokens']]) dura = sum([step0_res['dura'], step1_res['dura'], step2_res['dura'], step3_res['dura']]) self.logger.info(f"flowchart-plugin: [{src_name}] flowchart processing completed") return { 'content': content, 'input_tokens': input_tokens, 'output_tokens': output_tokens, 'dura': dura, 'flowchart_metadata': { 'steps_processed': 4, 'nodes_extracted': step0_res['content'].count(''), 'departments_identified': step1_res['content'].count('#'), 'mermaid_generated': 'mermaid' in step2_res['content'].lower() } } except Exception as e: self.logger.error(f"flowchart-plugin: [{src_name}] flowchart processing failed: {str(e)}") raise def should_use_flowchart_plugin(self, page_text: str) -> bool: """ Determine if a page should use the flowchart plugin. Based on the conditions from gzero.py. """ indicators = [ '\\n流程名稱\\n', '\\n流程編號\\n', '\\n流程說明\\n' ] # Check if all indicators are present has_all_indicators = all(indicator in page_text for indicator in indicators) # Additional checks for flowchart-like content has_flowchart_keywords = any(keyword in page_text.lower() for keyword in [ 'flowchart', 'flow chart', '流程图', '流程圖', 'process', 'workflow', 'tcode', 'sap' ]) return has_all_indicators or has_flowchart_keywords async def _execute_step(self, vision_model, prompt: str, image_url: str, temp_dir: str, src_name: str, req_name: str, hint: str) -> Dict: """Execute a single step with image input""" step_req = [ {'role': 'system', 'content': prompt}, {'role': 'user', 'content': [{'type': 'image_url', 'image_url': {'url': image_url}}]}, ] if gzero_vllm_proc and gzero_vllm_page_filter: res = await gzero_vllm_proc(vision_model, step_req, temp_dir, src_name, req_name, hint) res['content'] = gzero_vllm_page_filter(res['content']) else: # Fallback if platform functions not available res = {'content': 'Flowchart processing not available on this platform'} self.logger.info(f"flowchart-plugin: [{src_name}] step completed: {req_name}") return res async def _execute_step_with_context(self, vision_model, system_prompt: str, user_prompt: str, image_url: str, context: str, temp_dir: str, src_name: str, req_name: str, hint: str) -> Dict: """Execute a step with previous context""" step_req = [ {'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': [{'type': 'image_url', 'image_url': {'url': image_url}}]}, {'role': 'assistant', 'content': context}, {'role': 'user', 'content': user_prompt}, ] if gzero_vllm_proc and gzero_vllm_page_filter: res = await gzero_vllm_proc(vision_model, step_req, temp_dir, src_name, req_name, hint) res['content'] = gzero_vllm_page_filter(res['content']) else: # Fallback if platform functions not available res = {'content': 'Flowchart processing not available on this platform'} self.logger.info(f"flowchart-plugin: [{src_name}] context step completed: {req_name}") return res async def _execute_complex_step(self, vision_model, system_prompt: str, step2_prompt: str, step3_prompt: str, image_url: str, step0_content: str, step2_content: str, temp_dir: str, src_name: str, req_name: str, hint: str) -> Dict: """Execute complex step with multiple context elements""" step_req = [ {'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': [{'type': 'image_url', 'image_url': {'url': image_url}}]}, {'role': 'assistant', 'content': step0_content}, {'role': 'user', 'content': step2_prompt}, {'role': 'assistant', 'content': step2_content}, {'role': 'user', 'content': step3_prompt}, ] if gzero_vllm_proc and gzero_vllm_page_filter: res = await gzero_vllm_proc(vision_model, step_req, temp_dir, src_name, req_name, hint) res['content'] = gzero_vllm_page_filter(res['content']) else: # Fallback if platform functions not available res = {'content': 'Flowchart processing not available on this platform'} self.logger.info(f"flowchart-plugin: [{src_name}] complex step completed: {req_name}") return res def _combine_flowchart_results(self, step0_res: Dict, step1_res: Dict, step2_res: Dict, step3_res: Dict) -> str: """Combine all flowchart processing results into final content""" content = '' content += step3_res['content'] + '\\n\\n' content += '流程节点:\\n' + step0_res['content'] + '\\n\\n' content += '流程负责部门:\\n' + step1_res['content'] + '\\n\\n' content += '流程图:\\n' + step2_res['content'] + '\\n\\n' return content def _get_step0_prompt(self) -> str: """Get step 0 prompt for node identification""" return """ 图片左下方一个很大的格子内是一张大的流程图。请用你的视觉多模态能力识别这张流程图的每一个处理节点的内容文字并输出。 某些方框右上角会有一个SAP的图标,图标的上方会有一串大写英文字母、数字、横杠、斜杠组成的TCode代码,请注意识别并以[TCode:代码]的格式添加到该方框的识别结果中。 每一个节点识别的内容如有换行请替换为空格,最终用标签包含并输出。 请保持识别文字原来的写法,比如是繁体中文的请输出繁体中文,不要输出为简体中文,更不要翻译为其他语种。 """ def _get_step1_prompt(self) -> str: """Get step 1 prompt for department organization""" return """ 留意流程图上方还有一行标题行文字,可能中间还会有竖向分隔虚线,将流程图分隔为多个区域并标识为标题文字的负责部门。 请根据之前识别出来的流程节点信息,以及可能有中间分隔虚线划分出来的区域,将之前识别出来的node节点信息归属到标题文字标识的负责部门中,输出如下的markdown格式: # 负责部门1(标题文字) ## 归属负责部门1的node节点内容 ## 归属负责部门1的node节点内容 ... # 负责部门2(标题文字) ## 归属负责部门2的node节点内容 ## 归属负责部门2的node节点内容 ... """ def _get_step2_prompt(self) -> str: """Get step 2 prompt for mermaid generation""" return """ 再观察图片左下方大的格子内的流程图, 根据之前识别出来的流程图node节点信息, 参考流程图中节点间的线条和箭头,以及线条旁边的条件标签, 将整张流程图识别并输出为mermaid格式。 注意要以node节点内容信息作为mermaid节点的内容。 """ def _get_step3_prompt(self) -> str: """Get step 3 prompt for remaining content extraction""" return """ 根据上面识别出来图片左下很大区域的流程图内的节点node信息和流程图mermaid信息,将图片内流程图其余部分的内容识别并输出为markdown格式。 1. 优先识别页面最左上角格子内类似"XXXX專案"的标题文字(用`#`表示)。 2. 注意不需要再识别左下角大格子内的流程图内容了,你只需要识别流程图之外的内容就可以了。识别的内容输出为markdown格式。 """