maxkb/apps/common/handle/impl/text/mineru_split_handle.py

# -*- coding: utf-8 -*-
"""
MinerU Split Handle - 使用MinerU服务处理文档和图片

支持的文档格式：PDF、PPT、PPTX、DOC、DOCX
支持的图片格式：PNG、JPG、JPEG、GIF、BMP、TIFF、WebP、SVG
"""
import os
from typing import List, Dict, Any
from common.handle.base_split_handle import BaseSplitHandle
from common.handle.impl.mineru.maxkb_adapter.adapter import MinerUAdapter
from common.utils.logger import maxkb_logger as logger


class MinerUSplitHandle(BaseSplitHandle):
    """
    使用MinerU服务处理PDF等复杂文档格式
    """

    def __init__(self):
        super().__init__()
        self.mineru_adapter = None

    def support(self, file, get_buffer, **kwargs):
        """
        检查是否支持该文件类型
        支持PDF、PPT、DOC和图片文件，且需要MinerU服务配置
        预览模式下不使用MinerU处理器，因为处理速度较慢
        """
        # 如果是预览模式，不使用MinerU处理器
        if kwargs.get('is_preview', False):
            return False

        file_name = file.name.lower()
        # 检查文件扩展名
        supported_extensions = (
            '.pdf', '.ppt', '.pptx', '.doc', '.docx',  # 文档格式
            '.png', '.jpg', '.jpeg', '.gif', '.bmp',    # 图片格式
            '.tiff', '.tif', '.webp', '.svg'            # 其他图片格式
        )
        if not any(file_name.endswith(ext) for ext in supported_extensions):
            return False

        # 检查MinerU配置
        mineru_api_type = os.environ.get('MINERU_API_TYPE', '')
        if not mineru_api_type:
            return False

        return True

    def handle(self, file, pattern_list: List, with_filter: bool, limit: int,
               get_buffer, save_image, **kwargs):
        """
        使用MinerU处理文档
        """
        try:
            logger.info(f"MinerUSplitHandle.handle called for file: {file.name if hasattr(file, 'name') else 'unknown'}")

            # 初始化MinerU适配器，传递模型ID（如果提供）
            if not self.mineru_adapter:
                logger.info("Initializing MinerU adapter")
                llm_model_id = kwargs.get('llm_model_id')
                vision_model_id = kwargs.get('vision_model_id')
                if llm_model_id and vision_model_id:
                    logger.info(f"Using models: LLM={llm_model_id}, Vision={vision_model_id}")
                self.mineru_adapter = MinerUAdapter()

            # 获取文件内容
            buffer = get_buffer(file)
            logger.info(f"File buffer size: {len(buffer) if buffer else 0} bytes")

            # 处理文档，传递模型ID到适配器
            logger.info("Calling MinerU adapter to process document")
            process_kwargs = {
                'file_content': buffer,
                'file_name': file.name if hasattr(file, 'name') else 'document.pdf',
                'save_image_func': save_image
            }

            # 如果有模型ID，传递给适配器
            llm_model_id = kwargs.get('llm_model_id')
            vision_model_id = kwargs.get('vision_model_id')
            if llm_model_id:
                process_kwargs['llm_model_id'] = llm_model_id
            if vision_model_id:
                process_kwargs['vision_model_id'] = vision_model_id

            result = self.mineru_adapter.process_document(**process_kwargs)
            logger.info(f"MinerU adapter returned result with {len(result.get('sections', []))} sections")

            # 转换为段落格式
            paragraphs = []
            for section in result.get('sections', []):
                content = section.get('content', '')
                if content:
                    paragraph = {
                        'content': content,
                        'title': section.get('title', ''),
                        'images': section.get('images', [])
                    }
                    paragraphs.append(paragraph)

            logger.info(f"Converted to {len(paragraphs)} paragraphs before pattern processing")

            # 应用分段模式
            if pattern_list and len(pattern_list) > 0:
                split_paragraphs = []
                for paragraph in paragraphs:
                    content = paragraph['content']
                    for pattern in pattern_list:
                        split_contents = pattern.parse(content)
                        for split_content in split_contents:
                            split_paragraph = {
                                'content': split_content,
                                'title': paragraph.get('title', ''),
                                'images': paragraph.get('images', [])
                            }
                            split_paragraphs.append(split_paragraph)
                paragraphs = split_paragraphs

            # 限制返回数量
            if limit > 0:
                paragraphs = paragraphs[:limit]

            logger.info(f"MinerUSplitHandle returning {len(paragraphs)} paragraphs")
            return paragraphs

        except Exception as e:
            logger.error(f"MinerU处理文档失败: {str(e)}", exc_info=True)
            # 如果MinerU处理失败，回退到PDF处理器
            from common.handle.impl.text.pdf_split_handle import PdfSplitHandle
            pdf_handler = PdfSplitHandle()
            return pdf_handler.handle(file, pattern_list, with_filter, limit,
                                     get_buffer, save_image)

    def get_content(self, file, get_buffer):
        """
        获取文件的文本内容
        """
        try:
            # 如果MinerU可用，使用MinerU提取内容
            if self.mineru_adapter is None:
                # 检查MinerU配置
                mineru_api_type = os.environ.get('MINERU_API_TYPE', '')
                if mineru_api_type:
                    self.mineru_adapter = MinerUAdapter()

            if self.mineru_adapter:
                buffer = get_buffer(file)
                result = self.mineru_adapter.process_document(
                    file_content=buffer,
                    file_name=file.name,
                    save_image_func=None
                )

                # 合并所有sections的内容
                content_parts = []
                for section in result.get('sections', []):
                    if section.get('content'):
                        content_parts.append(section['content'])

                return '\n'.join(content_parts) if content_parts else ''
        except Exception as e:
            logger.warning(f"MinerU获取内容失败，回退到PDF处理器: {str(e)}")

        # 回退到PDF处理器
        from common.handle.impl.text.pdf_split_handle import PdfSplitHandle
        pdf_handler = PdfSplitHandle()
        return pdf_handler.get_content(file, get_buffer)