maxkb/apps/common/handle/impl/mineru/image_processor.py

"""
Image processing module for MinerU-based parsing.

This module handles image recognition, classification, and processing
using multimodal AI models, following patterns from gzero.py.
"""

import os
import json
import base64
import asyncio
import time
import io
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass
from .logger import get_module_logger
logger = get_module_logger('image_processor')
import tiktoken

from .config_base import MinerUConfig
from .context_types import ImageContext, ContentElement, PageContext
from .prompts import format_image_classification_prompt
from .language_detector import LanguageDetector
from .image_optimizer import ImageOptimizer


@dataclass
class ImageProcessingResult:
    """Result from image processing operations"""
    success: bool
    processed_images: Dict[str, str]  # filename -> uploaded_url
    image_descriptions: Dict[str, Dict]  # filename -> classification_result
    error: Optional[str] = None


class MinerUImageProcessor:
    """Image processing handler for MinerU pipeline"""

    def __init__(self, config: MinerUConfig):
        self.config = config
        self.logger = logger
        self.image_optimizer = None
        self.platform_adapter = None  # Will be set by parser if available
        # Log the config being used
        if hasattr(config, 'llm_model_id') and hasattr(config, 'vision_model_id'):
            self.logger.info(f"ImageProcessor initialized with config: LLM={getattr(config, 'llm_model_id', 'N/A')}, Vision={getattr(config, 'vision_model_id', 'N/A')}")

    async def initialize(self):
        """Initialize image optimizer"""
        self.image_optimizer = ImageOptimizer(
            max_concurrent_uploads=self.config.max_concurrent_uploads,
            max_concurrent_api_calls=self.config.max_concurrent_api_calls,
            max_image_size_mb=self.config.max_image_size_mb,
            compression_quality=self.config.compression_quality,
            upload_max_retries=self.config.upload_max_retries,
            upload_retry_delay=self.config.upload_retry_delay
        )

    async def cleanup(self):
        """Cleanup image optimizer resources"""
        if self.image_optimizer:
            await self.image_optimizer.cleanup()

    def _should_skip_recognition(self, image_path: str) -> Tuple[bool, str]:
        """
        Check if an image should skip AI recognition based on size and dimensions.

        Args:
            image_path: Path to the image file

        Returns:
            Tuple of (should_skip, reason)
        """
        try:
            # Check file size
            file_size = os.path.getsize(image_path)
            file_size_kb = file_size / 1024

            if file_size_kb < self.config.min_image_size_kb:
                return True, f"File size too small: {file_size_kb:.1f}KB < {self.config.min_image_size_kb}KB"

            # Check image dimensions using PIL
            from PIL import Image
            with Image.open(image_path) as img:
                width, height = img.size

                # Check minimum dimensions
                if width < self.config.min_image_width or height < self.config.min_image_height:
                    return True, f"Image too small: {width}x{height} < {self.config.min_image_width}x{self.config.min_image_height}"

                # Check maximum dimensions
                if width > self.config.max_image_width or height > self.config.max_image_height:
                    return True, f"Image too large: {width}x{height} > {self.config.max_image_width}x{self.config.max_image_height}"

            return False, "Image meets all requirements"

        except Exception as e:
            self.logger.error(f"mineru-image: error checking image {image_path}: {str(e)}")
            return False, f"Error checking image: {str(e)}"

    def filter_images_by_limits(self, images: List[str], temp_dir: str,
                                page_idx: Optional[int] = None,
                                total_pages: int = 1) -> Tuple[List[str], List[str]]:
        """
        Filter images based on configured limits (similar to gzero.py).

        Args:
            images: List of image filenames
            temp_dir: Directory containing images
            page_idx: Current page index (for per-page filtering)
            total_pages: Total number of pages in document

        Returns:
            Tuple of (selected_images, filtered_out_images)
        """
        # If no images, return empty lists
        if not images:
            return [], []

        # Sort images by size (larger first, like gzero.py)
        image_info = []
        for img_filename in images:
            img_path = os.path.join(temp_dir, img_filename)
            try:
                # Get image size
                from PIL import Image
                with Image.open(img_path) as img:
                    width, height = img.size
                    area = width * height
                    file_size = os.path.getsize(img_path)

                    image_info.append({
                        'filename': img_filename,
                        'area': area,
                        'width': width,
                        'height': height,
                        'file_size': file_size
                    })
            except Exception as e:
                self.logger.warning(f"mineru-image: failed to get info for {img_filename}: {e}")
                # Include failed images with minimal info
                image_info.append({
                    'filename': img_filename,
                    'area': 0,
                    'width': 0,
                    'height': 0,
                    'file_size': 0
                })

        # Sort by area (largest first) then by file size
        image_info.sort(key=lambda x: (x['area'], x['file_size']), reverse=True)

        # Apply size filters
        filtered_images = []
        for info in image_info:
            # Check minimum size
            if info['area'] < self.config.min_image_size:
                self.logger.debug(f"mineru-image: filtering out {info['filename']} - too small ({info['area']} pixels)")
                continue

            # Check maximum size
            if info['area'] > self.config.max_image_size:
                self.logger.debug(f"mineru-image: filtering out {info['filename']} - too large ({info['area']} pixels)")
                continue

            # Check dimensions
            if info['width'] < self.config.min_image_width or info['height'] < self.config.min_image_height:
                self.logger.debug(f"mineru-image: filtering out {info['filename']} - dimensions too small")
                continue

            if info['width'] > self.config.max_image_width or info['height'] > self.config.max_image_height:
                self.logger.debug(f"mineru-image: filtering out {info['filename']} - dimensions too large")
                continue

            filtered_images.append(info)

        # Apply per-page limit (similar to gzero.py's probe_page_thres)
        if page_idx is not None:
            # For individual pages, apply per-page limit
            page_limit = self.config.max_images_per_page
            if len(filtered_images) > page_limit:
                self.logger.info(f"mineru-image: page {page_idx + 1} has {len(filtered_images)} images, limiting to {page_limit}")
                selected = filtered_images[:page_limit]
                filtered_out = filtered_images[page_limit:]
            else:
                selected = filtered_images
                filtered_out = []
        else:
            # For document-level processing, apply document limit
            doc_limit = min(self.config.max_images_per_document,
                          total_pages * self.config.max_images_per_page)
            if len(filtered_images) > doc_limit:
                self.logger.info(f"mineru-image: document has {len(filtered_images)} images, limiting to {doc_limit}")
                selected = filtered_images[:doc_limit]
                filtered_out = filtered_images[doc_limit:]
            else:
                selected = filtered_images
                filtered_out = []

        # Extract filenames
        selected_files = [info['filename'] for info in selected]
        filtered_out_files = [info['filename'] for info in filtered_out] + \
                           [img for img in images if img not in [i['filename'] for i in image_info]]

        self.logger.info(f"mineru-image: selected {len(selected_files)} images, filtered out {len(filtered_out_files)}")

        return selected_files, filtered_out_files

    async def process_images(self, images: List[str], temp_dir: str, src_fileid: str,
                           learn_type: int, upload_callback, upload_options,
                           page_contexts: Optional[List[PageContext]] = None,
                           content_list: Optional[List[Dict]] = None,
                           page_idx: Optional[int] = None,
                           source_text: Optional[str] = None,
                           language_code: Optional[str] = None) -> ImageProcessingResult:
        """
        Process images: classify, recognize content, and upload.

        Args:
            images: List of image filenames
            temp_dir: Temporary directory containing images
            src_fileid: Source file ID for logging
            learn_type: Model type for AI processing
            upload_callback: Function to upload images
            upload_options: Upload configuration
            page_contexts: Optional page context information
            content_list: Optional content list from MinerU
            page_idx: Optional page index for page-specific processing
            source_text: Optional source text for language detection
            language_code: Optional language code (will override detection)

        Returns:
            ImageProcessingResult with processed images and descriptions
        """
        try:
            if not self.image_optimizer:
                await self.initialize()

            # Apply image filtering first (similar to gzero.py)
            total_pages = 1  # Default, should be provided in metadata if available
            if page_contexts:
                total_pages = len(page_contexts)

            selected_images, filtered_out_images = self.filter_images_by_limits(
                images, temp_dir, page_idx, total_pages
            )

            if filtered_out_images:
                self.logger.info(f"mineru-image: filtered out {len(filtered_out_images)} images due to limits")

            page_info = f" for page {page_idx + 1}" if page_idx is not None else ""
            self.logger.info(f"mineru-image:  processing {len(selected_images)} images{page_info} (after filtering)")

            # Use provided language code or detect from source text
            if not language_code and source_text:
                detected_code, confidence = LanguageDetector.detect_language(source_text)
                if confidence > 0.7:  # Only use detected language if confidence is high
                    language_code = detected_code
                    self.logger.info(f"mineru-image: detected language: {language_code} (confidence: {confidence:.2f})")

            if language_code:
                self.logger.info(f"mineru-image: will generate descriptions in {LanguageDetector.get_language_name(language_code)}")
            else:
                self.logger.info(f"mineru-image: no language specified, will use default")

            # Step 1: Load image information and filter based on size/dimensions
            images_to_process = []
            images_skipped = []  # Images that don't need AI recognition

            for img_filename in selected_images:
                img_filepath = os.path.join(temp_dir, img_filename)
                if os.path.exists(img_filepath):
                    # Check if image should skip recognition
                    if self.config.skip_recognition_for_small_images:
                        should_skip, reason = self._should_skip_recognition(img_filepath)
                        if should_skip:
                            self.logger.info(f"mineru-image: skipping recognition for {img_filename}: {reason}")
                            # Still add to skipped list for upload without AI processing
                            xref = img_filename.replace('.png', '').replace('mineru_image_', '')
                            image_info = await self.image_optimizer.load_image_info(
                                img_filepath, img_filename, xref
                            )
                            images_skipped.append(image_info)
                            continue

                    # Use filename as xref for consistency
                    xref = img_filename.replace('.png', '').replace('mineru_image_', '')
                    image_info = await self.image_optimizer.load_image_info(
                        img_filepath, img_filename, xref
                    )
                    images_to_process.append(image_info)
                else:
                    self.logger.warning(f"mineru-image:  image file not found: {img_filepath}")

            if not images_to_process and not images_skipped:
                self.logger.warning(f"mineru-image:  no valid images to process")
                return ImageProcessingResult(success=True, processed_images={}, image_descriptions={})

            # Step 2: Extract context for images if available
            image_contexts = {}
            if page_contexts and content_list:
                self.logger.info("mineru-image: extracting context for images")
                for img_info in images_to_process:
                    context = self._extract_image_context(
                        img_info.filename, content_list, page_contexts
                    )
                    if context:
                        image_contexts[img_info.xref] = context

            # Step 3: Classify images using AI with context
            self.logger.info(f"mineru-image:  classifying {len(images_to_process)} images sequentially")

            # Create a wrapper to pass context and language to classification
            async def classify_with_context(learn_type, image_filepath: str, temp_dir: str,
                                          src_name: str, hint: str = "") -> Dict:
                # Extract xref from image filepath
                filename = os.path.basename(image_filepath)
                xref = filename.replace('.png', '').replace('mineru_image_', '')
                context = image_contexts.get(xref)
                return await self._classify_single_image_with_context(
                    learn_type, image_filepath, temp_dir, src_name, hint, context, language_code
                )

            # Note: batch_classify_images now processes images sequentially to avoid pressure on multimodal service
            classification_results = await self.image_optimizer.batch_classify_images(
                images_to_process,
                classify_with_context,
                learn_type,  # Pass the learn_type instead of model_config
                temp_dir,
                src_fileid
            )

            # Step 4: Filter meaningful images
            meaningful_images = []
            meaningful_classifications = {}

            for image_info in images_to_process:
                filename = image_info.filename
                xref = image_info.xref

                if xref in classification_results:
                    result = classification_results[xref]

                    # Apply meaningless filter if configured
                    if self.config.filter_meaningless_images and result.get('type') == 'meaningless':
                        self.logger.info(f"mineru-image:  image {filename} classified as meaningless, filtering out")
                        # Store classification but don't add to meaningful_images
                        meaningful_classifications[filename] = result
                    else:
                        # Either filter is disabled or image is meaningful
                        meaningful_images.append(image_info)
                        meaningful_classifications[filename] = result
                        self.logger.info(f"mineru-image:  image {filename} classified as {result.get('type')}, keeping")

            if self.config.filter_meaningless_images:
                self.logger.info(f"mineru-image:  filtered to {len(meaningful_images)} meaningful images (meaningless filter enabled)")
            else:
                self.logger.info(f"mineru-image:  keeping all {len(meaningful_images)} classified images (meaningless filter disabled)")

            # Step 5: Upload meaningful images and skipped images
            uploaded_images = {}
            all_images_to_upload = meaningful_images + images_skipped

            if all_images_to_upload:
                self.logger.info(f"mineru-image:  uploading {len(all_images_to_upload)} images ({len(meaningful_images)} with AI, {len(images_skipped)} without AI)")
                self.logger.info(f"mineru-image:  upload_callback={upload_callback}, upload_options={upload_options}")

                upload_results = await self.image_optimizer.batch_upload_images(
                    all_images_to_upload,
                    upload_callback,
                    upload_options
                )

                self.logger.info(f"mineru-image:  upload_results: {upload_results}")

                # Map results back to filenames
                for image_info in all_images_to_upload:
                    xref = image_info.xref
                    self.logger.info(f"mineru-image:  checking upload result for {image_info.filename} (xref={xref})")
                    if xref in upload_results and upload_results[xref]:
                        uploaded_images[image_info.filename] = upload_results[xref]
                        self.logger.info(f"mineru-image:  uploaded {image_info.filename} -> {upload_results[xref]}")
                    else:
                        self.logger.warning(f"mineru-image:  upload failed for {image_info.filename}")

            # For skipped images, add a simple description
            for image_info in images_skipped:
                if image_info.filename in uploaded_images:
                    meaningful_classifications[image_info.filename] = {
                        'type': 'skipped',
                        'content': 'Image skipped due to size/dimension filters',
                        'input_tokens': 0,
                        'output_tokens': 0,
                        'dura': 0.0
                    }

            return ImageProcessingResult(
                success=True,
                processed_images=uploaded_images,
                image_descriptions=meaningful_classifications
            )

        except Exception as e:
            self.logger.error(f"mineru-image:  image processing failed: {str(e)}")
            return ImageProcessingResult(
                success=False,
                processed_images={},
                image_descriptions={},
                error=str(e)
            )

    def _extract_image_context(self, image_filename: str, content_list: List[Dict],
                             page_contexts: List[PageContext]) -> Optional[ImageContext]:
        """
        Extract context information for an image from the content list.

        Args:
            image_filename: The image filename
            content_list: MinerU content list with page_idx and type
            page_contexts: List of page context information

        Returns:
            ImageContext object or None if not found
        """
        try:
            # Find the image in content list
            image_page_idx = None
            image_position = None

            for idx, item in enumerate(content_list):
                if item.get('type') == 'image' and image_filename in str(item.get('img_path', '')):
                    image_page_idx = item.get('page_idx', 0)
                    image_position = idx
                    break

            if image_page_idx is None:
                return None

            # Get page context
            page_context = None
            for pc in page_contexts:
                if pc.page_idx == image_page_idx:
                    page_context = pc
                    break

            if not page_context:
                return None

            # Extract surrounding text with configurable window
            window_size = self.config.context_window_size if hasattr(self.config, 'context_window_size') else 2
            surrounding_text = self._extract_surrounding_text(
                content_list, image_position, window_size, image_page_idx
            )

            # Get before and after text from page
            before_text, after_text = "", ""
            if hasattr(page_context, 'get_text_around_position'):
                before_text, after_text = page_context.get_text_around_position(image_position)

            # Count tokens
            token_count = self._count_tokens(surrounding_text)

            return ImageContext(
                page_idx=image_page_idx,
                surrounding_text=surrounding_text,
                page_type=page_context.page_type if page_context else 'content',
                chunk_idx=image_position,
                token_count=token_count,
                before_text=before_text,
                after_text=after_text,
                page_title=page_context.title if page_context else None
            )

        except Exception as e:
            self.logger.error(f"mineru-image: failed to extract context for {image_filename}: {str(e)}")
            return None

    def _extract_surrounding_text(self, content_list: List[Dict], position: int,
                                window_size: int, target_page_idx: int) -> str:
        """
        Extract text content around a specific position in the content list.
        """
        texts = []

        # Look backward
        for i in range(max(0, position - window_size), position):
            item = content_list[i]
            if item.get('page_idx') == target_page_idx and item.get('type') == 'text':
                text = item.get('text', '').strip()
                if text:
                    texts.append(f"[Before] {text}")

        # Look forward
        for i in range(position + 1, min(len(content_list), position + window_size + 1)):
            item = content_list[i]
            if item.get('page_idx') == target_page_idx and item.get('type') == 'text':
                text = item.get('text', '').strip()
                if text:
                    texts.append(f"[After] {text}")

        return '\n'.join(texts)

    def _count_tokens(self, text: str) -> int:
        """
        Count tokens in text using tiktoken.
        """
        try:
            encoding = tiktoken.get_encoding("cl100k_base")
            return len(encoding.encode(text))
        except:
            # Fallback to character-based estimation
            return len(text) // 4

    async def _classify_single_image_with_context(self, learn_type, image_filepath: str, temp_dir: str,
                                                src_name: str, hint: str = "",
                                                context: Optional[ImageContext] = None,
                                                language_code: Optional[str] = None) -> Dict:
        """
        Classify a single image using multimodal AI with optional context.

        This is an enhanced version that uses context when available.
        """
        self.logger.info(f"mineru-image: _classify_single_image_with_context called for {os.path.basename(image_filepath)}")

        # If no context, fall back to original method
        if not context:
            self.logger.info(f"mineru-image: no context, falling back to original method")
            return await self._classify_single_image(learn_type, image_filepath, temp_dir, src_name, hint)

        try:
            self.logger.info(f"mineru-image: processing with context for {os.path.basename(image_filepath)}")

            if not os.path.exists(image_filepath):
                raise FileNotFoundError(f"Image file not found: {image_filepath}")

            with open(image_filepath, 'rb') as file:
                image_data = file.read()

            # Use BytesIO to avoid blocking the event loop
            image_buffer = io.BytesIO(image_data)
            image_base64 = base64.b64encode(image_buffer.getvalue()).decode("utf-8")

            # Build context-aware prompt with language
            prompt = self._build_context_aware_prompt(context, language_code)

            # Log the final prompt for debugging
            self.logger.info(f"mineru-image: Final prompt for {os.path.basename(image_filepath)}:\n{prompt[:1000]}...")

            messages = [
                {'role': 'system', 'content': prompt},
                {'role': 'user', 'content': [
                    {'type': 'text', 'text': '请分析这张图片并按照要求输出JSON格式结果。'},
                    {'type': 'image_url', 'image_url': {
                        'url': f"data:image/png;base64,{image_base64}"
                    }}
                ]}
            ]

            # Call litellm using unified helper
            start_time = time.time()

            try:
                self.logger.info(f"mineru-image: calling vision model for {os.path.basename(image_filepath)}")
                response = await self.config.call_litellm(
                    model_type=learn_type,
                    messages=messages,
                    temperature=0.0,
                    timeout=120.0  # Increased timeout to 120 seconds for vision models
                )
                self.logger.info(f"mineru-image: received response from vision model")

                duration = time.time() - start_time

                # Log raw response for debugging
                raw_response = response.choices[0].message.content if response.choices else ""
                self.logger.info(f"mineru-image: raw AI response (first 500 chars): {raw_response[:500] if raw_response else 'Empty response'}")
                # Log complete response for debugging
                self.logger.info(f"mineru-image: FULL AI response for {os.path.basename(image_filepath)}:\n{raw_response}")

                # Log usage info
                if hasattr(response, 'usage'):
                    self.logger.info(f"mineru-image: usage - prompt_tokens={getattr(response.usage, 'prompt_tokens', 0)}, "
                                   f"completion_tokens={getattr(response.usage, 'completion_tokens', 0)}")
                else:
                    self.logger.warning(f"mineru-image: no usage info in response")

                # Parse enhanced response
                result = self._parse_context_aware_response(
                    raw_response,
                    response.usage if hasattr(response, 'usage') else None,
                    duration
                )

                # Add context information to result
                result['has_context'] = True
                result['page_idx'] = context.page_idx

                # Log successful classification
                self.logger.info(f"mineru-image: classified {os.path.basename(image_filepath)} as {result.get('type', 'unknown')} "
                                f"(tokens: in={result.get('input_tokens', 0)}, out={result.get('output_tokens', 0)})")

            except Exception as e:
                self.logger.error(f"mineru-image: classification error: {str(e)}")
                self.logger.info(f"mineru-image: classification failed for {os.path.basename(image_filepath)}, returning meaningless")
                result = {
                    'type': 'meaningless',
                    'content': f'Classification error: {str(e)}',
                    'input_tokens': 0,
                    'output_tokens': 0,
                    'dura': time.time() - start_time,
                    'has_context': True,
                    'error': str(e)
                }

            return result

        except Exception as e:
            self.logger.error(f"mineru-image: context classification failed: {str(e)}")
            # Fall back to non-context classification
            return await self._classify_single_image(learn_type, image_filepath, temp_dir, src_name, hint)

    def _build_context_aware_prompt(self, context: ImageContext, language_code: Optional[str] = None) -> str:
        """
        Build an enhanced prompt that includes context information and language instruction.
        """
        # Format page title info
        page_title_info = ""
        if context.page_title:
            page_title_info = f"页面标题：{context.page_title}"

        # Truncate surrounding text if too long
        max_context_tokens = getattr(self.config, 'max_context_tokens', 1000)
        surrounding_text = self._truncate_text_by_tokens(context.surrounding_text, max_context_tokens)

        # Build context dictionary
        context_data = {
            'page_idx': context.page_idx + 1,  # Human-readable page number
            'page_type': context.page_type,
            'page_title_info': page_title_info,
            'surrounding_text': surrounding_text
        }

        # Check if we have text content for language detection
        has_text_content = bool(context.surrounding_text and context.surrounding_text.strip())

        # Use the optimized prompt with context and language
        return format_image_classification_prompt(context=context_data, language_code=language_code, has_text_content=has_text_content)

    def _truncate_text_by_tokens(self, text: str, max_tokens: int) -> str:
        """
        Truncate text to fit within token limit.
        """
        if not text:
            return ""

        try:
            encoding = tiktoken.get_encoding("cl100k_base")
            tokens = encoding.encode(text)

            if len(tokens) <= max_tokens:
                return text

            # Truncate and decode
            truncated_tokens = tokens[:max_tokens]
            return encoding.decode(truncated_tokens) + "...[已截断]"
        except:
            # Fallback to character-based truncation
            char_limit = max_tokens * 4
            if len(text) > char_limit:
                return text[:char_limit] + "...[已截断]"
            return text

    def _parse_context_aware_response(self, response_content: str, usage: Any, duration: float) -> Dict:
        """
        Parse the enhanced response from context-aware classification.
        """
        try:
            # Extract JSON from markdown if present
            if '```json' in response_content and '```' in response_content:
                json_start = response_content.find('```json') + 7
                json_end = response_content.find('```', json_start)
                response_content = response_content[json_start:json_end].strip()

            # Parse JSON
            result_json = json.loads(response_content)

            # Log the raw classification response for debugging
            self.logger.info(f"mineru-image: parsed JSON response: {result_json}")

            # Build result dictionary
            result = {
                'type': result_json.get('type', 'meaningless'),
                'title': result_json.get('title', ''),
                'content': result_json.get('description', ''),
                'input_tokens': usage.prompt_tokens if usage and hasattr(usage, 'prompt_tokens') else 0,
                'output_tokens': usage.completion_tokens if usage and hasattr(usage, 'completion_tokens') else 0,
                'dura': duration
            }

            # Add OCR content if available
            if result_json.get('ocr_content'):
                result['ocr_content'] = result_json['ocr_content']

            return result

        except Exception as e:
            self.logger.error(f"mineru-image: failed to parse context response: {str(e)}")
            self.logger.debug(f"mineru-image: response that failed to parse: {response_content[:500] if response_content else 'Empty'}")
            # Return a basic result
            return {
                'type': 'brief_description',
                'title': '',
                'content': response_content[:200] if response_content else '',
                'input_tokens': usage.prompt_tokens if usage and hasattr(usage, 'prompt_tokens') else 0,
                'output_tokens': usage.completion_tokens if usage and hasattr(usage, 'completion_tokens') else 0,
                'dura': duration
            }

    async def _classify_single_image(self, learn_type, image_filepath: str, temp_dir: str,
                                   src_name: str, hint: str = "") -> Dict:
        """
        Classify a single image using multimodal AI.

        This follows the gzero.py pattern for image classification.

        Args:
            learn_type: The learn type for model selection
            image_filepath: Path to the image file to classify
            temp_dir: Temporary directory (currently unused but kept for API compatibility)
            src_name: Source name (currently unused but kept for API compatibility)
            hint: Additional hint for classification (currently unused but kept for API compatibility)
        """
        try:
            if not os.path.exists(image_filepath):
                raise FileNotFoundError(f"Image file not found: {image_filepath}")

            with open(image_filepath, 'rb') as file:
                image_data = file.read()

            # Use BytesIO to avoid blocking the event loop
            image_buffer = io.BytesIO(image_data)
            image_base64 = base64.b64encode(image_buffer.getvalue()).decode("utf-8")

            # Use the optimized prompt without context
            # For simple classification, we don't have text content
            prompt = format_image_classification_prompt(context=None, language_code=None, has_text_content=False)

            messages = [
                {'role': 'system', 'content': prompt},
                {'role': 'user', 'content': [
                    {'type': 'text', 'text': '请分析这张图片并按照要求输出JSON格式结果。'},
                    {'type': 'image_url', 'image_url': {
                        'url': f"data:image/png;base64,{image_base64}"
                    }}
                ]}
            ]

            # Call litellm using unified helper
            start_time = time.time()

            try:
                # Set timeout to avoid long waits
                response = await self.config.call_litellm(
                    model_type=learn_type,
                    messages=messages,
                    temperature=0.0,
                    timeout=120.0  # Increased timeout to 120 seconds for vision models
                )

                duration = time.time() - start_time

                # Parse response
                response_content = response.choices[0].message.content

                # Log complete response for debugging
                self.logger.info(f"mineru-image: FULL AI response for {os.path.basename(image_filepath)} (no context):\n{response_content}")

                # Extract JSON from markdown code block if present
                if '```json' in response_content and '```' in response_content:
                    try:
                        json_start = response_content.find('```json') + 7
                        json_end = response_content.find('```', json_start)
                        response_content = response_content[json_start:json_end].strip()
                    except:
                        pass

                # Try to parse JSON response
                try:
                    result_json = json.loads(response_content)
                    img_type = result_json.get('type', 'meaningless')
                    title = result_json.get('title', '')
                    description = result_json.get('description', '')
                    ocr_content = result_json.get('ocr_content', '')
                except json.JSONDecodeError:
                    # Fallback parsing if not valid JSON
                    if 'structured_content' in response_content:
                        img_type = 'structured_content'
                    elif 'brief_description' in response_content:
                        img_type = 'brief_description'
                    else:
                        img_type = 'meaningless'
                    title = ''
                    description = response_content
                    ocr_content = ''  # Default value for fallback case

                result = {
                    'type': img_type,
                    'content': description,
                    'input_tokens': response.usage.prompt_tokens if hasattr(response, 'usage') else 0,
                    'output_tokens': response.usage.completion_tokens if hasattr(response, 'usage') else 0,
                    'dura': duration,
                }
                # Add title if it exists
                if title:
                    result['title'] = title
                # Only add ocr_content if it exists
                if ocr_content:
                    result['ocr_content'] = ocr_content

            except asyncio.TimeoutError:
                self.logger.warning(f"mineru-image: classification timeout for {image_filepath}")
                result = {
                    'type': 'meaningless',
                    'content': 'Classification timeout',
                    'input_tokens': 0,
                    'output_tokens': 0,
                    'dura': time.time() - start_time,
                }
            except Exception as e:
                self.logger.error(f"mineru-image: classification error for {image_filepath}: {str(e)}")
                result = {
                    'type': 'meaningless',
                    'content': f'Classification error: {str(e)}',
                    'input_tokens': 0,
                    'output_tokens': 0,
                    'dura': time.time() - start_time,
                }

            # Enhanced logging to debug meaningless classification
            self.logger.info(f"mineru-image: classified {image_filepath} as {result.get('type', 'unknown')} - tokens: in={result.get('input_tokens', 0)}, out={result.get('output_tokens', 0)}, error={result.get('error', 'None')}")

            return result

        except Exception as e:
            self.logger.error(f"mineru-image: image classification failed for {image_filepath}: {str(e)}")
            return {
                'type': 'meaningless',
                'content': '',
                'input_tokens': 0,
                'output_tokens': 0,
                'dura': 0.0,
                'error': str(e)
            }

    def integrate_image_descriptions(self, content: str, image_descriptions: Dict[str, Dict],
                                   uploaded_images: Dict[str, str], src_fileid: str) -> str:
        """
        Integrate image descriptions into content by replacing original image references.

        Args:
            content: Original content with image references like ![](images/xxxxx.jpg)
            image_descriptions: Image classification results
            uploaded_images: Mapping of filename to uploaded URL
            src_fileid: Source file ID for logging

        Returns:
            Content with replaced image descriptions
        """
        import re

        try:
            enhanced_content = content

            # Log the image descriptions we're working with
            self.logger.info(f"mineru-image:  integrate_image_descriptions called:")
            self.logger.info(f"  - {len(image_descriptions)} image descriptions: {list(image_descriptions.keys())}")
            self.logger.info(f"  - {len(uploaded_images)} uploaded images: {list(uploaded_images.keys())}")
            self.logger.info(f"  - src_fileid: {src_fileid}")
            self.logger.debug(f"  - content length: {len(content)} chars")

            # Create a mapping of processed images with their enhanced markdown
            image_replacements = {}

            for filename, description_data in image_descriptions.items():
                img_url = uploaded_images.get(filename, f"placeholder_{filename}")

                # Extract title
                title = description_data.get('title', '')

                # Limit title length
                if title and len(title) > 15:
                    title = title[:15] + "..."

                if description_data['type'] == 'skipped':
                    # For skipped images, just keep the original image reference with URL
                    img_markdown = f"![Image]({img_url})"

                elif description_data['type'] == 'structured_content':
                    # Parse structured content
                    description = description_data.get('content', '')
                    ocr_content = description_data.get('ocr_content', '')

                    # Escape quotes in description
                    if description:
                        description = description.replace('"', '\\"')

                    # Use title or default
                    if not title:
                        title = "Structured content image"

                    if description:
                        img_markdown = f"![{title}]({img_url})\n<!--{description}-->\n"
                        if ocr_content:
                            img_markdown += f"\n\n{ocr_content}"
                    elif ocr_content:
                        img_markdown = f"![{title}]({img_url})\n\n{ocr_content}"
                    else:
                        img_markdown = f"![{title}]({img_url})"

                elif description_data['type'] == 'brief_description':
                    description = description_data.get('content', '')

                    # Escape quotes in description
                    if description:
                        description = description.replace('"', '\\"')

                    # Use title or default
                    if not title:
                        title = "Image"

                    if description:
                        img_markdown = f"![{title}]({img_url})\n<!--{description}-->\n"
                    else:
                        img_markdown = f"![{title}]({img_url})"

                else:
                    # Default format for meaningless type
                    img_markdown = f"![Image]({img_url})"

                image_replacements[filename] = img_markdown
                self.logger.info(f"mineru-image:  prepared replacement for {filename}: {img_markdown[:100]}...")

            # Replace original image references with enhanced versions
            # Pattern to match ![any_text](images/filename) or ![](images/filename)
            def replace_image_reference(match):
                full_match = match.group(0)
                image_path = match.group(2)

                # Extract filename from path (e.g., "images/xxxxx.jpg" -> "xxxxx.jpg")
                filename = image_path.split('/')[-1]

                # Direct match first
                if filename in image_replacements:
                    self.logger.info(f"mineru-image:  FOUND direct match for {filename}")
                    self.logger.info(f"mineru-image:  replacing '{full_match}' with '{image_replacements[filename][:100]}...'")
                    return image_replacements[filename]

                # Try to find a match by checking if any key ends with the filename
                # This handles cases where the stored key has a prefix
                for stored_filename, replacement in image_replacements.items():
                    if stored_filename.endswith(filename) or filename.endswith(stored_filename):
                        self.logger.info(f"mineru-image:  replacing reference for {filename} (matched with {stored_filename})")
                        return replacement

                # Also try matching by partial filename patterns
                # Handle case where filename might be like "mineru_image_1.png"
                # and we have "17888edb327f3b95ee826f5d02a9c264_page_1_afc32c3bbdbe2eafb44ebb66c01028fedb5523292bb954eb58154392aa447ebf.jpg"
                filename_base = filename.replace('.png', '').replace('.jpg', '').replace('.jpeg', '')
                for stored_filename, replacement in image_replacements.items():
                    if filename_base in stored_filename:
                        self.logger.info(f"mineru-image:  replacing reference for {filename} (partial match with {stored_filename})")
                        return replacement

                # Keep original if no replacement available
                self.logger.warning(f"mineru-image:  no replacement found for {filename} in image_replacements")
                self.logger.info(f"mineru-image:  available replacements: {list(image_replacements.keys())}")
                return full_match

            # Regex pattern to match markdown image syntax: ![alt_text](path)
            # This handles both ![](images/xxx.jpg) and ![alt text](images/xxx.jpg)
            image_pattern = r'!\[([^\]]*)\]\(([^)]+)\)'

            # Log all image references found in content for debugging
            import re
            found_images = re.findall(image_pattern, enhanced_content)
            if found_images:
                self.logger.info(f"mineru-image:  found {len(found_images)} image references in content")
                for alt_text, img_path in found_images[:10]:  # Log first 10
                    self.logger.info(f"mineru-image:  image reference: ![{alt_text}]({img_path})")
            else:
                self.logger.warning(f"mineru-image:  NO image references found in content!")

            enhanced_content = re.sub(image_pattern, replace_image_reference, enhanced_content)

            # Log summary of replacements
            self.logger.info(f"mineru-image:  completed image integration, processed {len(image_replacements)} images")

            return enhanced_content

        except Exception as e:
            self.logger.error(f"mineru-image:  image description integration failed: {str(e)}")
            return content

    def create_image_references(self, image_descriptions: Dict[str, Dict],
                              uploaded_images: Dict[str, str]) -> Dict[str, str]:
        """
        Create image reference placeholders for content replacement.

        Returns:
            Dictionary mapping placeholder to final image markdown
        """
        image_refs = {}

        for filename, description_data in image_descriptions.items():
            img_url = uploaded_images.get(filename, f"placeholder_{filename}")
            placeholder = f"[===[{filename}]===]"

            # Extract title
            title = description_data.get('title', '')

            # Limit title length
            if title and len(title) > 15:
                title = title[:15] + "..."

            if description_data['type'] == 'skipped':
                # For skipped images, just keep the original image reference
                img_markdown = f"![Image]({img_url})"

            elif description_data['type'] == 'structured_content':
                try:
                    # Try to parse as JSON if content is JSON string
                    content_data = description_data
                    if isinstance(description_data.get('content'), str) and description_data['content'].startswith('{'):
                        try:
                            content_data = json.loads(description_data['content'])
                        except:
                            pass

                    description = content_data.get('description', content_data.get('content', ''))
                    ocr_content = content_data.get('ocr_content', description_data.get('ocr_content', ''))

                    # Escape quotes in description
                    if description:
                        description = description.replace('"', '\\"')

                    # Use title or default
                    if not title:
                        title = "Structured content image"

                    if description:
                        img_markdown = f"![{title}]({img_url})\n<!--{description}-->\n"
                        if ocr_content:
                            img_markdown += f"\n\n{ocr_content}"
                    elif ocr_content:
                        img_markdown = f"![{title}]({img_url})\n\n{ocr_content}"
                    else:
                        img_markdown = f"![{title}]({img_url})"

                except Exception as e:
                    img_markdown = f"![Structured content image]({img_url})"

            elif description_data['type'] == 'brief_description':
                description = description_data.get('content', '')

                # Escape quotes in description
                if description:
                    description = description.replace('"', '\\"')

                # Use title or default
                if not title:
                    title = "Image"

                if description:
                    img_markdown = f"![{title}]({img_url})\n<!--{description}-->\n"
                else:
                    img_markdown = f"![{title}]({img_url})"

            else:
                img_markdown = f"![Image]({img_url})"

            image_refs[placeholder] = img_markdown

        return image_refs