maxkb/apps/common/handle/impl/mineru/parallel_processor.py

"""
Parallel processing module for MinerU with task-level granularity.

This module implements a producer-consumer pattern with four independent
processing threads for maximum parallelism.
"""

import asyncio
import os
import queue
import threading
import time
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass, field
from enum import Enum
from concurrent.futures import ThreadPoolExecutor
from .logger import get_module_logger
logger = get_module_logger('parallel_processor')

from .config_base import MinerUConfig
from .converter import DocumentConverter
from .api_client import MinerUAPIClient, MinerUResult
from .image_processor import MinerUImageProcessor
from .content_processor import MinerUContentProcessor


class TaskStatus(Enum):
    """Task processing status"""
    PENDING = "pending"
    PROCESSING = "processing"
    COMPLETED = "completed"
    FAILED = "failed"


@dataclass
class PageTask:
    """Represents a page-level processing task"""
    page_idx: int
    pdf_path: str
    temp_dir: str
    src_fileid: str
    status: TaskStatus = TaskStatus.PENDING
    content: Optional[str] = None
    images: List[str] = field(default_factory=list)
    refined_content: Optional[str] = None
    processed_images: Dict[str, str] = field(default_factory=dict)
    image_descriptions: Dict[str, Dict] = field(default_factory=dict)
    error: Optional[str] = None
    metadata: Dict[str, Any] = field(default_factory=dict)
    # Non-serializable fields
    upload_callback: Optional[Any] = field(default=None, repr=False)
    upload_options: Optional[Any] = field(default=None, repr=False)
    meaningful_images: List[Any] = field(default_factory=list, repr=False)
    save_callback: Optional[Any] = field(default=None, repr=False)  # Cache save callback
    # Retry tracking
    retry_count: int = 0
    max_retries: int = 3


@dataclass
class ProcessingQueues:
    """Container for all processing queues"""
    parsing_queue: queue.Queue
    content_refinement_queue: queue.Queue
    image_recognition_queue: queue.Queue
    image_upload_queue: queue.Queue


class ParallelMinerUProcessor:
    """
    Parallel processor implementing task-level granularity.

    Architecture:
    - 4 independent threads: parsing, content refinement, image recognition, image upload
    - Producer-consumer pattern with queues connecting stages
    - Page-level task granularity for maximum parallelism
    """

    def __init__(self, config: MinerUConfig, learn_type: int = 9, platform_adapter=None):
        self.config = config
        self.learn_type = learn_type
        self.logger = logger
        self.platform_adapter = platform_adapter

        # Initialize components
        self.converter = DocumentConverter(config)
        self.content_processor = MinerUContentProcessor(config)
        self.image_processor = MinerUImageProcessor(config)

        # Initialize queues
        self.queues = ProcessingQueues(
            parsing_queue=queue.Queue(maxsize=config.queue_size),
            content_refinement_queue=queue.Queue(maxsize=config.queue_size),
            image_recognition_queue=queue.Queue(maxsize=config.queue_size),
            image_upload_queue=queue.Queue(maxsize=config.queue_size)
        )

        # Task tracking - use document-specific dictionaries
        self.document_tasks: Dict[str, Dict[int, PageTask]] = {}  # src_fileid -> page_idx -> task
        self.tasks_lock = threading.Lock()

        # Thread control
        self.shutdown_event = threading.Event()
        self.threads: List[threading.Thread] = []

        # Results collection - per document
        self.document_completed_tasks: Dict[str, List[PageTask]] = {}  # src_fileid -> completed tasks
        self.completed_lock = threading.Lock()

        # Async initialization flag
        self._initialized = False

        # Queue monitoring
        self._monitor_thread = None
        self._monitor_stop_event = threading.Event()

        # Document-level language detection cache
        self.document_languages: Dict[str, str] = {}  # src_fileid -> detected language

    async def initialize(self):
        """Initialize async components like image processor"""
        if not self._initialized:
            await self.image_processor.initialize()
            self._initialized = True

    def initialize_threads(self):
        """
        Initialize and start all processing threads.

        This is the main initialization function that starts:
        1. Parsing thread - processes MinerU API calls
        2. Content refinement thread - refines content with AI
        3. Image recognition thread - classifies and recognizes images
        4. Image upload thread - uploads images to storage
        """
        self.logger.info("Initializing parallel processing threads...")

        # Create threads
        threads = [
            threading.Thread(target=self._run_async_thread, args=(self._parsing_worker,),
                           name="MinerU-Parser"),
            threading.Thread(target=self._run_async_thread, args=(self._content_refinement_worker,),
                           name="MinerU-ContentRefiner"),
            threading.Thread(target=self._run_async_thread, args=(self._image_recognition_worker,),
                           name="MinerU-ImageRecognizer"),
            threading.Thread(target=self._run_async_thread, args=(self._image_upload_worker,),
                           name="MinerU-ImageUploader")
        ]

        # Start all threads
        for thread in threads:
            thread.daemon = True
            thread.start()
            self.threads.append(thread)
            self.logger.info(f"Started thread: {thread.name}")

    def _run_async_thread(self, async_worker):
        """Run async worker in thread with its own event loop"""
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        try:
            loop.run_until_complete(async_worker())
        finally:
            loop.close()

    async def _parsing_worker(self):
        """
        Parsing worker thread with on-demand batch processing.

        Consumes: PageTask from parsing_queue
        Produces: PageTask to content_refinement_queue and image_recognition_queue
        """
        self.logger.info("Parsing worker started")

        # Cache for batch results: {src_fileid: {batch_idx: MinerUResult}}
        document_batch_results = {}
        # Track which pages belong to which batch
        document_batch_info = {}  # {src_fileid: {'batch_size': int, 'total_pages': int}}

        # Initialize API client
        async with MinerUAPIClient(self.config, self.platform_adapter) as api_client:
            while not self.shutdown_event.is_set():
                try:
                    # Get task from queue (timeout to check shutdown)
                    try:
                        task = self.queues.parsing_queue.get(timeout=1.0)
                    except queue.Empty:
                        continue

                    # Set trace context for this task with page number
                    if self.platform_adapter:
                        self.platform_adapter.set_trace_id(f"{task.src_fileid}_p{task.page_idx + 1}")

                    self.logger.info(f"Parser processing page {task.page_idx + 1}")

                    # Update task status
                    with self.tasks_lock:
                        task.status = TaskStatus.PROCESSING
                        # Track task by document
                        if task.src_fileid not in self.document_tasks:
                            self.document_tasks[task.src_fileid] = {}
                        self.document_tasks[task.src_fileid][task.page_idx] = task

                    try:
                        # Determine batch size and batch index for this page
                        batch_size = self.config.batch_size if self.config.batch_processing_enabled else 0

                        # Initialize document batch info if needed
                        if task.src_fileid not in document_batch_info:
                            # Get total pages from metadata or detect from PDF
                            total_pages = task.metadata.get('total_pages', 0)
                            if total_pages == 0:
                                # Count pages if not provided
                                import fitz
                                with fitz.open(task.pdf_path) as doc:
                                    total_pages = len(doc)

                            document_batch_info[task.src_fileid] = {
                                'batch_size': batch_size if batch_size > 0 and total_pages > batch_size else 0,
                                'total_pages': total_pages
                            }
                            document_batch_results[task.src_fileid] = {}

                        batch_info = document_batch_info[task.src_fileid]

                        # Calculate batch index for this page
                        if batch_info['batch_size'] > 0:
                            batch_idx = task.page_idx // batch_info['batch_size']
                            batch_start = batch_idx * batch_info['batch_size']
                            batch_end = min(batch_start + batch_info['batch_size'], batch_info['total_pages'])

                            # Check if we have results for this batch
                            if batch_idx not in document_batch_results[task.src_fileid]:
                                # Need to process this batch
                                self.logger.info(f"Parser: processing batch {batch_idx + 1} (pages {batch_start + 1}-{batch_end}) for {task.src_fileid[:8]}...")

                                # Split PDF for this batch
                                batch_pdf_path = await self._split_pdf_batch(
                                    task.pdf_path, task.temp_dir, batch_start, batch_end, batch_idx
                                )

                                # Process this batch
                                batch_result = await api_client._process_batch_self_hosted(
                                    batch_pdf_path, task.temp_dir, task.src_fileid,
                                    batch_idx, batch_start, batch_end,
                                    task.metadata.get('is_ppt_format', False)
                                ) if self.config.mineru_api_type == "self_hosted" else await api_client._process_batch_cloud(
                                    batch_pdf_path, task.temp_dir, task.src_fileid,
                                    batch_idx, batch_start, batch_end,
                                    task.metadata.get('is_ppt_format', False)
                                )

                                if not batch_result.success:
                                    # Batch failed, try single-page processing for this batch
                                    self.logger.warning(f"Batch {batch_idx + 1} failed, falling back to single-page processing for pages {batch_start + 1}-{batch_end}")

                                    # Mark this batch as needing single-page processing
                                    if task.src_fileid not in document_batch_results:
                                        document_batch_results[task.src_fileid] = {}

                                    # Create a special marker for failed batch
                                    document_batch_results[task.src_fileid][batch_idx] = MinerUResult(
                                        success=False,
                                        content="",
                                        images=[],
                                        tables=[],
                                        metadata={"batch_failed": True, "needs_single_page": True,
                                                "batch_start": batch_start, "batch_end": batch_end},
                                        error=batch_result.error
                                    )

                                    # Process pages individually for this failed batch
                                    batch_result = await self._process_batch_as_single_pages(
                                        task, api_client, batch_start, batch_end, batch_idx
                                    )

                                    if batch_result.success:
                                        # Update cache with successful single-page results
                                        document_batch_results[task.src_fileid][batch_idx] = batch_result
                                        self.logger.info(f"Parser: single-page processing succeeded for batch {batch_idx + 1}")
                                    else:
                                        raise Exception(batch_result.error or f"Batch {batch_idx} processing failed even with single-page fallback")
                                else:
                                    # Original batch succeeded, cache it
                                    document_batch_results[task.src_fileid][batch_idx] = batch_result
                                    self.logger.info(f"Parser: cached batch {batch_idx + 1} result for {task.src_fileid[:8]}...")

                            # Use cached batch result
                            result = document_batch_results[task.src_fileid][batch_idx]
                            self.logger.debug(f"Parser: using cached batch {batch_idx + 1} for page {task.page_idx + 1}")

                        else:
                            # No batching, process entire document
                            if 'full' not in document_batch_results.get(task.src_fileid, {}):
                                self.logger.info(f"Parser: calling MinerU API for full document {task.src_fileid[:8]}...")
                                result = await api_client.process_document(
                                    task.pdf_path, task.temp_dir, task.src_fileid,
                                    is_ppt_converted=task.metadata.get('is_ppt_format', False),
                                    batch_size=0  # Disable batching
                                )

                                if not result.success:
                                    raise Exception(result.error or "MinerU parsing failed")

                                # Cache the result
                                if task.src_fileid not in document_batch_results:
                                    document_batch_results[task.src_fileid] = {}
                                document_batch_results[task.src_fileid]['full'] = result
                            else:
                                result = document_batch_results[task.src_fileid]['full']
                                self.logger.debug(f"Parser: using cached full result for page {task.page_idx + 1}")

                        # Extract page-specific content from the full document result
                        self.logger.debug(f"Extracting page data for page {task.page_idx + 1}")
                        self.logger.debug(f"Result type: {type(result)}, has content: {hasattr(result, 'content')}")
                        if hasattr(result, 'metadata'):
                            self.logger.debug(f"Result metadata keys: {list(result.metadata.keys())}")

                        page_data = self._extract_page_data(result, task.page_idx)
                        task.content = page_data.get('content', '')
                        task.images = page_data.get('images', [])
                        task.metadata.update(page_data.get('metadata', {}))

                        self.logger.debug(f"Extracted content length: {len(task.content)}, images: {len(task.images)}")

                        # Store content_list in metadata for context extraction
                        if result.metadata.get('content_list'):
                            task.metadata['content_list'] = result.metadata['content_list']

                        # Use language detected from the full document
                        if result.metadata.get('detected_language'):
                            # Store document-level language from API result
                            if task.src_fileid not in self.document_languages:
                                self.document_languages[task.src_fileid] = result.metadata['detected_language']
                                self.logger.info(f"Parser: using document language from API: {result.metadata['detected_language']} for {task.src_fileid[:8]}...")
                        else:
                            # Fallback: detect language from content if available
                            if task.content and task.content.strip():
                                from .language_detector import LanguageDetector
                                language_code, confidence = LanguageDetector.detect_language(task.content)
                                if confidence > 0.7:
                                    # Store document-level language
                                    if task.src_fileid not in self.document_languages:
                                        self.document_languages[task.src_fileid] = language_code
                                        self.logger.info(f"Parser: detected document language (fallback): {language_code} for {task.src_fileid[:8]}...")

                        # Send to next stages (task already contains upload_callback and upload_options)
                        if task.content:
                            self.queues.content_refinement_queue.put(task)
                        else:
                            # No content, set refined_content to empty
                            task.refined_content = ''

                        if task.images:
                            self.queues.image_recognition_queue.put(task)
                        else:
                            # No images to process
                            task.metadata['images_processed'] = True

                        # If page has neither content nor images, mark as complete
                        if not task.content and not task.images:
                            task.refined_content = ''
                            self._check_task_completion(task)

                        self.logger.info(f"Parser completed page {task.page_idx + 1}")

                    except Exception as e:
                        # Page-level task retry (for handling queue/processing failures, not API failures)
                        # Note: Batch API calls don't retry, they fallback to single-page processing
                        # Single-page API calls have their own retry logic in api_client._retry_with_backoff
                        task.retry_count += 1

                        if task.retry_count <= task.max_retries:
                            self.logger.warning(f"Parser task failed for page {task.page_idx + 1} (attempt {task.retry_count}/{task.max_retries}): {e}")
                            # Put the task back in the queue for retry
                            with self.tasks_lock:
                                task.status = TaskStatus.PENDING
                                task.error = f"Task retry {task.retry_count}: {str(e)}"

                            # Add exponential backoff delay
                            retry_delay = min(2.0 * (2 ** (task.retry_count - 1)), 30.0)
                            await asyncio.sleep(retry_delay)

                            # Re-queue the task for retry
                            self.queues.parsing_queue.put(task)
                            self.logger.info(f"Re-queued page {task.page_idx + 1} task for retry after {retry_delay:.1f}s delay")
                        else:
                            # Max retries exceeded, mark as failed
                            self.logger.error(f"Parser task failed for page {task.page_idx + 1} after {task.max_retries} retries: {e}")
                            with self.tasks_lock:
                                task.status = TaskStatus.FAILED
                                task.error = f"Task failed after {task.max_retries} retries: {str(e)}"
                            self._mark_task_completed(task)

                    finally:
                        self.queues.parsing_queue.task_done()

                        # Clean up cached results for completed documents
                        # Check if all pages for this document are processed
                        with self.tasks_lock:
                            if task.src_fileid in self.document_tasks:
                                doc_tasks = self.document_tasks[task.src_fileid]
                                all_processed = all(
                                    t.status in [TaskStatus.COMPLETED, TaskStatus.FAILED]
                                    for t in doc_tasks.values()
                                )
                                if all_processed:
                                    # All pages processed, clean up cache
                                    if task.src_fileid in document_batch_results:
                                        del document_batch_results[task.src_fileid]
                                        self.logger.info(f"Parser: cleaned up cached batch results for document {task.src_fileid[:8]}...")
                                    if task.src_fileid in document_batch_info:
                                        del document_batch_info[task.src_fileid]

                except Exception as e:
                    self.logger.error(f"Parsing worker error: {e}")

        self.logger.info("Parsing worker stopped")

    async def _content_refinement_worker(self):
        """
        Content refinement worker thread.

        Consumes: PageTask from content_refinement_queue
        Updates: PageTask with refined content
        """
        self.logger.info("Content refinement worker started")

        while not self.shutdown_event.is_set():
            try:
                # Get task from queue
                try:
                    task = self.queues.content_refinement_queue.get(timeout=1.0)
                except queue.Empty:
                    continue

                # Set trace context for this task with page number
                if self.platform_adapter:
                    self.platform_adapter.set_trace_id(f"{task.src_fileid}_p{task.page_idx + 1}")

                self.logger.info(f"Refiner processing page {task.page_idx + 1}")

                try:
                    # Get document language
                    language_code = self.document_languages.get(task.src_fileid)

                    # Process content refinement for single page
                    page_result = await self.content_processor.process_page_content(
                        task.content, task.images, task.pdf_path, task.page_idx,
                        task.temp_dir, task.src_fileid, self.learn_type, language_code
                    )

                    # Extract results
                    task.refined_content = page_result['content']
                    task.metadata['has_tables'] = page_result['has_tables']
                    task.metadata['content_processing'] = page_result.get('processing_metadata', {})

                    self.logger.info(f"Refiner completed page {task.page_idx + 1}")

                    # Check if this task is complete
                    self._check_task_completion(task)

                except Exception as e:
                    self.logger.error(f"Refiner failed for page {task.page_idx + 1}: {e}")
                    task.refined_content = task.content  # Fallback
                    self._check_task_completion(task)

                finally:
                    self.queues.content_refinement_queue.task_done()
            except Exception as e:
                self.logger.error(f"Content refinement worker error: {e}")

        self.logger.info("Content refinement worker stopped")

    async def _image_recognition_worker(self):
        """
        Image recognition worker thread.

        Consumes: PageTask from image_recognition_queue
        Produces: PageTask to image_upload_queue
        """
        self.logger.info("Image recognition worker started")

        try:
            while not self.shutdown_event.is_set():
                try:
                    # Get task from queue
                    try:
                        task = self.queues.image_recognition_queue.get(timeout=1.0)
                    except queue.Empty:
                        continue

                    # Set trace context for this task with page number
                    if self.platform_adapter:
                        self.platform_adapter.set_trace_id(f"{task.src_fileid}_p{task.page_idx + 1}")

                    self.logger.info(f"Recognizer processing {len(task.images)} images for page {task.page_idx + 1}")

                    try:
                        self.logger.info(f"Recognizer: loading images from {task.temp_dir}")

                        # Create wrapper for classification without upload
                        async def classify_only(learn_type, image_filepath, temp_dir, src_name, hint=""):
                            # Use document-level language directly
                            language_code = self.document_languages.get(task.src_fileid)
                            if language_code:
                                self.logger.info(f"Recognizer: using document language: {language_code} for page {task.page_idx + 1}")
                            else:
                                self.logger.info(f"Recognizer: no document language detected for {task.src_fileid[:8]}")

                            # Extract context for the image if content is available
                            context = None
                            if task.content:
                                try:
                                    # Extract the filename from filepath
                                    import os
                                    import re
                                    filename = os.path.basename(image_filepath)

                                    # Create ImageContext from page content
                                    from .context_types import ImageContext

                                    # Try to find image reference in content
                                    # Look for patterns like ![...](filename) or just the filename
                                    surrounding_text = ""
                                    before_text = ""
                                    after_text = ""

                                    # Extract base name without extension for more flexible matching
                                    base_name = filename.replace('.png', '').replace('.jpg', '').replace('.jpeg', '')

                                    # Try different patterns to find the image reference
                                    patterns = [
                                        rf'!\[.*?\]\([^)]*{re.escape(filename)}[^)]*\)',  # Full filename in markdown
                                        rf'!\[.*?\]\([^)]*{re.escape(base_name)}[^)]*\)',  # Base name in markdown
                                        rf'{re.escape(filename)}',  # Just the filename
                                        rf'{re.escape(base_name)}'   # Just the base name
                                    ]

                                    image_position = -1
                                    for pattern in patterns:
                                        match = re.search(pattern, task.content)
                                        if match:
                                            image_position = match.start()
                                            self.logger.debug(f"Found image reference at position {image_position} using pattern: {pattern}")
                                            break

                                    if image_position >= 0:
                                        # Extract 250 characters before and after the image reference
                                        start_pos = max(0, image_position - 250)
                                        end_pos = min(len(task.content), image_position + 250)

                                        before_text = task.content[start_pos:image_position].strip()
                                        after_text = task.content[image_position:end_pos].strip()
                                        surrounding_text = task.content[start_pos:end_pos].strip()

                                        self.logger.info(f"Recognizer: found image {filename} at position {image_position}, "
                                                       f"extracted {len(before_text)} chars before and {len(after_text)} chars after")
                                    else:
                                        # Fallback: if image reference not found, use first 500 chars of content
                                        surrounding_text = task.content[:500] if len(task.content) > 500 else task.content
                                        self.logger.warning(f"Recognizer: could not find image reference for {filename}, using fallback context")

                                    context = ImageContext(
                                        page_idx=task.page_idx,
                                        surrounding_text=surrounding_text,
                                        page_type='content',  # Could be enhanced to detect actual page type
                                        token_count=len(surrounding_text) // 4,  # Rough token estimate
                                        before_text=before_text,
                                        after_text=after_text
                                    )

                                    self.logger.info(f"Recognizer: created context for image {filename} with {len(surrounding_text)} chars of surrounding text")
                                except Exception as e:
                                    self.logger.warning(f"Recognizer: failed to create context for image: {e}")
                                    context = None

                            # Use the image processor's classification method with context
                            return await self.image_processor._classify_single_image_with_context(
                                learn_type, image_filepath, temp_dir, src_name, hint, context, language_code
                            )

                        # Load and classify images
                        images_to_process = []
                        for img_filename in task.images:
                            img_filepath = os.path.join(task.temp_dir, img_filename)
                            self.logger.info(f"Recognizer: checking image {img_filename} at {img_filepath}")
                            if os.path.exists(img_filepath):
                                xref = img_filename.replace('.png', '').replace('mineru_image_', '')
                                self.logger.info(f"Recognizer: loading image info for {img_filename}, xref={xref}")
                                image_info = await self.image_processor.image_optimizer.load_image_info(
                                    img_filepath, img_filename, xref
                                )
                                images_to_process.append(image_info)
                                self.logger.info(f"Recognizer: loaded image info: {image_info}")
                            else:
                                self.logger.warning(f"Recognizer: image file not found: {img_filepath}")

                        if images_to_process:
                            # Classify images sequentially (not concurrently) to avoid pressure on multimodal service
                            self.logger.info(f"Recognizer: classifying {len(images_to_process)} images sequentially for page {task.page_idx + 1}")
                            classification_results = await self.image_processor.image_optimizer.batch_classify_images(
                                images_to_process, classify_only, self.learn_type,
                                task.temp_dir, task.src_fileid
                            )

                            # Filter meaningful images based on configuration
                            meaningful_images = []
                            for image_info in images_to_process:
                                xref = image_info.xref
                                if xref in classification_results:
                                    result = classification_results[xref]
                                else:
                                    # No classification result - likely an error occurred
                                    self.logger.warning(f"Recognizer: no classification result for {image_info.filename}, creating default result")
                                    result = {
                                        'type': 'meaningless',
                                        'content': 'Classification failed - no result returned',
                                        'input_tokens': 0,
                                        'output_tokens': 0,
                                        'error': 'No classification result'
                                    }

                                # Apply meaningless filter if configured
                                if self.config.filter_meaningless_images and result.get('type') == 'meaningless':
                                    self.logger.info(f"Recognizer: filtering out meaningless image {image_info.filename}")
                                    # Still store the classification for reference
                                    task.image_descriptions[image_info.filename] = result
                                else:
                                    # Either filter is disabled or image is meaningful
                                    meaningful_images.append(image_info)
                                    task.image_descriptions[image_info.filename] = result

                            # Send to upload queue if there are images to upload
                            # Note: if filter is disabled, we upload all classified images including meaningless ones
                            if meaningful_images or (not self.config.filter_meaningless_images and images_to_process):
                                if not self.config.filter_meaningless_images:
                                    # If filter is disabled, upload all processed images
                                    task.meaningful_images = images_to_process
                                else:
                                    # If filter is enabled, only upload meaningful images
                                    task.meaningful_images = meaningful_images
                                self.queues.image_upload_queue.put(task)
                            else:
                                # No images to upload
                                task.metadata['images_processed'] = True
                                self._check_task_completion(task)

                            self.logger.info(f"Recognizer completed page {task.page_idx + 1}: "
                                           f"{len(meaningful_images)}/{len(task.images)} meaningful")
                        else:
                            task.metadata['images_processed'] = True
                            self._check_task_completion(task)

                    except Exception as e:
                        self.logger.error(f"Recognizer failed for page {task.page_idx + 1}: {e}")
                        import traceback
                        self.logger.error(f"Recognizer traceback: {traceback.format_exc()}")
                        task.metadata['images_processed'] = True  # Mark as processed to avoid hanging
                        self._check_task_completion(task)

                    finally:
                        self.queues.image_recognition_queue.task_done()

                except Exception as e:
                    self.logger.error(f"Image recognition worker inner error: {e}")
                    import traceback
                    self.logger.error(f"Image recognition worker traceback: {traceback.format_exc()}")

        except Exception as e:
            self.logger.error(f"Image recognition worker error: {e}")
        finally:
            self.logger.info("Image recognition worker stopped")

    async def _image_upload_worker(self):
        """
        Image upload worker thread.

        Consumes: PageTask from image_upload_queue
        Updates: PageTask with uploaded image URLs
        """
        self.logger.info("Image upload worker started")

        while not self.shutdown_event.is_set():
            try:
                # Get task from queue
                try:
                    task = self.queues.image_upload_queue.get(timeout=1.0)
                except queue.Empty:
                    continue

                # Set trace context for this task with page number
                if self.platform_adapter:
                    self.platform_adapter.set_trace_id(f"{task.src_fileid}_p{task.page_idx + 1}")

                meaningful_images = task.meaningful_images
                self.logger.info(f"Uploader processing {len(meaningful_images)} images for page {task.page_idx + 1}")

                try:
                    if meaningful_images:
                        # Get upload callback and options from task
                        upload_callback = task.upload_callback
                        upload_options = task.upload_options

                        self.logger.info(f"Uploader: starting upload for {len(meaningful_images)} images on page {task.page_idx + 1}")

                        if upload_callback:
                            # Batch upload images
                            upload_results = await self.image_processor.image_optimizer.batch_upload_images(
                                meaningful_images, upload_callback, upload_options
                            )

                            self.logger.info(f"Uploader: upload_results keys: {list(upload_results.keys())}")

                            # Map results back to filenames
                            for image_info in meaningful_images:
                                xref = image_info.xref
                                if xref in upload_results and upload_results[xref]:
                                    task.processed_images[image_info.filename] = upload_results[xref]
                                    self.logger.info(f"Uploader: mapped {image_info.filename} -> {upload_results[xref]}")
                                else:
                                    self.logger.warning(f"Uploader: no upload result for {image_info.filename} (xref={xref})")

                            self.logger.info(f"Uploader completed page {task.page_idx + 1}: "
                                           f"{len(task.processed_images)} uploaded")
                        else:
                            self.logger.warning(f"Uploader: no upload_callback provided for page {task.page_idx + 1}")

                    # Mark images as processed
                    task.metadata['images_processed'] = True

                    # Mark task as complete
                    self._check_task_completion(task)

                except Exception as e:
                    self.logger.error(f"Uploader failed for page {task.page_idx + 1}: {e}")
                    self._check_task_completion(task)

                finally:
                    self.queues.image_upload_queue.task_done()

            except Exception as e:
                self.logger.error(f"Image upload worker error: {e}")

        self.logger.info("Image upload worker stopped")

    async def _split_pdf_batch(self, pdf_path: str, temp_dir: str, start_page: int,
                              end_page: int, batch_idx: int) -> str:
        """
        Split PDF to extract specific pages for batch processing.

        Args:
            pdf_path: Original PDF path
            temp_dir: Temporary directory
            start_page: Start page index (0-based)
            end_page: End page index (exclusive)
            batch_idx: Batch index for naming

        Returns:
            Path to the split PDF file
        """
        import fitz
        batch_pdf_path = os.path.join(temp_dir, f"batch_{batch_idx}.pdf")

        with fitz.open(pdf_path) as src_doc:
            batch_doc = fitz.open()  # Create new PDF

            # Copy pages to new document
            for page_idx in range(start_page, end_page):
                batch_doc.insert_pdf(src_doc, from_page=page_idx, to_page=page_idx)

            batch_doc.save(batch_pdf_path)
            batch_doc.close()

        self.logger.info(f"Created batch PDF with {end_page - start_page} pages: {batch_pdf_path}")
        return batch_pdf_path

    async def _process_batch_as_single_pages(self, task: PageTask, api_client: MinerUAPIClient,
                                            batch_start: int, batch_end: int, batch_idx: int) -> MinerUResult:
        """
        Process a failed batch by trying each page individually.
        This helps identify which specific page is causing the batch to fail.

        Args:
            task: The current page task
            api_client: MinerU API client
            batch_start: Start page index of the batch
            batch_end: End page index of the batch
            batch_idx: Batch index

        Returns:
            MinerUResult with combined results from successful pages
        """
        self.logger.info(f"Starting single-page processing for failed batch {batch_idx + 1} (pages {batch_start + 1}-{batch_end})")

        successful_pages = []
        failed_pages = []
        all_content_parts = []
        all_images = []
        all_tables = []
        page_data = {}

        # Try each page individually
        for page_idx in range(batch_start, batch_end):
            try:
                self.logger.info(f"Processing single page {page_idx + 1}/{batch_end}")

                # Create single-page PDF
                single_page_pdf = os.path.join(task.temp_dir, f"single_page_{page_idx}.pdf")

                import fitz
                with fitz.open(task.pdf_path) as src_doc:
                    single_doc = fitz.open()
                    single_doc.insert_pdf(src_doc, from_page=page_idx, to_page=page_idx)
                    single_doc.save(single_page_pdf)
                    single_doc.close()

                # Process single page WITH retry logic
                if self.config.mineru_api_type == "self_hosted":
                    # Use retry wrapper for single-page processing
                    page_result = await api_client._retry_with_backoff(
                        api_client._process_batch_self_hosted_impl,
                        single_page_pdf, task.temp_dir, task.src_fileid,
                        page_idx, page_idx, page_idx + 1,
                        task.metadata.get('is_ppt_format', False)
                    )
                else:
                    # Use retry wrapper for single-page processing
                    page_result = await api_client._retry_with_backoff(
                        api_client._process_batch_cloud_impl,
                        single_page_pdf, task.temp_dir, task.src_fileid,
                        page_idx, page_idx, page_idx + 1,
                        task.metadata.get('is_ppt_format', False)
                    )

                # Clean up single-page PDF
                if os.path.exists(single_page_pdf):
                    os.remove(single_page_pdf)

                if page_result.success:
                    successful_pages.append(page_idx + 1)
                    all_content_parts.append(page_result.content)
                    all_images.extend(page_result.images)
                    all_tables.extend(page_result.tables)
                    page_data[page_idx] = {
                        'content': page_result.content,
                        'images': page_result.images,
                        'tables': page_result.tables,
                        'metadata': page_result.metadata
                    }
                    self.logger.success(f"✓ Page {page_idx + 1} processed successfully")
                else:
                    failed_pages.append(page_idx + 1)
                    self.logger.error(f"✗ Page {page_idx + 1} failed: {page_result.error}")
                    # Add empty data for failed page
                    page_data[page_idx] = {
                        'content': '',
                        'images': [],
                        'tables': [],
                        'metadata': {'error': page_result.error, 'failed': True}
                    }

            except Exception as e:
                failed_pages.append(page_idx + 1)
                self.logger.error(f"✗ Page {page_idx + 1} processing error: {str(e)}")
                # Add empty data for failed page
                page_data[page_idx] = {
                    'content': '',
                    'images': [],
                    'tables': [],
                    'metadata': {'error': str(e), 'failed': True}
                }

        # Log summary
        self.logger.info(f"Single-page processing complete for batch {batch_idx + 1}:")
        self.logger.info(f"  - Successful pages: {successful_pages} ({len(successful_pages)}/{batch_end - batch_start})")
        if failed_pages:
            self.logger.warning(f"  - Failed pages: {failed_pages} ({len(failed_pages)}/{batch_end - batch_start})")

        # Return combined result
        success = len(successful_pages) > 0  # Partial success if at least one page worked

        metadata = {
            "batch_idx": batch_idx,
            "start_page": batch_start,
            "end_page": batch_end,
            "pages_in_batch": batch_end - batch_start,
            "successful_pages": successful_pages,
            "failed_pages": failed_pages,
            "single_page_fallback": True,
            "page_data": page_data
        }

        return MinerUResult(
            success=success,
            content="\n\n".join(all_content_parts),
            images=all_images,
            tables=all_tables,
            metadata=metadata,
            error=f"Failed pages: {failed_pages}" if failed_pages else None
        )

    def _extract_page_data(self, mineru_result, page_idx: int) -> Dict:
        """Extract data for a specific page from MinerU result"""
        # Check if we have pre-split page data
        if mineru_result.metadata.get('page_data'):
            page_data = mineru_result.metadata['page_data']
            self.logger.debug(f"Found page_data with keys: {list(page_data.keys())}")

            # Try both integer and string keys
            if page_idx in page_data:
                self.logger.debug(f"Found data for page_idx {page_idx} (int key)")
                return {
                    'content': page_data[page_idx].get('content', ''),
                    'images': page_data[page_idx].get('images', []),
                    'metadata': page_data[page_idx].get('metadata', {})
                }
            elif str(page_idx) in page_data:
                self.logger.debug(f"Found data for page_idx {page_idx} (str key)")
                return {
                    'content': page_data[str(page_idx)].get('content', ''),
                    'images': page_data[str(page_idx)].get('images', []),
                    'metadata': page_data[str(page_idx)].get('metadata', {})
                }
            else:
                self.logger.warning(f"No data found for page_idx {page_idx} in page_data")

        # Fallback: split content by page markers
        content_parts = mineru_result.content.split(f'__PAGE_OF_PORTION_{page_idx + 1}__')
        if len(content_parts) > 1:
            # Get content after the page marker until next marker
            page_content = content_parts[1].split(f'__PAGE_OF_PORTION_{page_idx + 2}__')[0]
        else:
            # If no page markers, return empty
            page_content = ''

        # Filter images by page (assuming images have page info in metadata)
        page_images = []

        # For single-page documents, assign all images to page 0
        total_pages = len(mineru_result.metadata.get('page_data', {}))
        if total_pages == 1 and page_idx == 0:
            # Single page document - assign all images to this page
            page_images = list(mineru_result.images)
            self.logger.debug(f"Single-page document: assigning {len(page_images)} images to page 0")
        else:
            # Multi-page document - filter by page markers
            for img in mineru_result.images:
                # This is a simplified version - real implementation would check image metadata
                if f'page_{page_idx}' in img or f'p{page_idx}' in img:
                    page_images.append(img)

        # Handle page content - ensure empty/whitespace pages are properly handled
        stripped_content = page_content.strip()

        # Log if we have a page with only whitespace
        if page_content and not stripped_content:
            self.logger.debug(f"Page {page_idx} contains only whitespace characters")

        return {
            'content': stripped_content,
            'images': page_images,
            'metadata': mineru_result.metadata
        }

    def _check_task_completion(self, task: PageTask):
        """Check if a task is complete and mark it accordingly"""
        should_mark_complete = False

        with self.tasks_lock:
            # Skip if already completed
            if task.status == TaskStatus.COMPLETED:
                self.logger.debug(f"Page {task.page_idx + 1} already completed, skipping")
                return

            # A task is complete when:
            # 1. Content is refined (or original content is used)
            # 2. Images are processed (or there are no images)
            has_content = task.refined_content is not None
            has_no_images = not task.images
            has_processed_images = len(task.images) == 0 or task.metadata.get('images_processed', False)

            # Log the current state for debugging
            self.logger.debug(f"Checking completion for page {task.page_idx + 1}: "
                            f"has_content={has_content}, has_no_images={has_no_images}, "
                            f"has_processed_images={has_processed_images}, "
                            f"images_count={len(task.images)}")

            if has_content and (has_no_images or has_processed_images):
                # Integrate images into content if we have any image descriptions
                # This ensures meaningless images are properly removed from content
                if task.image_descriptions:
                    self.logger.info(f"Page {task.page_idx + 1} processing image integration:")
                    self.logger.info(f"  - processed_images: {list(task.processed_images.keys()) if task.processed_images else 'None (filtered out)'}")
                    self.logger.info(f"  - image_descriptions: {list(task.image_descriptions.keys())}")
                    self.logger.info(f"  - content length before: {len(task.refined_content)} chars")

                    task.refined_content = self._integrate_images_into_content(
                        task.refined_content,
                        task.image_descriptions,
                        task.processed_images or {},  # Pass empty dict if None
                        f"{task.src_fileid}_page_{task.page_idx}"
                    )

                    self.logger.info(f"  - content length after: {len(task.refined_content)} chars")
                else:
                    self.logger.info(f"Page {task.page_idx + 1} has no images to process")

                task.status = TaskStatus.COMPLETED
                should_mark_complete = True

        # Call _mark_task_completed outside of the lock to avoid deadlock
        if should_mark_complete:
            self._mark_task_completed(task)

    def _integrate_images_into_content(self, content: str, image_descriptions: Dict[str, Dict],
                                     uploaded_images: Dict[str, str], _page_identifier: str) -> str:
        """
        Integrate image descriptions into content by replacing original image references.

        This method is adapted from image_processor.integrate_image_descriptions
        """
        import re

        try:
            enhanced_content = content

            # Log what we're processing
            self.logger.info(f"Image integration starting for {_page_identifier}:")
            self.logger.info(f"  - Image descriptions: {list(image_descriptions.keys())}")
            self.logger.info(f"  - Uploaded images: {list(uploaded_images.keys())}")
            self.logger.debug(f"  - Content length: {len(content)} chars")

            # First, find all image references in the content
            image_pattern = r'!\[.*?\]\((.*?)\)'
            content_images = re.findall(image_pattern, content)
            self.logger.info(f"Found {len(content_images)} image references in content:")
            for img_ref in content_images[:5]:  # Log first 5
                self.logger.info(f"  - {img_ref}")

            # Process each image description
            for filename, desc_info in image_descriptions.items():
                self.logger.info(f"\nChecking image {filename} for replacement")

                # Get image type to determine if it's meaningless
                img_type = desc_info.get('type', 'brief_description')

                # Process ALL images that have descriptions
                # - Meaningless images: remove references (replace with empty string)
                # - Images not uploaded but classified: also remove (likely filtered)
                # - Uploaded images: replace with proper markdown
                uploaded_url = uploaded_images.get(filename, '')

                if img_type == 'meaningless':
                    self.logger.info(f"  - Image is meaningless, will remove references")
                elif filename not in uploaded_images:
                    self.logger.info(f"  - Image was classified as {img_type} but not uploaded (filtered), will remove references")
                    # Treat as meaningless for removal purposes
                    img_type = 'meaningless'
                else:
                    self.logger.info(f"  - Found in uploaded_images: {uploaded_url}")

                # Extract the hash part from the filename
                # This handles filenames like: 390561cb34fd3f951b1d25a252bead1c_page_1_44450601...jpg
                # or mineru_image_xxx.png
                base_filename = filename.replace('.png', '').replace('.jpg', '').replace('.jpeg', '')

                # Try to extract the hash part (usually the long hex string)
                # For mineru images: mineru_image_XXX -> XXX
                # For hash-based: XXX_page_N_YYY -> YYY (the last hash)
                if 'mineru_image_' in filename:
                    ref = base_filename.replace('mineru_image_', '')
                else:
                    # Look for the last hash-like pattern
                    parts = base_filename.split('_')
                    # Find the longest hex-like string
                    hash_parts = [p for p in parts if len(p) > 20 and all(c in '0123456789abcdef' for c in p)]
                    if hash_parts:
                        ref = hash_parts[-1]  # Use the last hash
                    else:
                        ref = base_filename

                # Build replacement content based on image type
                # img_type already extracted above
                title = desc_info.get('title', '')
                description = desc_info.get('content', '')
                ocr_content = desc_info.get('ocr_content', '')

                # Create the replacement markdown
                if img_type == 'meaningless':
                    # For meaningless images, we want to remove them entirely
                    replacement = ""  # Empty string to remove the image reference
                elif img_type == 'structured_content':
                    # For structured content, include full description
                    replacement = f"\n\n![{title}]({uploaded_url})\n<!--{description}-->\n\n{ocr_content}\n\n"
                else:
                    # For other types, use a simpler format
                    if description:
                        replacement = f"\n\n![{title}]({uploaded_url})\n<!--{description}-->\n"
                    else:
                        replacement = f"\n\n![{title}]({uploaded_url})\n\n"

                # Replace various possible image reference patterns
                # We need to be flexible because the reference in content might be different from our filename
                patterns = []

                # If we found a hash reference, try to match it in various formats
                if ref and len(ref) > 20:  # Likely a hash
                    patterns.extend([
                        f"!\\[.*?\\]\\(.*?{ref}.*?\\)",         # Match hash anywhere in path
                        f"!\\[\\]\\(.*?{ref}.*?\\)",            # Empty alt text with hash
                        f"!\\[.*?\\]\\(images/{ref}\\.[^)]+\\)", # images/hash.ext
                        f"!\\[\\]\\(images/{ref}\\.[^)]+\\)",    # images/hash.ext with empty alt
                    ])

                # Always try the full filename patterns
                patterns.extend([
                    f"!\\[.*?\\]\\(.*?{re.escape(filename)}\\)",        # Match exact filename
                    f"!\\[.*?\\]\\(.*?{re.escape(base_filename)}\\)",   # Match base filename
                ])

                # Add generic mineru image pattern if applicable
                if 'mineru_image_' in filename:
                    patterns.append(f"!\\[.*?\\]\\(.*?{filename}\\)")

                self.logger.info(f"  - extracted ref: '{ref}'")
                self.logger.info(f"  - trying {len(patterns)} patterns")

                replaced = False
                for pattern in patterns:
                    new_content = re.sub(pattern, replacement, enhanced_content)
                    if new_content != enhanced_content:
                        enhanced_content = new_content
                        replaced = True
                        self.logger.info(f"Successfully replaced image {filename} using pattern: {pattern}")
                        break

                if not replaced:
                    self.logger.warning(f"Failed to replace image reference for: {filename}, ref={ref}")
                    # Log the first few characters of content to help debugging
                    sample = enhanced_content[:500] if len(enhanced_content) > 500 else enhanced_content
                    self.logger.info(f"Content sample: {sample}...")
                    # Also log the exact patterns we tried
                    self.logger.info(f"Tried patterns:")
                    for p in patterns:
                        self.logger.info(f"  - {p}")

            return enhanced_content

        except Exception as e:
            self.logger.error(f"Error integrating images: {str(e)}")
            return content  # Return original content on error

    def _mark_task_completed(self, task: PageTask):
        """Mark a task as completed and add to results"""
        self.logger.debug(f"Marking task as completed: page {task.page_idx + 1}, status={task.status}, "
                        f"src_fileid={task.src_fileid[:8]}...")

        # Get total pages first (outside of completed_lock to avoid potential deadlock)
        with self.tasks_lock:
            doc_tasks = self.document_tasks.get(task.src_fileid, {})
            total_pages = len(doc_tasks)

        try:
            # Save to cache if callback provided and task succeeded
            if task.status == TaskStatus.COMPLETED:
                save_callback = task.save_callback  # Get from non-serializable field
                if save_callback:
                    try:
                        # Prepare page data for caching
                        page_data = {
                            'content': task.refined_content or task.content or '',
                            'images': task.images,
                            'processed_images': task.processed_images,
                            'image_descriptions': task.image_descriptions,
                            'has_tables': task.metadata.get('has_tables', False),
                            'processing_metadata': {k: v for k, v in task.metadata.items()
                                                  if k not in ['save_callback', 'upload_callback']}
                        }
                        save_callback(task.page_idx, page_data)
                        self.logger.debug(f"Saved page {task.page_idx + 1} to cache")
                    except Exception as e:
                        self.logger.error(f"Failed to save page {task.page_idx + 1} to cache: {e}")

            with self.completed_lock:
                self.logger.debug(f"Acquired completed_lock for marking task")
                # Initialize list for this document if needed
                if task.src_fileid not in self.document_completed_tasks:
                    self.document_completed_tasks[task.src_fileid] = []
                    self.logger.debug(f"Initialized completed tasks list for doc {task.src_fileid[:8]}")

                self.document_completed_tasks[task.src_fileid].append(task)
                completed_count = len(self.document_completed_tasks[task.src_fileid])
                self.logger.debug(f"Added task to completed list, count now: {completed_count}")

            self.logger.info(f"Task completed: page {task.page_idx + 1} "
                           f"[{completed_count}/{total_pages}] for doc {task.src_fileid[:8]}...")
        except Exception as e:
            self.logger.error(f"Error in _mark_task_completed: {e}", exc_info=True)

    async def process_document_with_cache(self, pdf_path: str, temp_dir: str, src_fileid: str,
                                        is_ppt_format: bool, total_pages: int,
                                        upload_callback=None, upload_options=None,
                                        cached_pages=None, save_callback=None) -> List[PageTask]:
        """
        Process a document using parallel pipeline with cache support.

        Args:
            pdf_path: Path to PDF file
            temp_dir: Temporary directory
            src_fileid: Source file ID
            is_ppt_format: Whether document is in PPT format
            total_pages: Total number of pages
            upload_callback: Callback for uploading images
            upload_options: Upload options
            cached_pages: Dictionary of already cached pages {page_idx: page_data}
            save_callback: Callback to save page to cache when completed

        Returns:
            List of completed PageTask objects
        """
        # Process with cache support
        return await self._process_document_internal(
            pdf_path, temp_dir, src_fileid, is_ppt_format, total_pages,
            upload_callback, upload_options, cached_pages, save_callback
        )

    async def process_document(self, pdf_path: str, temp_dir: str, src_fileid: str,
                             is_ppt_format: bool, total_pages: int,
                             upload_callback=None, upload_options=None) -> List[PageTask]:
        """
        Process a document using parallel pipeline.

        Args:
            pdf_path: Path to PDF file
            temp_dir: Temporary directory
            src_fileid: Source file ID
            is_ppt_format: Whether document is in PPT format
            total_pages: Total number of pages
            upload_callback: Image upload callback
            upload_options: Upload options

        Returns:
            List of completed PageTask objects
        """
        # Process without cache support
        return await self._process_document_internal(
            pdf_path, temp_dir, src_fileid, is_ppt_format, total_pages,
            upload_callback, upload_options, None, None
        )

    async def _process_document_internal(self, pdf_path: str, temp_dir: str, src_fileid: str,
                                       is_ppt_format: bool, total_pages: int,
                                       upload_callback=None, upload_options=None,
                                       cached_pages=None, save_callback=None) -> List[PageTask]:
        """
        Internal method to process document with optional cache support.
        """
        cached_pages = cached_pages or {}
        self.logger.info(f"Starting parallel processing for {total_pages} pages, {len(cached_pages)} already cached")

        # Initialize async components
        await self.initialize()

        # Initialize threads if not already running
        if not self.threads:
            self.initialize_threads()
            # Start queue monitor
            self._start_queue_monitor()

        # Initialize document-specific tracking
        with self.tasks_lock:
            self.document_tasks[src_fileid] = {}
        with self.completed_lock:
            self.document_completed_tasks[src_fileid] = []

        # Create tasks for each page
        for page_idx in range(total_pages):
            # Check if page is already cached
            if page_idx in cached_pages:
                # Create a completed task from cached data
                cached_data = cached_pages[page_idx]
                task = PageTask(
                    page_idx=page_idx,
                    pdf_path=pdf_path,
                    temp_dir=temp_dir,
                    src_fileid=src_fileid,
                    status=TaskStatus.COMPLETED,
                    content=cached_data.get('content', ''),
                    images=cached_data.get('metadata', {}).get('images', []),
                    refined_content=cached_data.get('content', ''),  # Use cached content as refined
                    processed_images=cached_data.get('metadata', {}).get('processed_images', {}),
                    image_descriptions=cached_data.get('metadata', {}).get('image_descriptions', {}),
                    metadata=cached_data.get('metadata', {}),
                    max_retries=self.config.api_max_retries  # Use retry config from settings
                )
                with self.tasks_lock:
                    self.document_tasks[src_fileid][page_idx] = task
                with self.completed_lock:
                    self.document_completed_tasks[src_fileid].append(task)
                self.logger.info(f"Loaded page {page_idx + 1} from cache")
            else:
                # Create new task for processing
                task = PageTask(
                    page_idx=page_idx,
                    pdf_path=pdf_path,
                    temp_dir=temp_dir,
                    src_fileid=src_fileid,
                    metadata={
                        'is_ppt_format': is_ppt_format
                    },
                    upload_callback=upload_callback,
                    upload_options=upload_options,
                    save_callback=save_callback,  # Store as non-serializable field
                    max_retries=self.config.api_max_retries  # Use retry config from settings
                )
                with self.tasks_lock:
                    self.document_tasks[src_fileid][page_idx] = task
                self.queues.parsing_queue.put(task)

        # Wait for all tasks to complete for this document
        start_time = time.time()
        self.logger.info(f"Waiting for {total_pages} pages to complete for document {src_fileid[:8]}...")

        while True:
            with self.completed_lock:
                # Debug: log the src_fileid we're looking for
                self.logger.debug(f"Looking for src_fileid: {src_fileid}")
                completed_tasks_for_doc = self.document_completed_tasks.get(src_fileid, [])
                completed_count = len(completed_tasks_for_doc)

                # Debug: log the actual keys in document_completed_tasks
                available_docs = list(self.document_completed_tasks.keys())
                if len(available_docs) > 0:
                    # Show full keys for debugging
                    self.logger.debug(f"Looking for: {src_fileid}")
                    self.logger.debug(f"Available docs: {available_docs}")
                    self.logger.debug(f"Tasks for this doc: {completed_count}")

            # Log progress every few iterations
            if int(time.time() - start_time) % 5 == 0:
                self.logger.info(f"Progress: {completed_count}/{total_pages} pages completed")

            if completed_count >= total_pages:
                self.logger.info(f"All {total_pages} pages completed!")
                break

            # Check for timeout
            if time.time() - start_time > self.config.processing_timeout:
                self.logger.error(f"Processing timeout reached after {self.config.processing_timeout}s")
                self.logger.error(f"Only {completed_count}/{total_pages} pages completed")
                break

            await asyncio.sleep(0.5)

        self.logger.info(f"Parallel processing completed in {time.time() - start_time:.2f}s")

        # Get and sort completed tasks for this document (with proper locking)
        with self.completed_lock:
            completed_tasks = list(self.document_completed_tasks.get(src_fileid, []))
            self.logger.info(f"Retrieved {len(completed_tasks)} completed tasks for document")

        completed_tasks.sort(key=lambda t: t.page_idx)

        # Log task details before cleanup
        try:
            for task in completed_tasks:
                self.logger.debug(f"Completed task: page {task.page_idx + 1}, "
                                f"has_content={bool(task.refined_content)}, "
                                f"images={len(task.processed_images)}")
        except Exception as e:
            self.logger.error(f"Error logging task details: {e}")

        # Clean up document-specific data
        with self.tasks_lock:
            if src_fileid in self.document_tasks:
                del self.document_tasks[src_fileid]
        with self.completed_lock:
            if src_fileid in self.document_completed_tasks:
                del self.document_completed_tasks[src_fileid]
        if src_fileid in self.document_languages:
            del self.document_languages[src_fileid]

        self.logger.info(f"Returning {len(completed_tasks)} completed tasks")
        return completed_tasks

    def _start_queue_monitor(self):
        """Start queue monitoring thread"""
        if self._monitor_thread is None or not self._monitor_thread.is_alive():
            self._monitor_stop_event.clear()
            self._monitor_thread = threading.Thread(
                target=self._queue_monitor_worker,
                name="MinerU-QueueMonitor",
                daemon=True
            )
            self._monitor_thread.start()
            self.logger.info("Queue monitor started")

    def _queue_monitor_worker(self):
        """Monitor queue sizes and log statistics"""
        while not self._monitor_stop_event.is_set():
            try:
                # Get queue sizes
                parsing_size = self.queues.parsing_queue.qsize()
                content_size = self.queues.content_refinement_queue.qsize()
                recognition_size = self.queues.image_recognition_queue.qsize()
                upload_size = self.queues.image_upload_queue.qsize()

                # Get active documents count
                with self.tasks_lock:
                    active_docs = len(self.document_tasks)
                    total_pending_tasks = sum(len(tasks) for tasks in self.document_tasks.values())

                with self.completed_lock:
                    total_completed = sum(len(tasks) for tasks in self.document_completed_tasks.values())

                # Log queue status only when there's activity
                if parsing_size > 0 or content_size > 0 or recognition_size > 0 or upload_size > 0 or active_docs > 0:
                    self.logger.info(
                        f"Queue Status | Parse: {parsing_size} | "
                        f"Refine: {content_size} | "
                        f"Recognize: {recognition_size} | "
                        f"Upload: {upload_size} | "
                        f"Docs: {active_docs} | "
                        f"Tasks: {total_pending_tasks}/{total_completed} (pending/completed)"
                    )

                # Sleep for monitoring interval
                time.sleep(5)  # Log every 5 seconds

            except Exception as e:
                self.logger.error(f"Queue monitor error: {e}")
                time.sleep(5)

        self.logger.info("Queue monitor stopped")

    def get_queue_status(self) -> Dict[str, Any]:
        """Get detailed queue status"""
        status = {
            'queues': {
                'parsing': {
                    'size': self.queues.parsing_queue.qsize(),
                    'max_size': self.queues.parsing_queue.maxsize
                },
                'content_refinement': {
                    'size': self.queues.content_refinement_queue.qsize(),
                    'max_size': self.queues.content_refinement_queue.maxsize
                },
                'image_recognition': {
                    'size': self.queues.image_recognition_queue.qsize(),
                    'max_size': self.queues.image_recognition_queue.maxsize
                },
                'image_upload': {
                    'size': self.queues.image_upload_queue.qsize(),
                    'max_size': self.queues.image_upload_queue.maxsize
                }
            },
            'documents': {},
            'threads': {
                'active': len([t for t in self.threads if t.is_alive()]),
                'total': len(self.threads)
            }
        }

        # Get per-document status
        with self.tasks_lock:
            for doc_id, tasks in self.document_tasks.items():
                doc_status = {
                    'total_pages': len(tasks),
                    'processing': 0,
                    'pending': 0
                }
                for task in tasks.values():
                    if task.status == TaskStatus.PROCESSING:
                        doc_status['processing'] += 1
                    elif task.status == TaskStatus.PENDING:
                        doc_status['pending'] += 1

                with self.completed_lock:
                    completed = len(self.document_completed_tasks.get(doc_id, []))
                    doc_status['completed'] = completed

                status['documents'][doc_id[:8] + '...'] = doc_status

        return status

    async def shutdown(self):
        """Shutdown all processing threads"""
        self.logger.info("Shutting down parallel processor...")

        # Signal shutdown
        self.shutdown_event.set()
        self._monitor_stop_event.set()

        # Wait for threads to complete
        for thread in self.threads:
            thread.join(timeout=5.0)
            if thread.is_alive():
                self.logger.warning(f"Thread {thread.name} did not stop gracefully")

        # Stop monitor thread
        if self._monitor_thread and self._monitor_thread.is_alive():
            self._monitor_thread.join(timeout=2.0)

        # Clear threads list
        self.threads.clear()

        # Cleanup async components
        if self._initialized:
            await self.image_processor.cleanup()
            self._initialized = False

        # Clear all document data
        with self.tasks_lock:
            self.document_tasks.clear()
        with self.completed_lock:
            self.document_completed_tasks.clear()
        self.document_languages.clear()

        self.logger.info("Parallel processor shutdown complete")


# Import required modules
import os