maxkb/apps/common/handle/impl/mineru/parallel_processor.py
2025-08-25 19:39:23 +08:00

1524 lines
78 KiB
Python

"""
Parallel processing module for MinerU with task-level granularity.
This module implements a producer-consumer pattern with four independent
processing threads for maximum parallelism.
"""
import asyncio
import os
import queue
import threading
import time
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass, field
from enum import Enum
from concurrent.futures import ThreadPoolExecutor
from .logger import get_module_logger
logger = get_module_logger('parallel_processor')
from .config_base import MinerUConfig
from .converter import DocumentConverter
from .api_client import MinerUAPIClient, MinerUResult
from .image_processor import MinerUImageProcessor
from .content_processor import MinerUContentProcessor
class TaskStatus(Enum):
"""Task processing status"""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
@dataclass
class PageTask:
"""Represents a page-level processing task"""
page_idx: int
pdf_path: str
temp_dir: str
src_fileid: str
status: TaskStatus = TaskStatus.PENDING
content: Optional[str] = None
images: List[str] = field(default_factory=list)
refined_content: Optional[str] = None
processed_images: Dict[str, str] = field(default_factory=dict)
image_descriptions: Dict[str, Dict] = field(default_factory=dict)
error: Optional[str] = None
metadata: Dict[str, Any] = field(default_factory=dict)
# Non-serializable fields
upload_callback: Optional[Any] = field(default=None, repr=False)
upload_options: Optional[Any] = field(default=None, repr=False)
meaningful_images: List[Any] = field(default_factory=list, repr=False)
save_callback: Optional[Any] = field(default=None, repr=False) # Cache save callback
# Retry tracking
retry_count: int = 0
max_retries: int = 3
@dataclass
class ProcessingQueues:
"""Container for all processing queues"""
parsing_queue: queue.Queue
content_refinement_queue: queue.Queue
image_recognition_queue: queue.Queue
image_upload_queue: queue.Queue
class ParallelMinerUProcessor:
"""
Parallel processor implementing task-level granularity.
Architecture:
- 4 independent threads: parsing, content refinement, image recognition, image upload
- Producer-consumer pattern with queues connecting stages
- Page-level task granularity for maximum parallelism
"""
def __init__(self, config: MinerUConfig, learn_type: int = 9, platform_adapter=None):
self.config = config
self.learn_type = learn_type
self.logger = logger
self.platform_adapter = platform_adapter
# Initialize components
self.converter = DocumentConverter(config)
self.content_processor = MinerUContentProcessor(config)
self.image_processor = MinerUImageProcessor(config)
# Initialize queues
self.queues = ProcessingQueues(
parsing_queue=queue.Queue(maxsize=config.queue_size),
content_refinement_queue=queue.Queue(maxsize=config.queue_size),
image_recognition_queue=queue.Queue(maxsize=config.queue_size),
image_upload_queue=queue.Queue(maxsize=config.queue_size)
)
# Task tracking - use document-specific dictionaries
self.document_tasks: Dict[str, Dict[int, PageTask]] = {} # src_fileid -> page_idx -> task
self.tasks_lock = threading.Lock()
# Thread control
self.shutdown_event = threading.Event()
self.threads: List[threading.Thread] = []
# Results collection - per document
self.document_completed_tasks: Dict[str, List[PageTask]] = {} # src_fileid -> completed tasks
self.completed_lock = threading.Lock()
# Async initialization flag
self._initialized = False
# Queue monitoring
self._monitor_thread = None
self._monitor_stop_event = threading.Event()
# Document-level language detection cache
self.document_languages: Dict[str, str] = {} # src_fileid -> detected language
async def initialize(self):
"""Initialize async components like image processor"""
if not self._initialized:
await self.image_processor.initialize()
self._initialized = True
def initialize_threads(self):
"""
Initialize and start all processing threads.
This is the main initialization function that starts:
1. Parsing thread - processes MinerU API calls
2. Content refinement thread - refines content with AI
3. Image recognition thread - classifies and recognizes images
4. Image upload thread - uploads images to storage
"""
self.logger.info("Initializing parallel processing threads...")
# Create threads
threads = [
threading.Thread(target=self._run_async_thread, args=(self._parsing_worker,),
name="MinerU-Parser"),
threading.Thread(target=self._run_async_thread, args=(self._content_refinement_worker,),
name="MinerU-ContentRefiner"),
threading.Thread(target=self._run_async_thread, args=(self._image_recognition_worker,),
name="MinerU-ImageRecognizer"),
threading.Thread(target=self._run_async_thread, args=(self._image_upload_worker,),
name="MinerU-ImageUploader")
]
# Start all threads
for thread in threads:
thread.daemon = True
thread.start()
self.threads.append(thread)
self.logger.info(f"Started thread: {thread.name}")
def _run_async_thread(self, async_worker):
"""Run async worker in thread with its own event loop"""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(async_worker())
finally:
loop.close()
async def _parsing_worker(self):
"""
Parsing worker thread with on-demand batch processing.
Consumes: PageTask from parsing_queue
Produces: PageTask to content_refinement_queue and image_recognition_queue
"""
self.logger.info("Parsing worker started")
# Cache for batch results: {src_fileid: {batch_idx: MinerUResult}}
document_batch_results = {}
# Track which pages belong to which batch
document_batch_info = {} # {src_fileid: {'batch_size': int, 'total_pages': int}}
# Initialize API client
async with MinerUAPIClient(self.config, self.platform_adapter) as api_client:
while not self.shutdown_event.is_set():
try:
# Get task from queue (timeout to check shutdown)
try:
task = self.queues.parsing_queue.get(timeout=1.0)
except queue.Empty:
continue
# Set trace context for this task with page number
if self.platform_adapter:
self.platform_adapter.set_trace_id(f"{task.src_fileid}_p{task.page_idx + 1}")
self.logger.info(f"Parser processing page {task.page_idx + 1}")
# Update task status
with self.tasks_lock:
task.status = TaskStatus.PROCESSING
# Track task by document
if task.src_fileid not in self.document_tasks:
self.document_tasks[task.src_fileid] = {}
self.document_tasks[task.src_fileid][task.page_idx] = task
try:
# Determine batch size and batch index for this page
batch_size = self.config.batch_size if self.config.batch_processing_enabled else 0
# Initialize document batch info if needed
if task.src_fileid not in document_batch_info:
# Get total pages from metadata or detect from PDF
total_pages = task.metadata.get('total_pages', 0)
if total_pages == 0:
# Count pages if not provided
import fitz
with fitz.open(task.pdf_path) as doc:
total_pages = len(doc)
document_batch_info[task.src_fileid] = {
'batch_size': batch_size if batch_size > 0 and total_pages > batch_size else 0,
'total_pages': total_pages
}
document_batch_results[task.src_fileid] = {}
batch_info = document_batch_info[task.src_fileid]
# Calculate batch index for this page
if batch_info['batch_size'] > 0:
batch_idx = task.page_idx // batch_info['batch_size']
batch_start = batch_idx * batch_info['batch_size']
batch_end = min(batch_start + batch_info['batch_size'], batch_info['total_pages'])
# Check if we have results for this batch
if batch_idx not in document_batch_results[task.src_fileid]:
# Need to process this batch
self.logger.info(f"Parser: processing batch {batch_idx + 1} (pages {batch_start + 1}-{batch_end}) for {task.src_fileid[:8]}...")
# Split PDF for this batch
batch_pdf_path = await self._split_pdf_batch(
task.pdf_path, task.temp_dir, batch_start, batch_end, batch_idx
)
# Process this batch
batch_result = await api_client._process_batch_self_hosted(
batch_pdf_path, task.temp_dir, task.src_fileid,
batch_idx, batch_start, batch_end,
task.metadata.get('is_ppt_format', False)
) if self.config.mineru_api_type == "self_hosted" else await api_client._process_batch_cloud(
batch_pdf_path, task.temp_dir, task.src_fileid,
batch_idx, batch_start, batch_end,
task.metadata.get('is_ppt_format', False)
)
if not batch_result.success:
# Batch failed, try single-page processing for this batch
self.logger.warning(f"Batch {batch_idx + 1} failed, falling back to single-page processing for pages {batch_start + 1}-{batch_end}")
# Mark this batch as needing single-page processing
if task.src_fileid not in document_batch_results:
document_batch_results[task.src_fileid] = {}
# Create a special marker for failed batch
document_batch_results[task.src_fileid][batch_idx] = MinerUResult(
success=False,
content="",
images=[],
tables=[],
metadata={"batch_failed": True, "needs_single_page": True,
"batch_start": batch_start, "batch_end": batch_end},
error=batch_result.error
)
# Process pages individually for this failed batch
batch_result = await self._process_batch_as_single_pages(
task, api_client, batch_start, batch_end, batch_idx
)
if batch_result.success:
# Update cache with successful single-page results
document_batch_results[task.src_fileid][batch_idx] = batch_result
self.logger.info(f"Parser: single-page processing succeeded for batch {batch_idx + 1}")
else:
raise Exception(batch_result.error or f"Batch {batch_idx} processing failed even with single-page fallback")
else:
# Original batch succeeded, cache it
document_batch_results[task.src_fileid][batch_idx] = batch_result
self.logger.info(f"Parser: cached batch {batch_idx + 1} result for {task.src_fileid[:8]}...")
# Use cached batch result
result = document_batch_results[task.src_fileid][batch_idx]
self.logger.debug(f"Parser: using cached batch {batch_idx + 1} for page {task.page_idx + 1}")
else:
# No batching, process entire document
if 'full' not in document_batch_results.get(task.src_fileid, {}):
self.logger.info(f"Parser: calling MinerU API for full document {task.src_fileid[:8]}...")
result = await api_client.process_document(
task.pdf_path, task.temp_dir, task.src_fileid,
is_ppt_converted=task.metadata.get('is_ppt_format', False),
batch_size=0 # Disable batching
)
if not result.success:
raise Exception(result.error or "MinerU parsing failed")
# Cache the result
if task.src_fileid not in document_batch_results:
document_batch_results[task.src_fileid] = {}
document_batch_results[task.src_fileid]['full'] = result
else:
result = document_batch_results[task.src_fileid]['full']
self.logger.debug(f"Parser: using cached full result for page {task.page_idx + 1}")
# Extract page-specific content from the full document result
self.logger.debug(f"Extracting page data for page {task.page_idx + 1}")
self.logger.debug(f"Result type: {type(result)}, has content: {hasattr(result, 'content')}")
if hasattr(result, 'metadata'):
self.logger.debug(f"Result metadata keys: {list(result.metadata.keys())}")
page_data = self._extract_page_data(result, task.page_idx)
task.content = page_data.get('content', '')
task.images = page_data.get('images', [])
task.metadata.update(page_data.get('metadata', {}))
self.logger.debug(f"Extracted content length: {len(task.content)}, images: {len(task.images)}")
# Store content_list in metadata for context extraction
if result.metadata.get('content_list'):
task.metadata['content_list'] = result.metadata['content_list']
# Use language detected from the full document
if result.metadata.get('detected_language'):
# Store document-level language from API result
if task.src_fileid not in self.document_languages:
self.document_languages[task.src_fileid] = result.metadata['detected_language']
self.logger.info(f"Parser: using document language from API: {result.metadata['detected_language']} for {task.src_fileid[:8]}...")
else:
# Fallback: detect language from content if available
if task.content and task.content.strip():
from .language_detector import LanguageDetector
language_code, confidence = LanguageDetector.detect_language(task.content)
if confidence > 0.7:
# Store document-level language
if task.src_fileid not in self.document_languages:
self.document_languages[task.src_fileid] = language_code
self.logger.info(f"Parser: detected document language (fallback): {language_code} for {task.src_fileid[:8]}...")
# Send to next stages (task already contains upload_callback and upload_options)
if task.content:
self.queues.content_refinement_queue.put(task)
else:
# No content, set refined_content to empty
task.refined_content = ''
if task.images:
self.queues.image_recognition_queue.put(task)
else:
# No images to process
task.metadata['images_processed'] = True
# If page has neither content nor images, mark as complete
if not task.content and not task.images:
task.refined_content = ''
self._check_task_completion(task)
self.logger.info(f"Parser completed page {task.page_idx + 1}")
except Exception as e:
# Page-level task retry (for handling queue/processing failures, not API failures)
# Note: Batch API calls don't retry, they fallback to single-page processing
# Single-page API calls have their own retry logic in api_client._retry_with_backoff
task.retry_count += 1
if task.retry_count <= task.max_retries:
self.logger.warning(f"Parser task failed for page {task.page_idx + 1} (attempt {task.retry_count}/{task.max_retries}): {e}")
# Put the task back in the queue for retry
with self.tasks_lock:
task.status = TaskStatus.PENDING
task.error = f"Task retry {task.retry_count}: {str(e)}"
# Add exponential backoff delay
retry_delay = min(2.0 * (2 ** (task.retry_count - 1)), 30.0)
await asyncio.sleep(retry_delay)
# Re-queue the task for retry
self.queues.parsing_queue.put(task)
self.logger.info(f"Re-queued page {task.page_idx + 1} task for retry after {retry_delay:.1f}s delay")
else:
# Max retries exceeded, mark as failed
self.logger.error(f"Parser task failed for page {task.page_idx + 1} after {task.max_retries} retries: {e}")
with self.tasks_lock:
task.status = TaskStatus.FAILED
task.error = f"Task failed after {task.max_retries} retries: {str(e)}"
self._mark_task_completed(task)
finally:
self.queues.parsing_queue.task_done()
# Clean up cached results for completed documents
# Check if all pages for this document are processed
with self.tasks_lock:
if task.src_fileid in self.document_tasks:
doc_tasks = self.document_tasks[task.src_fileid]
all_processed = all(
t.status in [TaskStatus.COMPLETED, TaskStatus.FAILED]
for t in doc_tasks.values()
)
if all_processed:
# All pages processed, clean up cache
if task.src_fileid in document_batch_results:
del document_batch_results[task.src_fileid]
self.logger.info(f"Parser: cleaned up cached batch results for document {task.src_fileid[:8]}...")
if task.src_fileid in document_batch_info:
del document_batch_info[task.src_fileid]
except Exception as e:
self.logger.error(f"Parsing worker error: {e}")
self.logger.info("Parsing worker stopped")
async def _content_refinement_worker(self):
"""
Content refinement worker thread.
Consumes: PageTask from content_refinement_queue
Updates: PageTask with refined content
"""
self.logger.info("Content refinement worker started")
while not self.shutdown_event.is_set():
try:
# Get task from queue
try:
task = self.queues.content_refinement_queue.get(timeout=1.0)
except queue.Empty:
continue
# Set trace context for this task with page number
if self.platform_adapter:
self.platform_adapter.set_trace_id(f"{task.src_fileid}_p{task.page_idx + 1}")
self.logger.info(f"Refiner processing page {task.page_idx + 1}")
try:
# Get document language
language_code = self.document_languages.get(task.src_fileid)
# Process content refinement for single page
page_result = await self.content_processor.process_page_content(
task.content, task.images, task.pdf_path, task.page_idx,
task.temp_dir, task.src_fileid, self.learn_type, language_code
)
# Extract results
task.refined_content = page_result['content']
task.metadata['has_tables'] = page_result['has_tables']
task.metadata['content_processing'] = page_result.get('processing_metadata', {})
self.logger.info(f"Refiner completed page {task.page_idx + 1}")
# Check if this task is complete
self._check_task_completion(task)
except Exception as e:
self.logger.error(f"Refiner failed for page {task.page_idx + 1}: {e}")
task.refined_content = task.content # Fallback
self._check_task_completion(task)
finally:
self.queues.content_refinement_queue.task_done()
except Exception as e:
self.logger.error(f"Content refinement worker error: {e}")
self.logger.info("Content refinement worker stopped")
async def _image_recognition_worker(self):
"""
Image recognition worker thread.
Consumes: PageTask from image_recognition_queue
Produces: PageTask to image_upload_queue
"""
self.logger.info("Image recognition worker started")
try:
while not self.shutdown_event.is_set():
try:
# Get task from queue
try:
task = self.queues.image_recognition_queue.get(timeout=1.0)
except queue.Empty:
continue
# Set trace context for this task with page number
if self.platform_adapter:
self.platform_adapter.set_trace_id(f"{task.src_fileid}_p{task.page_idx + 1}")
self.logger.info(f"Recognizer processing {len(task.images)} images for page {task.page_idx + 1}")
try:
self.logger.info(f"Recognizer: loading images from {task.temp_dir}")
# Create wrapper for classification without upload
async def classify_only(learn_type, image_filepath, temp_dir, src_name, hint=""):
# Use document-level language directly
language_code = self.document_languages.get(task.src_fileid)
if language_code:
self.logger.info(f"Recognizer: using document language: {language_code} for page {task.page_idx + 1}")
else:
self.logger.info(f"Recognizer: no document language detected for {task.src_fileid[:8]}")
# Extract context for the image if content is available
context = None
if task.content:
try:
# Extract the filename from filepath
import os
import re
filename = os.path.basename(image_filepath)
# Create ImageContext from page content
from .context_types import ImageContext
# Try to find image reference in content
# Look for patterns like ![...](filename) or just the filename
surrounding_text = ""
before_text = ""
after_text = ""
# Extract base name without extension for more flexible matching
base_name = filename.replace('.png', '').replace('.jpg', '').replace('.jpeg', '')
# Try different patterns to find the image reference
patterns = [
rf'!\[.*?\]\([^)]*{re.escape(filename)}[^)]*\)', # Full filename in markdown
rf'!\[.*?\]\([^)]*{re.escape(base_name)}[^)]*\)', # Base name in markdown
rf'{re.escape(filename)}', # Just the filename
rf'{re.escape(base_name)}' # Just the base name
]
image_position = -1
for pattern in patterns:
match = re.search(pattern, task.content)
if match:
image_position = match.start()
self.logger.debug(f"Found image reference at position {image_position} using pattern: {pattern}")
break
if image_position >= 0:
# Extract 250 characters before and after the image reference
start_pos = max(0, image_position - 250)
end_pos = min(len(task.content), image_position + 250)
before_text = task.content[start_pos:image_position].strip()
after_text = task.content[image_position:end_pos].strip()
surrounding_text = task.content[start_pos:end_pos].strip()
self.logger.info(f"Recognizer: found image {filename} at position {image_position}, "
f"extracted {len(before_text)} chars before and {len(after_text)} chars after")
else:
# Fallback: if image reference not found, use first 500 chars of content
surrounding_text = task.content[:500] if len(task.content) > 500 else task.content
self.logger.warning(f"Recognizer: could not find image reference for {filename}, using fallback context")
context = ImageContext(
page_idx=task.page_idx,
surrounding_text=surrounding_text,
page_type='content', # Could be enhanced to detect actual page type
token_count=len(surrounding_text) // 4, # Rough token estimate
before_text=before_text,
after_text=after_text
)
self.logger.info(f"Recognizer: created context for image {filename} with {len(surrounding_text)} chars of surrounding text")
except Exception as e:
self.logger.warning(f"Recognizer: failed to create context for image: {e}")
context = None
# Use the image processor's classification method with context
return await self.image_processor._classify_single_image_with_context(
learn_type, image_filepath, temp_dir, src_name, hint, context, language_code
)
# Load and classify images
images_to_process = []
for img_filename in task.images:
img_filepath = os.path.join(task.temp_dir, img_filename)
self.logger.info(f"Recognizer: checking image {img_filename} at {img_filepath}")
if os.path.exists(img_filepath):
xref = img_filename.replace('.png', '').replace('mineru_image_', '')
self.logger.info(f"Recognizer: loading image info for {img_filename}, xref={xref}")
image_info = await self.image_processor.image_optimizer.load_image_info(
img_filepath, img_filename, xref
)
images_to_process.append(image_info)
self.logger.info(f"Recognizer: loaded image info: {image_info}")
else:
self.logger.warning(f"Recognizer: image file not found: {img_filepath}")
if images_to_process:
# Classify images sequentially (not concurrently) to avoid pressure on multimodal service
self.logger.info(f"Recognizer: classifying {len(images_to_process)} images sequentially for page {task.page_idx + 1}")
classification_results = await self.image_processor.image_optimizer.batch_classify_images(
images_to_process, classify_only, self.learn_type,
task.temp_dir, task.src_fileid
)
# Filter meaningful images based on configuration
meaningful_images = []
for image_info in images_to_process:
xref = image_info.xref
if xref in classification_results:
result = classification_results[xref]
else:
# No classification result - likely an error occurred
self.logger.warning(f"Recognizer: no classification result for {image_info.filename}, creating default result")
result = {
'type': 'meaningless',
'content': 'Classification failed - no result returned',
'input_tokens': 0,
'output_tokens': 0,
'error': 'No classification result'
}
# Apply meaningless filter if configured
if self.config.filter_meaningless_images and result.get('type') == 'meaningless':
self.logger.info(f"Recognizer: filtering out meaningless image {image_info.filename}")
# Still store the classification for reference
task.image_descriptions[image_info.filename] = result
else:
# Either filter is disabled or image is meaningful
meaningful_images.append(image_info)
task.image_descriptions[image_info.filename] = result
# Send to upload queue if there are images to upload
# Note: if filter is disabled, we upload all classified images including meaningless ones
if meaningful_images or (not self.config.filter_meaningless_images and images_to_process):
if not self.config.filter_meaningless_images:
# If filter is disabled, upload all processed images
task.meaningful_images = images_to_process
else:
# If filter is enabled, only upload meaningful images
task.meaningful_images = meaningful_images
self.queues.image_upload_queue.put(task)
else:
# No images to upload
task.metadata['images_processed'] = True
self._check_task_completion(task)
self.logger.info(f"Recognizer completed page {task.page_idx + 1}: "
f"{len(meaningful_images)}/{len(task.images)} meaningful")
else:
task.metadata['images_processed'] = True
self._check_task_completion(task)
except Exception as e:
self.logger.error(f"Recognizer failed for page {task.page_idx + 1}: {e}")
import traceback
self.logger.error(f"Recognizer traceback: {traceback.format_exc()}")
task.metadata['images_processed'] = True # Mark as processed to avoid hanging
self._check_task_completion(task)
finally:
self.queues.image_recognition_queue.task_done()
except Exception as e:
self.logger.error(f"Image recognition worker inner error: {e}")
import traceback
self.logger.error(f"Image recognition worker traceback: {traceback.format_exc()}")
except Exception as e:
self.logger.error(f"Image recognition worker error: {e}")
finally:
self.logger.info("Image recognition worker stopped")
async def _image_upload_worker(self):
"""
Image upload worker thread.
Consumes: PageTask from image_upload_queue
Updates: PageTask with uploaded image URLs
"""
self.logger.info("Image upload worker started")
while not self.shutdown_event.is_set():
try:
# Get task from queue
try:
task = self.queues.image_upload_queue.get(timeout=1.0)
except queue.Empty:
continue
# Set trace context for this task with page number
if self.platform_adapter:
self.platform_adapter.set_trace_id(f"{task.src_fileid}_p{task.page_idx + 1}")
meaningful_images = task.meaningful_images
self.logger.info(f"Uploader processing {len(meaningful_images)} images for page {task.page_idx + 1}")
try:
if meaningful_images:
# Get upload callback and options from task
upload_callback = task.upload_callback
upload_options = task.upload_options
self.logger.info(f"Uploader: starting upload for {len(meaningful_images)} images on page {task.page_idx + 1}")
if upload_callback:
# Batch upload images
upload_results = await self.image_processor.image_optimizer.batch_upload_images(
meaningful_images, upload_callback, upload_options
)
self.logger.info(f"Uploader: upload_results keys: {list(upload_results.keys())}")
# Map results back to filenames
for image_info in meaningful_images:
xref = image_info.xref
if xref in upload_results and upload_results[xref]:
task.processed_images[image_info.filename] = upload_results[xref]
self.logger.info(f"Uploader: mapped {image_info.filename} -> {upload_results[xref]}")
else:
self.logger.warning(f"Uploader: no upload result for {image_info.filename} (xref={xref})")
self.logger.info(f"Uploader completed page {task.page_idx + 1}: "
f"{len(task.processed_images)} uploaded")
else:
self.logger.warning(f"Uploader: no upload_callback provided for page {task.page_idx + 1}")
# Mark images as processed
task.metadata['images_processed'] = True
# Mark task as complete
self._check_task_completion(task)
except Exception as e:
self.logger.error(f"Uploader failed for page {task.page_idx + 1}: {e}")
self._check_task_completion(task)
finally:
self.queues.image_upload_queue.task_done()
except Exception as e:
self.logger.error(f"Image upload worker error: {e}")
self.logger.info("Image upload worker stopped")
async def _split_pdf_batch(self, pdf_path: str, temp_dir: str, start_page: int,
end_page: int, batch_idx: int) -> str:
"""
Split PDF to extract specific pages for batch processing.
Args:
pdf_path: Original PDF path
temp_dir: Temporary directory
start_page: Start page index (0-based)
end_page: End page index (exclusive)
batch_idx: Batch index for naming
Returns:
Path to the split PDF file
"""
import fitz
batch_pdf_path = os.path.join(temp_dir, f"batch_{batch_idx}.pdf")
with fitz.open(pdf_path) as src_doc:
batch_doc = fitz.open() # Create new PDF
# Copy pages to new document
for page_idx in range(start_page, end_page):
batch_doc.insert_pdf(src_doc, from_page=page_idx, to_page=page_idx)
batch_doc.save(batch_pdf_path)
batch_doc.close()
self.logger.info(f"Created batch PDF with {end_page - start_page} pages: {batch_pdf_path}")
return batch_pdf_path
async def _process_batch_as_single_pages(self, task: PageTask, api_client: MinerUAPIClient,
batch_start: int, batch_end: int, batch_idx: int) -> MinerUResult:
"""
Process a failed batch by trying each page individually.
This helps identify which specific page is causing the batch to fail.
Args:
task: The current page task
api_client: MinerU API client
batch_start: Start page index of the batch
batch_end: End page index of the batch
batch_idx: Batch index
Returns:
MinerUResult with combined results from successful pages
"""
self.logger.info(f"Starting single-page processing for failed batch {batch_idx + 1} (pages {batch_start + 1}-{batch_end})")
successful_pages = []
failed_pages = []
all_content_parts = []
all_images = []
all_tables = []
page_data = {}
# Try each page individually
for page_idx in range(batch_start, batch_end):
try:
self.logger.info(f"Processing single page {page_idx + 1}/{batch_end}")
# Create single-page PDF
single_page_pdf = os.path.join(task.temp_dir, f"single_page_{page_idx}.pdf")
import fitz
with fitz.open(task.pdf_path) as src_doc:
single_doc = fitz.open()
single_doc.insert_pdf(src_doc, from_page=page_idx, to_page=page_idx)
single_doc.save(single_page_pdf)
single_doc.close()
# Process single page WITH retry logic
if self.config.mineru_api_type == "self_hosted":
# Use retry wrapper for single-page processing
page_result = await api_client._retry_with_backoff(
api_client._process_batch_self_hosted_impl,
single_page_pdf, task.temp_dir, task.src_fileid,
page_idx, page_idx, page_idx + 1,
task.metadata.get('is_ppt_format', False)
)
else:
# Use retry wrapper for single-page processing
page_result = await api_client._retry_with_backoff(
api_client._process_batch_cloud_impl,
single_page_pdf, task.temp_dir, task.src_fileid,
page_idx, page_idx, page_idx + 1,
task.metadata.get('is_ppt_format', False)
)
# Clean up single-page PDF
if os.path.exists(single_page_pdf):
os.remove(single_page_pdf)
if page_result.success:
successful_pages.append(page_idx + 1)
all_content_parts.append(page_result.content)
all_images.extend(page_result.images)
all_tables.extend(page_result.tables)
page_data[page_idx] = {
'content': page_result.content,
'images': page_result.images,
'tables': page_result.tables,
'metadata': page_result.metadata
}
self.logger.success(f"✓ Page {page_idx + 1} processed successfully")
else:
failed_pages.append(page_idx + 1)
self.logger.error(f"✗ Page {page_idx + 1} failed: {page_result.error}")
# Add empty data for failed page
page_data[page_idx] = {
'content': '',
'images': [],
'tables': [],
'metadata': {'error': page_result.error, 'failed': True}
}
except Exception as e:
failed_pages.append(page_idx + 1)
self.logger.error(f"✗ Page {page_idx + 1} processing error: {str(e)}")
# Add empty data for failed page
page_data[page_idx] = {
'content': '',
'images': [],
'tables': [],
'metadata': {'error': str(e), 'failed': True}
}
# Log summary
self.logger.info(f"Single-page processing complete for batch {batch_idx + 1}:")
self.logger.info(f" - Successful pages: {successful_pages} ({len(successful_pages)}/{batch_end - batch_start})")
if failed_pages:
self.logger.warning(f" - Failed pages: {failed_pages} ({len(failed_pages)}/{batch_end - batch_start})")
# Return combined result
success = len(successful_pages) > 0 # Partial success if at least one page worked
metadata = {
"batch_idx": batch_idx,
"start_page": batch_start,
"end_page": batch_end,
"pages_in_batch": batch_end - batch_start,
"successful_pages": successful_pages,
"failed_pages": failed_pages,
"single_page_fallback": True,
"page_data": page_data
}
return MinerUResult(
success=success,
content="\n\n".join(all_content_parts),
images=all_images,
tables=all_tables,
metadata=metadata,
error=f"Failed pages: {failed_pages}" if failed_pages else None
)
def _extract_page_data(self, mineru_result, page_idx: int) -> Dict:
"""Extract data for a specific page from MinerU result"""
# Check if we have pre-split page data
if mineru_result.metadata.get('page_data'):
page_data = mineru_result.metadata['page_data']
self.logger.debug(f"Found page_data with keys: {list(page_data.keys())}")
# Try both integer and string keys
if page_idx in page_data:
self.logger.debug(f"Found data for page_idx {page_idx} (int key)")
return {
'content': page_data[page_idx].get('content', ''),
'images': page_data[page_idx].get('images', []),
'metadata': page_data[page_idx].get('metadata', {})
}
elif str(page_idx) in page_data:
self.logger.debug(f"Found data for page_idx {page_idx} (str key)")
return {
'content': page_data[str(page_idx)].get('content', ''),
'images': page_data[str(page_idx)].get('images', []),
'metadata': page_data[str(page_idx)].get('metadata', {})
}
else:
self.logger.warning(f"No data found for page_idx {page_idx} in page_data")
# Fallback: split content by page markers
content_parts = mineru_result.content.split(f'__PAGE_OF_PORTION_{page_idx + 1}__')
if len(content_parts) > 1:
# Get content after the page marker until next marker
page_content = content_parts[1].split(f'__PAGE_OF_PORTION_{page_idx + 2}__')[0]
else:
# If no page markers, return empty
page_content = ''
# Filter images by page (assuming images have page info in metadata)
page_images = []
# For single-page documents, assign all images to page 0
total_pages = len(mineru_result.metadata.get('page_data', {}))
if total_pages == 1 and page_idx == 0:
# Single page document - assign all images to this page
page_images = list(mineru_result.images)
self.logger.debug(f"Single-page document: assigning {len(page_images)} images to page 0")
else:
# Multi-page document - filter by page markers
for img in mineru_result.images:
# This is a simplified version - real implementation would check image metadata
if f'page_{page_idx}' in img or f'p{page_idx}' in img:
page_images.append(img)
# Handle page content - ensure empty/whitespace pages are properly handled
stripped_content = page_content.strip()
# Log if we have a page with only whitespace
if page_content and not stripped_content:
self.logger.debug(f"Page {page_idx} contains only whitespace characters")
return {
'content': stripped_content,
'images': page_images,
'metadata': mineru_result.metadata
}
def _check_task_completion(self, task: PageTask):
"""Check if a task is complete and mark it accordingly"""
should_mark_complete = False
with self.tasks_lock:
# Skip if already completed
if task.status == TaskStatus.COMPLETED:
self.logger.debug(f"Page {task.page_idx + 1} already completed, skipping")
return
# A task is complete when:
# 1. Content is refined (or original content is used)
# 2. Images are processed (or there are no images)
has_content = task.refined_content is not None
has_no_images = not task.images
has_processed_images = len(task.images) == 0 or task.metadata.get('images_processed', False)
# Log the current state for debugging
self.logger.debug(f"Checking completion for page {task.page_idx + 1}: "
f"has_content={has_content}, has_no_images={has_no_images}, "
f"has_processed_images={has_processed_images}, "
f"images_count={len(task.images)}")
if has_content and (has_no_images or has_processed_images):
# Integrate images into content if we have any image descriptions
# This ensures meaningless images are properly removed from content
if task.image_descriptions:
self.logger.info(f"Page {task.page_idx + 1} processing image integration:")
self.logger.info(f" - processed_images: {list(task.processed_images.keys()) if task.processed_images else 'None (filtered out)'}")
self.logger.info(f" - image_descriptions: {list(task.image_descriptions.keys())}")
self.logger.info(f" - content length before: {len(task.refined_content)} chars")
task.refined_content = self._integrate_images_into_content(
task.refined_content,
task.image_descriptions,
task.processed_images or {}, # Pass empty dict if None
f"{task.src_fileid}_page_{task.page_idx}"
)
self.logger.info(f" - content length after: {len(task.refined_content)} chars")
else:
self.logger.info(f"Page {task.page_idx + 1} has no images to process")
task.status = TaskStatus.COMPLETED
should_mark_complete = True
# Call _mark_task_completed outside of the lock to avoid deadlock
if should_mark_complete:
self._mark_task_completed(task)
def _integrate_images_into_content(self, content: str, image_descriptions: Dict[str, Dict],
uploaded_images: Dict[str, str], _page_identifier: str) -> str:
"""
Integrate image descriptions into content by replacing original image references.
This method is adapted from image_processor.integrate_image_descriptions
"""
import re
try:
enhanced_content = content
# Log what we're processing
self.logger.info(f"Image integration starting for {_page_identifier}:")
self.logger.info(f" - Image descriptions: {list(image_descriptions.keys())}")
self.logger.info(f" - Uploaded images: {list(uploaded_images.keys())}")
self.logger.debug(f" - Content length: {len(content)} chars")
# First, find all image references in the content
image_pattern = r'!\[.*?\]\((.*?)\)'
content_images = re.findall(image_pattern, content)
self.logger.info(f"Found {len(content_images)} image references in content:")
for img_ref in content_images[:5]: # Log first 5
self.logger.info(f" - {img_ref}")
# Process each image description
for filename, desc_info in image_descriptions.items():
self.logger.info(f"\nChecking image {filename} for replacement")
# Get image type to determine if it's meaningless
img_type = desc_info.get('type', 'brief_description')
# Process ALL images that have descriptions
# - Meaningless images: remove references (replace with empty string)
# - Images not uploaded but classified: also remove (likely filtered)
# - Uploaded images: replace with proper markdown
uploaded_url = uploaded_images.get(filename, '')
if img_type == 'meaningless':
self.logger.info(f" - Image is meaningless, will remove references")
elif filename not in uploaded_images:
self.logger.info(f" - Image was classified as {img_type} but not uploaded (filtered), will remove references")
# Treat as meaningless for removal purposes
img_type = 'meaningless'
else:
self.logger.info(f" - Found in uploaded_images: {uploaded_url}")
# Extract the hash part from the filename
# This handles filenames like: 390561cb34fd3f951b1d25a252bead1c_page_1_44450601...jpg
# or mineru_image_xxx.png
base_filename = filename.replace('.png', '').replace('.jpg', '').replace('.jpeg', '')
# Try to extract the hash part (usually the long hex string)
# For mineru images: mineru_image_XXX -> XXX
# For hash-based: XXX_page_N_YYY -> YYY (the last hash)
if 'mineru_image_' in filename:
ref = base_filename.replace('mineru_image_', '')
else:
# Look for the last hash-like pattern
parts = base_filename.split('_')
# Find the longest hex-like string
hash_parts = [p for p in parts if len(p) > 20 and all(c in '0123456789abcdef' for c in p)]
if hash_parts:
ref = hash_parts[-1] # Use the last hash
else:
ref = base_filename
# Build replacement content based on image type
# img_type already extracted above
title = desc_info.get('title', '')
description = desc_info.get('content', '')
ocr_content = desc_info.get('ocr_content', '')
# Create the replacement markdown
if img_type == 'meaningless':
# For meaningless images, we want to remove them entirely
replacement = "" # Empty string to remove the image reference
elif img_type == 'structured_content':
# For structured content, include full description
replacement = f"\n\n![{title}]({uploaded_url})\n<!--{description}-->\n\n{ocr_content}\n\n"
else:
# For other types, use a simpler format
if description:
replacement = f"\n\n![{title}]({uploaded_url})\n<!--{description}-->\n"
else:
replacement = f"\n\n![{title}]({uploaded_url})\n\n"
# Replace various possible image reference patterns
# We need to be flexible because the reference in content might be different from our filename
patterns = []
# If we found a hash reference, try to match it in various formats
if ref and len(ref) > 20: # Likely a hash
patterns.extend([
f"!\\[.*?\\]\\(.*?{ref}.*?\\)", # Match hash anywhere in path
f"!\\[\\]\\(.*?{ref}.*?\\)", # Empty alt text with hash
f"!\\[.*?\\]\\(images/{ref}\\.[^)]+\\)", # images/hash.ext
f"!\\[\\]\\(images/{ref}\\.[^)]+\\)", # images/hash.ext with empty alt
])
# Always try the full filename patterns
patterns.extend([
f"!\\[.*?\\]\\(.*?{re.escape(filename)}\\)", # Match exact filename
f"!\\[.*?\\]\\(.*?{re.escape(base_filename)}\\)", # Match base filename
])
# Add generic mineru image pattern if applicable
if 'mineru_image_' in filename:
patterns.append(f"!\\[.*?\\]\\(.*?{filename}\\)")
self.logger.info(f" - extracted ref: '{ref}'")
self.logger.info(f" - trying {len(patterns)} patterns")
replaced = False
for pattern in patterns:
new_content = re.sub(pattern, replacement, enhanced_content)
if new_content != enhanced_content:
enhanced_content = new_content
replaced = True
self.logger.info(f"Successfully replaced image {filename} using pattern: {pattern}")
break
if not replaced:
self.logger.warning(f"Failed to replace image reference for: {filename}, ref={ref}")
# Log the first few characters of content to help debugging
sample = enhanced_content[:500] if len(enhanced_content) > 500 else enhanced_content
self.logger.info(f"Content sample: {sample}...")
# Also log the exact patterns we tried
self.logger.info(f"Tried patterns:")
for p in patterns:
self.logger.info(f" - {p}")
return enhanced_content
except Exception as e:
self.logger.error(f"Error integrating images: {str(e)}")
return content # Return original content on error
def _mark_task_completed(self, task: PageTask):
"""Mark a task as completed and add to results"""
self.logger.debug(f"Marking task as completed: page {task.page_idx + 1}, status={task.status}, "
f"src_fileid={task.src_fileid[:8]}...")
# Get total pages first (outside of completed_lock to avoid potential deadlock)
with self.tasks_lock:
doc_tasks = self.document_tasks.get(task.src_fileid, {})
total_pages = len(doc_tasks)
try:
# Save to cache if callback provided and task succeeded
if task.status == TaskStatus.COMPLETED:
save_callback = task.save_callback # Get from non-serializable field
if save_callback:
try:
# Prepare page data for caching
page_data = {
'content': task.refined_content or task.content or '',
'images': task.images,
'processed_images': task.processed_images,
'image_descriptions': task.image_descriptions,
'has_tables': task.metadata.get('has_tables', False),
'processing_metadata': {k: v for k, v in task.metadata.items()
if k not in ['save_callback', 'upload_callback']}
}
save_callback(task.page_idx, page_data)
self.logger.debug(f"Saved page {task.page_idx + 1} to cache")
except Exception as e:
self.logger.error(f"Failed to save page {task.page_idx + 1} to cache: {e}")
with self.completed_lock:
self.logger.debug(f"Acquired completed_lock for marking task")
# Initialize list for this document if needed
if task.src_fileid not in self.document_completed_tasks:
self.document_completed_tasks[task.src_fileid] = []
self.logger.debug(f"Initialized completed tasks list for doc {task.src_fileid[:8]}")
self.document_completed_tasks[task.src_fileid].append(task)
completed_count = len(self.document_completed_tasks[task.src_fileid])
self.logger.debug(f"Added task to completed list, count now: {completed_count}")
self.logger.info(f"Task completed: page {task.page_idx + 1} "
f"[{completed_count}/{total_pages}] for doc {task.src_fileid[:8]}...")
except Exception as e:
self.logger.error(f"Error in _mark_task_completed: {e}", exc_info=True)
async def process_document_with_cache(self, pdf_path: str, temp_dir: str, src_fileid: str,
is_ppt_format: bool, total_pages: int,
upload_callback=None, upload_options=None,
cached_pages=None, save_callback=None) -> List[PageTask]:
"""
Process a document using parallel pipeline with cache support.
Args:
pdf_path: Path to PDF file
temp_dir: Temporary directory
src_fileid: Source file ID
is_ppt_format: Whether document is in PPT format
total_pages: Total number of pages
upload_callback: Callback for uploading images
upload_options: Upload options
cached_pages: Dictionary of already cached pages {page_idx: page_data}
save_callback: Callback to save page to cache when completed
Returns:
List of completed PageTask objects
"""
# Process with cache support
return await self._process_document_internal(
pdf_path, temp_dir, src_fileid, is_ppt_format, total_pages,
upload_callback, upload_options, cached_pages, save_callback
)
async def process_document(self, pdf_path: str, temp_dir: str, src_fileid: str,
is_ppt_format: bool, total_pages: int,
upload_callback=None, upload_options=None) -> List[PageTask]:
"""
Process a document using parallel pipeline.
Args:
pdf_path: Path to PDF file
temp_dir: Temporary directory
src_fileid: Source file ID
is_ppt_format: Whether document is in PPT format
total_pages: Total number of pages
upload_callback: Image upload callback
upload_options: Upload options
Returns:
List of completed PageTask objects
"""
# Process without cache support
return await self._process_document_internal(
pdf_path, temp_dir, src_fileid, is_ppt_format, total_pages,
upload_callback, upload_options, None, None
)
async def _process_document_internal(self, pdf_path: str, temp_dir: str, src_fileid: str,
is_ppt_format: bool, total_pages: int,
upload_callback=None, upload_options=None,
cached_pages=None, save_callback=None) -> List[PageTask]:
"""
Internal method to process document with optional cache support.
"""
cached_pages = cached_pages or {}
self.logger.info(f"Starting parallel processing for {total_pages} pages, {len(cached_pages)} already cached")
# Initialize async components
await self.initialize()
# Initialize threads if not already running
if not self.threads:
self.initialize_threads()
# Start queue monitor
self._start_queue_monitor()
# Initialize document-specific tracking
with self.tasks_lock:
self.document_tasks[src_fileid] = {}
with self.completed_lock:
self.document_completed_tasks[src_fileid] = []
# Create tasks for each page
for page_idx in range(total_pages):
# Check if page is already cached
if page_idx in cached_pages:
# Create a completed task from cached data
cached_data = cached_pages[page_idx]
task = PageTask(
page_idx=page_idx,
pdf_path=pdf_path,
temp_dir=temp_dir,
src_fileid=src_fileid,
status=TaskStatus.COMPLETED,
content=cached_data.get('content', ''),
images=cached_data.get('metadata', {}).get('images', []),
refined_content=cached_data.get('content', ''), # Use cached content as refined
processed_images=cached_data.get('metadata', {}).get('processed_images', {}),
image_descriptions=cached_data.get('metadata', {}).get('image_descriptions', {}),
metadata=cached_data.get('metadata', {}),
max_retries=self.config.api_max_retries # Use retry config from settings
)
with self.tasks_lock:
self.document_tasks[src_fileid][page_idx] = task
with self.completed_lock:
self.document_completed_tasks[src_fileid].append(task)
self.logger.info(f"Loaded page {page_idx + 1} from cache")
else:
# Create new task for processing
task = PageTask(
page_idx=page_idx,
pdf_path=pdf_path,
temp_dir=temp_dir,
src_fileid=src_fileid,
metadata={
'is_ppt_format': is_ppt_format
},
upload_callback=upload_callback,
upload_options=upload_options,
save_callback=save_callback, # Store as non-serializable field
max_retries=self.config.api_max_retries # Use retry config from settings
)
with self.tasks_lock:
self.document_tasks[src_fileid][page_idx] = task
self.queues.parsing_queue.put(task)
# Wait for all tasks to complete for this document
start_time = time.time()
self.logger.info(f"Waiting for {total_pages} pages to complete for document {src_fileid[:8]}...")
while True:
with self.completed_lock:
# Debug: log the src_fileid we're looking for
self.logger.debug(f"Looking for src_fileid: {src_fileid}")
completed_tasks_for_doc = self.document_completed_tasks.get(src_fileid, [])
completed_count = len(completed_tasks_for_doc)
# Debug: log the actual keys in document_completed_tasks
available_docs = list(self.document_completed_tasks.keys())
if len(available_docs) > 0:
# Show full keys for debugging
self.logger.debug(f"Looking for: {src_fileid}")
self.logger.debug(f"Available docs: {available_docs}")
self.logger.debug(f"Tasks for this doc: {completed_count}")
# Log progress every few iterations
if int(time.time() - start_time) % 5 == 0:
self.logger.info(f"Progress: {completed_count}/{total_pages} pages completed")
if completed_count >= total_pages:
self.logger.info(f"All {total_pages} pages completed!")
break
# Check for timeout
if time.time() - start_time > self.config.processing_timeout:
self.logger.error(f"Processing timeout reached after {self.config.processing_timeout}s")
self.logger.error(f"Only {completed_count}/{total_pages} pages completed")
break
await asyncio.sleep(0.5)
self.logger.info(f"Parallel processing completed in {time.time() - start_time:.2f}s")
# Get and sort completed tasks for this document (with proper locking)
with self.completed_lock:
completed_tasks = list(self.document_completed_tasks.get(src_fileid, []))
self.logger.info(f"Retrieved {len(completed_tasks)} completed tasks for document")
completed_tasks.sort(key=lambda t: t.page_idx)
# Log task details before cleanup
try:
for task in completed_tasks:
self.logger.debug(f"Completed task: page {task.page_idx + 1}, "
f"has_content={bool(task.refined_content)}, "
f"images={len(task.processed_images)}")
except Exception as e:
self.logger.error(f"Error logging task details: {e}")
# Clean up document-specific data
with self.tasks_lock:
if src_fileid in self.document_tasks:
del self.document_tasks[src_fileid]
with self.completed_lock:
if src_fileid in self.document_completed_tasks:
del self.document_completed_tasks[src_fileid]
if src_fileid in self.document_languages:
del self.document_languages[src_fileid]
self.logger.info(f"Returning {len(completed_tasks)} completed tasks")
return completed_tasks
def _start_queue_monitor(self):
"""Start queue monitoring thread"""
if self._monitor_thread is None or not self._monitor_thread.is_alive():
self._monitor_stop_event.clear()
self._monitor_thread = threading.Thread(
target=self._queue_monitor_worker,
name="MinerU-QueueMonitor",
daemon=True
)
self._monitor_thread.start()
self.logger.info("Queue monitor started")
def _queue_monitor_worker(self):
"""Monitor queue sizes and log statistics"""
while not self._monitor_stop_event.is_set():
try:
# Get queue sizes
parsing_size = self.queues.parsing_queue.qsize()
content_size = self.queues.content_refinement_queue.qsize()
recognition_size = self.queues.image_recognition_queue.qsize()
upload_size = self.queues.image_upload_queue.qsize()
# Get active documents count
with self.tasks_lock:
active_docs = len(self.document_tasks)
total_pending_tasks = sum(len(tasks) for tasks in self.document_tasks.values())
with self.completed_lock:
total_completed = sum(len(tasks) for tasks in self.document_completed_tasks.values())
# Log queue status only when there's activity
if parsing_size > 0 or content_size > 0 or recognition_size > 0 or upload_size > 0 or active_docs > 0:
self.logger.info(
f"Queue Status | Parse: {parsing_size} | "
f"Refine: {content_size} | "
f"Recognize: {recognition_size} | "
f"Upload: {upload_size} | "
f"Docs: {active_docs} | "
f"Tasks: {total_pending_tasks}/{total_completed} (pending/completed)"
)
# Sleep for monitoring interval
time.sleep(5) # Log every 5 seconds
except Exception as e:
self.logger.error(f"Queue monitor error: {e}")
time.sleep(5)
self.logger.info("Queue monitor stopped")
def get_queue_status(self) -> Dict[str, Any]:
"""Get detailed queue status"""
status = {
'queues': {
'parsing': {
'size': self.queues.parsing_queue.qsize(),
'max_size': self.queues.parsing_queue.maxsize
},
'content_refinement': {
'size': self.queues.content_refinement_queue.qsize(),
'max_size': self.queues.content_refinement_queue.maxsize
},
'image_recognition': {
'size': self.queues.image_recognition_queue.qsize(),
'max_size': self.queues.image_recognition_queue.maxsize
},
'image_upload': {
'size': self.queues.image_upload_queue.qsize(),
'max_size': self.queues.image_upload_queue.maxsize
}
},
'documents': {},
'threads': {
'active': len([t for t in self.threads if t.is_alive()]),
'total': len(self.threads)
}
}
# Get per-document status
with self.tasks_lock:
for doc_id, tasks in self.document_tasks.items():
doc_status = {
'total_pages': len(tasks),
'processing': 0,
'pending': 0
}
for task in tasks.values():
if task.status == TaskStatus.PROCESSING:
doc_status['processing'] += 1
elif task.status == TaskStatus.PENDING:
doc_status['pending'] += 1
with self.completed_lock:
completed = len(self.document_completed_tasks.get(doc_id, []))
doc_status['completed'] = completed
status['documents'][doc_id[:8] + '...'] = doc_status
return status
async def shutdown(self):
"""Shutdown all processing threads"""
self.logger.info("Shutting down parallel processor...")
# Signal shutdown
self.shutdown_event.set()
self._monitor_stop_event.set()
# Wait for threads to complete
for thread in self.threads:
thread.join(timeout=5.0)
if thread.is_alive():
self.logger.warning(f"Thread {thread.name} did not stop gracefully")
# Stop monitor thread
if self._monitor_thread and self._monitor_thread.is_alive():
self._monitor_thread.join(timeout=2.0)
# Clear threads list
self.threads.clear()
# Cleanup async components
if self._initialized:
await self.image_processor.cleanup()
self._initialized = False
# Clear all document data
with self.tasks_lock:
self.document_tasks.clear()
with self.completed_lock:
self.document_completed_tasks.clear()
self.document_languages.clear()
self.logger.info("Parallel processor shutdown complete")
# Import required modules
import os