1524 lines
78 KiB
Python
1524 lines
78 KiB
Python
"""
|
|
Parallel processing module for MinerU with task-level granularity.
|
|
|
|
This module implements a producer-consumer pattern with four independent
|
|
processing threads for maximum parallelism.
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
import queue
|
|
import threading
|
|
import time
|
|
from typing import Dict, List, Optional, Tuple, Any
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from .logger import get_module_logger
|
|
logger = get_module_logger('parallel_processor')
|
|
|
|
from .config_base import MinerUConfig
|
|
from .converter import DocumentConverter
|
|
from .api_client import MinerUAPIClient, MinerUResult
|
|
from .image_processor import MinerUImageProcessor
|
|
from .content_processor import MinerUContentProcessor
|
|
|
|
|
|
class TaskStatus(Enum):
|
|
"""Task processing status"""
|
|
PENDING = "pending"
|
|
PROCESSING = "processing"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
|
|
|
|
@dataclass
|
|
class PageTask:
|
|
"""Represents a page-level processing task"""
|
|
page_idx: int
|
|
pdf_path: str
|
|
temp_dir: str
|
|
src_fileid: str
|
|
status: TaskStatus = TaskStatus.PENDING
|
|
content: Optional[str] = None
|
|
images: List[str] = field(default_factory=list)
|
|
refined_content: Optional[str] = None
|
|
processed_images: Dict[str, str] = field(default_factory=dict)
|
|
image_descriptions: Dict[str, Dict] = field(default_factory=dict)
|
|
error: Optional[str] = None
|
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
# Non-serializable fields
|
|
upload_callback: Optional[Any] = field(default=None, repr=False)
|
|
upload_options: Optional[Any] = field(default=None, repr=False)
|
|
meaningful_images: List[Any] = field(default_factory=list, repr=False)
|
|
save_callback: Optional[Any] = field(default=None, repr=False) # Cache save callback
|
|
# Retry tracking
|
|
retry_count: int = 0
|
|
max_retries: int = 3
|
|
|
|
|
|
@dataclass
|
|
class ProcessingQueues:
|
|
"""Container for all processing queues"""
|
|
parsing_queue: queue.Queue
|
|
content_refinement_queue: queue.Queue
|
|
image_recognition_queue: queue.Queue
|
|
image_upload_queue: queue.Queue
|
|
|
|
|
|
class ParallelMinerUProcessor:
|
|
"""
|
|
Parallel processor implementing task-level granularity.
|
|
|
|
Architecture:
|
|
- 4 independent threads: parsing, content refinement, image recognition, image upload
|
|
- Producer-consumer pattern with queues connecting stages
|
|
- Page-level task granularity for maximum parallelism
|
|
"""
|
|
|
|
def __init__(self, config: MinerUConfig, learn_type: int = 9, platform_adapter=None):
|
|
self.config = config
|
|
self.learn_type = learn_type
|
|
self.logger = logger
|
|
self.platform_adapter = platform_adapter
|
|
|
|
# Initialize components
|
|
self.converter = DocumentConverter(config)
|
|
self.content_processor = MinerUContentProcessor(config)
|
|
self.image_processor = MinerUImageProcessor(config)
|
|
|
|
# Initialize queues
|
|
self.queues = ProcessingQueues(
|
|
parsing_queue=queue.Queue(maxsize=config.queue_size),
|
|
content_refinement_queue=queue.Queue(maxsize=config.queue_size),
|
|
image_recognition_queue=queue.Queue(maxsize=config.queue_size),
|
|
image_upload_queue=queue.Queue(maxsize=config.queue_size)
|
|
)
|
|
|
|
# Task tracking - use document-specific dictionaries
|
|
self.document_tasks: Dict[str, Dict[int, PageTask]] = {} # src_fileid -> page_idx -> task
|
|
self.tasks_lock = threading.Lock()
|
|
|
|
# Thread control
|
|
self.shutdown_event = threading.Event()
|
|
self.threads: List[threading.Thread] = []
|
|
|
|
# Results collection - per document
|
|
self.document_completed_tasks: Dict[str, List[PageTask]] = {} # src_fileid -> completed tasks
|
|
self.completed_lock = threading.Lock()
|
|
|
|
# Async initialization flag
|
|
self._initialized = False
|
|
|
|
# Queue monitoring
|
|
self._monitor_thread = None
|
|
self._monitor_stop_event = threading.Event()
|
|
|
|
# Document-level language detection cache
|
|
self.document_languages: Dict[str, str] = {} # src_fileid -> detected language
|
|
|
|
async def initialize(self):
|
|
"""Initialize async components like image processor"""
|
|
if not self._initialized:
|
|
await self.image_processor.initialize()
|
|
self._initialized = True
|
|
|
|
def initialize_threads(self):
|
|
"""
|
|
Initialize and start all processing threads.
|
|
|
|
This is the main initialization function that starts:
|
|
1. Parsing thread - processes MinerU API calls
|
|
2. Content refinement thread - refines content with AI
|
|
3. Image recognition thread - classifies and recognizes images
|
|
4. Image upload thread - uploads images to storage
|
|
"""
|
|
self.logger.info("Initializing parallel processing threads...")
|
|
|
|
# Create threads
|
|
threads = [
|
|
threading.Thread(target=self._run_async_thread, args=(self._parsing_worker,),
|
|
name="MinerU-Parser"),
|
|
threading.Thread(target=self._run_async_thread, args=(self._content_refinement_worker,),
|
|
name="MinerU-ContentRefiner"),
|
|
threading.Thread(target=self._run_async_thread, args=(self._image_recognition_worker,),
|
|
name="MinerU-ImageRecognizer"),
|
|
threading.Thread(target=self._run_async_thread, args=(self._image_upload_worker,),
|
|
name="MinerU-ImageUploader")
|
|
]
|
|
|
|
# Start all threads
|
|
for thread in threads:
|
|
thread.daemon = True
|
|
thread.start()
|
|
self.threads.append(thread)
|
|
self.logger.info(f"Started thread: {thread.name}")
|
|
|
|
def _run_async_thread(self, async_worker):
|
|
"""Run async worker in thread with its own event loop"""
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
try:
|
|
loop.run_until_complete(async_worker())
|
|
finally:
|
|
loop.close()
|
|
|
|
async def _parsing_worker(self):
|
|
"""
|
|
Parsing worker thread with on-demand batch processing.
|
|
|
|
Consumes: PageTask from parsing_queue
|
|
Produces: PageTask to content_refinement_queue and image_recognition_queue
|
|
"""
|
|
self.logger.info("Parsing worker started")
|
|
|
|
# Cache for batch results: {src_fileid: {batch_idx: MinerUResult}}
|
|
document_batch_results = {}
|
|
# Track which pages belong to which batch
|
|
document_batch_info = {} # {src_fileid: {'batch_size': int, 'total_pages': int}}
|
|
|
|
# Initialize API client
|
|
async with MinerUAPIClient(self.config, self.platform_adapter) as api_client:
|
|
while not self.shutdown_event.is_set():
|
|
try:
|
|
# Get task from queue (timeout to check shutdown)
|
|
try:
|
|
task = self.queues.parsing_queue.get(timeout=1.0)
|
|
except queue.Empty:
|
|
continue
|
|
|
|
# Set trace context for this task with page number
|
|
if self.platform_adapter:
|
|
self.platform_adapter.set_trace_id(f"{task.src_fileid}_p{task.page_idx + 1}")
|
|
|
|
self.logger.info(f"Parser processing page {task.page_idx + 1}")
|
|
|
|
# Update task status
|
|
with self.tasks_lock:
|
|
task.status = TaskStatus.PROCESSING
|
|
# Track task by document
|
|
if task.src_fileid not in self.document_tasks:
|
|
self.document_tasks[task.src_fileid] = {}
|
|
self.document_tasks[task.src_fileid][task.page_idx] = task
|
|
|
|
try:
|
|
# Determine batch size and batch index for this page
|
|
batch_size = self.config.batch_size if self.config.batch_processing_enabled else 0
|
|
|
|
# Initialize document batch info if needed
|
|
if task.src_fileid not in document_batch_info:
|
|
# Get total pages from metadata or detect from PDF
|
|
total_pages = task.metadata.get('total_pages', 0)
|
|
if total_pages == 0:
|
|
# Count pages if not provided
|
|
import fitz
|
|
with fitz.open(task.pdf_path) as doc:
|
|
total_pages = len(doc)
|
|
|
|
document_batch_info[task.src_fileid] = {
|
|
'batch_size': batch_size if batch_size > 0 and total_pages > batch_size else 0,
|
|
'total_pages': total_pages
|
|
}
|
|
document_batch_results[task.src_fileid] = {}
|
|
|
|
batch_info = document_batch_info[task.src_fileid]
|
|
|
|
# Calculate batch index for this page
|
|
if batch_info['batch_size'] > 0:
|
|
batch_idx = task.page_idx // batch_info['batch_size']
|
|
batch_start = batch_idx * batch_info['batch_size']
|
|
batch_end = min(batch_start + batch_info['batch_size'], batch_info['total_pages'])
|
|
|
|
# Check if we have results for this batch
|
|
if batch_idx not in document_batch_results[task.src_fileid]:
|
|
# Need to process this batch
|
|
self.logger.info(f"Parser: processing batch {batch_idx + 1} (pages {batch_start + 1}-{batch_end}) for {task.src_fileid[:8]}...")
|
|
|
|
# Split PDF for this batch
|
|
batch_pdf_path = await self._split_pdf_batch(
|
|
task.pdf_path, task.temp_dir, batch_start, batch_end, batch_idx
|
|
)
|
|
|
|
# Process this batch
|
|
batch_result = await api_client._process_batch_self_hosted(
|
|
batch_pdf_path, task.temp_dir, task.src_fileid,
|
|
batch_idx, batch_start, batch_end,
|
|
task.metadata.get('is_ppt_format', False)
|
|
) if self.config.mineru_api_type == "self_hosted" else await api_client._process_batch_cloud(
|
|
batch_pdf_path, task.temp_dir, task.src_fileid,
|
|
batch_idx, batch_start, batch_end,
|
|
task.metadata.get('is_ppt_format', False)
|
|
)
|
|
|
|
if not batch_result.success:
|
|
# Batch failed, try single-page processing for this batch
|
|
self.logger.warning(f"Batch {batch_idx + 1} failed, falling back to single-page processing for pages {batch_start + 1}-{batch_end}")
|
|
|
|
# Mark this batch as needing single-page processing
|
|
if task.src_fileid not in document_batch_results:
|
|
document_batch_results[task.src_fileid] = {}
|
|
|
|
# Create a special marker for failed batch
|
|
document_batch_results[task.src_fileid][batch_idx] = MinerUResult(
|
|
success=False,
|
|
content="",
|
|
images=[],
|
|
tables=[],
|
|
metadata={"batch_failed": True, "needs_single_page": True,
|
|
"batch_start": batch_start, "batch_end": batch_end},
|
|
error=batch_result.error
|
|
)
|
|
|
|
# Process pages individually for this failed batch
|
|
batch_result = await self._process_batch_as_single_pages(
|
|
task, api_client, batch_start, batch_end, batch_idx
|
|
)
|
|
|
|
if batch_result.success:
|
|
# Update cache with successful single-page results
|
|
document_batch_results[task.src_fileid][batch_idx] = batch_result
|
|
self.logger.info(f"Parser: single-page processing succeeded for batch {batch_idx + 1}")
|
|
else:
|
|
raise Exception(batch_result.error or f"Batch {batch_idx} processing failed even with single-page fallback")
|
|
else:
|
|
# Original batch succeeded, cache it
|
|
document_batch_results[task.src_fileid][batch_idx] = batch_result
|
|
self.logger.info(f"Parser: cached batch {batch_idx + 1} result for {task.src_fileid[:8]}...")
|
|
|
|
# Use cached batch result
|
|
result = document_batch_results[task.src_fileid][batch_idx]
|
|
self.logger.debug(f"Parser: using cached batch {batch_idx + 1} for page {task.page_idx + 1}")
|
|
|
|
else:
|
|
# No batching, process entire document
|
|
if 'full' not in document_batch_results.get(task.src_fileid, {}):
|
|
self.logger.info(f"Parser: calling MinerU API for full document {task.src_fileid[:8]}...")
|
|
result = await api_client.process_document(
|
|
task.pdf_path, task.temp_dir, task.src_fileid,
|
|
is_ppt_converted=task.metadata.get('is_ppt_format', False),
|
|
batch_size=0 # Disable batching
|
|
)
|
|
|
|
if not result.success:
|
|
raise Exception(result.error or "MinerU parsing failed")
|
|
|
|
# Cache the result
|
|
if task.src_fileid not in document_batch_results:
|
|
document_batch_results[task.src_fileid] = {}
|
|
document_batch_results[task.src_fileid]['full'] = result
|
|
else:
|
|
result = document_batch_results[task.src_fileid]['full']
|
|
self.logger.debug(f"Parser: using cached full result for page {task.page_idx + 1}")
|
|
|
|
# Extract page-specific content from the full document result
|
|
self.logger.debug(f"Extracting page data for page {task.page_idx + 1}")
|
|
self.logger.debug(f"Result type: {type(result)}, has content: {hasattr(result, 'content')}")
|
|
if hasattr(result, 'metadata'):
|
|
self.logger.debug(f"Result metadata keys: {list(result.metadata.keys())}")
|
|
|
|
page_data = self._extract_page_data(result, task.page_idx)
|
|
task.content = page_data.get('content', '')
|
|
task.images = page_data.get('images', [])
|
|
task.metadata.update(page_data.get('metadata', {}))
|
|
|
|
self.logger.debug(f"Extracted content length: {len(task.content)}, images: {len(task.images)}")
|
|
|
|
# Store content_list in metadata for context extraction
|
|
if result.metadata.get('content_list'):
|
|
task.metadata['content_list'] = result.metadata['content_list']
|
|
|
|
# Use language detected from the full document
|
|
if result.metadata.get('detected_language'):
|
|
# Store document-level language from API result
|
|
if task.src_fileid not in self.document_languages:
|
|
self.document_languages[task.src_fileid] = result.metadata['detected_language']
|
|
self.logger.info(f"Parser: using document language from API: {result.metadata['detected_language']} for {task.src_fileid[:8]}...")
|
|
else:
|
|
# Fallback: detect language from content if available
|
|
if task.content and task.content.strip():
|
|
from .language_detector import LanguageDetector
|
|
language_code, confidence = LanguageDetector.detect_language(task.content)
|
|
if confidence > 0.7:
|
|
# Store document-level language
|
|
if task.src_fileid not in self.document_languages:
|
|
self.document_languages[task.src_fileid] = language_code
|
|
self.logger.info(f"Parser: detected document language (fallback): {language_code} for {task.src_fileid[:8]}...")
|
|
|
|
# Send to next stages (task already contains upload_callback and upload_options)
|
|
if task.content:
|
|
self.queues.content_refinement_queue.put(task)
|
|
else:
|
|
# No content, set refined_content to empty
|
|
task.refined_content = ''
|
|
|
|
if task.images:
|
|
self.queues.image_recognition_queue.put(task)
|
|
else:
|
|
# No images to process
|
|
task.metadata['images_processed'] = True
|
|
|
|
# If page has neither content nor images, mark as complete
|
|
if not task.content and not task.images:
|
|
task.refined_content = ''
|
|
self._check_task_completion(task)
|
|
|
|
self.logger.info(f"Parser completed page {task.page_idx + 1}")
|
|
|
|
except Exception as e:
|
|
# Page-level task retry (for handling queue/processing failures, not API failures)
|
|
# Note: Batch API calls don't retry, they fallback to single-page processing
|
|
# Single-page API calls have their own retry logic in api_client._retry_with_backoff
|
|
task.retry_count += 1
|
|
|
|
if task.retry_count <= task.max_retries:
|
|
self.logger.warning(f"Parser task failed for page {task.page_idx + 1} (attempt {task.retry_count}/{task.max_retries}): {e}")
|
|
# Put the task back in the queue for retry
|
|
with self.tasks_lock:
|
|
task.status = TaskStatus.PENDING
|
|
task.error = f"Task retry {task.retry_count}: {str(e)}"
|
|
|
|
# Add exponential backoff delay
|
|
retry_delay = min(2.0 * (2 ** (task.retry_count - 1)), 30.0)
|
|
await asyncio.sleep(retry_delay)
|
|
|
|
# Re-queue the task for retry
|
|
self.queues.parsing_queue.put(task)
|
|
self.logger.info(f"Re-queued page {task.page_idx + 1} task for retry after {retry_delay:.1f}s delay")
|
|
else:
|
|
# Max retries exceeded, mark as failed
|
|
self.logger.error(f"Parser task failed for page {task.page_idx + 1} after {task.max_retries} retries: {e}")
|
|
with self.tasks_lock:
|
|
task.status = TaskStatus.FAILED
|
|
task.error = f"Task failed after {task.max_retries} retries: {str(e)}"
|
|
self._mark_task_completed(task)
|
|
|
|
finally:
|
|
self.queues.parsing_queue.task_done()
|
|
|
|
# Clean up cached results for completed documents
|
|
# Check if all pages for this document are processed
|
|
with self.tasks_lock:
|
|
if task.src_fileid in self.document_tasks:
|
|
doc_tasks = self.document_tasks[task.src_fileid]
|
|
all_processed = all(
|
|
t.status in [TaskStatus.COMPLETED, TaskStatus.FAILED]
|
|
for t in doc_tasks.values()
|
|
)
|
|
if all_processed:
|
|
# All pages processed, clean up cache
|
|
if task.src_fileid in document_batch_results:
|
|
del document_batch_results[task.src_fileid]
|
|
self.logger.info(f"Parser: cleaned up cached batch results for document {task.src_fileid[:8]}...")
|
|
if task.src_fileid in document_batch_info:
|
|
del document_batch_info[task.src_fileid]
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Parsing worker error: {e}")
|
|
|
|
self.logger.info("Parsing worker stopped")
|
|
|
|
async def _content_refinement_worker(self):
|
|
"""
|
|
Content refinement worker thread.
|
|
|
|
Consumes: PageTask from content_refinement_queue
|
|
Updates: PageTask with refined content
|
|
"""
|
|
self.logger.info("Content refinement worker started")
|
|
|
|
while not self.shutdown_event.is_set():
|
|
try:
|
|
# Get task from queue
|
|
try:
|
|
task = self.queues.content_refinement_queue.get(timeout=1.0)
|
|
except queue.Empty:
|
|
continue
|
|
|
|
# Set trace context for this task with page number
|
|
if self.platform_adapter:
|
|
self.platform_adapter.set_trace_id(f"{task.src_fileid}_p{task.page_idx + 1}")
|
|
|
|
self.logger.info(f"Refiner processing page {task.page_idx + 1}")
|
|
|
|
try:
|
|
# Get document language
|
|
language_code = self.document_languages.get(task.src_fileid)
|
|
|
|
# Process content refinement for single page
|
|
page_result = await self.content_processor.process_page_content(
|
|
task.content, task.images, task.pdf_path, task.page_idx,
|
|
task.temp_dir, task.src_fileid, self.learn_type, language_code
|
|
)
|
|
|
|
# Extract results
|
|
task.refined_content = page_result['content']
|
|
task.metadata['has_tables'] = page_result['has_tables']
|
|
task.metadata['content_processing'] = page_result.get('processing_metadata', {})
|
|
|
|
self.logger.info(f"Refiner completed page {task.page_idx + 1}")
|
|
|
|
# Check if this task is complete
|
|
self._check_task_completion(task)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Refiner failed for page {task.page_idx + 1}: {e}")
|
|
task.refined_content = task.content # Fallback
|
|
self._check_task_completion(task)
|
|
|
|
finally:
|
|
self.queues.content_refinement_queue.task_done()
|
|
except Exception as e:
|
|
self.logger.error(f"Content refinement worker error: {e}")
|
|
|
|
self.logger.info("Content refinement worker stopped")
|
|
|
|
async def _image_recognition_worker(self):
|
|
"""
|
|
Image recognition worker thread.
|
|
|
|
Consumes: PageTask from image_recognition_queue
|
|
Produces: PageTask to image_upload_queue
|
|
"""
|
|
self.logger.info("Image recognition worker started")
|
|
|
|
try:
|
|
while not self.shutdown_event.is_set():
|
|
try:
|
|
# Get task from queue
|
|
try:
|
|
task = self.queues.image_recognition_queue.get(timeout=1.0)
|
|
except queue.Empty:
|
|
continue
|
|
|
|
# Set trace context for this task with page number
|
|
if self.platform_adapter:
|
|
self.platform_adapter.set_trace_id(f"{task.src_fileid}_p{task.page_idx + 1}")
|
|
|
|
self.logger.info(f"Recognizer processing {len(task.images)} images for page {task.page_idx + 1}")
|
|
|
|
try:
|
|
self.logger.info(f"Recognizer: loading images from {task.temp_dir}")
|
|
|
|
# Create wrapper for classification without upload
|
|
async def classify_only(learn_type, image_filepath, temp_dir, src_name, hint=""):
|
|
# Use document-level language directly
|
|
language_code = self.document_languages.get(task.src_fileid)
|
|
if language_code:
|
|
self.logger.info(f"Recognizer: using document language: {language_code} for page {task.page_idx + 1}")
|
|
else:
|
|
self.logger.info(f"Recognizer: no document language detected for {task.src_fileid[:8]}")
|
|
|
|
# Extract context for the image if content is available
|
|
context = None
|
|
if task.content:
|
|
try:
|
|
# Extract the filename from filepath
|
|
import os
|
|
import re
|
|
filename = os.path.basename(image_filepath)
|
|
|
|
# Create ImageContext from page content
|
|
from .context_types import ImageContext
|
|
|
|
# Try to find image reference in content
|
|
# Look for patterns like  or just the filename
|
|
surrounding_text = ""
|
|
before_text = ""
|
|
after_text = ""
|
|
|
|
# Extract base name without extension for more flexible matching
|
|
base_name = filename.replace('.png', '').replace('.jpg', '').replace('.jpeg', '')
|
|
|
|
# Try different patterns to find the image reference
|
|
patterns = [
|
|
rf'!\[.*?\]\([^)]*{re.escape(filename)}[^)]*\)', # Full filename in markdown
|
|
rf'!\[.*?\]\([^)]*{re.escape(base_name)}[^)]*\)', # Base name in markdown
|
|
rf'{re.escape(filename)}', # Just the filename
|
|
rf'{re.escape(base_name)}' # Just the base name
|
|
]
|
|
|
|
image_position = -1
|
|
for pattern in patterns:
|
|
match = re.search(pattern, task.content)
|
|
if match:
|
|
image_position = match.start()
|
|
self.logger.debug(f"Found image reference at position {image_position} using pattern: {pattern}")
|
|
break
|
|
|
|
if image_position >= 0:
|
|
# Extract 250 characters before and after the image reference
|
|
start_pos = max(0, image_position - 250)
|
|
end_pos = min(len(task.content), image_position + 250)
|
|
|
|
before_text = task.content[start_pos:image_position].strip()
|
|
after_text = task.content[image_position:end_pos].strip()
|
|
surrounding_text = task.content[start_pos:end_pos].strip()
|
|
|
|
self.logger.info(f"Recognizer: found image {filename} at position {image_position}, "
|
|
f"extracted {len(before_text)} chars before and {len(after_text)} chars after")
|
|
else:
|
|
# Fallback: if image reference not found, use first 500 chars of content
|
|
surrounding_text = task.content[:500] if len(task.content) > 500 else task.content
|
|
self.logger.warning(f"Recognizer: could not find image reference for {filename}, using fallback context")
|
|
|
|
context = ImageContext(
|
|
page_idx=task.page_idx,
|
|
surrounding_text=surrounding_text,
|
|
page_type='content', # Could be enhanced to detect actual page type
|
|
token_count=len(surrounding_text) // 4, # Rough token estimate
|
|
before_text=before_text,
|
|
after_text=after_text
|
|
)
|
|
|
|
self.logger.info(f"Recognizer: created context for image {filename} with {len(surrounding_text)} chars of surrounding text")
|
|
except Exception as e:
|
|
self.logger.warning(f"Recognizer: failed to create context for image: {e}")
|
|
context = None
|
|
|
|
# Use the image processor's classification method with context
|
|
return await self.image_processor._classify_single_image_with_context(
|
|
learn_type, image_filepath, temp_dir, src_name, hint, context, language_code
|
|
)
|
|
|
|
# Load and classify images
|
|
images_to_process = []
|
|
for img_filename in task.images:
|
|
img_filepath = os.path.join(task.temp_dir, img_filename)
|
|
self.logger.info(f"Recognizer: checking image {img_filename} at {img_filepath}")
|
|
if os.path.exists(img_filepath):
|
|
xref = img_filename.replace('.png', '').replace('mineru_image_', '')
|
|
self.logger.info(f"Recognizer: loading image info for {img_filename}, xref={xref}")
|
|
image_info = await self.image_processor.image_optimizer.load_image_info(
|
|
img_filepath, img_filename, xref
|
|
)
|
|
images_to_process.append(image_info)
|
|
self.logger.info(f"Recognizer: loaded image info: {image_info}")
|
|
else:
|
|
self.logger.warning(f"Recognizer: image file not found: {img_filepath}")
|
|
|
|
if images_to_process:
|
|
# Classify images sequentially (not concurrently) to avoid pressure on multimodal service
|
|
self.logger.info(f"Recognizer: classifying {len(images_to_process)} images sequentially for page {task.page_idx + 1}")
|
|
classification_results = await self.image_processor.image_optimizer.batch_classify_images(
|
|
images_to_process, classify_only, self.learn_type,
|
|
task.temp_dir, task.src_fileid
|
|
)
|
|
|
|
# Filter meaningful images based on configuration
|
|
meaningful_images = []
|
|
for image_info in images_to_process:
|
|
xref = image_info.xref
|
|
if xref in classification_results:
|
|
result = classification_results[xref]
|
|
else:
|
|
# No classification result - likely an error occurred
|
|
self.logger.warning(f"Recognizer: no classification result for {image_info.filename}, creating default result")
|
|
result = {
|
|
'type': 'meaningless',
|
|
'content': 'Classification failed - no result returned',
|
|
'input_tokens': 0,
|
|
'output_tokens': 0,
|
|
'error': 'No classification result'
|
|
}
|
|
|
|
# Apply meaningless filter if configured
|
|
if self.config.filter_meaningless_images and result.get('type') == 'meaningless':
|
|
self.logger.info(f"Recognizer: filtering out meaningless image {image_info.filename}")
|
|
# Still store the classification for reference
|
|
task.image_descriptions[image_info.filename] = result
|
|
else:
|
|
# Either filter is disabled or image is meaningful
|
|
meaningful_images.append(image_info)
|
|
task.image_descriptions[image_info.filename] = result
|
|
|
|
# Send to upload queue if there are images to upload
|
|
# Note: if filter is disabled, we upload all classified images including meaningless ones
|
|
if meaningful_images or (not self.config.filter_meaningless_images and images_to_process):
|
|
if not self.config.filter_meaningless_images:
|
|
# If filter is disabled, upload all processed images
|
|
task.meaningful_images = images_to_process
|
|
else:
|
|
# If filter is enabled, only upload meaningful images
|
|
task.meaningful_images = meaningful_images
|
|
self.queues.image_upload_queue.put(task)
|
|
else:
|
|
# No images to upload
|
|
task.metadata['images_processed'] = True
|
|
self._check_task_completion(task)
|
|
|
|
self.logger.info(f"Recognizer completed page {task.page_idx + 1}: "
|
|
f"{len(meaningful_images)}/{len(task.images)} meaningful")
|
|
else:
|
|
task.metadata['images_processed'] = True
|
|
self._check_task_completion(task)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Recognizer failed for page {task.page_idx + 1}: {e}")
|
|
import traceback
|
|
self.logger.error(f"Recognizer traceback: {traceback.format_exc()}")
|
|
task.metadata['images_processed'] = True # Mark as processed to avoid hanging
|
|
self._check_task_completion(task)
|
|
|
|
finally:
|
|
self.queues.image_recognition_queue.task_done()
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Image recognition worker inner error: {e}")
|
|
import traceback
|
|
self.logger.error(f"Image recognition worker traceback: {traceback.format_exc()}")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Image recognition worker error: {e}")
|
|
finally:
|
|
self.logger.info("Image recognition worker stopped")
|
|
|
|
async def _image_upload_worker(self):
|
|
"""
|
|
Image upload worker thread.
|
|
|
|
Consumes: PageTask from image_upload_queue
|
|
Updates: PageTask with uploaded image URLs
|
|
"""
|
|
self.logger.info("Image upload worker started")
|
|
|
|
while not self.shutdown_event.is_set():
|
|
try:
|
|
# Get task from queue
|
|
try:
|
|
task = self.queues.image_upload_queue.get(timeout=1.0)
|
|
except queue.Empty:
|
|
continue
|
|
|
|
# Set trace context for this task with page number
|
|
if self.platform_adapter:
|
|
self.platform_adapter.set_trace_id(f"{task.src_fileid}_p{task.page_idx + 1}")
|
|
|
|
meaningful_images = task.meaningful_images
|
|
self.logger.info(f"Uploader processing {len(meaningful_images)} images for page {task.page_idx + 1}")
|
|
|
|
try:
|
|
if meaningful_images:
|
|
# Get upload callback and options from task
|
|
upload_callback = task.upload_callback
|
|
upload_options = task.upload_options
|
|
|
|
self.logger.info(f"Uploader: starting upload for {len(meaningful_images)} images on page {task.page_idx + 1}")
|
|
|
|
if upload_callback:
|
|
# Batch upload images
|
|
upload_results = await self.image_processor.image_optimizer.batch_upload_images(
|
|
meaningful_images, upload_callback, upload_options
|
|
)
|
|
|
|
self.logger.info(f"Uploader: upload_results keys: {list(upload_results.keys())}")
|
|
|
|
# Map results back to filenames
|
|
for image_info in meaningful_images:
|
|
xref = image_info.xref
|
|
if xref in upload_results and upload_results[xref]:
|
|
task.processed_images[image_info.filename] = upload_results[xref]
|
|
self.logger.info(f"Uploader: mapped {image_info.filename} -> {upload_results[xref]}")
|
|
else:
|
|
self.logger.warning(f"Uploader: no upload result for {image_info.filename} (xref={xref})")
|
|
|
|
self.logger.info(f"Uploader completed page {task.page_idx + 1}: "
|
|
f"{len(task.processed_images)} uploaded")
|
|
else:
|
|
self.logger.warning(f"Uploader: no upload_callback provided for page {task.page_idx + 1}")
|
|
|
|
# Mark images as processed
|
|
task.metadata['images_processed'] = True
|
|
|
|
# Mark task as complete
|
|
self._check_task_completion(task)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Uploader failed for page {task.page_idx + 1}: {e}")
|
|
self._check_task_completion(task)
|
|
|
|
finally:
|
|
self.queues.image_upload_queue.task_done()
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Image upload worker error: {e}")
|
|
|
|
self.logger.info("Image upload worker stopped")
|
|
|
|
async def _split_pdf_batch(self, pdf_path: str, temp_dir: str, start_page: int,
|
|
end_page: int, batch_idx: int) -> str:
|
|
"""
|
|
Split PDF to extract specific pages for batch processing.
|
|
|
|
Args:
|
|
pdf_path: Original PDF path
|
|
temp_dir: Temporary directory
|
|
start_page: Start page index (0-based)
|
|
end_page: End page index (exclusive)
|
|
batch_idx: Batch index for naming
|
|
|
|
Returns:
|
|
Path to the split PDF file
|
|
"""
|
|
import fitz
|
|
batch_pdf_path = os.path.join(temp_dir, f"batch_{batch_idx}.pdf")
|
|
|
|
with fitz.open(pdf_path) as src_doc:
|
|
batch_doc = fitz.open() # Create new PDF
|
|
|
|
# Copy pages to new document
|
|
for page_idx in range(start_page, end_page):
|
|
batch_doc.insert_pdf(src_doc, from_page=page_idx, to_page=page_idx)
|
|
|
|
batch_doc.save(batch_pdf_path)
|
|
batch_doc.close()
|
|
|
|
self.logger.info(f"Created batch PDF with {end_page - start_page} pages: {batch_pdf_path}")
|
|
return batch_pdf_path
|
|
|
|
async def _process_batch_as_single_pages(self, task: PageTask, api_client: MinerUAPIClient,
|
|
batch_start: int, batch_end: int, batch_idx: int) -> MinerUResult:
|
|
"""
|
|
Process a failed batch by trying each page individually.
|
|
This helps identify which specific page is causing the batch to fail.
|
|
|
|
Args:
|
|
task: The current page task
|
|
api_client: MinerU API client
|
|
batch_start: Start page index of the batch
|
|
batch_end: End page index of the batch
|
|
batch_idx: Batch index
|
|
|
|
Returns:
|
|
MinerUResult with combined results from successful pages
|
|
"""
|
|
self.logger.info(f"Starting single-page processing for failed batch {batch_idx + 1} (pages {batch_start + 1}-{batch_end})")
|
|
|
|
successful_pages = []
|
|
failed_pages = []
|
|
all_content_parts = []
|
|
all_images = []
|
|
all_tables = []
|
|
page_data = {}
|
|
|
|
# Try each page individually
|
|
for page_idx in range(batch_start, batch_end):
|
|
try:
|
|
self.logger.info(f"Processing single page {page_idx + 1}/{batch_end}")
|
|
|
|
# Create single-page PDF
|
|
single_page_pdf = os.path.join(task.temp_dir, f"single_page_{page_idx}.pdf")
|
|
|
|
import fitz
|
|
with fitz.open(task.pdf_path) as src_doc:
|
|
single_doc = fitz.open()
|
|
single_doc.insert_pdf(src_doc, from_page=page_idx, to_page=page_idx)
|
|
single_doc.save(single_page_pdf)
|
|
single_doc.close()
|
|
|
|
# Process single page WITH retry logic
|
|
if self.config.mineru_api_type == "self_hosted":
|
|
# Use retry wrapper for single-page processing
|
|
page_result = await api_client._retry_with_backoff(
|
|
api_client._process_batch_self_hosted_impl,
|
|
single_page_pdf, task.temp_dir, task.src_fileid,
|
|
page_idx, page_idx, page_idx + 1,
|
|
task.metadata.get('is_ppt_format', False)
|
|
)
|
|
else:
|
|
# Use retry wrapper for single-page processing
|
|
page_result = await api_client._retry_with_backoff(
|
|
api_client._process_batch_cloud_impl,
|
|
single_page_pdf, task.temp_dir, task.src_fileid,
|
|
page_idx, page_idx, page_idx + 1,
|
|
task.metadata.get('is_ppt_format', False)
|
|
)
|
|
|
|
# Clean up single-page PDF
|
|
if os.path.exists(single_page_pdf):
|
|
os.remove(single_page_pdf)
|
|
|
|
if page_result.success:
|
|
successful_pages.append(page_idx + 1)
|
|
all_content_parts.append(page_result.content)
|
|
all_images.extend(page_result.images)
|
|
all_tables.extend(page_result.tables)
|
|
page_data[page_idx] = {
|
|
'content': page_result.content,
|
|
'images': page_result.images,
|
|
'tables': page_result.tables,
|
|
'metadata': page_result.metadata
|
|
}
|
|
self.logger.success(f"✓ Page {page_idx + 1} processed successfully")
|
|
else:
|
|
failed_pages.append(page_idx + 1)
|
|
self.logger.error(f"✗ Page {page_idx + 1} failed: {page_result.error}")
|
|
# Add empty data for failed page
|
|
page_data[page_idx] = {
|
|
'content': '',
|
|
'images': [],
|
|
'tables': [],
|
|
'metadata': {'error': page_result.error, 'failed': True}
|
|
}
|
|
|
|
except Exception as e:
|
|
failed_pages.append(page_idx + 1)
|
|
self.logger.error(f"✗ Page {page_idx + 1} processing error: {str(e)}")
|
|
# Add empty data for failed page
|
|
page_data[page_idx] = {
|
|
'content': '',
|
|
'images': [],
|
|
'tables': [],
|
|
'metadata': {'error': str(e), 'failed': True}
|
|
}
|
|
|
|
# Log summary
|
|
self.logger.info(f"Single-page processing complete for batch {batch_idx + 1}:")
|
|
self.logger.info(f" - Successful pages: {successful_pages} ({len(successful_pages)}/{batch_end - batch_start})")
|
|
if failed_pages:
|
|
self.logger.warning(f" - Failed pages: {failed_pages} ({len(failed_pages)}/{batch_end - batch_start})")
|
|
|
|
# Return combined result
|
|
success = len(successful_pages) > 0 # Partial success if at least one page worked
|
|
|
|
metadata = {
|
|
"batch_idx": batch_idx,
|
|
"start_page": batch_start,
|
|
"end_page": batch_end,
|
|
"pages_in_batch": batch_end - batch_start,
|
|
"successful_pages": successful_pages,
|
|
"failed_pages": failed_pages,
|
|
"single_page_fallback": True,
|
|
"page_data": page_data
|
|
}
|
|
|
|
return MinerUResult(
|
|
success=success,
|
|
content="\n\n".join(all_content_parts),
|
|
images=all_images,
|
|
tables=all_tables,
|
|
metadata=metadata,
|
|
error=f"Failed pages: {failed_pages}" if failed_pages else None
|
|
)
|
|
|
|
def _extract_page_data(self, mineru_result, page_idx: int) -> Dict:
|
|
"""Extract data for a specific page from MinerU result"""
|
|
# Check if we have pre-split page data
|
|
if mineru_result.metadata.get('page_data'):
|
|
page_data = mineru_result.metadata['page_data']
|
|
self.logger.debug(f"Found page_data with keys: {list(page_data.keys())}")
|
|
|
|
# Try both integer and string keys
|
|
if page_idx in page_data:
|
|
self.logger.debug(f"Found data for page_idx {page_idx} (int key)")
|
|
return {
|
|
'content': page_data[page_idx].get('content', ''),
|
|
'images': page_data[page_idx].get('images', []),
|
|
'metadata': page_data[page_idx].get('metadata', {})
|
|
}
|
|
elif str(page_idx) in page_data:
|
|
self.logger.debug(f"Found data for page_idx {page_idx} (str key)")
|
|
return {
|
|
'content': page_data[str(page_idx)].get('content', ''),
|
|
'images': page_data[str(page_idx)].get('images', []),
|
|
'metadata': page_data[str(page_idx)].get('metadata', {})
|
|
}
|
|
else:
|
|
self.logger.warning(f"No data found for page_idx {page_idx} in page_data")
|
|
|
|
# Fallback: split content by page markers
|
|
content_parts = mineru_result.content.split(f'__PAGE_OF_PORTION_{page_idx + 1}__')
|
|
if len(content_parts) > 1:
|
|
# Get content after the page marker until next marker
|
|
page_content = content_parts[1].split(f'__PAGE_OF_PORTION_{page_idx + 2}__')[0]
|
|
else:
|
|
# If no page markers, return empty
|
|
page_content = ''
|
|
|
|
# Filter images by page (assuming images have page info in metadata)
|
|
page_images = []
|
|
|
|
# For single-page documents, assign all images to page 0
|
|
total_pages = len(mineru_result.metadata.get('page_data', {}))
|
|
if total_pages == 1 and page_idx == 0:
|
|
# Single page document - assign all images to this page
|
|
page_images = list(mineru_result.images)
|
|
self.logger.debug(f"Single-page document: assigning {len(page_images)} images to page 0")
|
|
else:
|
|
# Multi-page document - filter by page markers
|
|
for img in mineru_result.images:
|
|
# This is a simplified version - real implementation would check image metadata
|
|
if f'page_{page_idx}' in img or f'p{page_idx}' in img:
|
|
page_images.append(img)
|
|
|
|
# Handle page content - ensure empty/whitespace pages are properly handled
|
|
stripped_content = page_content.strip()
|
|
|
|
# Log if we have a page with only whitespace
|
|
if page_content and not stripped_content:
|
|
self.logger.debug(f"Page {page_idx} contains only whitespace characters")
|
|
|
|
return {
|
|
'content': stripped_content,
|
|
'images': page_images,
|
|
'metadata': mineru_result.metadata
|
|
}
|
|
|
|
def _check_task_completion(self, task: PageTask):
|
|
"""Check if a task is complete and mark it accordingly"""
|
|
should_mark_complete = False
|
|
|
|
with self.tasks_lock:
|
|
# Skip if already completed
|
|
if task.status == TaskStatus.COMPLETED:
|
|
self.logger.debug(f"Page {task.page_idx + 1} already completed, skipping")
|
|
return
|
|
|
|
# A task is complete when:
|
|
# 1. Content is refined (or original content is used)
|
|
# 2. Images are processed (or there are no images)
|
|
has_content = task.refined_content is not None
|
|
has_no_images = not task.images
|
|
has_processed_images = len(task.images) == 0 or task.metadata.get('images_processed', False)
|
|
|
|
# Log the current state for debugging
|
|
self.logger.debug(f"Checking completion for page {task.page_idx + 1}: "
|
|
f"has_content={has_content}, has_no_images={has_no_images}, "
|
|
f"has_processed_images={has_processed_images}, "
|
|
f"images_count={len(task.images)}")
|
|
|
|
if has_content and (has_no_images or has_processed_images):
|
|
# Integrate images into content if we have any image descriptions
|
|
# This ensures meaningless images are properly removed from content
|
|
if task.image_descriptions:
|
|
self.logger.info(f"Page {task.page_idx + 1} processing image integration:")
|
|
self.logger.info(f" - processed_images: {list(task.processed_images.keys()) if task.processed_images else 'None (filtered out)'}")
|
|
self.logger.info(f" - image_descriptions: {list(task.image_descriptions.keys())}")
|
|
self.logger.info(f" - content length before: {len(task.refined_content)} chars")
|
|
|
|
task.refined_content = self._integrate_images_into_content(
|
|
task.refined_content,
|
|
task.image_descriptions,
|
|
task.processed_images or {}, # Pass empty dict if None
|
|
f"{task.src_fileid}_page_{task.page_idx}"
|
|
)
|
|
|
|
self.logger.info(f" - content length after: {len(task.refined_content)} chars")
|
|
else:
|
|
self.logger.info(f"Page {task.page_idx + 1} has no images to process")
|
|
|
|
task.status = TaskStatus.COMPLETED
|
|
should_mark_complete = True
|
|
|
|
# Call _mark_task_completed outside of the lock to avoid deadlock
|
|
if should_mark_complete:
|
|
self._mark_task_completed(task)
|
|
|
|
def _integrate_images_into_content(self, content: str, image_descriptions: Dict[str, Dict],
|
|
uploaded_images: Dict[str, str], _page_identifier: str) -> str:
|
|
"""
|
|
Integrate image descriptions into content by replacing original image references.
|
|
|
|
This method is adapted from image_processor.integrate_image_descriptions
|
|
"""
|
|
import re
|
|
|
|
try:
|
|
enhanced_content = content
|
|
|
|
# Log what we're processing
|
|
self.logger.info(f"Image integration starting for {_page_identifier}:")
|
|
self.logger.info(f" - Image descriptions: {list(image_descriptions.keys())}")
|
|
self.logger.info(f" - Uploaded images: {list(uploaded_images.keys())}")
|
|
self.logger.debug(f" - Content length: {len(content)} chars")
|
|
|
|
# First, find all image references in the content
|
|
image_pattern = r'!\[.*?\]\((.*?)\)'
|
|
content_images = re.findall(image_pattern, content)
|
|
self.logger.info(f"Found {len(content_images)} image references in content:")
|
|
for img_ref in content_images[:5]: # Log first 5
|
|
self.logger.info(f" - {img_ref}")
|
|
|
|
# Process each image description
|
|
for filename, desc_info in image_descriptions.items():
|
|
self.logger.info(f"\nChecking image {filename} for replacement")
|
|
|
|
# Get image type to determine if it's meaningless
|
|
img_type = desc_info.get('type', 'brief_description')
|
|
|
|
# Process ALL images that have descriptions
|
|
# - Meaningless images: remove references (replace with empty string)
|
|
# - Images not uploaded but classified: also remove (likely filtered)
|
|
# - Uploaded images: replace with proper markdown
|
|
uploaded_url = uploaded_images.get(filename, '')
|
|
|
|
if img_type == 'meaningless':
|
|
self.logger.info(f" - Image is meaningless, will remove references")
|
|
elif filename not in uploaded_images:
|
|
self.logger.info(f" - Image was classified as {img_type} but not uploaded (filtered), will remove references")
|
|
# Treat as meaningless for removal purposes
|
|
img_type = 'meaningless'
|
|
else:
|
|
self.logger.info(f" - Found in uploaded_images: {uploaded_url}")
|
|
|
|
# Extract the hash part from the filename
|
|
# This handles filenames like: 390561cb34fd3f951b1d25a252bead1c_page_1_44450601...jpg
|
|
# or mineru_image_xxx.png
|
|
base_filename = filename.replace('.png', '').replace('.jpg', '').replace('.jpeg', '')
|
|
|
|
# Try to extract the hash part (usually the long hex string)
|
|
# For mineru images: mineru_image_XXX -> XXX
|
|
# For hash-based: XXX_page_N_YYY -> YYY (the last hash)
|
|
if 'mineru_image_' in filename:
|
|
ref = base_filename.replace('mineru_image_', '')
|
|
else:
|
|
# Look for the last hash-like pattern
|
|
parts = base_filename.split('_')
|
|
# Find the longest hex-like string
|
|
hash_parts = [p for p in parts if len(p) > 20 and all(c in '0123456789abcdef' for c in p)]
|
|
if hash_parts:
|
|
ref = hash_parts[-1] # Use the last hash
|
|
else:
|
|
ref = base_filename
|
|
|
|
# Build replacement content based on image type
|
|
# img_type already extracted above
|
|
title = desc_info.get('title', '')
|
|
description = desc_info.get('content', '')
|
|
ocr_content = desc_info.get('ocr_content', '')
|
|
|
|
# Create the replacement markdown
|
|
if img_type == 'meaningless':
|
|
# For meaningless images, we want to remove them entirely
|
|
replacement = "" # Empty string to remove the image reference
|
|
elif img_type == 'structured_content':
|
|
# For structured content, include full description
|
|
replacement = f"\n\n\n<!--{description}-->\n\n{ocr_content}\n\n"
|
|
else:
|
|
# For other types, use a simpler format
|
|
if description:
|
|
replacement = f"\n\n\n<!--{description}-->\n"
|
|
else:
|
|
replacement = f"\n\n\n\n"
|
|
|
|
# Replace various possible image reference patterns
|
|
# We need to be flexible because the reference in content might be different from our filename
|
|
patterns = []
|
|
|
|
# If we found a hash reference, try to match it in various formats
|
|
if ref and len(ref) > 20: # Likely a hash
|
|
patterns.extend([
|
|
f"!\\[.*?\\]\\(.*?{ref}.*?\\)", # Match hash anywhere in path
|
|
f"!\\[\\]\\(.*?{ref}.*?\\)", # Empty alt text with hash
|
|
f"!\\[.*?\\]\\(images/{ref}\\.[^)]+\\)", # images/hash.ext
|
|
f"!\\[\\]\\(images/{ref}\\.[^)]+\\)", # images/hash.ext with empty alt
|
|
])
|
|
|
|
# Always try the full filename patterns
|
|
patterns.extend([
|
|
f"!\\[.*?\\]\\(.*?{re.escape(filename)}\\)", # Match exact filename
|
|
f"!\\[.*?\\]\\(.*?{re.escape(base_filename)}\\)", # Match base filename
|
|
])
|
|
|
|
# Add generic mineru image pattern if applicable
|
|
if 'mineru_image_' in filename:
|
|
patterns.append(f"!\\[.*?\\]\\(.*?{filename}\\)")
|
|
|
|
self.logger.info(f" - extracted ref: '{ref}'")
|
|
self.logger.info(f" - trying {len(patterns)} patterns")
|
|
|
|
replaced = False
|
|
for pattern in patterns:
|
|
new_content = re.sub(pattern, replacement, enhanced_content)
|
|
if new_content != enhanced_content:
|
|
enhanced_content = new_content
|
|
replaced = True
|
|
self.logger.info(f"Successfully replaced image {filename} using pattern: {pattern}")
|
|
break
|
|
|
|
if not replaced:
|
|
self.logger.warning(f"Failed to replace image reference for: {filename}, ref={ref}")
|
|
# Log the first few characters of content to help debugging
|
|
sample = enhanced_content[:500] if len(enhanced_content) > 500 else enhanced_content
|
|
self.logger.info(f"Content sample: {sample}...")
|
|
# Also log the exact patterns we tried
|
|
self.logger.info(f"Tried patterns:")
|
|
for p in patterns:
|
|
self.logger.info(f" - {p}")
|
|
|
|
return enhanced_content
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error integrating images: {str(e)}")
|
|
return content # Return original content on error
|
|
|
|
def _mark_task_completed(self, task: PageTask):
|
|
"""Mark a task as completed and add to results"""
|
|
self.logger.debug(f"Marking task as completed: page {task.page_idx + 1}, status={task.status}, "
|
|
f"src_fileid={task.src_fileid[:8]}...")
|
|
|
|
# Get total pages first (outside of completed_lock to avoid potential deadlock)
|
|
with self.tasks_lock:
|
|
doc_tasks = self.document_tasks.get(task.src_fileid, {})
|
|
total_pages = len(doc_tasks)
|
|
|
|
try:
|
|
# Save to cache if callback provided and task succeeded
|
|
if task.status == TaskStatus.COMPLETED:
|
|
save_callback = task.save_callback # Get from non-serializable field
|
|
if save_callback:
|
|
try:
|
|
# Prepare page data for caching
|
|
page_data = {
|
|
'content': task.refined_content or task.content or '',
|
|
'images': task.images,
|
|
'processed_images': task.processed_images,
|
|
'image_descriptions': task.image_descriptions,
|
|
'has_tables': task.metadata.get('has_tables', False),
|
|
'processing_metadata': {k: v for k, v in task.metadata.items()
|
|
if k not in ['save_callback', 'upload_callback']}
|
|
}
|
|
save_callback(task.page_idx, page_data)
|
|
self.logger.debug(f"Saved page {task.page_idx + 1} to cache")
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to save page {task.page_idx + 1} to cache: {e}")
|
|
|
|
with self.completed_lock:
|
|
self.logger.debug(f"Acquired completed_lock for marking task")
|
|
# Initialize list for this document if needed
|
|
if task.src_fileid not in self.document_completed_tasks:
|
|
self.document_completed_tasks[task.src_fileid] = []
|
|
self.logger.debug(f"Initialized completed tasks list for doc {task.src_fileid[:8]}")
|
|
|
|
self.document_completed_tasks[task.src_fileid].append(task)
|
|
completed_count = len(self.document_completed_tasks[task.src_fileid])
|
|
self.logger.debug(f"Added task to completed list, count now: {completed_count}")
|
|
|
|
self.logger.info(f"Task completed: page {task.page_idx + 1} "
|
|
f"[{completed_count}/{total_pages}] for doc {task.src_fileid[:8]}...")
|
|
except Exception as e:
|
|
self.logger.error(f"Error in _mark_task_completed: {e}", exc_info=True)
|
|
|
|
async def process_document_with_cache(self, pdf_path: str, temp_dir: str, src_fileid: str,
|
|
is_ppt_format: bool, total_pages: int,
|
|
upload_callback=None, upload_options=None,
|
|
cached_pages=None, save_callback=None) -> List[PageTask]:
|
|
"""
|
|
Process a document using parallel pipeline with cache support.
|
|
|
|
Args:
|
|
pdf_path: Path to PDF file
|
|
temp_dir: Temporary directory
|
|
src_fileid: Source file ID
|
|
is_ppt_format: Whether document is in PPT format
|
|
total_pages: Total number of pages
|
|
upload_callback: Callback for uploading images
|
|
upload_options: Upload options
|
|
cached_pages: Dictionary of already cached pages {page_idx: page_data}
|
|
save_callback: Callback to save page to cache when completed
|
|
|
|
Returns:
|
|
List of completed PageTask objects
|
|
"""
|
|
# Process with cache support
|
|
return await self._process_document_internal(
|
|
pdf_path, temp_dir, src_fileid, is_ppt_format, total_pages,
|
|
upload_callback, upload_options, cached_pages, save_callback
|
|
)
|
|
|
|
async def process_document(self, pdf_path: str, temp_dir: str, src_fileid: str,
|
|
is_ppt_format: bool, total_pages: int,
|
|
upload_callback=None, upload_options=None) -> List[PageTask]:
|
|
"""
|
|
Process a document using parallel pipeline.
|
|
|
|
Args:
|
|
pdf_path: Path to PDF file
|
|
temp_dir: Temporary directory
|
|
src_fileid: Source file ID
|
|
is_ppt_format: Whether document is in PPT format
|
|
total_pages: Total number of pages
|
|
upload_callback: Image upload callback
|
|
upload_options: Upload options
|
|
|
|
Returns:
|
|
List of completed PageTask objects
|
|
"""
|
|
# Process without cache support
|
|
return await self._process_document_internal(
|
|
pdf_path, temp_dir, src_fileid, is_ppt_format, total_pages,
|
|
upload_callback, upload_options, None, None
|
|
)
|
|
|
|
async def _process_document_internal(self, pdf_path: str, temp_dir: str, src_fileid: str,
|
|
is_ppt_format: bool, total_pages: int,
|
|
upload_callback=None, upload_options=None,
|
|
cached_pages=None, save_callback=None) -> List[PageTask]:
|
|
"""
|
|
Internal method to process document with optional cache support.
|
|
"""
|
|
cached_pages = cached_pages or {}
|
|
self.logger.info(f"Starting parallel processing for {total_pages} pages, {len(cached_pages)} already cached")
|
|
|
|
# Initialize async components
|
|
await self.initialize()
|
|
|
|
# Initialize threads if not already running
|
|
if not self.threads:
|
|
self.initialize_threads()
|
|
# Start queue monitor
|
|
self._start_queue_monitor()
|
|
|
|
# Initialize document-specific tracking
|
|
with self.tasks_lock:
|
|
self.document_tasks[src_fileid] = {}
|
|
with self.completed_lock:
|
|
self.document_completed_tasks[src_fileid] = []
|
|
|
|
# Create tasks for each page
|
|
for page_idx in range(total_pages):
|
|
# Check if page is already cached
|
|
if page_idx in cached_pages:
|
|
# Create a completed task from cached data
|
|
cached_data = cached_pages[page_idx]
|
|
task = PageTask(
|
|
page_idx=page_idx,
|
|
pdf_path=pdf_path,
|
|
temp_dir=temp_dir,
|
|
src_fileid=src_fileid,
|
|
status=TaskStatus.COMPLETED,
|
|
content=cached_data.get('content', ''),
|
|
images=cached_data.get('metadata', {}).get('images', []),
|
|
refined_content=cached_data.get('content', ''), # Use cached content as refined
|
|
processed_images=cached_data.get('metadata', {}).get('processed_images', {}),
|
|
image_descriptions=cached_data.get('metadata', {}).get('image_descriptions', {}),
|
|
metadata=cached_data.get('metadata', {}),
|
|
max_retries=self.config.api_max_retries # Use retry config from settings
|
|
)
|
|
with self.tasks_lock:
|
|
self.document_tasks[src_fileid][page_idx] = task
|
|
with self.completed_lock:
|
|
self.document_completed_tasks[src_fileid].append(task)
|
|
self.logger.info(f"Loaded page {page_idx + 1} from cache")
|
|
else:
|
|
# Create new task for processing
|
|
task = PageTask(
|
|
page_idx=page_idx,
|
|
pdf_path=pdf_path,
|
|
temp_dir=temp_dir,
|
|
src_fileid=src_fileid,
|
|
metadata={
|
|
'is_ppt_format': is_ppt_format
|
|
},
|
|
upload_callback=upload_callback,
|
|
upload_options=upload_options,
|
|
save_callback=save_callback, # Store as non-serializable field
|
|
max_retries=self.config.api_max_retries # Use retry config from settings
|
|
)
|
|
with self.tasks_lock:
|
|
self.document_tasks[src_fileid][page_idx] = task
|
|
self.queues.parsing_queue.put(task)
|
|
|
|
# Wait for all tasks to complete for this document
|
|
start_time = time.time()
|
|
self.logger.info(f"Waiting for {total_pages} pages to complete for document {src_fileid[:8]}...")
|
|
|
|
while True:
|
|
with self.completed_lock:
|
|
# Debug: log the src_fileid we're looking for
|
|
self.logger.debug(f"Looking for src_fileid: {src_fileid}")
|
|
completed_tasks_for_doc = self.document_completed_tasks.get(src_fileid, [])
|
|
completed_count = len(completed_tasks_for_doc)
|
|
|
|
# Debug: log the actual keys in document_completed_tasks
|
|
available_docs = list(self.document_completed_tasks.keys())
|
|
if len(available_docs) > 0:
|
|
# Show full keys for debugging
|
|
self.logger.debug(f"Looking for: {src_fileid}")
|
|
self.logger.debug(f"Available docs: {available_docs}")
|
|
self.logger.debug(f"Tasks for this doc: {completed_count}")
|
|
|
|
# Log progress every few iterations
|
|
if int(time.time() - start_time) % 5 == 0:
|
|
self.logger.info(f"Progress: {completed_count}/{total_pages} pages completed")
|
|
|
|
if completed_count >= total_pages:
|
|
self.logger.info(f"All {total_pages} pages completed!")
|
|
break
|
|
|
|
# Check for timeout
|
|
if time.time() - start_time > self.config.processing_timeout:
|
|
self.logger.error(f"Processing timeout reached after {self.config.processing_timeout}s")
|
|
self.logger.error(f"Only {completed_count}/{total_pages} pages completed")
|
|
break
|
|
|
|
await asyncio.sleep(0.5)
|
|
|
|
self.logger.info(f"Parallel processing completed in {time.time() - start_time:.2f}s")
|
|
|
|
# Get and sort completed tasks for this document (with proper locking)
|
|
with self.completed_lock:
|
|
completed_tasks = list(self.document_completed_tasks.get(src_fileid, []))
|
|
self.logger.info(f"Retrieved {len(completed_tasks)} completed tasks for document")
|
|
|
|
completed_tasks.sort(key=lambda t: t.page_idx)
|
|
|
|
# Log task details before cleanup
|
|
try:
|
|
for task in completed_tasks:
|
|
self.logger.debug(f"Completed task: page {task.page_idx + 1}, "
|
|
f"has_content={bool(task.refined_content)}, "
|
|
f"images={len(task.processed_images)}")
|
|
except Exception as e:
|
|
self.logger.error(f"Error logging task details: {e}")
|
|
|
|
# Clean up document-specific data
|
|
with self.tasks_lock:
|
|
if src_fileid in self.document_tasks:
|
|
del self.document_tasks[src_fileid]
|
|
with self.completed_lock:
|
|
if src_fileid in self.document_completed_tasks:
|
|
del self.document_completed_tasks[src_fileid]
|
|
if src_fileid in self.document_languages:
|
|
del self.document_languages[src_fileid]
|
|
|
|
self.logger.info(f"Returning {len(completed_tasks)} completed tasks")
|
|
return completed_tasks
|
|
|
|
def _start_queue_monitor(self):
|
|
"""Start queue monitoring thread"""
|
|
if self._monitor_thread is None or not self._monitor_thread.is_alive():
|
|
self._monitor_stop_event.clear()
|
|
self._monitor_thread = threading.Thread(
|
|
target=self._queue_monitor_worker,
|
|
name="MinerU-QueueMonitor",
|
|
daemon=True
|
|
)
|
|
self._monitor_thread.start()
|
|
self.logger.info("Queue monitor started")
|
|
|
|
def _queue_monitor_worker(self):
|
|
"""Monitor queue sizes and log statistics"""
|
|
while not self._monitor_stop_event.is_set():
|
|
try:
|
|
# Get queue sizes
|
|
parsing_size = self.queues.parsing_queue.qsize()
|
|
content_size = self.queues.content_refinement_queue.qsize()
|
|
recognition_size = self.queues.image_recognition_queue.qsize()
|
|
upload_size = self.queues.image_upload_queue.qsize()
|
|
|
|
# Get active documents count
|
|
with self.tasks_lock:
|
|
active_docs = len(self.document_tasks)
|
|
total_pending_tasks = sum(len(tasks) for tasks in self.document_tasks.values())
|
|
|
|
with self.completed_lock:
|
|
total_completed = sum(len(tasks) for tasks in self.document_completed_tasks.values())
|
|
|
|
# Log queue status only when there's activity
|
|
if parsing_size > 0 or content_size > 0 or recognition_size > 0 or upload_size > 0 or active_docs > 0:
|
|
self.logger.info(
|
|
f"Queue Status | Parse: {parsing_size} | "
|
|
f"Refine: {content_size} | "
|
|
f"Recognize: {recognition_size} | "
|
|
f"Upload: {upload_size} | "
|
|
f"Docs: {active_docs} | "
|
|
f"Tasks: {total_pending_tasks}/{total_completed} (pending/completed)"
|
|
)
|
|
|
|
# Sleep for monitoring interval
|
|
time.sleep(5) # Log every 5 seconds
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Queue monitor error: {e}")
|
|
time.sleep(5)
|
|
|
|
self.logger.info("Queue monitor stopped")
|
|
|
|
def get_queue_status(self) -> Dict[str, Any]:
|
|
"""Get detailed queue status"""
|
|
status = {
|
|
'queues': {
|
|
'parsing': {
|
|
'size': self.queues.parsing_queue.qsize(),
|
|
'max_size': self.queues.parsing_queue.maxsize
|
|
},
|
|
'content_refinement': {
|
|
'size': self.queues.content_refinement_queue.qsize(),
|
|
'max_size': self.queues.content_refinement_queue.maxsize
|
|
},
|
|
'image_recognition': {
|
|
'size': self.queues.image_recognition_queue.qsize(),
|
|
'max_size': self.queues.image_recognition_queue.maxsize
|
|
},
|
|
'image_upload': {
|
|
'size': self.queues.image_upload_queue.qsize(),
|
|
'max_size': self.queues.image_upload_queue.maxsize
|
|
}
|
|
},
|
|
'documents': {},
|
|
'threads': {
|
|
'active': len([t for t in self.threads if t.is_alive()]),
|
|
'total': len(self.threads)
|
|
}
|
|
}
|
|
|
|
# Get per-document status
|
|
with self.tasks_lock:
|
|
for doc_id, tasks in self.document_tasks.items():
|
|
doc_status = {
|
|
'total_pages': len(tasks),
|
|
'processing': 0,
|
|
'pending': 0
|
|
}
|
|
for task in tasks.values():
|
|
if task.status == TaskStatus.PROCESSING:
|
|
doc_status['processing'] += 1
|
|
elif task.status == TaskStatus.PENDING:
|
|
doc_status['pending'] += 1
|
|
|
|
with self.completed_lock:
|
|
completed = len(self.document_completed_tasks.get(doc_id, []))
|
|
doc_status['completed'] = completed
|
|
|
|
status['documents'][doc_id[:8] + '...'] = doc_status
|
|
|
|
return status
|
|
|
|
async def shutdown(self):
|
|
"""Shutdown all processing threads"""
|
|
self.logger.info("Shutting down parallel processor...")
|
|
|
|
# Signal shutdown
|
|
self.shutdown_event.set()
|
|
self._monitor_stop_event.set()
|
|
|
|
# Wait for threads to complete
|
|
for thread in self.threads:
|
|
thread.join(timeout=5.0)
|
|
if thread.is_alive():
|
|
self.logger.warning(f"Thread {thread.name} did not stop gracefully")
|
|
|
|
# Stop monitor thread
|
|
if self._monitor_thread and self._monitor_thread.is_alive():
|
|
self._monitor_thread.join(timeout=2.0)
|
|
|
|
# Clear threads list
|
|
self.threads.clear()
|
|
|
|
# Cleanup async components
|
|
if self._initialized:
|
|
await self.image_processor.cleanup()
|
|
self._initialized = False
|
|
|
|
# Clear all document data
|
|
with self.tasks_lock:
|
|
self.document_tasks.clear()
|
|
with self.completed_lock:
|
|
self.document_completed_tasks.clear()
|
|
self.document_languages.clear()
|
|
|
|
self.logger.info("Parallel processor shutdown complete")
|
|
|
|
|
|
# Import required modules
|
|
import os |