""" MinerU API client module. This module handles communication with the MinerU service for document parsing, following the architecture patterns from gzero.py and implementing the real API. """ import os import asyncio import aiohttp import zipfile import fitz import time from typing import Dict, List, Optional, Any, Tuple from dataclasses import dataclass from .logger import get_module_logger logger = get_module_logger('api_client') from .config_base import MinerUConfig @dataclass class MinerUPageResult: """Result from MinerU processing for a single page""" page_idx: int success: bool content: str images: List[str] tables: List[Dict] metadata: Dict[str, Any] error: Optional[str] = None @dataclass class MinerUResult: """Result from MinerU processing""" success: bool content: str # Backward compatibility - merged content images: List[str] # All images tables: List[Dict] # All tables metadata: Dict[str, Any] error: Optional[str] = None page_results: Optional[List[MinerUPageResult]] = None # Individual page results class MinerUAPIClient: """Client for interacting with MinerU API""" def __init__(self, config: MinerUConfig, platform_adapter=None): self.config = config self.session: Optional[aiohttp.ClientSession] = None self.logger = logger self.platform_adapter = platform_adapter async def __aenter__(self): """Async context manager entry""" self.session = aiohttp.ClientSession( timeout=aiohttp.ClientTimeout(total=600) # 10 minute timeout ) return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Async context manager exit""" if self.session: await self.session.close() async def process_document(self, pdf_path: str, temp_dir: str, src_fileid: str, is_ppt_converted: bool = False, batch_size: int = 20) -> MinerUResult: """ Process document using MinerU API with batch processing support. Can process the entire document at once or in batches for large PDFs. Args: pdf_path: Path to PDF file temp_dir: Temporary directory for processing src_fileid: Source file ID for logging/tracing is_ppt_converted: Whether the PDF was converted from PPT batch_size: Maximum pages per batch (0 for no batching) Returns: MinerUResult with parsed content """ try: # Check PDF page count page_count = self._get_pdf_page_count(pdf_path) self.logger.info(f"mineru-api: PDF has {page_count} pages, batch_size={batch_size}") # Decide whether to use batch processing if batch_size > 0 and page_count > batch_size: self.logger.info(f"mineru-api: using batch processing (batch_size={batch_size})") return await self._process_document_in_batches( pdf_path, temp_dir, src_fileid, is_ppt_converted, batch_size ) else: self.logger.info(f"mineru-api: processing full document at once") return await self._process_document_full( pdf_path, temp_dir, src_fileid, is_ppt_converted ) except Exception as e: self.logger.error(f"mineru-api: document processing failed: {str(e)}") return MinerUResult( success=False, content="", images=[], tables=[], metadata={}, error=str(e) ) async def _process_document_full(self, pdf_path: str, temp_dir: str, src_fileid: str, is_ppt_converted: bool) -> MinerUResult: """Process full document at once (original implementation).""" self.logger.info(f"mineru-api: starting full document processing") # Choose processing method based on API type if self.config.mineru_api_type == "self_hosted": self.logger.info(f"mineru-api: using self-hosted MinerU API") result = await self._process_full_document_self_hosted(pdf_path, temp_dir, src_fileid, is_ppt_converted) elif self.config.mineru_api_key and self.config.mineru_api_key.strip(): self.logger.info(f"mineru-api: using cloud MinerU API") result = await self._process_full_document_cloud(pdf_path, temp_dir, src_fileid, is_ppt_converted) else: self.logger.warning(f"mineru-api: no API configuration, using mock processing") result = await self._mock_mineru_processing(pdf_path, temp_dir, src_fileid, is_ppt_converted) self.logger.info(f"mineru-api: document processing completed") return result async def _process_document_in_batches(self, pdf_path: str, temp_dir: str, src_fileid: str, is_ppt_converted: bool, batch_size: int) -> MinerUResult: """ Process document in batches of pages. Args: pdf_path: Path to PDF file temp_dir: Temporary directory for processing src_fileid: Source file ID for logging/tracing is_ppt_converted: Whether the PDF was converted from PPT batch_size: Maximum pages per batch Returns: MinerUResult with merged results from all batches """ try: page_count = self._get_pdf_page_count(pdf_path) num_batches = (page_count + batch_size - 1) // batch_size self.logger.info(f"mineru-api: splitting {page_count} pages into {num_batches} batches") # Process each batch batch_results = [] for batch_idx in range(num_batches): start_page = batch_idx * batch_size end_page = min(start_page + batch_size, page_count) self.logger.info(f"mineru-api: processing batch {batch_idx + 1}/{num_batches} " f"(pages {start_page + 1}-{end_page})") # Split PDF for this batch batch_pdf_path = await self._split_pdf( pdf_path, temp_dir, start_page, end_page, batch_idx ) # Process batch based on API type if self.config.mineru_api_type == "self_hosted": batch_result = await self._process_batch_self_hosted( batch_pdf_path, temp_dir, src_fileid, batch_idx, start_page, end_page, is_ppt_converted ) elif self.config.mineru_api_key and self.config.mineru_api_key.strip(): batch_result = await self._process_batch_cloud( batch_pdf_path, temp_dir, src_fileid, batch_idx, start_page, end_page, is_ppt_converted ) else: batch_result = await self._mock_mineru_processing( batch_pdf_path, temp_dir, src_fileid, is_ppt_converted ) if batch_result.success: batch_results.append((start_page, batch_result)) else: self.logger.error(f"mineru-api: batch {batch_idx + 1} failed: {batch_result.error}") # Continue with other batches even if one fails # Merge all batch results return self._merge_batch_results(batch_results, page_count, is_ppt_converted) except Exception as e: self.logger.error(f"mineru-api: batch processing failed: {str(e)}") raise def _get_pdf_page_count(self, pdf_path: str) -> int: """Get the number of pages in a PDF.""" with fitz.open(pdf_path) as doc: return len(doc) async def _split_pdf(self, pdf_path: str, temp_dir: str, start_page: int, end_page: int, batch_idx: int) -> str: """ Split PDF to extract specific pages for batch processing. Args: pdf_path: Original PDF path temp_dir: Temporary directory start_page: Start page index (0-based) end_page: End page index (exclusive) batch_idx: Batch index for naming Returns: Path to the split PDF file """ batch_pdf_path = os.path.join(temp_dir, f"batch_{batch_idx}.pdf") with fitz.open(pdf_path) as src_doc: batch_doc = fitz.open() # Create new PDF # Copy pages to new document for page_idx in range(start_page, end_page): batch_doc.insert_pdf(src_doc, from_page=page_idx, to_page=page_idx) batch_doc.save(batch_pdf_path) batch_doc.close() self.logger.info(f"mineru-api: created batch PDF with {end_page - start_page} pages: {batch_pdf_path}") return batch_pdf_path async def _process_full_document_self_hosted(self, pdf_path: str, temp_dir: str, src_fileid: str, is_ppt_converted: bool) -> MinerUResult: """ Process full PDF document at once using self-hosted MinerU API with content_list support. This method uploads the entire document and gets back content_list for all pages. """ try: self.logger.info(f"mineru-api: processing full document with self-hosted API") if not self.session: raise RuntimeError("API client not initialized") start_time = asyncio.get_event_loop().time() # Prepare multipart form data form_data = aiohttp.FormData() # API parameters - enable content_list for full document processing form_data.add_field('return_middle_json', 'false') form_data.add_field('return_model_output', 'false') form_data.add_field('return_md', 'true') form_data.add_field('return_images', 'true') form_data.add_field('return_content_list', 'true') # Enable content_list form_data.add_field('end_page_id', '99999') form_data.add_field('parse_method', 'auto') form_data.add_field('start_page_id', '0') form_data.add_field('output_dir', './output') form_data.add_field('server_url', 'string') form_data.add_field('backend', 'pipeline') form_data.add_field('table_enable', 'true') form_data.add_field('formula_enable', 'true') # Add the PDF file with open(pdf_path, 'rb') as f: form_data.add_field('files', f, filename=os.path.basename(pdf_path), content_type='application/pdf') # Make API request async with self.session.post( f"{self.config.mineru_api_url}/file_parse", data=form_data, headers={'accept': 'application/json'} ) as response: if response.status != 200: error_text = await response.text() raise Exception(f"Self-hosted MinerU API error: {response.status} - {error_text}") result = await response.json() # Log the top-level keys to understand response structure self.logger.info(f"mineru-api: response top-level keys: {list(result.keys())}") # Extract content_list from response results = result.get('results', {}) if not results: raise Exception("No results in API response") # Get the first result (should be our PDF file) file_result = next(iter(results.values())) self.logger.info(f"mineru-api: file_result keys: {list(file_result.keys())}") # Extract content_list content_list_str = file_result.get('content_list', '') self.logger.info(f"mineru-api: content_list type: {type(content_list_str)}, length: {len(str(content_list_str))}") if not content_list_str: self.logger.error(f"mineru-api: No content_list in API response. File result keys: {list(file_result.keys())}") # Log a sample of the file_result to understand what we're getting sample_result = str(file_result)[:500] if file_result else 'None' self.logger.error(f"mineru-api: File result sample: {sample_result}") raise Exception("No content_list in API response") # Parse content_list to markdown markdown_content, all_images, all_tables, page_data = self._parse_content_list_to_markdown( content_list_str, temp_dir, src_fileid ) # Also get markdown content if available for language detection md_content = file_result.get('md_content', '') # Extract base64 images if provided images_data = file_result.get('images', {}) if images_data and isinstance(images_data, dict): self.logger.info(f"mineru-api: saving {len(images_data)} base64 images") saved_images = self._save_base64_images(images_data, temp_dir, src_fileid) self.logger.info(f"mineru-api: saved images: {saved_images}") # Merge with images from content_list for img in saved_images: if img not in all_images: all_images.append(img) else: self.logger.info(f"mineru-api: no base64 images in 'images' field") # Check if there's an 'images' field at the top level of result if 'images' in result and isinstance(result['images'], dict): self.logger.info(f"mineru-api: found images at top level: {len(result['images'])} images") # Save these images for img_name, img_data in result['images'].items(): if img_name not in all_images: # Save the image try: if isinstance(img_data, str) and img_data.startswith('data:'): # Base64 encoded saved = self._save_base64_images({img_name: img_data}, temp_dir, src_fileid) all_images.extend(saved) self.logger.info(f"mineru-api: saved top-level image: {img_name}") except Exception as e: self.logger.error(f"mineru-api: failed to save top-level image {img_name}: {e}") processing_time = asyncio.get_event_loop().time() - start_time # Detect language from the combined md_content detected_language = None if md_content and md_content.strip(): from .language_detector import LanguageDetector language_code, confidence = LanguageDetector.detect_language(md_content) if confidence > 0.7: detected_language = language_code self.logger.info(f"mineru-api: detected document language: {detected_language} (confidence: {confidence:.2f})") # If no md_content, detect from markdown_content as fallback if not detected_language and markdown_content and markdown_content.strip(): from .language_detector import LanguageDetector language_code, confidence = LanguageDetector.detect_language(markdown_content) if confidence > 0.7: detected_language = language_code self.logger.info(f"mineru-api: detected document language from content_list: {detected_language} (confidence: {confidence:.2f})") # Create metadata import json metadata = { "processing_time": processing_time, "total_pages": len(page_data), "images_found": len(all_images), "tables_found": len(all_tables), "is_ppt_source": is_ppt_converted, "processing_mode": "full_document_self_hosted", "api_version": "self_hosted", "api_type": "self_hosted", "page_data": page_data, "content_list": json.loads(content_list_str) if isinstance(content_list_str, str) else content_list_str, "detected_language": detected_language # Add detected language to metadata } # Create page results for compatibility mineru_page_results = [] for page_idx, pdata in page_data.items(): mineru_page_results.append(MinerUPageResult( page_idx=page_idx, success=True, content=pdata['content'], images=pdata['images'], tables=pdata['tables'], metadata=pdata['metadata'] )) self.logger.info(f"mineru-api: full document self-hosted processing completed in {processing_time:.2f}s") return MinerUResult( success=True, content=markdown_content, images=all_images, tables=all_tables, metadata=metadata, page_results=mineru_page_results ) except Exception as e: self.logger.error(f"mineru-api: full document self-hosted processing failed: {str(e)}") raise def _parse_self_hosted_response(self, api_response: Dict, temp_dir: str, page_id: str, page_num: int) -> Tuple[str, List[str], List[Dict]]: """ Parse response from self-hosted MinerU API. Expected response format: { "backend": "pipeline", "version": "2.1.10", "results": { "page_xxx": { "md_content": "# Content...", "images": { "filename.jpg": "...", ... }, "middle_json": {...}, "model_output": {...} } } } """ try: content = "" images = [] tables = [] # Extract results results = api_response.get('results', {}) if results: # Get the first result (should be our PDF file) file_result = next(iter(results.values())) if results else {} # Extract markdown content md_content = file_result.get('md_content', '') if md_content: content = md_content else: content = f"# Page {page_num}\n\nNo content extracted from self-hosted API" # Extract images from the images field (base64 encoded) images_data = file_result.get('images', {}) if images_data and isinstance(images_data, dict): images = self._save_base64_images(images_data, temp_dir, page_id) else: # Fallback to extracting from markdown if no images field images = self._extract_images_from_markdown(md_content, temp_dir, page_id) # Extract table information if available in middle_json middle_json = file_result.get('middle_json', {}) if middle_json and isinstance(middle_json, dict): tables = self._extract_tables_from_middle_json(middle_json, page_num) self.logger.debug(f"mineru-api: [{page_id}] parsed self-hosted response - content: {len(content)} chars, images: {len(images)}, tables: {len(tables)}") return content, images, tables except Exception as e: self.logger.error(f"mineru-api: [{page_id}] failed to parse self-hosted response: {str(e)}") return f"# Page {page_num}\n\nError parsing API response: {str(e)}", [], [] def _save_base64_images(self, images_data: Dict[str, str], temp_dir: str, page_id: str) -> List[str]: """ Save base64 encoded images to files. Args: images_data: Dictionary with image filename as key and base64 data as value temp_dir: Directory to save images page_id: Page identifier for logging Returns: List of saved image filenames """ import base64 saved_images = [] for filename, base64_data in images_data.items(): try: # Extract base64 data (remove data URI prefix if present) if base64_data.startswith('data:'): # Format: ... base64_data = base64_data.split(',', 1)[1] # Decode base64 to binary image_data = base64.b64decode(base64_data) # Use the original filename without prefix to match content_list references image_filename = filename image_path = os.path.join(temp_dir, image_filename) # Save image file with open(image_path, 'wb') as f: f.write(image_data) saved_images.append(image_filename) self.logger.info(f"mineru-api: [{page_id}] saved base64 image: {image_filename} ({len(image_data)} bytes)") except Exception as e: self.logger.error(f"mineru-api: [{page_id}] failed to save base64 image {filename}: {str(e)}") return saved_images def _extract_images_from_markdown(self, md_content: str, temp_dir: str, page_id: str) -> List[str]: """ Extract image references from markdown content and handle them. Self-hosted MinerU typically includes images as markdown references like ![alt](path) """ import re images = [] # Find all markdown image references image_pattern = r'!\[([^\]]*)\]\(([^)]+)\)' matches = re.findall(image_pattern, md_content) for _, image_path in matches: try: # Handle different image path formats if image_path.startswith('./') or image_path.startswith('../'): # Relative path - assume it's in the same directory structure actual_path = os.path.normpath(os.path.join(temp_dir, image_path)) elif os.path.isabs(image_path): # Absolute path actual_path = image_path else: # Relative to temp directory actual_path = os.path.join(temp_dir, image_path) # Check if image file exists and copy to our temp directory if needed if os.path.exists(actual_path): # Generate new filename for our processing image_filename = f"self_hosted_{page_id}_{os.path.basename(image_path)}" dest_path = os.path.join(temp_dir, image_filename) if actual_path != dest_path: import shutil shutil.copy(actual_path, dest_path) images.append(image_filename) self.logger.debug(f"mineru-api: [{page_id}] extracted image: {image_filename}") else: self.logger.warning(f"mineru-api: [{page_id}] image file not found: {actual_path}") except Exception as e: self.logger.error(f"mineru-api: [{page_id}] error processing image {image_path}: {str(e)}") return images def _extract_tables_from_middle_json(self, middle_json: Dict, page_num: int) -> List[Dict]: """ Extract table information from middle_json if available. """ tables = [] try: # This structure depends on the actual format returned by self-hosted MinerU # Adjust based on the actual response structure if 'tables' in middle_json: table_data = middle_json['tables'] if isinstance(table_data, list): for i, table in enumerate(table_data): tables.append({ 'page': page_num, 'table_id': i, 'content': str(table), 'source': 'self_hosted_middle_json' }) elif isinstance(table_data, dict): tables.append({ 'page': page_num, 'content': str(table_data), 'source': 'self_hosted_middle_json' }) except Exception as e: self.logger.debug(f"mineru-api: page {page_num} table extraction from middle_json failed: {str(e)}") return tables async def _upload_file_to_accessible_url(self, pdf_path: str, src_fileid: str) -> str: """ Upload file to a publicly accessible URL for MinerU processing. Uses platform adapter's upload_file method """ try: # Use platform adapter for upload if available if hasattr(self, 'platform_adapter') and self.platform_adapter: # The adapter will handle the upload return await self.platform_adapter.upload_file(pdf_path, { 'src_fileid': src_fileid }) # Fallback: return local path if no adapter logger.warning("No platform adapter available for upload, returning local path") return pdf_path except Exception as e: self.logger.error(f"mineru-api: file upload failed: {str(e)}") raise async def _poll_task_completion(self, task_id: str, src_fileid: str, max_wait_time: int = 600) -> str: """ Poll MinerU task until completion. Args: task_id: Task ID to poll src_fileid: Source file ID for logging max_wait_time: Maximum wait time in seconds Returns: Result ZIP file URL """ if not self.session: raise RuntimeError("API client not initialized") headers = { 'Authorization': f'Bearer {self.config.mineru_api_key}', 'Accept': '*/*' } start_time = asyncio.get_event_loop().time() poll_interval = 5 # Start with 5 seconds max_poll_interval = 30 # Max 30 seconds between polls while True: current_time = asyncio.get_event_loop().time() if current_time - start_time > max_wait_time: raise Exception(f"Task polling timeout after {max_wait_time} seconds") try: async with self.session.get( f"{self.config.mineru_api_url}/api/v4/extract/task/{task_id}", headers=headers ) as response: if response.status != 200: error_text = await response.text() raise Exception(f"MinerU polling error: {response.status} - {error_text}") result = await response.json() if result.get('code') != 0: raise Exception(f"MinerU polling error: {result.get('msg', 'Unknown error')}") data = result['data'] state = data['state'] if state == 'done': full_zip_url = data['full_zip_url'] self.logger.info(f"mineru-api: task completed: {task_id}") return full_zip_url elif state == 'failed': error_msg = data.get('err_msg', 'Unknown error') raise Exception(f"MinerU task failed: {error_msg}") elif state in ['pending', 'running', 'converting']: # Log progress if available if 'extract_progress' in data: progress = data['extract_progress'] extracted = progress.get('extracted_pages', 0) total = progress.get('total_pages', 0) start_time_str = progress.get('start_time', 'N/A') self.logger.info(f"mineru-api: task {state}: {extracted}/{total} pages (started: {start_time_str})") else: self.logger.info(f"mineru-api: task {state}") # Wait before next poll await asyncio.sleep(poll_interval) # Gradually increase poll interval poll_interval = min(poll_interval * 1.2, max_poll_interval) else: raise Exception(f"Unknown task state: {state}") except aiohttp.ClientError as e: self.logger.warning(f"mineru-api: polling connection error: {str(e)}") await asyncio.sleep(poll_interval) async def _process_full_document_cloud(self, pdf_path: str, temp_dir: str, src_fileid: str, is_ppt_converted: bool) -> MinerUResult: """ Process full PDF document at once using cloud MinerU API. This method uploads the entire document and extracts content_list from the result. """ try: self.logger.info(f"mineru-api: processing full document with cloud API") start_time = asyncio.get_event_loop().time() # Step 1: Upload file to accessible URL file_url = await self._upload_file_to_accessible_url(pdf_path, src_fileid) self.logger.info(f"mineru-api: uploaded file URL: {file_url}") if not file_url.startswith(('http://', 'https://')): self.logger.warning(f"mineru-api: URL may not be valid for Cloud API: {file_url}") # Step 2: Create task for full document task_id = await self._create_mineru_task_full_document(file_url, src_fileid) # Step 3: Poll for completion result_url = await self._poll_task_completion(task_id, src_fileid, max_wait_time=900) # 15 min for full doc # Step 4: Download and extract results with content_list markdown_content, all_images, all_tables, page_data, full_md_content = await self._download_and_extract_results_with_content_list( result_url, temp_dir, src_fileid ) processing_time = asyncio.get_event_loop().time() - start_time # Detect language from full.md first, then fallback to markdown_content detected_language = None if full_md_content and full_md_content.strip(): from .language_detector import LanguageDetector language_code, confidence = LanguageDetector.detect_language(full_md_content) if confidence > 0.7: detected_language = language_code self.logger.info(f"mineru-api: detected document language from full.md: {detected_language} (confidence: {confidence:.2f})") # Fallback to content_list markdown if no full.md or low confidence if not detected_language and markdown_content and markdown_content.strip(): from .language_detector import LanguageDetector language_code, confidence = LanguageDetector.detect_language(markdown_content) if confidence > 0.7: detected_language = language_code self.logger.info(f"mineru-api: detected document language from content_list: {detected_language} (confidence: {confidence:.2f})") # Create metadata metadata = { "processing_time": processing_time, "total_pages": len(page_data), "images_found": len(all_images), "tables_found": len(all_tables), "is_ppt_source": is_ppt_converted, "processing_mode": "full_document_cloud", "api_version": "v4", "api_type": "cloud", "page_data": page_data, "detected_language": detected_language # Add detected language to metadata } # Create page results for compatibility mineru_page_results = [] for page_idx, pdata in page_data.items(): mineru_page_results.append(MinerUPageResult( page_idx=page_idx, success=True, content=pdata['content'], images=pdata['images'], tables=pdata['tables'], metadata=pdata['metadata'] )) self.logger.info(f"mineru-api: full document cloud processing completed in {processing_time:.2f}s") return MinerUResult( success=True, content=markdown_content, images=all_images, tables=all_tables, metadata=metadata, page_results=mineru_page_results ) except Exception as e: self.logger.error(f"mineru-api: full document cloud processing failed: {str(e)}") raise async def _create_mineru_task_full_document(self, file_url: str, src_fileid: str) -> str: """ Create MinerU task for full document processing. """ if not self.session: raise RuntimeError("API client not initialized") headers = { 'Authorization': f'Bearer {self.config.mineru_api_key}', 'Content-Type': 'application/json', 'Accept': '*/*' } # Configure processing options for full document payload = { 'url': file_url, 'is_ocr': True, 'enable_formula': True, 'enable_table': True, 'language': 'auto', 'data_id': src_fileid, 'model_version': 'v1', 'extra_formats': ['html'] # Request content_list format } try: async with self.session.post( f"{self.config.mineru_api_url}/api/v4/extract/task", headers=headers, json=payload ) as response: if response.status != 200: error_text = await response.text() raise Exception(f"MinerU API error: {response.status} - {error_text}") result = await response.json() if result.get('code') != 0: raise Exception(f"MinerU API error: {result.get('msg', 'Unknown error')}") task_id = result['data']['task_id'] self.logger.info(f"mineru-api: full document task created: {task_id}") return task_id except aiohttp.ClientError as e: raise Exception(f"MinerU API connection error: {str(e)}") async def _download_and_extract_results_with_content_list(self, result_url: str, temp_dir: str, src_fileid: str) -> Tuple[str, List[str], List[Dict], Dict[int, Dict], str]: """ Download and extract MinerU processing results including content_list. Returns: Tuple of (markdown_content, images, tables, page_data, full_md_content) """ if not self.session: raise RuntimeError("API client not initialized") # Download ZIP file zip_path = os.path.join(temp_dir, f"mineru_result_{src_fileid}.zip") try: async with self.session.get(result_url) as response: if response.status != 200: error_text = await response.text() raise Exception(f"Download error: {response.status} - {error_text}") with open(zip_path, 'wb') as f: async for chunk in response.content.iter_chunked(8192): f.write(chunk) self.logger.info(f"mineru-api: downloaded result ZIP: {zip_path}") except aiohttp.ClientError as e: raise Exception(f"Download connection error: {str(e)}") # Extract ZIP file extract_dir = os.path.join(temp_dir, f"mineru_extracted_{src_fileid}") os.makedirs(extract_dir, exist_ok=True) with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(extract_dir) self.logger.info(f"mineru-api: extracted results to: {extract_dir}") # Look for content_list file and full.md content_list = None markdown_content = "" full_md_content = "" # For language detection images = [] tables = [] for root, _, files in os.walk(extract_dir): for file in files: file_path = os.path.join(root, file) # Look for content_list JSON file if file.endswith('_content_list.json'): with open(file_path, 'r', encoding='utf-8') as f: import json content_list = json.load(f) self.logger.info(f"mineru-api: found content_list with {len(content_list)} items") elif file == 'full.md': # Read full.md for language detection with open(file_path, 'r', encoding='utf-8') as f: full_md_content = f.read() self.logger.info(f"mineru-api: found full.md with {len(full_md_content)} characters") elif file.endswith('.md') and not markdown_content: # Backup: use other markdown files if no content_list with open(file_path, 'r', encoding='utf-8') as f: markdown_content = f.read() # Parse content_list if found if content_list: markdown_content, all_images, all_tables, page_data = self._parse_content_list_to_markdown( content_list, temp_dir, src_fileid ) self.logger.info(f"mineru-api: all_images from content_list: {all_images[:5]}...") # Show first 5 images # Copy images referenced in content_list from images/ directory images_dir = os.path.join(extract_dir, 'images') self.logger.info(f"mineru-api: checking for images directory: {images_dir}") # List all directories in extract_dir for debugging if os.path.exists(extract_dir): self.logger.info(f"mineru-api: contents of extract_dir: {os.listdir(extract_dir)}") if os.path.exists(images_dir): self.logger.info(f"mineru-api: found images directory: {images_dir}") # List files in images directory image_files = os.listdir(images_dir) self.logger.info(f"mineru-api: found {len(image_files)} files in images directory") self.logger.info(f"mineru-api: image files in directory: {image_files[:10]}") # Show first 10 files # Copy ALL image files from images directory to temp_dir import shutil for img_file in image_files: if img_file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')): src_img_path = os.path.join(images_dir, img_file) dest_img_path = os.path.join(temp_dir, img_file) shutil.copy(src_img_path, dest_img_path) self.logger.info(f"mineru-api: copied image {img_file} to temp_dir") # Also try to copy specific images referenced in content_list for img_filename in all_images: # Try different possible paths and names possible_names = [ img_filename, img_filename.replace('.png', '.jpg'), img_filename.replace('.jpg', '.png'), os.path.basename(img_filename) # Just the filename without path ] copied = False for name in possible_names: src_img_path = os.path.join(images_dir, name) if os.path.exists(src_img_path): dest_img_path = os.path.join(temp_dir, img_filename) if not os.path.exists(dest_img_path): shutil.copy(src_img_path, dest_img_path) self.logger.info(f"mineru-api: copied referenced image {name} as {img_filename}") copied = True break if not copied: # Try to find similar files base_name = os.path.splitext(img_filename)[0] matching_files = [f for f in image_files if base_name in f] if matching_files: self.logger.warning(f"mineru-api: image {img_filename} not found, but similar files exist: {matching_files}") else: self.logger.warning(f"mineru-api: image {img_filename} not found in images dir") else: self.logger.warning(f"mineru-api: images directory not found: {images_dir}") # For single-page documents, assign unassigned images to page 0 if len(page_data) == 1 and 0 in page_data: # Check if any images are not yet assigned to pages assigned_images = set() for pd in page_data.values(): assigned_images.update(pd.get('images', [])) unassigned_images = [img for img in all_images if img not in assigned_images] if unassigned_images: self.logger.info(f"mineru-api: assigning {len(unassigned_images)} unassigned images to page 0") page_data[0]['images'].extend(unassigned_images) else: # Fallback: parse markdown to create page data self.logger.warning("mineru-api: no content_list found, using markdown fallback") page_data = self._parse_markdown_to_page_data(markdown_content) # Copy all images from extract_dir to temp_dir (without mineru_ prefix) for root, _, files in os.walk(extract_dir): for file in files: if file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')): file_path = os.path.join(root, file) dest_path = os.path.join(temp_dir, file) import shutil shutil.copy(file_path, dest_path) images.append(file) self.logger.info(f"mineru-api: copied image {file} to temp_dir (fallback)") all_images = images all_tables = tables # Clean up ZIP file os.remove(zip_path) self.logger.info(f"mineru-api: parsed results - {len(page_data)} pages, " f"{len(all_images)} images, {len(all_tables)} tables") return markdown_content, all_images, all_tables, page_data, full_md_content def _parse_markdown_to_page_data(self, markdown_content: str) -> Dict[int, Dict]: """ Parse markdown content to create page data structure. This is a fallback when content_list is not available. """ page_data = {} # Split by page markers import re page_pattern = r'## Page (\d+)' parts = re.split(page_pattern, markdown_content) if len(parts) > 1: # Skip the first part (before first page marker) for i in range(1, len(parts), 2): if i < len(parts) - 1: page_num = int(parts[i]) page_content = parts[i + 1].strip() page_idx = page_num - 1 page_data[page_idx] = { 'content': page_content, 'images': [], 'tables': [], 'metadata': {'page_num': page_num} } else: # No page markers, treat as single page page_data[0] = { 'content': markdown_content, 'images': [], 'tables': [], 'metadata': {'page_num': 1} } return page_data async def _download_and_extract_results(self, result_url: str, temp_dir: str, src_fileid: str) -> Tuple[str, List[str], List[Dict]]: """ Download and extract MinerU processing results. Args: result_url: URL to result ZIP file temp_dir: Temporary directory for extraction src_fileid: Source file ID for logging Returns: Tuple of (content, images, tables) """ if not self.session: raise RuntimeError("API client not initialized") # Download ZIP file zip_path = os.path.join(temp_dir, f"mineru_result_{src_fileid}.zip") try: async with self.session.get(result_url) as response: if response.status != 200: error_text = await response.text() raise Exception(f"Download error: {response.status} - {error_text}") with open(zip_path, 'wb') as f: async for chunk in response.content.iter_chunked(8192): f.write(chunk) self.logger.info(f"mineru-api: downloaded result ZIP: {zip_path}") except aiohttp.ClientError as e: raise Exception(f"Download connection error: {str(e)}") # Extract ZIP file extract_dir = os.path.join(temp_dir, f"mineru_extracted_{src_fileid}") os.makedirs(extract_dir, exist_ok=True) with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(extract_dir) self.logger.info(f"mineru-api: extracted results to: {extract_dir}") # Parse extracted content content = "" images = [] tables = [] # Look for markdown file and other assets for root, _, files in os.walk(extract_dir): for file in files: file_path = os.path.join(root, file) if file.endswith('.md'): # Read markdown content with open(file_path, 'r', encoding='utf-8') as f: content = f.read() self.logger.info(f"mineru-api: loaded markdown content: {len(content)} chars") elif file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')): # Copy image to temp directory and add to list image_name = f"mineru_{file}" dest_path = os.path.join(temp_dir, image_name) import shutil shutil.copy(file_path, dest_path) images.append(image_name) elif file.endswith('.html'): # Parse HTML for additional table information if needed with open(file_path, 'r', encoding='utf-8') as f: html_content = f.read() # Extract table information from HTML table_count = html_content.count(' 0: tables.append({ "source": "html", "table_count": table_count, "content": "Tables extracted from HTML format" }) # Clean up ZIP file os.remove(zip_path) self.logger.info(f"mineru-api: parsed results - content: {len(content)} chars, images: {len(images)}, tables: {len(tables)}") return content, images, tables async def _mock_mineru_processing(self, pdf_path: str, temp_dir: str, src_fileid: str, is_ppt_converted: bool) -> MinerUResult: """ Mock MinerU processing for development/testing. Provides realistic output structure for development without API calls. """ try: # Simulate processing delay await asyncio.sleep(0.5) # Extract basic information from PDF for mock response content_parts = [] images = [] tables = [] content_list = [] # For context extraction with fitz.open(pdf_path) as doc: for page_num, page in enumerate(doc): # Extract text page_text = page.get_text() if page_text.strip(): content_parts.append(f"## Page {page_num + 1}\\n\\n{page_text}\\n") # Add to content_list content_list.append({ 'page_idx': page_num, 'type': 'text', 'text': page_text, 'metadata': {} }) # Mock image extraction (would be done by MinerU) for img in page.get_images(full=True): xref = img[0] bbox = page.get_image_bbox(img) if bbox.width > 0 and bbox.height > 0: image_filename = f"mineru_image_{xref}.png" image_path = os.path.join(temp_dir, image_filename) # Extract and save image try: pix = fitz.Pixmap(doc, xref) if pix.n - pix.alpha < 4: # GRAY or RGB pix.save(image_path) images.append(image_filename) # Add to content_list content_list.append({ 'page_idx': page_num, 'type': 'image', 'img_path': image_filename, 'metadata': {} }) pix = None # Free memory except Exception: pass # Mock table detection if "table" in page_text.lower() or "|" in page_text: tables.append({ "page": page_num, "content": "Mock table content detected", "bbox": [0, 0, page.rect.width, page.rect.height] }) # Add to content_list content_list.append({ 'page_idx': page_num, 'type': 'table', 'content': 'Mock table content', 'metadata': {} }) # Combine content with mock markdown structure mock_content = self._create_mock_markdown_content(content_parts, images) # Detect language from the combined content detected_language = None if mock_content and mock_content.strip(): from .language_detector import LanguageDetector language_code, confidence = LanguageDetector.detect_language(mock_content) if confidence > 0.7: detected_language = language_code self.logger.info(f"mineru-api: detected document language (mock): {detected_language} (confidence: {confidence:.2f})") # Mock metadata metadata = { "processing_time": 0.5, "pages_processed": len(doc), "images_found": len(images), "tables_found": len(tables), "is_ppt_source": is_ppt_converted, "api_version": "mock", "content_list": content_list, # Include content list for context extraction "detected_language": detected_language # Add detected language to metadata } self.logger.info(f"mineru-api: mock processing complete: {metadata}") return MinerUResult( success=True, content=mock_content, images=images, tables=tables, metadata=metadata ) except Exception as e: self.logger.error(f"mineru-api: mock processing error: {str(e)}") raise def _create_mock_markdown_content(self, content_parts: List[str], images: List[str]) -> str: """ Create mock markdown content that simulates MinerU output structure. """ mock_parts = [] # Add document header mock_parts.append("# Document Content (MinerU Mock)") mock_parts.append("") # Add content parts for part in content_parts: mock_parts.append(part) # Add image references if images: mock_parts.append("## Images") mock_parts.append("") for img in images: mock_parts.append(f"![Image](./{img})") mock_parts.append("") return "\\n".join(mock_parts) async def _retry_with_backoff(self, func, *args, **kwargs): """ Execute a function with exponential backoff retry logic. Args: func: Async function to execute *args: Positional arguments for the function **kwargs: Keyword arguments for the function Returns: Result from the function Raises: Exception from the last retry attempt """ max_retries = self.config.api_max_retries retry_delay = self.config.api_retry_delay backoff = self.config.api_retry_backoff max_delay = self.config.api_retry_max_delay retry_on_errors = self.config.retry_on_errors last_exception = None for attempt in range(max_retries + 1): try: result = await func(*args, **kwargs) # Check if result indicates success if hasattr(result, 'success') and not result.success: # Check if this is a retryable error if result.error and retry_on_errors: should_retry = any(err_type in str(result.error) for err_type in retry_on_errors) if not should_retry and attempt < max_retries: self.logger.warning(f"Non-retryable error: {result.error}") return result if attempt < max_retries: self.logger.warning(f"API call failed (attempt {attempt + 1}/{max_retries + 1}): {result.error}") last_exception = Exception(result.error or "API call failed") else: return result else: # Success if attempt > 0: self.logger.info(f"API call succeeded after {attempt + 1} attempts") return result except (aiohttp.ClientError, asyncio.TimeoutError, ConnectionError) as e: # Network-related errors are always retryable last_exception = e if attempt < max_retries: self.logger.warning(f"Network error (attempt {attempt + 1}/{max_retries + 1}): {str(e)}") else: self.logger.error(f"Network error after {max_retries + 1} attempts: {str(e)}") raise except Exception as e: # Check if this is a retryable error type if retry_on_errors: should_retry = any(err_type in str(e) for err_type in retry_on_errors) if not should_retry: self.logger.error(f"Non-retryable error: {str(e)}") raise last_exception = e if attempt < max_retries: self.logger.warning(f"API error (attempt {attempt + 1}/{max_retries + 1}): {str(e)}") else: self.logger.error(f"API error after {max_retries + 1} attempts: {str(e)}") raise # If we need to retry, wait with exponential backoff if attempt < max_retries: delay = min(retry_delay * (backoff ** attempt), max_delay) self.logger.info(f"Retrying in {delay:.1f} seconds...") await asyncio.sleep(delay) # Should not reach here, but just in case if last_exception: raise last_exception else: raise Exception("Maximum retries exceeded") async def _process_batch_self_hosted_impl(self, batch_pdf_path: str, temp_dir: str, src_fileid: str, batch_idx: int, start_page: int, end_page: int, is_ppt_converted: bool) -> MinerUResult: """ Process a batch of pages using self-hosted MinerU API. Args: batch_pdf_path: Path to batch PDF file temp_dir: Temporary directory src_fileid: Source file ID batch_idx: Batch index start_page: Original start page index end_page: Original end page index is_ppt_converted: Whether PDF is from PPT Returns: MinerUResult for this batch """ try: if not self.session: raise RuntimeError("API client not initialized") # Prepare multipart form data form_data = aiohttp.FormData() # API parameters form_data.add_field('return_middle_json', 'false') form_data.add_field('return_model_output', 'false') form_data.add_field('return_md', 'true') form_data.add_field('return_images', 'true') form_data.add_field('return_content_list', 'true') form_data.add_field('end_page_id', str(end_page - start_page)) form_data.add_field('parse_method', 'auto') form_data.add_field('start_page_id', '0') form_data.add_field('output_dir', './output') form_data.add_field('server_url', 'string') form_data.add_field('backend', 'pipeline') form_data.add_field('table_enable', 'true') form_data.add_field('formula_enable', 'true') # Add the batch PDF file with open(batch_pdf_path, 'rb') as f: form_data.add_field('files', f, filename=f"batch_{batch_idx}.pdf", content_type='application/pdf') # Make API request async with self.session.post( f"{self.config.mineru_api_url}/file_parse", data=form_data, headers={'accept': 'application/json'} ) as response: if response.status != 200: error_text = await response.text() raise Exception(f"Self-hosted MinerU API error: {response.status} - {error_text}") result = await response.json() # Process the batch result results = result.get('results', {}) if not results: raise Exception("No results in API response") file_result = next(iter(results.values())) content_list_str = file_result.get('content_list', '') if not content_list_str: raise Exception("No content_list in API response") # Parse content_list with adjusted page indices markdown_content, images, tables, page_data = self._parse_content_list_to_markdown_batch( content_list_str, temp_dir, src_fileid, start_page ) # Save batch images if provided images_data = file_result.get('images', {}) if images_data and isinstance(images_data, dict): saved_images = self._save_base64_images(images_data, temp_dir, f"{src_fileid}_batch_{batch_idx}") images.extend([img for img in saved_images if img not in images]) metadata = { "batch_idx": batch_idx, "start_page": start_page, "end_page": end_page, "pages_in_batch": end_page - start_page, "is_ppt_source": is_ppt_converted, "page_data": page_data # Add page_data so it can be extracted per page } return MinerUResult( success=True, content=markdown_content, images=images, tables=tables, metadata=metadata, page_results=None ) except Exception as e: self.logger.error(f"mineru-api: batch {batch_idx} processing failed: {str(e)}") return MinerUResult( success=False, content="", images=[], tables=[], metadata={"batch_idx": batch_idx, "error": str(e)}, error=str(e) ) async def _process_batch_self_hosted(self, batch_pdf_path: str, temp_dir: str, src_fileid: str, batch_idx: int, start_page: int, end_page: int, is_ppt_converted: bool) -> MinerUResult: """ Process a batch WITHOUT retry logic. Batch failures will fallback to single-page processing where retry happens. """ # Direct call without retry wrapper return await self._process_batch_self_hosted_impl( batch_pdf_path, temp_dir, src_fileid, batch_idx, start_page, end_page, is_ppt_converted ) async def _process_batch_cloud_impl(self, batch_pdf_path: str, temp_dir: str, src_fileid: str, batch_idx: int, start_page: int, end_page: int, is_ppt_converted: bool) -> MinerUResult: """ Implementation of batch processing using cloud MinerU API. """ try: # Upload batch PDF file_url = await self._upload_file_to_accessible_url(batch_pdf_path, f"{src_fileid}_batch_{batch_idx}") # Create task for batch task_id = await self._create_mineru_task_full_document(file_url, f"{src_fileid}_batch_{batch_idx}") # Poll for completion result_url = await self._poll_task_completion(task_id, src_fileid, max_wait_time=300) # Download and extract results markdown_content, images, tables, page_data, _ = await self._download_and_extract_results_with_content_list( result_url, temp_dir, f"{src_fileid}_batch_{batch_idx}" ) # Adjust page indices to match original document adjusted_page_data = {} for page_idx, pdata in page_data.items(): adjusted_idx = page_idx + start_page adjusted_page_data[adjusted_idx] = pdata adjusted_page_data[adjusted_idx]['metadata']['original_page_num'] = adjusted_idx + 1 metadata = { "batch_idx": batch_idx, "start_page": start_page, "end_page": end_page, "pages_in_batch": end_page - start_page, "is_ppt_source": is_ppt_converted, "page_data": adjusted_page_data } return MinerUResult( success=True, content=markdown_content, images=images, tables=tables, metadata=metadata, page_results=None ) except Exception as e: self.logger.error(f"mineru-api: cloud batch {batch_idx} processing failed: {str(e)}") return MinerUResult( success=False, content="", images=[], tables=[], metadata={"batch_idx": batch_idx, "error": str(e)}, error=str(e) ) async def _process_batch_cloud(self, batch_pdf_path: str, temp_dir: str, src_fileid: str, batch_idx: int, start_page: int, end_page: int, is_ppt_converted: bool) -> MinerUResult: """ Process a batch using cloud API WITHOUT retry logic. Batch failures will fallback to single-page processing where retry happens. """ # Direct call without retry wrapper return await self._process_batch_cloud_impl( batch_pdf_path, temp_dir, src_fileid, batch_idx, start_page, end_page, is_ppt_converted ) def _merge_batch_results(self, batch_results: List[Tuple[int, MinerUResult]], total_pages: int, is_ppt_converted: bool) -> MinerUResult: """ Merge results from multiple batches into a single MinerUResult. Args: batch_results: List of (start_page, MinerUResult) tuples total_pages: Total number of pages in original document is_ppt_converted: Whether the PDF was converted from PPT Returns: Merged MinerUResult """ if not batch_results: return MinerUResult( success=False, content="", images=[], tables=[], metadata={}, error="No successful batches" ) # Sort batches by start page batch_results.sort(key=lambda x: x[0]) # Merge content merged_content_parts = [] all_images = [] all_tables = [] all_page_data = {} all_page_results = [] for start_page, batch_result in batch_results: # Add batch content merged_content_parts.append(batch_result.content) # Collect images (avoid duplicates) for img in batch_result.images: if img not in all_images: all_images.append(img) # Collect tables all_tables.extend(batch_result.tables) # Merge page data if available if batch_result.metadata and 'page_data' in batch_result.metadata: all_page_data.update(batch_result.metadata['page_data']) # Create page results if needed if batch_result.page_results: all_page_results.extend(batch_result.page_results) # Join content with page separators merged_content = "\n\n".join(merged_content_parts) # Create merged metadata merged_metadata = { "processing_mode": "batch_processing", "total_pages": total_pages, "batch_count": len(batch_results), "images_found": len(all_images), "tables_found": len(all_tables), "is_ppt_source": is_ppt_converted, "page_data": all_page_data if all_page_data else None } self.logger.info(f"mineru-api: merged {len(batch_results)} batches - " f"{len(all_images)} images, {len(all_tables)} tables") return MinerUResult( success=True, content=merged_content, images=all_images, tables=all_tables, metadata=merged_metadata, page_results=all_page_results if all_page_results else None ) def _parse_content_list_to_markdown_batch(self, content_list: Any, temp_dir: str, src_fileid: str, page_offset: int) -> Tuple[str, List[str], List[Dict], Dict[int, Dict]]: """ Parse content_list for a batch with page offset adjustment. Args: content_list: Content list from API temp_dir: Temporary directory src_fileid: Source file ID page_offset: Offset to add to page indices Returns: Tuple of (markdown, images, tables, page_data) """ # Parse normally first markdown, images, tables, page_data = self._parse_content_list_to_markdown( content_list, temp_dir, src_fileid ) # Adjust page indices in page_data adjusted_page_data = {} for page_idx, pdata in page_data.items(): adjusted_idx = page_idx + page_offset adjusted_page_data[adjusted_idx] = pdata # Update page number in metadata if 'metadata' in pdata: pdata['metadata']['page_num'] = adjusted_idx + 1 # Adjust page numbers in tables for table in tables: if 'page' in table: table['page'] += page_offset return markdown, images, tables, adjusted_page_data def _parse_content_list_to_markdown(self, content_list: List[Dict], temp_dir: str, src_fileid: str) -> Tuple[str, List[str], List[Dict], Dict[int, Dict]]: """ Parse content_list JSON format to markdown organized by pages. Args: content_list: List of content items with page_idx, type, and content temp_dir: Temporary directory for saving images src_fileid: Source file ID for logging Returns: Tuple of (markdown_content, image_list, table_list, page_data_dict) """ try: import json import base64 # If content_list is a string, parse it as JSON if isinstance(content_list, str): self.logger.info(f"mineru-api: Parsing content_list string of length {len(content_list)}") try: content_list = json.loads(content_list) self.logger.info(f"mineru-api: Parsed content_list to {type(content_list)} with {len(content_list) if isinstance(content_list, list) else 'N/A'} items") except json.JSONDecodeError as e: self.logger.error(f"mineru-api: Failed to parse content_list JSON: {str(e)}") self.logger.error(f"mineru-api: Content_list sample: {content_list[:500]}") raise # Log content_list structure if isinstance(content_list, list): self.logger.info(f"mineru-api: Content list has {len(content_list)} items") if content_list: self.logger.debug(f"mineru-api: First item sample: {content_list[0] if content_list else 'None'}") else: self.logger.warning(f"mineru-api: Content list is not a list, type: {type(content_list)}") # Group content by page page_groups = {} for item in content_list: page_idx = item.get('page_idx', 0) if page_idx not in page_groups: page_groups[page_idx] = [] page_groups[page_idx].append(item) # Sort pages sorted_pages = sorted(page_groups.keys()) # Build markdown and collect resources markdown_parts = [] all_images = [] all_tables = [] page_data = {} for page_idx in sorted_pages: page_num = page_idx + 1 # Convert to 1-based page_items = page_groups[page_idx] # Page header - only add if there's content # We'll add this after checking for content page_content_parts = [] page_images = [] page_tables = [] for item in page_items: item_type = item.get('type', 'text') if item_type == 'text': text = item.get('text', '').strip() text_level = item.get('text_level', 0) if text: # Apply heading levels if text_level > 0: # Convert to markdown heading heading_prefix = '#' * min(text_level, 6) page_content_parts.append(f"{heading_prefix} {text}") else: page_content_parts.append(text) elif item_type == 'image': img_path = item.get('img_path', '') img_caption = item.get('img_caption', []) img_footnote = item.get('img_footnote', []) self.logger.info(f"mineru-api: processing image item - img_path: {img_path[:100] if img_path else 'None'}") # Handle image path/data if img_path: if img_path.startswith('data:'): # Base64 encoded image try: # Extract format and data header, data = img_path.split(',', 1) fmt = header.split('/')[1].split(';')[0] # Decode and save img_data = base64.b64decode(data) img_filename = f"content_list_img_{src_fileid}_p{page_num}_{len(page_images)}.{fmt}" img_file_path = os.path.join(temp_dir, img_filename) with open(img_file_path, 'wb') as f: f.write(img_data) page_images.append(img_filename) all_images.append(img_filename) # Add to markdown page_content_parts.append(f"![Image]({img_filename})") except Exception as e: self.logger.error(f"Failed to decode base64 image: {str(e)}") else: # Regular image path - need to check if it's a file that needs to be created img_filename = os.path.basename(img_path) # For self-hosted API, images might be referenced but not yet saved # We'll add them to the list and expect them to be in the 'images' field page_images.append(img_filename) all_images.append(img_filename) # Use relative path for image reference img_ref = f"images/{img_filename}" if not img_path.startswith('images/') else img_path page_content_parts.append(f"![Image]({img_ref})") self.logger.info(f"mineru-api: added image reference: ![Image]({img_ref})") self.logger.info(f"mineru-api: expecting image file: {img_filename} to be provided by API") # Add captions if present if img_caption: caption_text = ' '.join(img_caption) page_content_parts.append(f"*{caption_text}*") if img_footnote: footnote_text = ' '.join(img_footnote) page_content_parts.append(f"**Note:** {footnote_text}") elif item_type == 'table': table_body = item.get('table_body', '') table_caption = item.get('table_caption', []) table_footnote = item.get('table_footnote', []) # Add table caption if table_caption: caption_text = ' '.join(table_caption) page_content_parts.append(f"**{caption_text}**") # Add table content if table_body: # If HTML table, add directly if table_body.strip().startswith('<'): page_content_parts.append(table_body) else: page_content_parts.append(f"```\n{table_body}\n```") # Add footnote if table_footnote: footnote_text = ' '.join(table_footnote) page_content_parts.append(f"*Note: {footnote_text}*") # Store table data table_data = { 'page': page_num, 'content': table_body, 'caption': table_caption, 'footnote': table_footnote } page_tables.append(table_data) all_tables.append(table_data) elif item_type == 'equation': eq_text = item.get('text', '') eq_format = item.get('text_format', 'latex') if eq_text: if eq_format == 'latex': # Use display math for equations page_content_parts.append(f"$$\n{eq_text}\n$$") else: page_content_parts.append(f"```{eq_format}\n{eq_text}\n```") # Combine page content # Filter out empty parts to avoid excessive newlines non_empty_parts = [part for part in page_content_parts if part.strip()] page_content = '\n'.join(non_empty_parts) if non_empty_parts else '' # Only add page header and content if there's actual content if page_content: markdown_parts.append(f"\n\n## Page {page_num}\n\n") markdown_parts.append(page_content) # Store page data page_data[page_idx] = { 'content': page_content, 'images': page_images, 'tables': page_tables, 'metadata': {'page_num': page_num} } # Combine all markdown final_markdown = ''.join(markdown_parts) self.logger.info(f"mineru-api: parsed content_list - {len(sorted_pages)} pages, " f"{len(all_images)} images, {len(all_tables)} tables") return final_markdown, all_images, all_tables, page_data except Exception as e: self.logger.error(f"mineru-api: content_list parsing failed: {str(e)}") raise def detect_tables(self, content: str) -> bool: """ Detect if content contains table structures. Based on gzero.py's table detection logic. """ table_indicators = [ '', '', '
', '|---|', '表格', 'Table', '| ', ' |', '┌', '└', '├', '┤' # Table border characters ] content_lower = content.lower() for indicator in table_indicators: if indicator.lower() in content_lower: return True # Check for pipe-separated table format lines = content.split('\\n') pipe_lines = [line for line in lines if line.count('|') >= 2] if len(pipe_lines) >= 2: # At least header and one data row return True return False async def extract_plain_text(self, pdf_path: str, src_fileid: str) -> str: """ Extract plain text from PDF using PyMuPDF. This provides text content for comparison with MinerU results. """ try: text_parts = [] with fitz.open(pdf_path) as doc: for page in doc: page_text = page.get_text() if page_text.strip(): text_parts.append(page_text) plain_text = '\\n\\n'.join(text_parts) self.logger.info(f"mineru-api: extracted {len(plain_text)} characters of plain text") return plain_text except Exception as e: self.logger.error(f"mineru-api: plain text extraction failed: {str(e)}") return "" def merge_content(self, plain_text: str, mineru_content: str, src_fileid: str) -> str: """ Merge plain text with MinerU structured content. This combines the reliability of plain text extraction with MinerU's structured parsing capabilities. """ try: # Simple merge strategy - could be enhanced with more sophisticated logic if not mineru_content.strip(): self.logger.warning(f"mineru-api: MinerU content empty, using plain text") return plain_text if not plain_text.strip(): self.logger.warning(f"mineru-api: plain text empty, using MinerU content") return mineru_content # For now, prefer MinerU content as it should be more structured # In practice, you might want more sophisticated merging logic self.logger.info(f"mineru-api: using MinerU structured content") return mineru_content except Exception as e: self.logger.error(f"mineru-api: content merge failed: {str(e)}") # Fallback to plain text return plain_text if plain_text else mineru_content