""" MinerU API client module. This module handles communication with the MinerU service for document parsing, following the architecture patterns from gzero.py and implementing the real API. """ import os import asyncio import aiohttp import zipfile import fitz import time from typing import Dict, List, Optional, Any, Tuple from dataclasses import dataclass from .logger import get_module_logger logger = get_module_logger('api_client') from .config_base import MinerUConfig @dataclass class MinerUPageResult: """Result from MinerU processing for a single page""" page_idx: int success: bool content: str images: List[str] tables: List[Dict] metadata: Dict[str, Any] error: Optional[str] = None @dataclass class MinerUResult: """Result from MinerU processing""" success: bool content: str # Backward compatibility - merged content images: List[str] # All images tables: List[Dict] # All tables metadata: Dict[str, Any] error: Optional[str] = None page_results: Optional[List[MinerUPageResult]] = None # Individual page results class MinerUAPIClient: """Client for interacting with MinerU API""" def __init__(self, config: MinerUConfig, platform_adapter=None): self.config = config self.session: Optional[aiohttp.ClientSession] = None self.logger = logger self.platform_adapter = platform_adapter async def __aenter__(self): """Async context manager entry""" self.session = aiohttp.ClientSession( timeout=aiohttp.ClientTimeout(total=600) # 10 minute timeout ) return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Async context manager exit""" if self.session: await self.session.close() async def process_document(self, pdf_path: str, temp_dir: str, src_fileid: str, is_ppt_converted: bool = False, batch_size: int = 20) -> MinerUResult: """ Process document using MinerU API with batch processing support. Can process the entire document at once or in batches for large PDFs. Args: pdf_path: Path to PDF file temp_dir: Temporary directory for processing src_fileid: Source file ID for logging/tracing is_ppt_converted: Whether the PDF was converted from PPT batch_size: Maximum pages per batch (0 for no batching) Returns: MinerUResult with parsed content """ try: # Check PDF page count page_count = self._get_pdf_page_count(pdf_path) self.logger.info(f"mineru-api: PDF has {page_count} pages, batch_size={batch_size}") # Decide whether to use batch processing if batch_size > 0 and page_count > batch_size: self.logger.info(f"mineru-api: using batch processing (batch_size={batch_size})") return await self._process_document_in_batches( pdf_path, temp_dir, src_fileid, is_ppt_converted, batch_size ) else: self.logger.info(f"mineru-api: processing full document at once") return await self._process_document_full( pdf_path, temp_dir, src_fileid, is_ppt_converted ) except Exception as e: self.logger.error(f"mineru-api: document processing failed: {str(e)}") return MinerUResult( success=False, content="", images=[], tables=[], metadata={}, error=str(e) ) async def _process_document_full(self, pdf_path: str, temp_dir: str, src_fileid: str, is_ppt_converted: bool) -> MinerUResult: """Process full document at once (original implementation).""" self.logger.info(f"mineru-api: starting full document processing") # Choose processing method based on API type if self.config.mineru_api_type == "self_hosted": self.logger.info(f"mineru-api: using self-hosted MinerU API") result = await self._process_full_document_self_hosted(pdf_path, temp_dir, src_fileid, is_ppt_converted) elif self.config.mineru_api_key and self.config.mineru_api_key.strip(): self.logger.info(f"mineru-api: using cloud MinerU API") result = await self._process_full_document_cloud(pdf_path, temp_dir, src_fileid, is_ppt_converted) else: self.logger.warning(f"mineru-api: no API configuration, using mock processing") result = await self._mock_mineru_processing(pdf_path, temp_dir, src_fileid, is_ppt_converted) self.logger.info(f"mineru-api: document processing completed") return result async def _process_document_in_batches(self, pdf_path: str, temp_dir: str, src_fileid: str, is_ppt_converted: bool, batch_size: int) -> MinerUResult: """ Process document in batches of pages. Args: pdf_path: Path to PDF file temp_dir: Temporary directory for processing src_fileid: Source file ID for logging/tracing is_ppt_converted: Whether the PDF was converted from PPT batch_size: Maximum pages per batch Returns: MinerUResult with merged results from all batches """ try: page_count = self._get_pdf_page_count(pdf_path) num_batches = (page_count + batch_size - 1) // batch_size self.logger.info(f"mineru-api: splitting {page_count} pages into {num_batches} batches") # Process each batch batch_results = [] for batch_idx in range(num_batches): start_page = batch_idx * batch_size end_page = min(start_page + batch_size, page_count) self.logger.info(f"mineru-api: processing batch {batch_idx + 1}/{num_batches} " f"(pages {start_page + 1}-{end_page})") # Split PDF for this batch batch_pdf_path = await self._split_pdf( pdf_path, temp_dir, start_page, end_page, batch_idx ) # Process batch based on API type if self.config.mineru_api_type == "self_hosted": batch_result = await self._process_batch_self_hosted( batch_pdf_path, temp_dir, src_fileid, batch_idx, start_page, end_page, is_ppt_converted ) elif self.config.mineru_api_key and self.config.mineru_api_key.strip(): batch_result = await self._process_batch_cloud( batch_pdf_path, temp_dir, src_fileid, batch_idx, start_page, end_page, is_ppt_converted ) else: batch_result = await self._mock_mineru_processing( batch_pdf_path, temp_dir, src_fileid, is_ppt_converted ) if batch_result.success: batch_results.append((start_page, batch_result)) else: self.logger.error(f"mineru-api: batch {batch_idx + 1} failed: {batch_result.error}") # Continue with other batches even if one fails # Merge all batch results return self._merge_batch_results(batch_results, page_count, is_ppt_converted) except Exception as e: self.logger.error(f"mineru-api: batch processing failed: {str(e)}") raise def _get_pdf_page_count(self, pdf_path: str) -> int: """Get the number of pages in a PDF.""" with fitz.open(pdf_path) as doc: return len(doc) async def _split_pdf(self, pdf_path: str, temp_dir: str, start_page: int, end_page: int, batch_idx: int) -> str: """ Split PDF to extract specific pages for batch processing. Args: pdf_path: Original PDF path temp_dir: Temporary directory start_page: Start page index (0-based) end_page: End page index (exclusive) batch_idx: Batch index for naming Returns: Path to the split PDF file """ batch_pdf_path = os.path.join(temp_dir, f"batch_{batch_idx}.pdf") with fitz.open(pdf_path) as src_doc: batch_doc = fitz.open() # Create new PDF # Copy pages to new document for page_idx in range(start_page, end_page): batch_doc.insert_pdf(src_doc, from_page=page_idx, to_page=page_idx) batch_doc.save(batch_pdf_path) batch_doc.close() self.logger.info(f"mineru-api: created batch PDF with {end_page - start_page} pages: {batch_pdf_path}") return batch_pdf_path async def _process_full_document_self_hosted(self, pdf_path: str, temp_dir: str, src_fileid: str, is_ppt_converted: bool) -> MinerUResult: """ Process full PDF document at once using self-hosted MinerU API with content_list support. This method uploads the entire document and gets back content_list for all pages. """ try: self.logger.info(f"mineru-api: processing full document with self-hosted API") if not self.session: raise RuntimeError("API client not initialized") start_time = asyncio.get_event_loop().time() # Prepare multipart form data form_data = aiohttp.FormData() # API parameters - enable content_list for full document processing form_data.add_field('return_middle_json', 'false') form_data.add_field('return_model_output', 'false') form_data.add_field('return_md', 'true') form_data.add_field('return_images', 'true') form_data.add_field('return_content_list', 'true') # Enable content_list form_data.add_field('end_page_id', '99999') form_data.add_field('parse_method', 'auto') form_data.add_field('start_page_id', '0') form_data.add_field('output_dir', './output') form_data.add_field('server_url', 'string') form_data.add_field('backend', 'pipeline') form_data.add_field('table_enable', 'true') form_data.add_field('formula_enable', 'true') # Add the PDF file with open(pdf_path, 'rb') as f: form_data.add_field('files', f, filename=os.path.basename(pdf_path), content_type='application/pdf') # Make API request async with self.session.post( f"{self.config.mineru_api_url}/file_parse", data=form_data, headers={'accept': 'application/json'} ) as response: if response.status != 200: error_text = await response.text() raise Exception(f"Self-hosted MinerU API error: {response.status} - {error_text}") result = await response.json() # Log the top-level keys to understand response structure self.logger.info(f"mineru-api: response top-level keys: {list(result.keys())}") # Extract content_list from response results = result.get('results', {}) if not results: raise Exception("No results in API response") # Get the first result (should be our PDF file) file_result = next(iter(results.values())) self.logger.info(f"mineru-api: file_result keys: {list(file_result.keys())}") # Extract content_list content_list_str = file_result.get('content_list', '') self.logger.info(f"mineru-api: content_list type: {type(content_list_str)}, length: {len(str(content_list_str))}") if not content_list_str: self.logger.error(f"mineru-api: No content_list in API response. File result keys: {list(file_result.keys())}") # Log a sample of the file_result to understand what we're getting sample_result = str(file_result)[:500] if file_result else 'None' self.logger.error(f"mineru-api: File result sample: {sample_result}") raise Exception("No content_list in API response") # Parse content_list to markdown markdown_content, all_images, all_tables, page_data = self._parse_content_list_to_markdown( content_list_str, temp_dir, src_fileid ) # Also get markdown content if available for language detection md_content = file_result.get('md_content', '') # Extract base64 images if provided images_data = file_result.get('images', {}) if images_data and isinstance(images_data, dict): self.logger.info(f"mineru-api: saving {len(images_data)} base64 images") saved_images = self._save_base64_images(images_data, temp_dir, src_fileid) self.logger.info(f"mineru-api: saved images: {saved_images}") # Merge with images from content_list for img in saved_images: if img not in all_images: all_images.append(img) else: self.logger.info(f"mineru-api: no base64 images in 'images' field") # Check if there's an 'images' field at the top level of result if 'images' in result and isinstance(result['images'], dict): self.logger.info(f"mineru-api: found images at top level: {len(result['images'])} images") # Save these images for img_name, img_data in result['images'].items(): if img_name not in all_images: # Save the image try: if isinstance(img_data, str) and img_data.startswith('data:'): # Base64 encoded saved = self._save_base64_images({img_name: img_data}, temp_dir, src_fileid) all_images.extend(saved) self.logger.info(f"mineru-api: saved top-level image: {img_name}") except Exception as e: self.logger.error(f"mineru-api: failed to save top-level image {img_name}: {e}") processing_time = asyncio.get_event_loop().time() - start_time # Detect language from the combined md_content detected_language = None if md_content and md_content.strip(): from .language_detector import LanguageDetector language_code, confidence = LanguageDetector.detect_language(md_content) if confidence > 0.7: detected_language = language_code self.logger.info(f"mineru-api: detected document language: {detected_language} (confidence: {confidence:.2f})") # If no md_content, detect from markdown_content as fallback if not detected_language and markdown_content and markdown_content.strip(): from .language_detector import LanguageDetector language_code, confidence = LanguageDetector.detect_language(markdown_content) if confidence > 0.7: detected_language = language_code self.logger.info(f"mineru-api: detected document language from content_list: {detected_language} (confidence: {confidence:.2f})") # Create metadata import json metadata = { "processing_time": processing_time, "total_pages": len(page_data), "images_found": len(all_images), "tables_found": len(all_tables), "is_ppt_source": is_ppt_converted, "processing_mode": "full_document_self_hosted", "api_version": "self_hosted", "api_type": "self_hosted", "page_data": page_data, "content_list": json.loads(content_list_str) if isinstance(content_list_str, str) else content_list_str, "detected_language": detected_language # Add detected language to metadata } # Create page results for compatibility mineru_page_results = [] for page_idx, pdata in page_data.items(): mineru_page_results.append(MinerUPageResult( page_idx=page_idx, success=True, content=pdata['content'], images=pdata['images'], tables=pdata['tables'], metadata=pdata['metadata'] )) self.logger.info(f"mineru-api: full document self-hosted processing completed in {processing_time:.2f}s") return MinerUResult( success=True, content=markdown_content, images=all_images, tables=all_tables, metadata=metadata, page_results=mineru_page_results ) except Exception as e: self.logger.error(f"mineru-api: full document self-hosted processing failed: {str(e)}") raise def _parse_self_hosted_response(self, api_response: Dict, temp_dir: str, page_id: str, page_num: int) -> Tuple[str, List[str], List[Dict]]: """ Parse response from self-hosted MinerU API. Expected response format: { "backend": "pipeline", "version": "2.1.10", "results": { "page_xxx": { "md_content": "# Content...", "images": { "filename.jpg": "data:image/jpeg;base64,xxx...", ... }, "middle_json": {...}, "model_output": {...} } } } """ try: content = "" images = [] tables = [] # Extract results results = api_response.get('results', {}) if results: # Get the first result (should be our PDF file) file_result = next(iter(results.values())) if results else {} # Extract markdown content md_content = file_result.get('md_content', '') if md_content: content = md_content else: content = f"# Page {page_num}\n\nNo content extracted from self-hosted API" # Extract images from the images field (base64 encoded) images_data = file_result.get('images', {}) if images_data and isinstance(images_data, dict): images = self._save_base64_images(images_data, temp_dir, page_id) else: # Fallback to extracting from markdown if no images field images = self._extract_images_from_markdown(md_content, temp_dir, page_id) # Extract table information if available in middle_json middle_json = file_result.get('middle_json', {}) if middle_json and isinstance(middle_json, dict): tables = self._extract_tables_from_middle_json(middle_json, page_num) self.logger.debug(f"mineru-api: [{page_id}] parsed self-hosted response - content: {len(content)} chars, images: {len(images)}, tables: {len(tables)}") return content, images, tables except Exception as e: self.logger.error(f"mineru-api: [{page_id}] failed to parse self-hosted response: {str(e)}") return f"# Page {page_num}\n\nError parsing API response: {str(e)}", [], [] def _save_base64_images(self, images_data: Dict[str, str], temp_dir: str, page_id: str) -> List[str]: """ Save base64 encoded images to files. Args: images_data: Dictionary with image filename as key and base64 data as value temp_dir: Directory to save images page_id: Page identifier for logging Returns: List of saved image filenames """ import base64 saved_images = [] for filename, base64_data in images_data.items(): try: # Extract base64 data (remove data URI prefix if present) if base64_data.startswith('data:'): # Format: data:image/jpeg;base64,xxx... base64_data = base64_data.split(',', 1)[1] # Decode base64 to binary image_data = base64.b64decode(base64_data) # Use the original filename without prefix to match content_list references image_filename = filename image_path = os.path.join(temp_dir, image_filename) # Save image file with open(image_path, 'wb') as f: f.write(image_data) saved_images.append(image_filename) self.logger.info(f"mineru-api: [{page_id}] saved base64 image: {image_filename} ({len(image_data)} bytes)") except Exception as e: self.logger.error(f"mineru-api: [{page_id}] failed to save base64 image {filename}: {str(e)}") return saved_images def _extract_images_from_markdown(self, md_content: str, temp_dir: str, page_id: str) -> List[str]: """ Extract image references from markdown content and handle them. Self-hosted MinerU typically includes images as markdown references like  """ import re images = [] # Find all markdown image references image_pattern = r'!\[([^\]]*)\]\(([^)]+)\)' matches = re.findall(image_pattern, md_content) for _, image_path in matches: try: # Handle different image path formats if image_path.startswith('./') or image_path.startswith('../'): # Relative path - assume it's in the same directory structure actual_path = os.path.normpath(os.path.join(temp_dir, image_path)) elif os.path.isabs(image_path): # Absolute path actual_path = image_path else: # Relative to temp directory actual_path = os.path.join(temp_dir, image_path) # Check if image file exists and copy to our temp directory if needed if os.path.exists(actual_path): # Generate new filename for our processing image_filename = f"self_hosted_{page_id}_{os.path.basename(image_path)}" dest_path = os.path.join(temp_dir, image_filename) if actual_path != dest_path: import shutil shutil.copy(actual_path, dest_path) images.append(image_filename) self.logger.debug(f"mineru-api: [{page_id}] extracted image: {image_filename}") else: self.logger.warning(f"mineru-api: [{page_id}] image file not found: {actual_path}") except Exception as e: self.logger.error(f"mineru-api: [{page_id}] error processing image {image_path}: {str(e)}") return images def _extract_tables_from_middle_json(self, middle_json: Dict, page_num: int) -> List[Dict]: """ Extract table information from middle_json if available. """ tables = [] try: # This structure depends on the actual format returned by self-hosted MinerU # Adjust based on the actual response structure if 'tables' in middle_json: table_data = middle_json['tables'] if isinstance(table_data, list): for i, table in enumerate(table_data): tables.append({ 'page': page_num, 'table_id': i, 'content': str(table), 'source': 'self_hosted_middle_json' }) elif isinstance(table_data, dict): tables.append({ 'page': page_num, 'content': str(table_data), 'source': 'self_hosted_middle_json' }) except Exception as e: self.logger.debug(f"mineru-api: page {page_num} table extraction from middle_json failed: {str(e)}") return tables async def _upload_file_to_accessible_url(self, pdf_path: str, src_fileid: str) -> str: """ Upload file to a publicly accessible URL for MinerU processing. Uses platform adapter's upload_file method """ try: # Use platform adapter for upload if available if hasattr(self, 'platform_adapter') and self.platform_adapter: # The adapter will handle the upload return await self.platform_adapter.upload_file(pdf_path, { 'src_fileid': src_fileid }) # Fallback: return local path if no adapter logger.warning("No platform adapter available for upload, returning local path") return pdf_path except Exception as e: self.logger.error(f"mineru-api: file upload failed: {str(e)}") raise async def _poll_task_completion(self, task_id: str, src_fileid: str, max_wait_time: int = 600) -> str: """ Poll MinerU task until completion. Args: task_id: Task ID to poll src_fileid: Source file ID for logging max_wait_time: Maximum wait time in seconds Returns: Result ZIP file URL """ if not self.session: raise RuntimeError("API client not initialized") headers = { 'Authorization': f'Bearer {self.config.mineru_api_key}', 'Accept': '*/*' } start_time = asyncio.get_event_loop().time() poll_interval = 5 # Start with 5 seconds max_poll_interval = 30 # Max 30 seconds between polls while True: current_time = asyncio.get_event_loop().time() if current_time - start_time > max_wait_time: raise Exception(f"Task polling timeout after {max_wait_time} seconds") try: async with self.session.get( f"{self.config.mineru_api_url}/api/v4/extract/task/{task_id}", headers=headers ) as response: if response.status != 200: error_text = await response.text() raise Exception(f"MinerU polling error: {response.status} - {error_text}") result = await response.json() if result.get('code') != 0: raise Exception(f"MinerU polling error: {result.get('msg', 'Unknown error')}") data = result['data'] state = data['state'] if state == 'done': full_zip_url = data['full_zip_url'] self.logger.info(f"mineru-api: task completed: {task_id}") return full_zip_url elif state == 'failed': error_msg = data.get('err_msg', 'Unknown error') raise Exception(f"MinerU task failed: {error_msg}") elif state in ['pending', 'running', 'converting']: # Log progress if available if 'extract_progress' in data: progress = data['extract_progress'] extracted = progress.get('extracted_pages', 0) total = progress.get('total_pages', 0) start_time_str = progress.get('start_time', 'N/A') self.logger.info(f"mineru-api: task {state}: {extracted}/{total} pages (started: {start_time_str})") else: self.logger.info(f"mineru-api: task {state}") # Wait before next poll await asyncio.sleep(poll_interval) # Gradually increase poll interval poll_interval = min(poll_interval * 1.2, max_poll_interval) else: raise Exception(f"Unknown task state: {state}") except aiohttp.ClientError as e: self.logger.warning(f"mineru-api: polling connection error: {str(e)}") await asyncio.sleep(poll_interval) async def _process_full_document_cloud(self, pdf_path: str, temp_dir: str, src_fileid: str, is_ppt_converted: bool) -> MinerUResult: """ Process full PDF document at once using cloud MinerU API. This method uploads the entire document and extracts content_list from the result. """ try: self.logger.info(f"mineru-api: processing full document with cloud API") start_time = asyncio.get_event_loop().time() # Step 1: Upload file to accessible URL file_url = await self._upload_file_to_accessible_url(pdf_path, src_fileid) self.logger.info(f"mineru-api: uploaded file URL: {file_url}") if not file_url.startswith(('http://', 'https://')): self.logger.warning(f"mineru-api: URL may not be valid for Cloud API: {file_url}") # Step 2: Create task for full document task_id = await self._create_mineru_task_full_document(file_url, src_fileid) # Step 3: Poll for completion result_url = await self._poll_task_completion(task_id, src_fileid, max_wait_time=900) # 15 min for full doc # Step 4: Download and extract results with content_list markdown_content, all_images, all_tables, page_data, full_md_content = await self._download_and_extract_results_with_content_list( result_url, temp_dir, src_fileid ) processing_time = asyncio.get_event_loop().time() - start_time # Detect language from full.md first, then fallback to markdown_content detected_language = None if full_md_content and full_md_content.strip(): from .language_detector import LanguageDetector language_code, confidence = LanguageDetector.detect_language(full_md_content) if confidence > 0.7: detected_language = language_code self.logger.info(f"mineru-api: detected document language from full.md: {detected_language} (confidence: {confidence:.2f})") # Fallback to content_list markdown if no full.md or low confidence if not detected_language and markdown_content and markdown_content.strip(): from .language_detector import LanguageDetector language_code, confidence = LanguageDetector.detect_language(markdown_content) if confidence > 0.7: detected_language = language_code self.logger.info(f"mineru-api: detected document language from content_list: {detected_language} (confidence: {confidence:.2f})") # Create metadata metadata = { "processing_time": processing_time, "total_pages": len(page_data), "images_found": len(all_images), "tables_found": len(all_tables), "is_ppt_source": is_ppt_converted, "processing_mode": "full_document_cloud", "api_version": "v4", "api_type": "cloud", "page_data": page_data, "detected_language": detected_language # Add detected language to metadata } # Create page results for compatibility mineru_page_results = [] for page_idx, pdata in page_data.items(): mineru_page_results.append(MinerUPageResult( page_idx=page_idx, success=True, content=pdata['content'], images=pdata['images'], tables=pdata['tables'], metadata=pdata['metadata'] )) self.logger.info(f"mineru-api: full document cloud processing completed in {processing_time:.2f}s") return MinerUResult( success=True, content=markdown_content, images=all_images, tables=all_tables, metadata=metadata, page_results=mineru_page_results ) except Exception as e: self.logger.error(f"mineru-api: full document cloud processing failed: {str(e)}") raise async def _create_mineru_task_full_document(self, file_url: str, src_fileid: str) -> str: """ Create MinerU task for full document processing. """ if not self.session: raise RuntimeError("API client not initialized") headers = { 'Authorization': f'Bearer {self.config.mineru_api_key}', 'Content-Type': 'application/json', 'Accept': '*/*' } # Configure processing options for full document payload = { 'url': file_url, 'is_ocr': True, 'enable_formula': True, 'enable_table': True, 'language': 'auto', 'data_id': src_fileid, 'model_version': 'v1', 'extra_formats': ['html'] # Request content_list format } try: async with self.session.post( f"{self.config.mineru_api_url}/api/v4/extract/task", headers=headers, json=payload ) as response: if response.status != 200: error_text = await response.text() raise Exception(f"MinerU API error: {response.status} - {error_text}") result = await response.json() if result.get('code') != 0: raise Exception(f"MinerU API error: {result.get('msg', 'Unknown error')}") task_id = result['data']['task_id'] self.logger.info(f"mineru-api: full document task created: {task_id}") return task_id except aiohttp.ClientError as e: raise Exception(f"MinerU API connection error: {str(e)}") async def _download_and_extract_results_with_content_list(self, result_url: str, temp_dir: str, src_fileid: str) -> Tuple[str, List[str], List[Dict], Dict[int, Dict], str]: """ Download and extract MinerU processing results including content_list. Returns: Tuple of (markdown_content, images, tables, page_data, full_md_content) """ if not self.session: raise RuntimeError("API client not initialized") # Download ZIP file zip_path = os.path.join(temp_dir, f"mineru_result_{src_fileid}.zip") try: async with self.session.get(result_url) as response: if response.status != 200: error_text = await response.text() raise Exception(f"Download error: {response.status} - {error_text}") with open(zip_path, 'wb') as f: async for chunk in response.content.iter_chunked(8192): f.write(chunk) self.logger.info(f"mineru-api: downloaded result ZIP: {zip_path}") except aiohttp.ClientError as e: raise Exception(f"Download connection error: {str(e)}") # Extract ZIP file extract_dir = os.path.join(temp_dir, f"mineru_extracted_{src_fileid}") os.makedirs(extract_dir, exist_ok=True) with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(extract_dir) self.logger.info(f"mineru-api: extracted results to: {extract_dir}") # Look for content_list file and full.md content_list = None markdown_content = "" full_md_content = "" # For language detection images = [] tables = [] for root, _, files in os.walk(extract_dir): for file in files: file_path = os.path.join(root, file) # Look for content_list JSON file if file.endswith('_content_list.json'): with open(file_path, 'r', encoding='utf-8') as f: import json content_list = json.load(f) self.logger.info(f"mineru-api: found content_list with {len(content_list)} items") elif file == 'full.md': # Read full.md for language detection with open(file_path, 'r', encoding='utf-8') as f: full_md_content = f.read() self.logger.info(f"mineru-api: found full.md with {len(full_md_content)} characters") elif file.endswith('.md') and not markdown_content: # Backup: use other markdown files if no content_list with open(file_path, 'r', encoding='utf-8') as f: markdown_content = f.read() # Parse content_list if found if content_list: markdown_content, all_images, all_tables, page_data = self._parse_content_list_to_markdown( content_list, temp_dir, src_fileid ) self.logger.info(f"mineru-api: all_images from content_list: {all_images[:5]}...") # Show first 5 images # Copy images referenced in content_list from images/ directory images_dir = os.path.join(extract_dir, 'images') self.logger.info(f"mineru-api: checking for images directory: {images_dir}") # List all directories in extract_dir for debugging if os.path.exists(extract_dir): self.logger.info(f"mineru-api: contents of extract_dir: {os.listdir(extract_dir)}") if os.path.exists(images_dir): self.logger.info(f"mineru-api: found images directory: {images_dir}") # List files in images directory image_files = os.listdir(images_dir) self.logger.info(f"mineru-api: found {len(image_files)} files in images directory") self.logger.info(f"mineru-api: image files in directory: {image_files[:10]}") # Show first 10 files # Copy ALL image files from images directory to temp_dir import shutil for img_file in image_files: if img_file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')): src_img_path = os.path.join(images_dir, img_file) dest_img_path = os.path.join(temp_dir, img_file) shutil.copy(src_img_path, dest_img_path) self.logger.info(f"mineru-api: copied image {img_file} to temp_dir") # Also try to copy specific images referenced in content_list for img_filename in all_images: # Try different possible paths and names possible_names = [ img_filename, img_filename.replace('.png', '.jpg'), img_filename.replace('.jpg', '.png'), os.path.basename(img_filename) # Just the filename without path ] copied = False for name in possible_names: src_img_path = os.path.join(images_dir, name) if os.path.exists(src_img_path): dest_img_path = os.path.join(temp_dir, img_filename) if not os.path.exists(dest_img_path): shutil.copy(src_img_path, dest_img_path) self.logger.info(f"mineru-api: copied referenced image {name} as {img_filename}") copied = True break if not copied: # Try to find similar files base_name = os.path.splitext(img_filename)[0] matching_files = [f for f in image_files if base_name in f] if matching_files: self.logger.warning(f"mineru-api: image {img_filename} not found, but similar files exist: {matching_files}") else: self.logger.warning(f"mineru-api: image {img_filename} not found in images dir") else: self.logger.warning(f"mineru-api: images directory not found: {images_dir}") # For single-page documents, assign unassigned images to page 0 if len(page_data) == 1 and 0 in page_data: # Check if any images are not yet assigned to pages assigned_images = set() for pd in page_data.values(): assigned_images.update(pd.get('images', [])) unassigned_images = [img for img in all_images if img not in assigned_images] if unassigned_images: self.logger.info(f"mineru-api: assigning {len(unassigned_images)} unassigned images to page 0") page_data[0]['images'].extend(unassigned_images) else: # Fallback: parse markdown to create page data self.logger.warning("mineru-api: no content_list found, using markdown fallback") page_data = self._parse_markdown_to_page_data(markdown_content) # Copy all images from extract_dir to temp_dir (without mineru_ prefix) for root, _, files in os.walk(extract_dir): for file in files: if file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')): file_path = os.path.join(root, file) dest_path = os.path.join(temp_dir, file) import shutil shutil.copy(file_path, dest_path) images.append(file) self.logger.info(f"mineru-api: copied image {file} to temp_dir (fallback)") all_images = images all_tables = tables # Clean up ZIP file os.remove(zip_path) self.logger.info(f"mineru-api: parsed results - {len(page_data)} pages, " f"{len(all_images)} images, {len(all_tables)} tables") return markdown_content, all_images, all_tables, page_data, full_md_content def _parse_markdown_to_page_data(self, markdown_content: str) -> Dict[int, Dict]: """ Parse markdown content to create page data structure. This is a fallback when content_list is not available. """ page_data = {} # Split by page markers import re page_pattern = r'## Page (\d+)' parts = re.split(page_pattern, markdown_content) if len(parts) > 1: # Skip the first part (before first page marker) for i in range(1, len(parts), 2): if i < len(parts) - 1: page_num = int(parts[i]) page_content = parts[i + 1].strip() page_idx = page_num - 1 page_data[page_idx] = { 'content': page_content, 'images': [], 'tables': [], 'metadata': {'page_num': page_num} } else: # No page markers, treat as single page page_data[0] = { 'content': markdown_content, 'images': [], 'tables': [], 'metadata': {'page_num': 1} } return page_data async def _download_and_extract_results(self, result_url: str, temp_dir: str, src_fileid: str) -> Tuple[str, List[str], List[Dict]]: """ Download and extract MinerU processing results. Args: result_url: URL to result ZIP file temp_dir: Temporary directory for extraction src_fileid: Source file ID for logging Returns: Tuple of (content, images, tables) """ if not self.session: raise RuntimeError("API client not initialized") # Download ZIP file zip_path = os.path.join(temp_dir, f"mineru_result_{src_fileid}.zip") try: async with self.session.get(result_url) as response: if response.status != 200: error_text = await response.text() raise Exception(f"Download error: {response.status} - {error_text}") with open(zip_path, 'wb') as f: async for chunk in response.content.iter_chunked(8192): f.write(chunk) self.logger.info(f"mineru-api: downloaded result ZIP: {zip_path}") except aiohttp.ClientError as e: raise Exception(f"Download connection error: {str(e)}") # Extract ZIP file extract_dir = os.path.join(temp_dir, f"mineru_extracted_{src_fileid}") os.makedirs(extract_dir, exist_ok=True) with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(extract_dir) self.logger.info(f"mineru-api: extracted results to: {extract_dir}") # Parse extracted content content = "" images = [] tables = [] # Look for markdown file and other assets for root, _, files in os.walk(extract_dir): for file in files: file_path = os.path.join(root, file) if file.endswith('.md'): # Read markdown content with open(file_path, 'r', encoding='utf-8') as f: content = f.read() self.logger.info(f"mineru-api: loaded markdown content: {len(content)} chars") elif file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')): # Copy image to temp directory and add to list image_name = f"mineru_{file}" dest_path = os.path.join(temp_dir, image_name) import shutil shutil.copy(file_path, dest_path) images.append(image_name) elif file.endswith('.html'): # Parse HTML for additional table information if needed with open(file_path, 'r', encoding='utf-8') as f: html_content = f.read() # Extract table information from HTML table_count = html_content.count('
| ', '|---|', '表格', 'Table', '| ', ' |', '┌', '└', '├', '┤' # Table border characters ] content_lower = content.lower() for indicator in table_indicators: if indicator.lower() in content_lower: return True # Check for pipe-separated table format lines = content.split('\\n') pipe_lines = [line for line in lines if line.count('|') >= 2] if len(pipe_lines) >= 2: # At least header and one data row return True return False async def extract_plain_text(self, pdf_path: str, src_fileid: str) -> str: """ Extract plain text from PDF using PyMuPDF. This provides text content for comparison with MinerU results. """ try: text_parts = [] with fitz.open(pdf_path) as doc: for page in doc: page_text = page.get_text() if page_text.strip(): text_parts.append(page_text) plain_text = '\\n\\n'.join(text_parts) self.logger.info(f"mineru-api: extracted {len(plain_text)} characters of plain text") return plain_text except Exception as e: self.logger.error(f"mineru-api: plain text extraction failed: {str(e)}") return "" def merge_content(self, plain_text: str, mineru_content: str, src_fileid: str) -> str: """ Merge plain text with MinerU structured content. This combines the reliability of plain text extraction with MinerU's structured parsing capabilities. """ try: # Simple merge strategy - could be enhanced with more sophisticated logic if not mineru_content.strip(): self.logger.warning(f"mineru-api: MinerU content empty, using plain text") return plain_text if not plain_text.strip(): self.logger.warning(f"mineru-api: plain text empty, using MinerU content") return mineru_content # For now, prefer MinerU content as it should be more structured # In practice, you might want more sophisticated merging logic self.logger.info(f"mineru-api: using MinerU structured content") return mineru_content except Exception as e: self.logger.error(f"mineru-api: content merge failed: {str(e)}") # Fallback to plain text return plain_text if plain_text else mineru_content |