maxkb/apps/common/handle/impl/mineru/api_client.py
朱潮 575b04c10f
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run
Typos Check / Spell Check with Typos (push) Waiting to run
midyf model_id
2025-08-26 16:35:29 +08:00

1932 lines
86 KiB
Python

"""
MinerU API client module.
This module handles communication with the MinerU service for document parsing,
following the architecture patterns from gzero.py and implementing the real API.
"""
import os
import asyncio
import aiohttp
import zipfile
import fitz
import time
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass
from .logger import get_module_logger
logger = get_module_logger('api_client')
from .config_base import MinerUConfig
@dataclass
class MinerUPageResult:
"""Result from MinerU processing for a single page"""
page_idx: int
success: bool
content: str
images: List[str]
tables: List[Dict]
metadata: Dict[str, Any]
error: Optional[str] = None
@dataclass
class MinerUResult:
"""Result from MinerU processing"""
success: bool
content: str # Backward compatibility - merged content
images: List[str] # All images
tables: List[Dict] # All tables
metadata: Dict[str, Any]
error: Optional[str] = None
page_results: Optional[List[MinerUPageResult]] = None # Individual page results
class MinerUAPIClient:
"""Client for interacting with MinerU API"""
def __init__(self, config: MinerUConfig, platform_adapter=None):
self.config = config
self.session: Optional[aiohttp.ClientSession] = None
self.logger = logger
self.platform_adapter = platform_adapter
async def __aenter__(self):
"""Async context manager entry"""
self.session = aiohttp.ClientSession(
timeout=aiohttp.ClientTimeout(total=600) # 10 minute timeout
)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit"""
if self.session:
await self.session.close()
async def process_document(self, pdf_path: str, temp_dir: str, src_fileid: str,
is_ppt_converted: bool = False, batch_size: int = 20) -> MinerUResult:
"""
Process document using MinerU API with batch processing support.
Can process the entire document at once or in batches for large PDFs.
Args:
pdf_path: Path to PDF file
temp_dir: Temporary directory for processing
src_fileid: Source file ID for logging/tracing
is_ppt_converted: Whether the PDF was converted from PPT
batch_size: Maximum pages per batch (0 for no batching)
Returns:
MinerUResult with parsed content
"""
try:
# Check PDF page count
page_count = self._get_pdf_page_count(pdf_path)
self.logger.info(f"mineru-api: PDF has {page_count} pages, batch_size={batch_size}")
# Decide whether to use batch processing
if batch_size > 0 and page_count > batch_size:
self.logger.info(f"mineru-api: using batch processing (batch_size={batch_size})")
return await self._process_document_in_batches(
pdf_path, temp_dir, src_fileid, is_ppt_converted, batch_size
)
else:
self.logger.info(f"mineru-api: processing full document at once")
return await self._process_document_full(
pdf_path, temp_dir, src_fileid, is_ppt_converted
)
except Exception as e:
self.logger.error(f"mineru-api: document processing failed: {str(e)}")
return MinerUResult(
success=False,
content="",
images=[],
tables=[],
metadata={},
error=str(e)
)
async def _process_document_full(self, pdf_path: str, temp_dir: str, src_fileid: str,
is_ppt_converted: bool) -> MinerUResult:
"""Process full document at once (original implementation)."""
self.logger.info(f"mineru-api: starting full document processing")
# Choose processing method based on API type
if self.config.mineru_api_type == "self_hosted":
self.logger.info(f"mineru-api: using self-hosted MinerU API")
result = await self._process_full_document_self_hosted(pdf_path, temp_dir, src_fileid, is_ppt_converted)
elif self.config.mineru_api_key and self.config.mineru_api_key.strip():
self.logger.info(f"mineru-api: using cloud MinerU API")
result = await self._process_full_document_cloud(pdf_path, temp_dir, src_fileid, is_ppt_converted)
else:
self.logger.warning(f"mineru-api: no API configuration, using mock processing")
result = await self._mock_mineru_processing(pdf_path, temp_dir, src_fileid, is_ppt_converted)
self.logger.info(f"mineru-api: document processing completed")
return result
async def _process_document_in_batches(self, pdf_path: str, temp_dir: str, src_fileid: str,
is_ppt_converted: bool, batch_size: int) -> MinerUResult:
"""
Process document in batches of pages.
Args:
pdf_path: Path to PDF file
temp_dir: Temporary directory for processing
src_fileid: Source file ID for logging/tracing
is_ppt_converted: Whether the PDF was converted from PPT
batch_size: Maximum pages per batch
Returns:
MinerUResult with merged results from all batches
"""
try:
page_count = self._get_pdf_page_count(pdf_path)
num_batches = (page_count + batch_size - 1) // batch_size
self.logger.info(f"mineru-api: splitting {page_count} pages into {num_batches} batches")
# Process each batch
batch_results = []
for batch_idx in range(num_batches):
start_page = batch_idx * batch_size
end_page = min(start_page + batch_size, page_count)
self.logger.info(f"mineru-api: processing batch {batch_idx + 1}/{num_batches} "
f"(pages {start_page + 1}-{end_page})")
# Split PDF for this batch
batch_pdf_path = await self._split_pdf(
pdf_path, temp_dir, start_page, end_page, batch_idx
)
# Process batch based on API type
if self.config.mineru_api_type == "self_hosted":
batch_result = await self._process_batch_self_hosted(
batch_pdf_path, temp_dir, src_fileid, batch_idx,
start_page, end_page, is_ppt_converted
)
elif self.config.mineru_api_key and self.config.mineru_api_key.strip():
batch_result = await self._process_batch_cloud(
batch_pdf_path, temp_dir, src_fileid, batch_idx,
start_page, end_page, is_ppt_converted
)
else:
batch_result = await self._mock_mineru_processing(
batch_pdf_path, temp_dir, src_fileid, is_ppt_converted
)
if batch_result.success:
batch_results.append((start_page, batch_result))
else:
self.logger.error(f"mineru-api: batch {batch_idx + 1} failed: {batch_result.error}")
# Continue with other batches even if one fails
# Merge all batch results
return self._merge_batch_results(batch_results, page_count, is_ppt_converted)
except Exception as e:
self.logger.error(f"mineru-api: batch processing failed: {str(e)}")
raise
def _get_pdf_page_count(self, pdf_path: str) -> int:
"""Get the number of pages in a PDF."""
with fitz.open(pdf_path) as doc:
return len(doc)
async def _split_pdf(self, pdf_path: str, temp_dir: str, start_page: int,
end_page: int, batch_idx: int) -> str:
"""
Split PDF to extract specific pages for batch processing.
Args:
pdf_path: Original PDF path
temp_dir: Temporary directory
start_page: Start page index (0-based)
end_page: End page index (exclusive)
batch_idx: Batch index for naming
Returns:
Path to the split PDF file
"""
batch_pdf_path = os.path.join(temp_dir, f"batch_{batch_idx}.pdf")
with fitz.open(pdf_path) as src_doc:
batch_doc = fitz.open() # Create new PDF
# Copy pages to new document
for page_idx in range(start_page, end_page):
batch_doc.insert_pdf(src_doc, from_page=page_idx, to_page=page_idx)
batch_doc.save(batch_pdf_path)
batch_doc.close()
self.logger.info(f"mineru-api: created batch PDF with {end_page - start_page} pages: {batch_pdf_path}")
return batch_pdf_path
async def _process_full_document_self_hosted(self, pdf_path: str, temp_dir: str, src_fileid: str, is_ppt_converted: bool) -> MinerUResult:
"""
Process full PDF document at once using self-hosted MinerU API with content_list support.
This method uploads the entire document and gets back content_list for all pages.
"""
try:
self.logger.info(f"mineru-api: processing full document with self-hosted API")
if not self.session:
raise RuntimeError("API client not initialized")
start_time = asyncio.get_event_loop().time()
# Prepare multipart form data
form_data = aiohttp.FormData()
# API parameters - enable content_list for full document processing
form_data.add_field('return_middle_json', 'false')
form_data.add_field('return_model_output', 'false')
form_data.add_field('return_md', 'true')
form_data.add_field('return_images', 'true')
form_data.add_field('return_content_list', 'true') # Enable content_list
form_data.add_field('end_page_id', '99999')
form_data.add_field('parse_method', 'auto')
form_data.add_field('start_page_id', '0')
form_data.add_field('output_dir', './output')
form_data.add_field('server_url', 'string')
form_data.add_field('backend', 'pipeline')
form_data.add_field('table_enable', 'true')
form_data.add_field('formula_enable', 'true')
# Add the PDF file
with open(pdf_path, 'rb') as f:
form_data.add_field('files', f, filename=os.path.basename(pdf_path), content_type='application/pdf')
# Make API request
async with self.session.post(
f"{self.config.mineru_api_url}/file_parse",
data=form_data,
headers={'accept': 'application/json'}
) as response:
if response.status != 200:
error_text = await response.text()
raise Exception(f"Self-hosted MinerU API error: {response.status} - {error_text}")
result = await response.json()
# Log the top-level keys to understand response structure
self.logger.info(f"mineru-api: response top-level keys: {list(result.keys())}")
# Extract content_list from response
results = result.get('results', {})
if not results:
raise Exception("No results in API response")
# Get the first result (should be our PDF file)
file_result = next(iter(results.values()))
self.logger.info(f"mineru-api: file_result keys: {list(file_result.keys())}")
# Extract content_list
content_list_str = file_result.get('content_list', '')
self.logger.info(f"mineru-api: content_list type: {type(content_list_str)}, length: {len(str(content_list_str))}")
if not content_list_str:
self.logger.error(f"mineru-api: No content_list in API response. File result keys: {list(file_result.keys())}")
# Log a sample of the file_result to understand what we're getting
sample_result = str(file_result)[:500] if file_result else 'None'
self.logger.error(f"mineru-api: File result sample: {sample_result}")
raise Exception("No content_list in API response")
# Parse content_list to markdown
markdown_content, all_images, all_tables, page_data = self._parse_content_list_to_markdown(
content_list_str, temp_dir, src_fileid
)
# Also get markdown content if available for language detection
md_content = file_result.get('md_content', '')
# Extract base64 images if provided
images_data = file_result.get('images', {})
if images_data and isinstance(images_data, dict):
self.logger.info(f"mineru-api: saving {len(images_data)} base64 images")
saved_images = self._save_base64_images(images_data, temp_dir, src_fileid)
self.logger.info(f"mineru-api: saved images: {saved_images}")
# Merge with images from content_list
for img in saved_images:
if img not in all_images:
all_images.append(img)
else:
self.logger.info(f"mineru-api: no base64 images in 'images' field")
# Check if there's an 'images' field at the top level of result
if 'images' in result and isinstance(result['images'], dict):
self.logger.info(f"mineru-api: found images at top level: {len(result['images'])} images")
# Save these images
for img_name, img_data in result['images'].items():
if img_name not in all_images:
# Save the image
try:
if isinstance(img_data, str) and img_data.startswith('data:'):
# Base64 encoded
saved = self._save_base64_images({img_name: img_data}, temp_dir, src_fileid)
all_images.extend(saved)
self.logger.info(f"mineru-api: saved top-level image: {img_name}")
except Exception as e:
self.logger.error(f"mineru-api: failed to save top-level image {img_name}: {e}")
processing_time = asyncio.get_event_loop().time() - start_time
# Detect language from the combined md_content
detected_language = None
if md_content and md_content.strip():
from .language_detector import LanguageDetector
language_code, confidence = LanguageDetector.detect_language(md_content)
if confidence > 0.7:
detected_language = language_code
self.logger.info(f"mineru-api: detected document language: {detected_language} (confidence: {confidence:.2f})")
# If no md_content, detect from markdown_content as fallback
if not detected_language and markdown_content and markdown_content.strip():
from .language_detector import LanguageDetector
language_code, confidence = LanguageDetector.detect_language(markdown_content)
if confidence > 0.7:
detected_language = language_code
self.logger.info(f"mineru-api: detected document language from content_list: {detected_language} (confidence: {confidence:.2f})")
# Create metadata
import json
metadata = {
"processing_time": processing_time,
"total_pages": len(page_data),
"images_found": len(all_images),
"tables_found": len(all_tables),
"is_ppt_source": is_ppt_converted,
"processing_mode": "full_document_self_hosted",
"api_version": "self_hosted",
"api_type": "self_hosted",
"page_data": page_data,
"content_list": json.loads(content_list_str) if isinstance(content_list_str, str) else content_list_str,
"detected_language": detected_language # Add detected language to metadata
}
# Create page results for compatibility
mineru_page_results = []
for page_idx, pdata in page_data.items():
mineru_page_results.append(MinerUPageResult(
page_idx=page_idx,
success=True,
content=pdata['content'],
images=pdata['images'],
tables=pdata['tables'],
metadata=pdata['metadata']
))
self.logger.info(f"mineru-api: full document self-hosted processing completed in {processing_time:.2f}s")
return MinerUResult(
success=True,
content=markdown_content,
images=all_images,
tables=all_tables,
metadata=metadata,
page_results=mineru_page_results
)
except Exception as e:
self.logger.error(f"mineru-api: full document self-hosted processing failed: {str(e)}")
raise
def _parse_self_hosted_response(self, api_response: Dict, temp_dir: str, page_id: str, page_num: int) -> Tuple[str, List[str], List[Dict]]:
"""
Parse response from self-hosted MinerU API.
Expected response format:
{
"backend": "pipeline",
"version": "2.1.10",
"results": {
"page_xxx": {
"md_content": "# Content...",
"images": {
"filename.jpg": "data:image/jpeg;base64,xxx...",
...
},
"middle_json": {...},
"model_output": {...}
}
}
}
"""
try:
content = ""
images = []
tables = []
# Extract results
results = api_response.get('results', {})
if results:
# Get the first result (should be our PDF file)
file_result = next(iter(results.values())) if results else {}
# Extract markdown content
md_content = file_result.get('md_content', '')
if md_content:
content = md_content
else:
content = f"# Page {page_num}\n\nNo content extracted from self-hosted API"
# Extract images from the images field (base64 encoded)
images_data = file_result.get('images', {})
if images_data and isinstance(images_data, dict):
images = self._save_base64_images(images_data, temp_dir, page_id)
else:
# Fallback to extracting from markdown if no images field
images = self._extract_images_from_markdown(md_content, temp_dir, page_id)
# Extract table information if available in middle_json
middle_json = file_result.get('middle_json', {})
if middle_json and isinstance(middle_json, dict):
tables = self._extract_tables_from_middle_json(middle_json, page_num)
self.logger.debug(f"mineru-api: [{page_id}] parsed self-hosted response - content: {len(content)} chars, images: {len(images)}, tables: {len(tables)}")
return content, images, tables
except Exception as e:
self.logger.error(f"mineru-api: [{page_id}] failed to parse self-hosted response: {str(e)}")
return f"# Page {page_num}\n\nError parsing API response: {str(e)}", [], []
def _save_base64_images(self, images_data: Dict[str, str], temp_dir: str, page_id: str) -> List[str]:
"""
Save base64 encoded images to files.
Args:
images_data: Dictionary with image filename as key and base64 data as value
temp_dir: Directory to save images
page_id: Page identifier for logging
Returns:
List of saved image filenames
"""
import base64
saved_images = []
for filename, base64_data in images_data.items():
try:
# Extract base64 data (remove data URI prefix if present)
if base64_data.startswith('data:'):
# Format: data:image/jpeg;base64,xxx...
base64_data = base64_data.split(',', 1)[1]
# Decode base64 to binary
image_data = base64.b64decode(base64_data)
# Use the original filename without prefix to match content_list references
image_filename = filename
image_path = os.path.join(temp_dir, image_filename)
# Save image file
with open(image_path, 'wb') as f:
f.write(image_data)
saved_images.append(image_filename)
self.logger.info(f"mineru-api: [{page_id}] saved base64 image: {image_filename} ({len(image_data)} bytes)")
except Exception as e:
self.logger.error(f"mineru-api: [{page_id}] failed to save base64 image {filename}: {str(e)}")
return saved_images
def _extract_images_from_markdown(self, md_content: str, temp_dir: str, page_id: str) -> List[str]:
"""
Extract image references from markdown content and handle them.
Self-hosted MinerU typically includes images as markdown references like ![alt](path)
"""
import re
images = []
# Find all markdown image references
image_pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
matches = re.findall(image_pattern, md_content)
for _, image_path in matches:
try:
# Handle different image path formats
if image_path.startswith('./') or image_path.startswith('../'):
# Relative path - assume it's in the same directory structure
actual_path = os.path.normpath(os.path.join(temp_dir, image_path))
elif os.path.isabs(image_path):
# Absolute path
actual_path = image_path
else:
# Relative to temp directory
actual_path = os.path.join(temp_dir, image_path)
# Check if image file exists and copy to our temp directory if needed
if os.path.exists(actual_path):
# Generate new filename for our processing
image_filename = f"self_hosted_{page_id}_{os.path.basename(image_path)}"
dest_path = os.path.join(temp_dir, image_filename)
if actual_path != dest_path:
import shutil
shutil.copy(actual_path, dest_path)
images.append(image_filename)
self.logger.debug(f"mineru-api: [{page_id}] extracted image: {image_filename}")
else:
self.logger.warning(f"mineru-api: [{page_id}] image file not found: {actual_path}")
except Exception as e:
self.logger.error(f"mineru-api: [{page_id}] error processing image {image_path}: {str(e)}")
return images
def _extract_tables_from_middle_json(self, middle_json: Dict, page_num: int) -> List[Dict]:
"""
Extract table information from middle_json if available.
"""
tables = []
try:
# This structure depends on the actual format returned by self-hosted MinerU
# Adjust based on the actual response structure
if 'tables' in middle_json:
table_data = middle_json['tables']
if isinstance(table_data, list):
for i, table in enumerate(table_data):
tables.append({
'page': page_num,
'table_id': i,
'content': str(table),
'source': 'self_hosted_middle_json'
})
elif isinstance(table_data, dict):
tables.append({
'page': page_num,
'content': str(table_data),
'source': 'self_hosted_middle_json'
})
except Exception as e:
self.logger.debug(f"mineru-api: page {page_num} table extraction from middle_json failed: {str(e)}")
return tables
async def _upload_file_to_accessible_url(self, pdf_path: str, src_fileid: str) -> str:
"""
Upload file to a publicly accessible URL for MinerU processing.
Uses platform adapter's upload_file method
"""
try:
# Use platform adapter for upload if available
if hasattr(self, 'platform_adapter') and self.platform_adapter:
# The adapter will handle the upload
return await self.platform_adapter.upload_file(pdf_path, {
'src_fileid': src_fileid
})
# Fallback: return local path if no adapter
logger.warning("No platform adapter available for upload, returning local path")
return pdf_path
except Exception as e:
self.logger.error(f"mineru-api: file upload failed: {str(e)}")
raise
async def _poll_task_completion(self, task_id: str, src_fileid: str,
max_wait_time: int = 600) -> str:
"""
Poll MinerU task until completion.
Args:
task_id: Task ID to poll
src_fileid: Source file ID for logging
max_wait_time: Maximum wait time in seconds
Returns:
Result ZIP file URL
"""
if not self.session:
raise RuntimeError("API client not initialized")
headers = {
'Authorization': f'Bearer {self.config.mineru_api_key}',
'Accept': '*/*'
}
start_time = asyncio.get_event_loop().time()
poll_interval = 5 # Start with 5 seconds
max_poll_interval = 30 # Max 30 seconds between polls
while True:
current_time = asyncio.get_event_loop().time()
if current_time - start_time > max_wait_time:
raise Exception(f"Task polling timeout after {max_wait_time} seconds")
try:
async with self.session.get(
f"{self.config.mineru_api_url}/api/v4/extract/task/{task_id}",
headers=headers
) as response:
if response.status != 200:
error_text = await response.text()
raise Exception(f"MinerU polling error: {response.status} - {error_text}")
result = await response.json()
if result.get('code') != 0:
raise Exception(f"MinerU polling error: {result.get('msg', 'Unknown error')}")
data = result['data']
state = data['state']
if state == 'done':
full_zip_url = data['full_zip_url']
self.logger.info(f"mineru-api: task completed: {task_id}")
return full_zip_url
elif state == 'failed':
error_msg = data.get('err_msg', 'Unknown error')
raise Exception(f"MinerU task failed: {error_msg}")
elif state in ['pending', 'running', 'converting']:
# Log progress if available
if 'extract_progress' in data:
progress = data['extract_progress']
extracted = progress.get('extracted_pages', 0)
total = progress.get('total_pages', 0)
start_time_str = progress.get('start_time', 'N/A')
self.logger.info(f"mineru-api: task {state}: {extracted}/{total} pages (started: {start_time_str})")
else:
self.logger.info(f"mineru-api: task {state}")
# Wait before next poll
await asyncio.sleep(poll_interval)
# Gradually increase poll interval
poll_interval = min(poll_interval * 1.2, max_poll_interval)
else:
raise Exception(f"Unknown task state: {state}")
except aiohttp.ClientError as e:
self.logger.warning(f"mineru-api: polling connection error: {str(e)}")
await asyncio.sleep(poll_interval)
async def _process_full_document_cloud(self, pdf_path: str, temp_dir: str, src_fileid: str, is_ppt_converted: bool) -> MinerUResult:
"""
Process full PDF document at once using cloud MinerU API.
This method uploads the entire document and extracts content_list from the result.
"""
try:
self.logger.info(f"mineru-api: processing full document with cloud API")
start_time = asyncio.get_event_loop().time()
# Step 1: Upload file to accessible URL
file_url = await self._upload_file_to_accessible_url(pdf_path, src_fileid)
self.logger.info(f"mineru-api: uploaded file URL: {file_url}")
if not file_url.startswith(('http://', 'https://')):
self.logger.warning(f"mineru-api: URL may not be valid for Cloud API: {file_url}")
# Step 2: Create task for full document
task_id = await self._create_mineru_task_full_document(file_url, src_fileid)
# Step 3: Poll for completion
result_url = await self._poll_task_completion(task_id, src_fileid, max_wait_time=900) # 15 min for full doc
# Step 4: Download and extract results with content_list
markdown_content, all_images, all_tables, page_data, full_md_content = await self._download_and_extract_results_with_content_list(
result_url, temp_dir, src_fileid
)
processing_time = asyncio.get_event_loop().time() - start_time
# Detect language from full.md first, then fallback to markdown_content
detected_language = None
if full_md_content and full_md_content.strip():
from .language_detector import LanguageDetector
language_code, confidence = LanguageDetector.detect_language(full_md_content)
if confidence > 0.7:
detected_language = language_code
self.logger.info(f"mineru-api: detected document language from full.md: {detected_language} (confidence: {confidence:.2f})")
# Fallback to content_list markdown if no full.md or low confidence
if not detected_language and markdown_content and markdown_content.strip():
from .language_detector import LanguageDetector
language_code, confidence = LanguageDetector.detect_language(markdown_content)
if confidence > 0.7:
detected_language = language_code
self.logger.info(f"mineru-api: detected document language from content_list: {detected_language} (confidence: {confidence:.2f})")
# Create metadata
metadata = {
"processing_time": processing_time,
"total_pages": len(page_data),
"images_found": len(all_images),
"tables_found": len(all_tables),
"is_ppt_source": is_ppt_converted,
"processing_mode": "full_document_cloud",
"api_version": "v4",
"api_type": "cloud",
"page_data": page_data,
"detected_language": detected_language # Add detected language to metadata
}
# Create page results for compatibility
mineru_page_results = []
for page_idx, pdata in page_data.items():
mineru_page_results.append(MinerUPageResult(
page_idx=page_idx,
success=True,
content=pdata['content'],
images=pdata['images'],
tables=pdata['tables'],
metadata=pdata['metadata']
))
self.logger.info(f"mineru-api: full document cloud processing completed in {processing_time:.2f}s")
return MinerUResult(
success=True,
content=markdown_content,
images=all_images,
tables=all_tables,
metadata=metadata,
page_results=mineru_page_results
)
except Exception as e:
self.logger.error(f"mineru-api: full document cloud processing failed: {str(e)}")
raise
async def _create_mineru_task_full_document(self, file_url: str, src_fileid: str) -> str:
"""
Create MinerU task for full document processing.
"""
if not self.session:
raise RuntimeError("API client not initialized")
headers = {
'Authorization': f'Bearer {self.config.mineru_api_key}',
'Content-Type': 'application/json',
'Accept': '*/*'
}
# Configure processing options for full document
payload = {
'url': file_url,
'is_ocr': True,
'enable_formula': True,
'enable_table': True,
'language': 'auto',
'data_id': src_fileid,
'model_version': 'v1',
'extra_formats': ['html'] # Request content_list format
}
try:
async with self.session.post(
f"{self.config.mineru_api_url}/api/v4/extract/task",
headers=headers,
json=payload
) as response:
if response.status != 200:
error_text = await response.text()
raise Exception(f"MinerU API error: {response.status} - {error_text}")
result = await response.json()
if result.get('code') != 0:
raise Exception(f"MinerU API error: {result.get('msg', 'Unknown error')}")
task_id = result['data']['task_id']
self.logger.info(f"mineru-api: full document task created: {task_id}")
return task_id
except aiohttp.ClientError as e:
raise Exception(f"MinerU API connection error: {str(e)}")
async def _download_and_extract_results_with_content_list(self, result_url: str, temp_dir: str,
src_fileid: str) -> Tuple[str, List[str], List[Dict], Dict[int, Dict], str]:
"""
Download and extract MinerU processing results including content_list.
Returns:
Tuple of (markdown_content, images, tables, page_data, full_md_content)
"""
if not self.session:
raise RuntimeError("API client not initialized")
# Download ZIP file
zip_path = os.path.join(temp_dir, f"mineru_result_{src_fileid}.zip")
try:
async with self.session.get(result_url) as response:
if response.status != 200:
error_text = await response.text()
raise Exception(f"Download error: {response.status} - {error_text}")
with open(zip_path, 'wb') as f:
async for chunk in response.content.iter_chunked(8192):
f.write(chunk)
self.logger.info(f"mineru-api: downloaded result ZIP: {zip_path}")
except aiohttp.ClientError as e:
raise Exception(f"Download connection error: {str(e)}")
# Extract ZIP file
extract_dir = os.path.join(temp_dir, f"mineru_extracted_{src_fileid}")
os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_dir)
self.logger.info(f"mineru-api: extracted results to: {extract_dir}")
# Look for content_list file and full.md
content_list = None
markdown_content = ""
full_md_content = "" # For language detection
images = []
tables = []
for root, _, files in os.walk(extract_dir):
for file in files:
file_path = os.path.join(root, file)
# Look for content_list JSON file
if file.endswith('_content_list.json'):
with open(file_path, 'r', encoding='utf-8') as f:
import json
content_list = json.load(f)
self.logger.info(f"mineru-api: found content_list with {len(content_list)} items")
elif file == 'full.md':
# Read full.md for language detection
with open(file_path, 'r', encoding='utf-8') as f:
full_md_content = f.read()
self.logger.info(f"mineru-api: found full.md with {len(full_md_content)} characters")
elif file.endswith('.md') and not markdown_content:
# Backup: use other markdown files if no content_list
with open(file_path, 'r', encoding='utf-8') as f:
markdown_content = f.read()
# Parse content_list if found
if content_list:
markdown_content, all_images, all_tables, page_data = self._parse_content_list_to_markdown(
content_list, temp_dir, src_fileid
)
self.logger.info(f"mineru-api: all_images from content_list: {all_images[:5]}...") # Show first 5 images
# Copy images referenced in content_list from images/ directory
images_dir = os.path.join(extract_dir, 'images')
self.logger.info(f"mineru-api: checking for images directory: {images_dir}")
# List all directories in extract_dir for debugging
if os.path.exists(extract_dir):
self.logger.info(f"mineru-api: contents of extract_dir: {os.listdir(extract_dir)}")
if os.path.exists(images_dir):
self.logger.info(f"mineru-api: found images directory: {images_dir}")
# List files in images directory
image_files = os.listdir(images_dir)
self.logger.info(f"mineru-api: found {len(image_files)} files in images directory")
self.logger.info(f"mineru-api: image files in directory: {image_files[:10]}") # Show first 10 files
# Copy ALL image files from images directory to temp_dir
import shutil
for img_file in image_files:
if img_file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
src_img_path = os.path.join(images_dir, img_file)
dest_img_path = os.path.join(temp_dir, img_file)
shutil.copy(src_img_path, dest_img_path)
self.logger.info(f"mineru-api: copied image {img_file} to temp_dir")
# Also try to copy specific images referenced in content_list
for img_filename in all_images:
# Try different possible paths and names
possible_names = [
img_filename,
img_filename.replace('.png', '.jpg'),
img_filename.replace('.jpg', '.png'),
os.path.basename(img_filename) # Just the filename without path
]
copied = False
for name in possible_names:
src_img_path = os.path.join(images_dir, name)
if os.path.exists(src_img_path):
dest_img_path = os.path.join(temp_dir, img_filename)
if not os.path.exists(dest_img_path):
shutil.copy(src_img_path, dest_img_path)
self.logger.info(f"mineru-api: copied referenced image {name} as {img_filename}")
copied = True
break
if not copied:
# Try to find similar files
base_name = os.path.splitext(img_filename)[0]
matching_files = [f for f in image_files if base_name in f]
if matching_files:
self.logger.warning(f"mineru-api: image {img_filename} not found, but similar files exist: {matching_files}")
else:
self.logger.warning(f"mineru-api: image {img_filename} not found in images dir")
else:
self.logger.warning(f"mineru-api: images directory not found: {images_dir}")
# For single-page documents, assign unassigned images to page 0
if len(page_data) == 1 and 0 in page_data:
# Check if any images are not yet assigned to pages
assigned_images = set()
for pd in page_data.values():
assigned_images.update(pd.get('images', []))
unassigned_images = [img for img in all_images if img not in assigned_images]
if unassigned_images:
self.logger.info(f"mineru-api: assigning {len(unassigned_images)} unassigned images to page 0")
page_data[0]['images'].extend(unassigned_images)
else:
# Fallback: parse markdown to create page data
self.logger.warning("mineru-api: no content_list found, using markdown fallback")
page_data = self._parse_markdown_to_page_data(markdown_content)
# Copy all images from extract_dir to temp_dir (without mineru_ prefix)
for root, _, files in os.walk(extract_dir):
for file in files:
if file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
file_path = os.path.join(root, file)
dest_path = os.path.join(temp_dir, file)
import shutil
shutil.copy(file_path, dest_path)
images.append(file)
self.logger.info(f"mineru-api: copied image {file} to temp_dir (fallback)")
all_images = images
all_tables = tables
# Clean up ZIP file
os.remove(zip_path)
self.logger.info(f"mineru-api: parsed results - {len(page_data)} pages, "
f"{len(all_images)} images, {len(all_tables)} tables")
return markdown_content, all_images, all_tables, page_data, full_md_content
def _parse_markdown_to_page_data(self, markdown_content: str) -> Dict[int, Dict]:
"""
Parse markdown content to create page data structure.
This is a fallback when content_list is not available.
"""
page_data = {}
# Split by page markers
import re
page_pattern = r'## Page (\d+)'
parts = re.split(page_pattern, markdown_content)
if len(parts) > 1:
# Skip the first part (before first page marker)
for i in range(1, len(parts), 2):
if i < len(parts) - 1:
page_num = int(parts[i])
page_content = parts[i + 1].strip()
page_idx = page_num - 1
page_data[page_idx] = {
'content': page_content,
'images': [],
'tables': [],
'metadata': {'page_num': page_num}
}
else:
# No page markers, treat as single page
page_data[0] = {
'content': markdown_content,
'images': [],
'tables': [],
'metadata': {'page_num': 1}
}
return page_data
async def _download_and_extract_results(self, result_url: str, temp_dir: str,
src_fileid: str) -> Tuple[str, List[str], List[Dict]]:
"""
Download and extract MinerU processing results.
Args:
result_url: URL to result ZIP file
temp_dir: Temporary directory for extraction
src_fileid: Source file ID for logging
Returns:
Tuple of (content, images, tables)
"""
if not self.session:
raise RuntimeError("API client not initialized")
# Download ZIP file
zip_path = os.path.join(temp_dir, f"mineru_result_{src_fileid}.zip")
try:
async with self.session.get(result_url) as response:
if response.status != 200:
error_text = await response.text()
raise Exception(f"Download error: {response.status} - {error_text}")
with open(zip_path, 'wb') as f:
async for chunk in response.content.iter_chunked(8192):
f.write(chunk)
self.logger.info(f"mineru-api: downloaded result ZIP: {zip_path}")
except aiohttp.ClientError as e:
raise Exception(f"Download connection error: {str(e)}")
# Extract ZIP file
extract_dir = os.path.join(temp_dir, f"mineru_extracted_{src_fileid}")
os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_dir)
self.logger.info(f"mineru-api: extracted results to: {extract_dir}")
# Parse extracted content
content = ""
images = []
tables = []
# Look for markdown file and other assets
for root, _, files in os.walk(extract_dir):
for file in files:
file_path = os.path.join(root, file)
if file.endswith('.md'):
# Read markdown content
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
self.logger.info(f"mineru-api: loaded markdown content: {len(content)} chars")
elif file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
# Copy image to temp directory and add to list
image_name = f"mineru_{file}"
dest_path = os.path.join(temp_dir, image_name)
import shutil
shutil.copy(file_path, dest_path)
images.append(image_name)
elif file.endswith('.html'):
# Parse HTML for additional table information if needed
with open(file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
# Extract table information from HTML
table_count = html_content.count('<table')
if table_count > 0:
tables.append({
"source": "html",
"table_count": table_count,
"content": "Tables extracted from HTML format"
})
# Clean up ZIP file
os.remove(zip_path)
self.logger.info(f"mineru-api: parsed results - content: {len(content)} chars, images: {len(images)}, tables: {len(tables)}")
return content, images, tables
async def _mock_mineru_processing(self, pdf_path: str, temp_dir: str,
src_fileid: str, is_ppt_converted: bool) -> MinerUResult:
"""
Mock MinerU processing for development/testing.
Provides realistic output structure for development without API calls.
"""
try:
# Simulate processing delay
await asyncio.sleep(0.5)
# Extract basic information from PDF for mock response
content_parts = []
images = []
tables = []
content_list = [] # For context extraction
with fitz.open(pdf_path) as doc:
for page_num, page in enumerate(doc):
# Extract text
page_text = page.get_text()
if page_text.strip():
content_parts.append(f"## Page {page_num + 1}\\n\\n{page_text}\\n")
# Add to content_list
content_list.append({
'page_idx': page_num,
'type': 'text',
'text': page_text,
'metadata': {}
})
# Mock image extraction (would be done by MinerU)
for img in page.get_images(full=True):
xref = img[0]
bbox = page.get_image_bbox(img)
if bbox.width > 0 and bbox.height > 0:
image_filename = f"mineru_image_{xref}.png"
image_path = os.path.join(temp_dir, image_filename)
# Extract and save image
try:
pix = fitz.Pixmap(doc, xref)
if pix.n - pix.alpha < 4: # GRAY or RGB
pix.save(image_path)
images.append(image_filename)
# Add to content_list
content_list.append({
'page_idx': page_num,
'type': 'image',
'img_path': image_filename,
'metadata': {}
})
pix = None # Free memory
except Exception:
pass
# Mock table detection
if "table" in page_text.lower() or "|" in page_text:
tables.append({
"page": page_num,
"content": "Mock table content detected",
"bbox": [0, 0, page.rect.width, page.rect.height]
})
# Add to content_list
content_list.append({
'page_idx': page_num,
'type': 'table',
'content': 'Mock table content',
'metadata': {}
})
# Combine content with mock markdown structure
mock_content = self._create_mock_markdown_content(content_parts, images)
# Detect language from the combined content
detected_language = None
if mock_content and mock_content.strip():
from .language_detector import LanguageDetector
language_code, confidence = LanguageDetector.detect_language(mock_content)
if confidence > 0.7:
detected_language = language_code
self.logger.info(f"mineru-api: detected document language (mock): {detected_language} (confidence: {confidence:.2f})")
# Mock metadata
metadata = {
"processing_time": 0.5,
"pages_processed": len(doc),
"images_found": len(images),
"tables_found": len(tables),
"is_ppt_source": is_ppt_converted,
"api_version": "mock",
"content_list": content_list, # Include content list for context extraction
"detected_language": detected_language # Add detected language to metadata
}
self.logger.info(f"mineru-api: mock processing complete: {metadata}")
return MinerUResult(
success=True,
content=mock_content,
images=images,
tables=tables,
metadata=metadata
)
except Exception as e:
self.logger.error(f"mineru-api: mock processing error: {str(e)}")
raise
def _create_mock_markdown_content(self, content_parts: List[str], images: List[str]) -> str:
"""
Create mock markdown content that simulates MinerU output structure.
"""
mock_parts = []
# Add document header
mock_parts.append("# Document Content (MinerU Mock)")
mock_parts.append("")
# Add content parts
for part in content_parts:
mock_parts.append(part)
# Add image references
if images:
mock_parts.append("## Images")
mock_parts.append("")
for img in images:
mock_parts.append(f"![Image](./{img})")
mock_parts.append("")
return "\\n".join(mock_parts)
async def _retry_with_backoff(self, func, *args, **kwargs):
"""
Execute a function with exponential backoff retry logic.
Args:
func: Async function to execute
*args: Positional arguments for the function
**kwargs: Keyword arguments for the function
Returns:
Result from the function
Raises:
Exception from the last retry attempt
"""
max_retries = self.config.api_max_retries
retry_delay = self.config.api_retry_delay
backoff = self.config.api_retry_backoff
max_delay = self.config.api_retry_max_delay
retry_on_errors = self.config.retry_on_errors
last_exception = None
for attempt in range(max_retries + 1):
try:
result = await func(*args, **kwargs)
# Check if result indicates success
if hasattr(result, 'success') and not result.success:
# Check if this is a retryable error
if result.error and retry_on_errors:
should_retry = any(err_type in str(result.error) for err_type in retry_on_errors)
if not should_retry and attempt < max_retries:
self.logger.warning(f"Non-retryable error: {result.error}")
return result
if attempt < max_retries:
self.logger.warning(f"API call failed (attempt {attempt + 1}/{max_retries + 1}): {result.error}")
last_exception = Exception(result.error or "API call failed")
else:
return result
else:
# Success
if attempt > 0:
self.logger.info(f"API call succeeded after {attempt + 1} attempts")
return result
except (aiohttp.ClientError, asyncio.TimeoutError, ConnectionError) as e:
# Network-related errors are always retryable
last_exception = e
if attempt < max_retries:
self.logger.warning(f"Network error (attempt {attempt + 1}/{max_retries + 1}): {str(e)}")
else:
self.logger.error(f"Network error after {max_retries + 1} attempts: {str(e)}")
raise
except Exception as e:
# Check if this is a retryable error type
if retry_on_errors:
should_retry = any(err_type in str(e) for err_type in retry_on_errors)
if not should_retry:
self.logger.error(f"Non-retryable error: {str(e)}")
raise
last_exception = e
if attempt < max_retries:
self.logger.warning(f"API error (attempt {attempt + 1}/{max_retries + 1}): {str(e)}")
else:
self.logger.error(f"API error after {max_retries + 1} attempts: {str(e)}")
raise
# If we need to retry, wait with exponential backoff
if attempt < max_retries:
delay = min(retry_delay * (backoff ** attempt), max_delay)
self.logger.info(f"Retrying in {delay:.1f} seconds...")
await asyncio.sleep(delay)
# Should not reach here, but just in case
if last_exception:
raise last_exception
else:
raise Exception("Maximum retries exceeded")
async def _process_batch_self_hosted_impl(self, batch_pdf_path: str, temp_dir: str, src_fileid: str,
batch_idx: int, start_page: int, end_page: int,
is_ppt_converted: bool) -> MinerUResult:
"""
Process a batch of pages using self-hosted MinerU API.
Args:
batch_pdf_path: Path to batch PDF file
temp_dir: Temporary directory
src_fileid: Source file ID
batch_idx: Batch index
start_page: Original start page index
end_page: Original end page index
is_ppt_converted: Whether PDF is from PPT
Returns:
MinerUResult for this batch
"""
try:
if not self.session:
raise RuntimeError("API client not initialized")
# Prepare multipart form data
form_data = aiohttp.FormData()
# API parameters
form_data.add_field('return_middle_json', 'false')
form_data.add_field('return_model_output', 'false')
form_data.add_field('return_md', 'true')
form_data.add_field('return_images', 'true')
form_data.add_field('return_content_list', 'true')
form_data.add_field('end_page_id', str(end_page - start_page))
form_data.add_field('parse_method', 'auto')
form_data.add_field('start_page_id', '0')
form_data.add_field('output_dir', './output')
form_data.add_field('server_url', 'string')
form_data.add_field('backend', 'pipeline')
form_data.add_field('table_enable', 'true')
form_data.add_field('formula_enable', 'true')
# Add the batch PDF file
with open(batch_pdf_path, 'rb') as f:
form_data.add_field('files', f, filename=f"batch_{batch_idx}.pdf",
content_type='application/pdf')
# Make API request
async with self.session.post(
f"{self.config.mineru_api_url}/file_parse",
data=form_data,
headers={'accept': 'application/json'}
) as response:
if response.status != 200:
error_text = await response.text()
raise Exception(f"Self-hosted MinerU API error: {response.status} - {error_text}")
result = await response.json()
# Process the batch result
results = result.get('results', {})
if not results:
raise Exception("No results in API response")
file_result = next(iter(results.values()))
content_list_str = file_result.get('content_list', '')
if not content_list_str:
raise Exception("No content_list in API response")
# Parse content_list with adjusted page indices
markdown_content, images, tables, page_data = self._parse_content_list_to_markdown_batch(
content_list_str, temp_dir, src_fileid, start_page
)
# Save batch images if provided
images_data = file_result.get('images', {})
if images_data and isinstance(images_data, dict):
saved_images = self._save_base64_images(images_data, temp_dir,
f"{src_fileid}_batch_{batch_idx}")
images.extend([img for img in saved_images if img not in images])
metadata = {
"batch_idx": batch_idx,
"start_page": start_page,
"end_page": end_page,
"pages_in_batch": end_page - start_page,
"is_ppt_source": is_ppt_converted,
"page_data": page_data # Add page_data so it can be extracted per page
}
return MinerUResult(
success=True,
content=markdown_content,
images=images,
tables=tables,
metadata=metadata,
page_results=None
)
except Exception as e:
self.logger.error(f"mineru-api: batch {batch_idx} processing failed: {str(e)}")
return MinerUResult(
success=False,
content="",
images=[],
tables=[],
metadata={"batch_idx": batch_idx, "error": str(e)},
error=str(e)
)
async def _process_batch_self_hosted(self, batch_pdf_path: str, temp_dir: str, src_fileid: str,
batch_idx: int, start_page: int, end_page: int,
is_ppt_converted: bool) -> MinerUResult:
"""
Process a batch WITHOUT retry logic.
Batch failures will fallback to single-page processing where retry happens.
"""
# Direct call without retry wrapper
return await self._process_batch_self_hosted_impl(
batch_pdf_path, temp_dir, src_fileid, batch_idx, start_page, end_page, is_ppt_converted
)
async def _process_batch_cloud_impl(self, batch_pdf_path: str, temp_dir: str, src_fileid: str,
batch_idx: int, start_page: int, end_page: int,
is_ppt_converted: bool) -> MinerUResult:
"""
Implementation of batch processing using cloud MinerU API.
"""
try:
# Upload batch PDF
file_url = await self._upload_file_to_accessible_url(batch_pdf_path,
f"{src_fileid}_batch_{batch_idx}")
# Create task for batch
task_id = await self._create_mineru_task_full_document(file_url,
f"{src_fileid}_batch_{batch_idx}")
# Poll for completion
result_url = await self._poll_task_completion(task_id, src_fileid, max_wait_time=300)
# Download and extract results
markdown_content, images, tables, page_data, _ = await self._download_and_extract_results_with_content_list(
result_url, temp_dir, f"{src_fileid}_batch_{batch_idx}"
)
# Adjust page indices to match original document
adjusted_page_data = {}
for page_idx, pdata in page_data.items():
adjusted_idx = page_idx + start_page
adjusted_page_data[adjusted_idx] = pdata
adjusted_page_data[adjusted_idx]['metadata']['original_page_num'] = adjusted_idx + 1
metadata = {
"batch_idx": batch_idx,
"start_page": start_page,
"end_page": end_page,
"pages_in_batch": end_page - start_page,
"is_ppt_source": is_ppt_converted,
"page_data": adjusted_page_data
}
return MinerUResult(
success=True,
content=markdown_content,
images=images,
tables=tables,
metadata=metadata,
page_results=None
)
except Exception as e:
self.logger.error(f"mineru-api: cloud batch {batch_idx} processing failed: {str(e)}")
return MinerUResult(
success=False,
content="",
images=[],
tables=[],
metadata={"batch_idx": batch_idx, "error": str(e)},
error=str(e)
)
async def _process_batch_cloud(self, batch_pdf_path: str, temp_dir: str, src_fileid: str,
batch_idx: int, start_page: int, end_page: int,
is_ppt_converted: bool) -> MinerUResult:
"""
Process a batch using cloud API WITHOUT retry logic.
Batch failures will fallback to single-page processing where retry happens.
"""
# Direct call without retry wrapper
return await self._process_batch_cloud_impl(
batch_pdf_path, temp_dir, src_fileid, batch_idx, start_page, end_page, is_ppt_converted
)
def _merge_batch_results(self, batch_results: List[Tuple[int, MinerUResult]],
total_pages: int, is_ppt_converted: bool) -> MinerUResult:
"""
Merge results from multiple batches into a single MinerUResult.
Args:
batch_results: List of (start_page, MinerUResult) tuples
total_pages: Total number of pages in original document
is_ppt_converted: Whether the PDF was converted from PPT
Returns:
Merged MinerUResult
"""
if not batch_results:
return MinerUResult(
success=False,
content="",
images=[],
tables=[],
metadata={},
error="No successful batches"
)
# Sort batches by start page
batch_results.sort(key=lambda x: x[0])
# Merge content
merged_content_parts = []
all_images = []
all_tables = []
all_page_data = {}
all_page_results = []
for start_page, batch_result in batch_results:
# Add batch content
merged_content_parts.append(batch_result.content)
# Collect images (avoid duplicates)
for img in batch_result.images:
if img not in all_images:
all_images.append(img)
# Collect tables
all_tables.extend(batch_result.tables)
# Merge page data if available
if batch_result.metadata and 'page_data' in batch_result.metadata:
all_page_data.update(batch_result.metadata['page_data'])
# Create page results if needed
if batch_result.page_results:
all_page_results.extend(batch_result.page_results)
# Join content with page separators
merged_content = "\n\n".join(merged_content_parts)
# Create merged metadata
merged_metadata = {
"processing_mode": "batch_processing",
"total_pages": total_pages,
"batch_count": len(batch_results),
"images_found": len(all_images),
"tables_found": len(all_tables),
"is_ppt_source": is_ppt_converted,
"page_data": all_page_data if all_page_data else None
}
self.logger.info(f"mineru-api: merged {len(batch_results)} batches - "
f"{len(all_images)} images, {len(all_tables)} tables")
return MinerUResult(
success=True,
content=merged_content,
images=all_images,
tables=all_tables,
metadata=merged_metadata,
page_results=all_page_results if all_page_results else None
)
def _parse_content_list_to_markdown_batch(self, content_list: Any, temp_dir: str,
src_fileid: str, page_offset: int) -> Tuple[str, List[str], List[Dict], Dict[int, Dict]]:
"""
Parse content_list for a batch with page offset adjustment.
Args:
content_list: Content list from API
temp_dir: Temporary directory
src_fileid: Source file ID
page_offset: Offset to add to page indices
Returns:
Tuple of (markdown, images, tables, page_data)
"""
# Parse normally first
markdown, images, tables, page_data = self._parse_content_list_to_markdown(
content_list, temp_dir, src_fileid
)
# Adjust page indices in page_data
adjusted_page_data = {}
for page_idx, pdata in page_data.items():
adjusted_idx = page_idx + page_offset
adjusted_page_data[adjusted_idx] = pdata
# Update page number in metadata
if 'metadata' in pdata:
pdata['metadata']['page_num'] = adjusted_idx + 1
# Adjust page numbers in tables
for table in tables:
if 'page' in table:
table['page'] += page_offset
return markdown, images, tables, adjusted_page_data
def _parse_content_list_to_markdown(self, content_list: List[Dict], temp_dir: str, src_fileid: str) -> Tuple[str, List[str], List[Dict], Dict[int, Dict]]:
"""
Parse content_list JSON format to markdown organized by pages.
Args:
content_list: List of content items with page_idx, type, and content
temp_dir: Temporary directory for saving images
src_fileid: Source file ID for logging
Returns:
Tuple of (markdown_content, image_list, table_list, page_data_dict)
"""
try:
import json
import base64
# If content_list is a string, parse it as JSON
if isinstance(content_list, str):
self.logger.info(f"mineru-api: Parsing content_list string of length {len(content_list)}")
try:
content_list = json.loads(content_list)
self.logger.info(f"mineru-api: Parsed content_list to {type(content_list)} with {len(content_list) if isinstance(content_list, list) else 'N/A'} items")
except json.JSONDecodeError as e:
self.logger.error(f"mineru-api: Failed to parse content_list JSON: {str(e)}")
self.logger.error(f"mineru-api: Content_list sample: {content_list[:500]}")
raise
# Log content_list structure
if isinstance(content_list, list):
self.logger.info(f"mineru-api: Content list has {len(content_list)} items")
if content_list:
self.logger.debug(f"mineru-api: First item sample: {content_list[0] if content_list else 'None'}")
else:
self.logger.warning(f"mineru-api: Content list is not a list, type: {type(content_list)}")
# Group content by page
page_groups = {}
for item in content_list:
page_idx = item.get('page_idx', 0)
if page_idx not in page_groups:
page_groups[page_idx] = []
page_groups[page_idx].append(item)
# Sort pages
sorted_pages = sorted(page_groups.keys())
# Build markdown and collect resources
markdown_parts = []
all_images = []
all_tables = []
page_data = {}
for page_idx in sorted_pages:
page_num = page_idx + 1 # Convert to 1-based
page_items = page_groups[page_idx]
# Page header - only add if there's content
# We'll add this after checking for content
page_content_parts = []
page_images = []
page_tables = []
for item in page_items:
item_type = item.get('type', 'text')
if item_type == 'text':
text = item.get('text', '').strip()
text_level = item.get('text_level', 0)
if text:
# Apply heading levels
if text_level > 0:
# Convert to markdown heading
heading_prefix = '#' * min(text_level, 6)
page_content_parts.append(f"{heading_prefix} {text}")
else:
page_content_parts.append(text)
elif item_type == 'image':
img_path = item.get('img_path', '')
img_caption = item.get('img_caption', [])
img_footnote = item.get('img_footnote', [])
self.logger.info(f"mineru-api: processing image item - img_path: {img_path[:100] if img_path else 'None'}")
# Handle image path/data
if img_path:
if img_path.startswith('data:'):
# Base64 encoded image
try:
# Extract format and data
header, data = img_path.split(',', 1)
fmt = header.split('/')[1].split(';')[0]
# Decode and save
img_data = base64.b64decode(data)
img_filename = f"content_list_img_{src_fileid}_p{page_num}_{len(page_images)}.{fmt}"
img_file_path = os.path.join(temp_dir, img_filename)
with open(img_file_path, 'wb') as f:
f.write(img_data)
page_images.append(img_filename)
all_images.append(img_filename)
# Add to markdown
page_content_parts.append(f"![Image]({img_filename})")
except Exception as e:
self.logger.error(f"Failed to decode base64 image: {str(e)}")
else:
# Regular image path - need to check if it's a file that needs to be created
img_filename = os.path.basename(img_path)
# For self-hosted API, images might be referenced but not yet saved
# We'll add them to the list and expect them to be in the 'images' field
page_images.append(img_filename)
all_images.append(img_filename)
# Use relative path for image reference
img_ref = f"images/{img_filename}" if not img_path.startswith('images/') else img_path
page_content_parts.append(f"![Image]({img_ref})")
self.logger.info(f"mineru-api: added image reference: ![Image]({img_ref})")
self.logger.info(f"mineru-api: expecting image file: {img_filename} to be provided by API")
# Add captions if present
if img_caption:
caption_text = ' '.join(img_caption)
page_content_parts.append(f"*{caption_text}*")
if img_footnote:
footnote_text = ' '.join(img_footnote)
page_content_parts.append(f"**Note:** {footnote_text}")
elif item_type == 'table':
table_body = item.get('table_body', '')
table_caption = item.get('table_caption', [])
table_footnote = item.get('table_footnote', [])
# Add table caption
if table_caption:
caption_text = ' '.join(table_caption)
page_content_parts.append(f"**{caption_text}**")
# Add table content
if table_body:
# If HTML table, add directly
if table_body.strip().startswith('<'):
page_content_parts.append(table_body)
else:
page_content_parts.append(f"```\n{table_body}\n```")
# Add footnote
if table_footnote:
footnote_text = ' '.join(table_footnote)
page_content_parts.append(f"*Note: {footnote_text}*")
# Store table data
table_data = {
'page': page_num,
'content': table_body,
'caption': table_caption,
'footnote': table_footnote
}
page_tables.append(table_data)
all_tables.append(table_data)
elif item_type == 'equation':
eq_text = item.get('text', '')
eq_format = item.get('text_format', 'latex')
if eq_text:
if eq_format == 'latex':
# Use display math for equations
page_content_parts.append(f"$$\n{eq_text}\n$$")
else:
page_content_parts.append(f"```{eq_format}\n{eq_text}\n```")
# Combine page content
# Filter out empty parts to avoid excessive newlines
non_empty_parts = [part for part in page_content_parts if part.strip()]
page_content = '\n'.join(non_empty_parts) if non_empty_parts else ''
# Only add page header and content if there's actual content
if page_content:
markdown_parts.append(f"\n\n## Page {page_num}\n\n")
markdown_parts.append(page_content)
# Store page data
page_data[page_idx] = {
'content': page_content,
'images': page_images,
'tables': page_tables,
'metadata': {'page_num': page_num}
}
# Combine all markdown
final_markdown = ''.join(markdown_parts)
self.logger.info(f"mineru-api: parsed content_list - {len(sorted_pages)} pages, "
f"{len(all_images)} images, {len(all_tables)} tables")
return final_markdown, all_images, all_tables, page_data
except Exception as e:
self.logger.error(f"mineru-api: content_list parsing failed: {str(e)}")
raise
def detect_tables(self, content: str) -> bool:
"""
Detect if content contains table structures.
Based on gzero.py's table detection logic.
"""
table_indicators = [
'<table>', '<tr>', '<td>', '|---|',
'表格', 'Table', '| ', ' |',
'', '', '', '' # Table border characters
]
content_lower = content.lower()
for indicator in table_indicators:
if indicator.lower() in content_lower:
return True
# Check for pipe-separated table format
lines = content.split('\\n')
pipe_lines = [line for line in lines if line.count('|') >= 2]
if len(pipe_lines) >= 2: # At least header and one data row
return True
return False
async def extract_plain_text(self, pdf_path: str, src_fileid: str) -> str:
"""
Extract plain text from PDF using PyMuPDF.
This provides text content for comparison with MinerU results.
"""
try:
text_parts = []
with fitz.open(pdf_path) as doc:
for page in doc:
page_text = page.get_text()
if page_text.strip():
text_parts.append(page_text)
plain_text = '\\n\\n'.join(text_parts)
self.logger.info(f"mineru-api: extracted {len(plain_text)} characters of plain text")
return plain_text
except Exception as e:
self.logger.error(f"mineru-api: plain text extraction failed: {str(e)}")
return ""
def merge_content(self, plain_text: str, mineru_content: str, src_fileid: str) -> str:
"""
Merge plain text with MinerU structured content.
This combines the reliability of plain text extraction with
MinerU's structured parsing capabilities.
"""
try:
# Simple merge strategy - could be enhanced with more sophisticated logic
if not mineru_content.strip():
self.logger.warning(f"mineru-api: MinerU content empty, using plain text")
return plain_text
if not plain_text.strip():
self.logger.warning(f"mineru-api: plain text empty, using MinerU content")
return mineru_content
# For now, prefer MinerU content as it should be more structured
# In practice, you might want more sophisticated merging logic
self.logger.info(f"mineru-api: using MinerU structured content")
return mineru_content
except Exception as e:
self.logger.error(f"mineru-api: content merge failed: {str(e)}")
# Fallback to plain text
return plain_text if plain_text else mineru_content