1932 lines
86 KiB
Python
1932 lines
86 KiB
Python
"""
|
|
MinerU API client module.
|
|
|
|
This module handles communication with the MinerU service for document parsing,
|
|
following the architecture patterns from gzero.py and implementing the real API.
|
|
"""
|
|
|
|
import os
|
|
import asyncio
|
|
import aiohttp
|
|
import zipfile
|
|
import fitz
|
|
import time
|
|
from typing import Dict, List, Optional, Any, Tuple
|
|
from dataclasses import dataclass
|
|
from .logger import get_module_logger
|
|
logger = get_module_logger('api_client')
|
|
|
|
from .config_base import MinerUConfig
|
|
|
|
|
|
@dataclass
|
|
class MinerUPageResult:
|
|
"""Result from MinerU processing for a single page"""
|
|
page_idx: int
|
|
success: bool
|
|
content: str
|
|
images: List[str]
|
|
tables: List[Dict]
|
|
metadata: Dict[str, Any]
|
|
error: Optional[str] = None
|
|
|
|
@dataclass
|
|
class MinerUResult:
|
|
"""Result from MinerU processing"""
|
|
success: bool
|
|
content: str # Backward compatibility - merged content
|
|
images: List[str] # All images
|
|
tables: List[Dict] # All tables
|
|
metadata: Dict[str, Any]
|
|
error: Optional[str] = None
|
|
page_results: Optional[List[MinerUPageResult]] = None # Individual page results
|
|
|
|
|
|
class MinerUAPIClient:
|
|
"""Client for interacting with MinerU API"""
|
|
|
|
def __init__(self, config: MinerUConfig, platform_adapter=None):
|
|
self.config = config
|
|
self.session: Optional[aiohttp.ClientSession] = None
|
|
self.logger = logger
|
|
self.platform_adapter = platform_adapter
|
|
|
|
async def __aenter__(self):
|
|
"""Async context manager entry"""
|
|
self.session = aiohttp.ClientSession(
|
|
timeout=aiohttp.ClientTimeout(total=600) # 10 minute timeout
|
|
)
|
|
return self
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
"""Async context manager exit"""
|
|
if self.session:
|
|
await self.session.close()
|
|
|
|
async def process_document(self, pdf_path: str, temp_dir: str, src_fileid: str,
|
|
is_ppt_converted: bool = False, batch_size: int = 20) -> MinerUResult:
|
|
"""
|
|
Process document using MinerU API with batch processing support.
|
|
|
|
Can process the entire document at once or in batches for large PDFs.
|
|
|
|
Args:
|
|
pdf_path: Path to PDF file
|
|
temp_dir: Temporary directory for processing
|
|
src_fileid: Source file ID for logging/tracing
|
|
is_ppt_converted: Whether the PDF was converted from PPT
|
|
batch_size: Maximum pages per batch (0 for no batching)
|
|
|
|
Returns:
|
|
MinerUResult with parsed content
|
|
"""
|
|
try:
|
|
# Check PDF page count
|
|
page_count = self._get_pdf_page_count(pdf_path)
|
|
self.logger.info(f"mineru-api: PDF has {page_count} pages, batch_size={batch_size}")
|
|
|
|
# Decide whether to use batch processing
|
|
if batch_size > 0 and page_count > batch_size:
|
|
self.logger.info(f"mineru-api: using batch processing (batch_size={batch_size})")
|
|
return await self._process_document_in_batches(
|
|
pdf_path, temp_dir, src_fileid, is_ppt_converted, batch_size
|
|
)
|
|
else:
|
|
self.logger.info(f"mineru-api: processing full document at once")
|
|
return await self._process_document_full(
|
|
pdf_path, temp_dir, src_fileid, is_ppt_converted
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-api: document processing failed: {str(e)}")
|
|
return MinerUResult(
|
|
success=False,
|
|
content="",
|
|
images=[],
|
|
tables=[],
|
|
metadata={},
|
|
error=str(e)
|
|
)
|
|
|
|
async def _process_document_full(self, pdf_path: str, temp_dir: str, src_fileid: str,
|
|
is_ppt_converted: bool) -> MinerUResult:
|
|
"""Process full document at once (original implementation)."""
|
|
self.logger.info(f"mineru-api: starting full document processing")
|
|
|
|
# Choose processing method based on API type
|
|
if self.config.mineru_api_type == "self_hosted":
|
|
self.logger.info(f"mineru-api: using self-hosted MinerU API")
|
|
result = await self._process_full_document_self_hosted(pdf_path, temp_dir, src_fileid, is_ppt_converted)
|
|
elif self.config.mineru_api_key and self.config.mineru_api_key.strip():
|
|
self.logger.info(f"mineru-api: using cloud MinerU API")
|
|
result = await self._process_full_document_cloud(pdf_path, temp_dir, src_fileid, is_ppt_converted)
|
|
else:
|
|
self.logger.warning(f"mineru-api: no API configuration, using mock processing")
|
|
result = await self._mock_mineru_processing(pdf_path, temp_dir, src_fileid, is_ppt_converted)
|
|
|
|
self.logger.info(f"mineru-api: document processing completed")
|
|
return result
|
|
|
|
async def _process_document_in_batches(self, pdf_path: str, temp_dir: str, src_fileid: str,
|
|
is_ppt_converted: bool, batch_size: int) -> MinerUResult:
|
|
"""
|
|
Process document in batches of pages.
|
|
|
|
Args:
|
|
pdf_path: Path to PDF file
|
|
temp_dir: Temporary directory for processing
|
|
src_fileid: Source file ID for logging/tracing
|
|
is_ppt_converted: Whether the PDF was converted from PPT
|
|
batch_size: Maximum pages per batch
|
|
|
|
Returns:
|
|
MinerUResult with merged results from all batches
|
|
"""
|
|
try:
|
|
page_count = self._get_pdf_page_count(pdf_path)
|
|
num_batches = (page_count + batch_size - 1) // batch_size
|
|
|
|
self.logger.info(f"mineru-api: splitting {page_count} pages into {num_batches} batches")
|
|
|
|
# Process each batch
|
|
batch_results = []
|
|
for batch_idx in range(num_batches):
|
|
start_page = batch_idx * batch_size
|
|
end_page = min(start_page + batch_size, page_count)
|
|
|
|
self.logger.info(f"mineru-api: processing batch {batch_idx + 1}/{num_batches} "
|
|
f"(pages {start_page + 1}-{end_page})")
|
|
|
|
# Split PDF for this batch
|
|
batch_pdf_path = await self._split_pdf(
|
|
pdf_path, temp_dir, start_page, end_page, batch_idx
|
|
)
|
|
|
|
# Process batch based on API type
|
|
if self.config.mineru_api_type == "self_hosted":
|
|
batch_result = await self._process_batch_self_hosted(
|
|
batch_pdf_path, temp_dir, src_fileid, batch_idx,
|
|
start_page, end_page, is_ppt_converted
|
|
)
|
|
elif self.config.mineru_api_key and self.config.mineru_api_key.strip():
|
|
batch_result = await self._process_batch_cloud(
|
|
batch_pdf_path, temp_dir, src_fileid, batch_idx,
|
|
start_page, end_page, is_ppt_converted
|
|
)
|
|
else:
|
|
batch_result = await self._mock_mineru_processing(
|
|
batch_pdf_path, temp_dir, src_fileid, is_ppt_converted
|
|
)
|
|
|
|
if batch_result.success:
|
|
batch_results.append((start_page, batch_result))
|
|
else:
|
|
self.logger.error(f"mineru-api: batch {batch_idx + 1} failed: {batch_result.error}")
|
|
# Continue with other batches even if one fails
|
|
|
|
# Merge all batch results
|
|
return self._merge_batch_results(batch_results, page_count, is_ppt_converted)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-api: batch processing failed: {str(e)}")
|
|
raise
|
|
|
|
def _get_pdf_page_count(self, pdf_path: str) -> int:
|
|
"""Get the number of pages in a PDF."""
|
|
with fitz.open(pdf_path) as doc:
|
|
return len(doc)
|
|
|
|
async def _split_pdf(self, pdf_path: str, temp_dir: str, start_page: int,
|
|
end_page: int, batch_idx: int) -> str:
|
|
"""
|
|
Split PDF to extract specific pages for batch processing.
|
|
|
|
Args:
|
|
pdf_path: Original PDF path
|
|
temp_dir: Temporary directory
|
|
start_page: Start page index (0-based)
|
|
end_page: End page index (exclusive)
|
|
batch_idx: Batch index for naming
|
|
|
|
Returns:
|
|
Path to the split PDF file
|
|
"""
|
|
batch_pdf_path = os.path.join(temp_dir, f"batch_{batch_idx}.pdf")
|
|
|
|
with fitz.open(pdf_path) as src_doc:
|
|
batch_doc = fitz.open() # Create new PDF
|
|
|
|
# Copy pages to new document
|
|
for page_idx in range(start_page, end_page):
|
|
batch_doc.insert_pdf(src_doc, from_page=page_idx, to_page=page_idx)
|
|
|
|
batch_doc.save(batch_pdf_path)
|
|
batch_doc.close()
|
|
|
|
self.logger.info(f"mineru-api: created batch PDF with {end_page - start_page} pages: {batch_pdf_path}")
|
|
return batch_pdf_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def _process_full_document_self_hosted(self, pdf_path: str, temp_dir: str, src_fileid: str, is_ppt_converted: bool) -> MinerUResult:
|
|
"""
|
|
Process full PDF document at once using self-hosted MinerU API with content_list support.
|
|
|
|
This method uploads the entire document and gets back content_list for all pages.
|
|
"""
|
|
try:
|
|
self.logger.info(f"mineru-api: processing full document with self-hosted API")
|
|
|
|
if not self.session:
|
|
raise RuntimeError("API client not initialized")
|
|
|
|
start_time = asyncio.get_event_loop().time()
|
|
|
|
# Prepare multipart form data
|
|
form_data = aiohttp.FormData()
|
|
|
|
# API parameters - enable content_list for full document processing
|
|
form_data.add_field('return_middle_json', 'false')
|
|
form_data.add_field('return_model_output', 'false')
|
|
form_data.add_field('return_md', 'true')
|
|
form_data.add_field('return_images', 'true')
|
|
form_data.add_field('return_content_list', 'true') # Enable content_list
|
|
form_data.add_field('end_page_id', '99999')
|
|
form_data.add_field('parse_method', 'auto')
|
|
form_data.add_field('start_page_id', '0')
|
|
form_data.add_field('output_dir', './output')
|
|
form_data.add_field('server_url', 'string')
|
|
form_data.add_field('backend', 'pipeline')
|
|
form_data.add_field('table_enable', 'true')
|
|
form_data.add_field('formula_enable', 'true')
|
|
|
|
# Add the PDF file
|
|
with open(pdf_path, 'rb') as f:
|
|
form_data.add_field('files', f, filename=os.path.basename(pdf_path), content_type='application/pdf')
|
|
|
|
# Make API request
|
|
async with self.session.post(
|
|
f"{self.config.mineru_api_url}/file_parse",
|
|
data=form_data,
|
|
headers={'accept': 'application/json'}
|
|
) as response:
|
|
|
|
if response.status != 200:
|
|
error_text = await response.text()
|
|
raise Exception(f"Self-hosted MinerU API error: {response.status} - {error_text}")
|
|
|
|
result = await response.json()
|
|
|
|
# Log the top-level keys to understand response structure
|
|
self.logger.info(f"mineru-api: response top-level keys: {list(result.keys())}")
|
|
|
|
# Extract content_list from response
|
|
results = result.get('results', {})
|
|
if not results:
|
|
raise Exception("No results in API response")
|
|
|
|
# Get the first result (should be our PDF file)
|
|
file_result = next(iter(results.values()))
|
|
self.logger.info(f"mineru-api: file_result keys: {list(file_result.keys())}")
|
|
|
|
# Extract content_list
|
|
content_list_str = file_result.get('content_list', '')
|
|
self.logger.info(f"mineru-api: content_list type: {type(content_list_str)}, length: {len(str(content_list_str))}")
|
|
if not content_list_str:
|
|
self.logger.error(f"mineru-api: No content_list in API response. File result keys: {list(file_result.keys())}")
|
|
# Log a sample of the file_result to understand what we're getting
|
|
sample_result = str(file_result)[:500] if file_result else 'None'
|
|
self.logger.error(f"mineru-api: File result sample: {sample_result}")
|
|
raise Exception("No content_list in API response")
|
|
|
|
# Parse content_list to markdown
|
|
markdown_content, all_images, all_tables, page_data = self._parse_content_list_to_markdown(
|
|
content_list_str, temp_dir, src_fileid
|
|
)
|
|
|
|
# Also get markdown content if available for language detection
|
|
md_content = file_result.get('md_content', '')
|
|
|
|
# Extract base64 images if provided
|
|
images_data = file_result.get('images', {})
|
|
if images_data and isinstance(images_data, dict):
|
|
self.logger.info(f"mineru-api: saving {len(images_data)} base64 images")
|
|
saved_images = self._save_base64_images(images_data, temp_dir, src_fileid)
|
|
self.logger.info(f"mineru-api: saved images: {saved_images}")
|
|
# Merge with images from content_list
|
|
for img in saved_images:
|
|
if img not in all_images:
|
|
all_images.append(img)
|
|
else:
|
|
self.logger.info(f"mineru-api: no base64 images in 'images' field")
|
|
|
|
# Check if there's an 'images' field at the top level of result
|
|
if 'images' in result and isinstance(result['images'], dict):
|
|
self.logger.info(f"mineru-api: found images at top level: {len(result['images'])} images")
|
|
# Save these images
|
|
for img_name, img_data in result['images'].items():
|
|
if img_name not in all_images:
|
|
# Save the image
|
|
try:
|
|
if isinstance(img_data, str) and img_data.startswith('data:'):
|
|
# Base64 encoded
|
|
saved = self._save_base64_images({img_name: img_data}, temp_dir, src_fileid)
|
|
all_images.extend(saved)
|
|
self.logger.info(f"mineru-api: saved top-level image: {img_name}")
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-api: failed to save top-level image {img_name}: {e}")
|
|
|
|
processing_time = asyncio.get_event_loop().time() - start_time
|
|
|
|
# Detect language from the combined md_content
|
|
detected_language = None
|
|
if md_content and md_content.strip():
|
|
from .language_detector import LanguageDetector
|
|
language_code, confidence = LanguageDetector.detect_language(md_content)
|
|
if confidence > 0.7:
|
|
detected_language = language_code
|
|
self.logger.info(f"mineru-api: detected document language: {detected_language} (confidence: {confidence:.2f})")
|
|
|
|
# If no md_content, detect from markdown_content as fallback
|
|
if not detected_language and markdown_content and markdown_content.strip():
|
|
from .language_detector import LanguageDetector
|
|
language_code, confidence = LanguageDetector.detect_language(markdown_content)
|
|
if confidence > 0.7:
|
|
detected_language = language_code
|
|
self.logger.info(f"mineru-api: detected document language from content_list: {detected_language} (confidence: {confidence:.2f})")
|
|
|
|
# Create metadata
|
|
import json
|
|
metadata = {
|
|
"processing_time": processing_time,
|
|
"total_pages": len(page_data),
|
|
"images_found": len(all_images),
|
|
"tables_found": len(all_tables),
|
|
"is_ppt_source": is_ppt_converted,
|
|
"processing_mode": "full_document_self_hosted",
|
|
"api_version": "self_hosted",
|
|
"api_type": "self_hosted",
|
|
"page_data": page_data,
|
|
"content_list": json.loads(content_list_str) if isinstance(content_list_str, str) else content_list_str,
|
|
"detected_language": detected_language # Add detected language to metadata
|
|
}
|
|
|
|
# Create page results for compatibility
|
|
mineru_page_results = []
|
|
for page_idx, pdata in page_data.items():
|
|
mineru_page_results.append(MinerUPageResult(
|
|
page_idx=page_idx,
|
|
success=True,
|
|
content=pdata['content'],
|
|
images=pdata['images'],
|
|
tables=pdata['tables'],
|
|
metadata=pdata['metadata']
|
|
))
|
|
|
|
self.logger.info(f"mineru-api: full document self-hosted processing completed in {processing_time:.2f}s")
|
|
|
|
return MinerUResult(
|
|
success=True,
|
|
content=markdown_content,
|
|
images=all_images,
|
|
tables=all_tables,
|
|
metadata=metadata,
|
|
page_results=mineru_page_results
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-api: full document self-hosted processing failed: {str(e)}")
|
|
raise
|
|
|
|
def _parse_self_hosted_response(self, api_response: Dict, temp_dir: str, page_id: str, page_num: int) -> Tuple[str, List[str], List[Dict]]:
|
|
"""
|
|
Parse response from self-hosted MinerU API.
|
|
|
|
Expected response format:
|
|
{
|
|
"backend": "pipeline",
|
|
"version": "2.1.10",
|
|
"results": {
|
|
"page_xxx": {
|
|
"md_content": "# Content...",
|
|
"images": {
|
|
"filename.jpg": "...",
|
|
...
|
|
},
|
|
"middle_json": {...},
|
|
"model_output": {...}
|
|
}
|
|
}
|
|
}
|
|
"""
|
|
try:
|
|
content = ""
|
|
images = []
|
|
tables = []
|
|
|
|
# Extract results
|
|
results = api_response.get('results', {})
|
|
|
|
if results:
|
|
# Get the first result (should be our PDF file)
|
|
file_result = next(iter(results.values())) if results else {}
|
|
|
|
# Extract markdown content
|
|
md_content = file_result.get('md_content', '')
|
|
if md_content:
|
|
content = md_content
|
|
else:
|
|
content = f"# Page {page_num}\n\nNo content extracted from self-hosted API"
|
|
|
|
# Extract images from the images field (base64 encoded)
|
|
images_data = file_result.get('images', {})
|
|
if images_data and isinstance(images_data, dict):
|
|
images = self._save_base64_images(images_data, temp_dir, page_id)
|
|
else:
|
|
# Fallback to extracting from markdown if no images field
|
|
images = self._extract_images_from_markdown(md_content, temp_dir, page_id)
|
|
|
|
# Extract table information if available in middle_json
|
|
middle_json = file_result.get('middle_json', {})
|
|
if middle_json and isinstance(middle_json, dict):
|
|
tables = self._extract_tables_from_middle_json(middle_json, page_num)
|
|
|
|
self.logger.debug(f"mineru-api: [{page_id}] parsed self-hosted response - content: {len(content)} chars, images: {len(images)}, tables: {len(tables)}")
|
|
|
|
return content, images, tables
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-api: [{page_id}] failed to parse self-hosted response: {str(e)}")
|
|
return f"# Page {page_num}\n\nError parsing API response: {str(e)}", [], []
|
|
|
|
def _save_base64_images(self, images_data: Dict[str, str], temp_dir: str, page_id: str) -> List[str]:
|
|
"""
|
|
Save base64 encoded images to files.
|
|
|
|
Args:
|
|
images_data: Dictionary with image filename as key and base64 data as value
|
|
temp_dir: Directory to save images
|
|
page_id: Page identifier for logging
|
|
|
|
Returns:
|
|
List of saved image filenames
|
|
"""
|
|
import base64
|
|
|
|
saved_images = []
|
|
|
|
for filename, base64_data in images_data.items():
|
|
try:
|
|
# Extract base64 data (remove data URI prefix if present)
|
|
if base64_data.startswith('data:'):
|
|
# Format: ...
|
|
base64_data = base64_data.split(',', 1)[1]
|
|
|
|
# Decode base64 to binary
|
|
image_data = base64.b64decode(base64_data)
|
|
|
|
# Use the original filename without prefix to match content_list references
|
|
image_filename = filename
|
|
image_path = os.path.join(temp_dir, image_filename)
|
|
|
|
# Save image file
|
|
with open(image_path, 'wb') as f:
|
|
f.write(image_data)
|
|
|
|
saved_images.append(image_filename)
|
|
self.logger.info(f"mineru-api: [{page_id}] saved base64 image: {image_filename} ({len(image_data)} bytes)")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-api: [{page_id}] failed to save base64 image {filename}: {str(e)}")
|
|
|
|
return saved_images
|
|
|
|
def _extract_images_from_markdown(self, md_content: str, temp_dir: str, page_id: str) -> List[str]:
|
|
"""
|
|
Extract image references from markdown content and handle them.
|
|
|
|
Self-hosted MinerU typically includes images as markdown references like 
|
|
"""
|
|
import re
|
|
|
|
images = []
|
|
|
|
# Find all markdown image references
|
|
image_pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
|
|
matches = re.findall(image_pattern, md_content)
|
|
|
|
for _, image_path in matches:
|
|
try:
|
|
# Handle different image path formats
|
|
if image_path.startswith('./') or image_path.startswith('../'):
|
|
# Relative path - assume it's in the same directory structure
|
|
actual_path = os.path.normpath(os.path.join(temp_dir, image_path))
|
|
elif os.path.isabs(image_path):
|
|
# Absolute path
|
|
actual_path = image_path
|
|
else:
|
|
# Relative to temp directory
|
|
actual_path = os.path.join(temp_dir, image_path)
|
|
|
|
# Check if image file exists and copy to our temp directory if needed
|
|
if os.path.exists(actual_path):
|
|
# Generate new filename for our processing
|
|
image_filename = f"self_hosted_{page_id}_{os.path.basename(image_path)}"
|
|
dest_path = os.path.join(temp_dir, image_filename)
|
|
|
|
if actual_path != dest_path:
|
|
import shutil
|
|
shutil.copy(actual_path, dest_path)
|
|
|
|
images.append(image_filename)
|
|
self.logger.debug(f"mineru-api: [{page_id}] extracted image: {image_filename}")
|
|
else:
|
|
self.logger.warning(f"mineru-api: [{page_id}] image file not found: {actual_path}")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-api: [{page_id}] error processing image {image_path}: {str(e)}")
|
|
|
|
return images
|
|
|
|
def _extract_tables_from_middle_json(self, middle_json: Dict, page_num: int) -> List[Dict]:
|
|
"""
|
|
Extract table information from middle_json if available.
|
|
"""
|
|
tables = []
|
|
|
|
try:
|
|
# This structure depends on the actual format returned by self-hosted MinerU
|
|
# Adjust based on the actual response structure
|
|
if 'tables' in middle_json:
|
|
table_data = middle_json['tables']
|
|
if isinstance(table_data, list):
|
|
for i, table in enumerate(table_data):
|
|
tables.append({
|
|
'page': page_num,
|
|
'table_id': i,
|
|
'content': str(table),
|
|
'source': 'self_hosted_middle_json'
|
|
})
|
|
elif isinstance(table_data, dict):
|
|
tables.append({
|
|
'page': page_num,
|
|
'content': str(table_data),
|
|
'source': 'self_hosted_middle_json'
|
|
})
|
|
|
|
except Exception as e:
|
|
self.logger.debug(f"mineru-api: page {page_num} table extraction from middle_json failed: {str(e)}")
|
|
|
|
return tables
|
|
|
|
async def _upload_file_to_accessible_url(self, pdf_path: str, src_fileid: str) -> str:
|
|
"""
|
|
Upload file to a publicly accessible URL for MinerU processing.
|
|
|
|
Uses platform adapter's upload_file method
|
|
"""
|
|
try:
|
|
# Use platform adapter for upload if available
|
|
if hasattr(self, 'platform_adapter') and self.platform_adapter:
|
|
# The adapter will handle the upload
|
|
return await self.platform_adapter.upload_file(pdf_path, {
|
|
'src_fileid': src_fileid
|
|
})
|
|
|
|
# Fallback: return local path if no adapter
|
|
logger.warning("No platform adapter available for upload, returning local path")
|
|
return pdf_path
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-api: file upload failed: {str(e)}")
|
|
raise
|
|
|
|
|
|
async def _poll_task_completion(self, task_id: str, src_fileid: str,
|
|
max_wait_time: int = 600) -> str:
|
|
"""
|
|
Poll MinerU task until completion.
|
|
|
|
Args:
|
|
task_id: Task ID to poll
|
|
src_fileid: Source file ID for logging
|
|
max_wait_time: Maximum wait time in seconds
|
|
|
|
Returns:
|
|
Result ZIP file URL
|
|
"""
|
|
if not self.session:
|
|
raise RuntimeError("API client not initialized")
|
|
|
|
headers = {
|
|
'Authorization': f'Bearer {self.config.mineru_api_key}',
|
|
'Accept': '*/*'
|
|
}
|
|
|
|
start_time = asyncio.get_event_loop().time()
|
|
poll_interval = 5 # Start with 5 seconds
|
|
max_poll_interval = 30 # Max 30 seconds between polls
|
|
|
|
while True:
|
|
current_time = asyncio.get_event_loop().time()
|
|
if current_time - start_time > max_wait_time:
|
|
raise Exception(f"Task polling timeout after {max_wait_time} seconds")
|
|
|
|
try:
|
|
async with self.session.get(
|
|
f"{self.config.mineru_api_url}/api/v4/extract/task/{task_id}",
|
|
headers=headers
|
|
) as response:
|
|
|
|
if response.status != 200:
|
|
error_text = await response.text()
|
|
raise Exception(f"MinerU polling error: {response.status} - {error_text}")
|
|
|
|
result = await response.json()
|
|
|
|
if result.get('code') != 0:
|
|
raise Exception(f"MinerU polling error: {result.get('msg', 'Unknown error')}")
|
|
|
|
data = result['data']
|
|
state = data['state']
|
|
|
|
if state == 'done':
|
|
full_zip_url = data['full_zip_url']
|
|
self.logger.info(f"mineru-api: task completed: {task_id}")
|
|
return full_zip_url
|
|
|
|
elif state == 'failed':
|
|
error_msg = data.get('err_msg', 'Unknown error')
|
|
raise Exception(f"MinerU task failed: {error_msg}")
|
|
|
|
elif state in ['pending', 'running', 'converting']:
|
|
# Log progress if available
|
|
if 'extract_progress' in data:
|
|
progress = data['extract_progress']
|
|
extracted = progress.get('extracted_pages', 0)
|
|
total = progress.get('total_pages', 0)
|
|
start_time_str = progress.get('start_time', 'N/A')
|
|
self.logger.info(f"mineru-api: task {state}: {extracted}/{total} pages (started: {start_time_str})")
|
|
else:
|
|
self.logger.info(f"mineru-api: task {state}")
|
|
|
|
# Wait before next poll
|
|
await asyncio.sleep(poll_interval)
|
|
|
|
# Gradually increase poll interval
|
|
poll_interval = min(poll_interval * 1.2, max_poll_interval)
|
|
|
|
else:
|
|
raise Exception(f"Unknown task state: {state}")
|
|
|
|
except aiohttp.ClientError as e:
|
|
self.logger.warning(f"mineru-api: polling connection error: {str(e)}")
|
|
await asyncio.sleep(poll_interval)
|
|
|
|
async def _process_full_document_cloud(self, pdf_path: str, temp_dir: str, src_fileid: str, is_ppt_converted: bool) -> MinerUResult:
|
|
"""
|
|
Process full PDF document at once using cloud MinerU API.
|
|
|
|
This method uploads the entire document and extracts content_list from the result.
|
|
"""
|
|
try:
|
|
self.logger.info(f"mineru-api: processing full document with cloud API")
|
|
|
|
start_time = asyncio.get_event_loop().time()
|
|
|
|
# Step 1: Upload file to accessible URL
|
|
file_url = await self._upload_file_to_accessible_url(pdf_path, src_fileid)
|
|
self.logger.info(f"mineru-api: uploaded file URL: {file_url}")
|
|
if not file_url.startswith(('http://', 'https://')):
|
|
self.logger.warning(f"mineru-api: URL may not be valid for Cloud API: {file_url}")
|
|
# Step 2: Create task for full document
|
|
task_id = await self._create_mineru_task_full_document(file_url, src_fileid)
|
|
|
|
# Step 3: Poll for completion
|
|
result_url = await self._poll_task_completion(task_id, src_fileid, max_wait_time=900) # 15 min for full doc
|
|
|
|
# Step 4: Download and extract results with content_list
|
|
markdown_content, all_images, all_tables, page_data, full_md_content = await self._download_and_extract_results_with_content_list(
|
|
result_url, temp_dir, src_fileid
|
|
)
|
|
|
|
processing_time = asyncio.get_event_loop().time() - start_time
|
|
|
|
# Detect language from full.md first, then fallback to markdown_content
|
|
detected_language = None
|
|
if full_md_content and full_md_content.strip():
|
|
from .language_detector import LanguageDetector
|
|
language_code, confidence = LanguageDetector.detect_language(full_md_content)
|
|
if confidence > 0.7:
|
|
detected_language = language_code
|
|
self.logger.info(f"mineru-api: detected document language from full.md: {detected_language} (confidence: {confidence:.2f})")
|
|
|
|
# Fallback to content_list markdown if no full.md or low confidence
|
|
if not detected_language and markdown_content and markdown_content.strip():
|
|
from .language_detector import LanguageDetector
|
|
language_code, confidence = LanguageDetector.detect_language(markdown_content)
|
|
if confidence > 0.7:
|
|
detected_language = language_code
|
|
self.logger.info(f"mineru-api: detected document language from content_list: {detected_language} (confidence: {confidence:.2f})")
|
|
|
|
# Create metadata
|
|
metadata = {
|
|
"processing_time": processing_time,
|
|
"total_pages": len(page_data),
|
|
"images_found": len(all_images),
|
|
"tables_found": len(all_tables),
|
|
"is_ppt_source": is_ppt_converted,
|
|
"processing_mode": "full_document_cloud",
|
|
"api_version": "v4",
|
|
"api_type": "cloud",
|
|
"page_data": page_data,
|
|
"detected_language": detected_language # Add detected language to metadata
|
|
}
|
|
|
|
# Create page results for compatibility
|
|
mineru_page_results = []
|
|
for page_idx, pdata in page_data.items():
|
|
mineru_page_results.append(MinerUPageResult(
|
|
page_idx=page_idx,
|
|
success=True,
|
|
content=pdata['content'],
|
|
images=pdata['images'],
|
|
tables=pdata['tables'],
|
|
metadata=pdata['metadata']
|
|
))
|
|
|
|
self.logger.info(f"mineru-api: full document cloud processing completed in {processing_time:.2f}s")
|
|
|
|
return MinerUResult(
|
|
success=True,
|
|
content=markdown_content,
|
|
images=all_images,
|
|
tables=all_tables,
|
|
metadata=metadata,
|
|
page_results=mineru_page_results
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-api: full document cloud processing failed: {str(e)}")
|
|
raise
|
|
|
|
async def _create_mineru_task_full_document(self, file_url: str, src_fileid: str) -> str:
|
|
"""
|
|
Create MinerU task for full document processing.
|
|
"""
|
|
if not self.session:
|
|
raise RuntimeError("API client not initialized")
|
|
|
|
headers = {
|
|
'Authorization': f'Bearer {self.config.mineru_api_key}',
|
|
'Content-Type': 'application/json',
|
|
'Accept': '*/*'
|
|
}
|
|
|
|
# Configure processing options for full document
|
|
payload = {
|
|
'url': file_url,
|
|
'is_ocr': True,
|
|
'enable_formula': True,
|
|
'enable_table': True,
|
|
'language': 'auto',
|
|
'data_id': src_fileid,
|
|
'model_version': 'v1',
|
|
'extra_formats': ['html'] # Request content_list format
|
|
}
|
|
|
|
try:
|
|
async with self.session.post(
|
|
f"{self.config.mineru_api_url}/api/v4/extract/task",
|
|
headers=headers,
|
|
json=payload
|
|
) as response:
|
|
|
|
if response.status != 200:
|
|
error_text = await response.text()
|
|
raise Exception(f"MinerU API error: {response.status} - {error_text}")
|
|
|
|
result = await response.json()
|
|
|
|
if result.get('code') != 0:
|
|
raise Exception(f"MinerU API error: {result.get('msg', 'Unknown error')}")
|
|
|
|
task_id = result['data']['task_id']
|
|
self.logger.info(f"mineru-api: full document task created: {task_id}")
|
|
|
|
return task_id
|
|
|
|
except aiohttp.ClientError as e:
|
|
raise Exception(f"MinerU API connection error: {str(e)}")
|
|
|
|
async def _download_and_extract_results_with_content_list(self, result_url: str, temp_dir: str,
|
|
src_fileid: str) -> Tuple[str, List[str], List[Dict], Dict[int, Dict], str]:
|
|
"""
|
|
Download and extract MinerU processing results including content_list.
|
|
|
|
Returns:
|
|
Tuple of (markdown_content, images, tables, page_data, full_md_content)
|
|
"""
|
|
if not self.session:
|
|
raise RuntimeError("API client not initialized")
|
|
|
|
# Download ZIP file
|
|
zip_path = os.path.join(temp_dir, f"mineru_result_{src_fileid}.zip")
|
|
|
|
try:
|
|
async with self.session.get(result_url) as response:
|
|
if response.status != 200:
|
|
error_text = await response.text()
|
|
raise Exception(f"Download error: {response.status} - {error_text}")
|
|
|
|
with open(zip_path, 'wb') as f:
|
|
async for chunk in response.content.iter_chunked(8192):
|
|
f.write(chunk)
|
|
|
|
self.logger.info(f"mineru-api: downloaded result ZIP: {zip_path}")
|
|
|
|
except aiohttp.ClientError as e:
|
|
raise Exception(f"Download connection error: {str(e)}")
|
|
|
|
# Extract ZIP file
|
|
extract_dir = os.path.join(temp_dir, f"mineru_extracted_{src_fileid}")
|
|
os.makedirs(extract_dir, exist_ok=True)
|
|
|
|
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
zip_ref.extractall(extract_dir)
|
|
|
|
self.logger.info(f"mineru-api: extracted results to: {extract_dir}")
|
|
|
|
# Look for content_list file and full.md
|
|
content_list = None
|
|
markdown_content = ""
|
|
full_md_content = "" # For language detection
|
|
images = []
|
|
tables = []
|
|
|
|
for root, _, files in os.walk(extract_dir):
|
|
for file in files:
|
|
file_path = os.path.join(root, file)
|
|
|
|
# Look for content_list JSON file
|
|
if file.endswith('_content_list.json'):
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
import json
|
|
content_list = json.load(f)
|
|
self.logger.info(f"mineru-api: found content_list with {len(content_list)} items")
|
|
|
|
elif file == 'full.md':
|
|
# Read full.md for language detection
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
full_md_content = f.read()
|
|
self.logger.info(f"mineru-api: found full.md with {len(full_md_content)} characters")
|
|
|
|
elif file.endswith('.md') and not markdown_content:
|
|
# Backup: use other markdown files if no content_list
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
markdown_content = f.read()
|
|
|
|
# Parse content_list if found
|
|
if content_list:
|
|
markdown_content, all_images, all_tables, page_data = self._parse_content_list_to_markdown(
|
|
content_list, temp_dir, src_fileid
|
|
)
|
|
|
|
self.logger.info(f"mineru-api: all_images from content_list: {all_images[:5]}...") # Show first 5 images
|
|
|
|
# Copy images referenced in content_list from images/ directory
|
|
images_dir = os.path.join(extract_dir, 'images')
|
|
self.logger.info(f"mineru-api: checking for images directory: {images_dir}")
|
|
|
|
# List all directories in extract_dir for debugging
|
|
if os.path.exists(extract_dir):
|
|
self.logger.info(f"mineru-api: contents of extract_dir: {os.listdir(extract_dir)}")
|
|
|
|
if os.path.exists(images_dir):
|
|
self.logger.info(f"mineru-api: found images directory: {images_dir}")
|
|
# List files in images directory
|
|
image_files = os.listdir(images_dir)
|
|
self.logger.info(f"mineru-api: found {len(image_files)} files in images directory")
|
|
self.logger.info(f"mineru-api: image files in directory: {image_files[:10]}") # Show first 10 files
|
|
|
|
# Copy ALL image files from images directory to temp_dir
|
|
import shutil
|
|
for img_file in image_files:
|
|
if img_file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
|
|
src_img_path = os.path.join(images_dir, img_file)
|
|
dest_img_path = os.path.join(temp_dir, img_file)
|
|
shutil.copy(src_img_path, dest_img_path)
|
|
self.logger.info(f"mineru-api: copied image {img_file} to temp_dir")
|
|
|
|
# Also try to copy specific images referenced in content_list
|
|
for img_filename in all_images:
|
|
# Try different possible paths and names
|
|
possible_names = [
|
|
img_filename,
|
|
img_filename.replace('.png', '.jpg'),
|
|
img_filename.replace('.jpg', '.png'),
|
|
os.path.basename(img_filename) # Just the filename without path
|
|
]
|
|
|
|
copied = False
|
|
for name in possible_names:
|
|
src_img_path = os.path.join(images_dir, name)
|
|
if os.path.exists(src_img_path):
|
|
dest_img_path = os.path.join(temp_dir, img_filename)
|
|
if not os.path.exists(dest_img_path):
|
|
shutil.copy(src_img_path, dest_img_path)
|
|
self.logger.info(f"mineru-api: copied referenced image {name} as {img_filename}")
|
|
copied = True
|
|
break
|
|
|
|
if not copied:
|
|
# Try to find similar files
|
|
base_name = os.path.splitext(img_filename)[0]
|
|
matching_files = [f for f in image_files if base_name in f]
|
|
if matching_files:
|
|
self.logger.warning(f"mineru-api: image {img_filename} not found, but similar files exist: {matching_files}")
|
|
else:
|
|
self.logger.warning(f"mineru-api: image {img_filename} not found in images dir")
|
|
else:
|
|
self.logger.warning(f"mineru-api: images directory not found: {images_dir}")
|
|
|
|
# For single-page documents, assign unassigned images to page 0
|
|
if len(page_data) == 1 and 0 in page_data:
|
|
# Check if any images are not yet assigned to pages
|
|
assigned_images = set()
|
|
for pd in page_data.values():
|
|
assigned_images.update(pd.get('images', []))
|
|
|
|
unassigned_images = [img for img in all_images if img not in assigned_images]
|
|
if unassigned_images:
|
|
self.logger.info(f"mineru-api: assigning {len(unassigned_images)} unassigned images to page 0")
|
|
page_data[0]['images'].extend(unassigned_images)
|
|
else:
|
|
# Fallback: parse markdown to create page data
|
|
self.logger.warning("mineru-api: no content_list found, using markdown fallback")
|
|
page_data = self._parse_markdown_to_page_data(markdown_content)
|
|
|
|
# Copy all images from extract_dir to temp_dir (without mineru_ prefix)
|
|
for root, _, files in os.walk(extract_dir):
|
|
for file in files:
|
|
if file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
|
|
file_path = os.path.join(root, file)
|
|
dest_path = os.path.join(temp_dir, file)
|
|
|
|
import shutil
|
|
shutil.copy(file_path, dest_path)
|
|
images.append(file)
|
|
self.logger.info(f"mineru-api: copied image {file} to temp_dir (fallback)")
|
|
|
|
all_images = images
|
|
all_tables = tables
|
|
|
|
# Clean up ZIP file
|
|
os.remove(zip_path)
|
|
|
|
self.logger.info(f"mineru-api: parsed results - {len(page_data)} pages, "
|
|
f"{len(all_images)} images, {len(all_tables)} tables")
|
|
|
|
return markdown_content, all_images, all_tables, page_data, full_md_content
|
|
|
|
def _parse_markdown_to_page_data(self, markdown_content: str) -> Dict[int, Dict]:
|
|
"""
|
|
Parse markdown content to create page data structure.
|
|
|
|
This is a fallback when content_list is not available.
|
|
"""
|
|
page_data = {}
|
|
|
|
# Split by page markers
|
|
import re
|
|
page_pattern = r'## Page (\d+)'
|
|
parts = re.split(page_pattern, markdown_content)
|
|
|
|
if len(parts) > 1:
|
|
# Skip the first part (before first page marker)
|
|
for i in range(1, len(parts), 2):
|
|
if i < len(parts) - 1:
|
|
page_num = int(parts[i])
|
|
page_content = parts[i + 1].strip()
|
|
page_idx = page_num - 1
|
|
|
|
page_data[page_idx] = {
|
|
'content': page_content,
|
|
'images': [],
|
|
'tables': [],
|
|
'metadata': {'page_num': page_num}
|
|
}
|
|
else:
|
|
# No page markers, treat as single page
|
|
page_data[0] = {
|
|
'content': markdown_content,
|
|
'images': [],
|
|
'tables': [],
|
|
'metadata': {'page_num': 1}
|
|
}
|
|
|
|
return page_data
|
|
|
|
async def _download_and_extract_results(self, result_url: str, temp_dir: str,
|
|
src_fileid: str) -> Tuple[str, List[str], List[Dict]]:
|
|
"""
|
|
Download and extract MinerU processing results.
|
|
|
|
Args:
|
|
result_url: URL to result ZIP file
|
|
temp_dir: Temporary directory for extraction
|
|
src_fileid: Source file ID for logging
|
|
|
|
Returns:
|
|
Tuple of (content, images, tables)
|
|
"""
|
|
if not self.session:
|
|
raise RuntimeError("API client not initialized")
|
|
|
|
# Download ZIP file
|
|
zip_path = os.path.join(temp_dir, f"mineru_result_{src_fileid}.zip")
|
|
|
|
try:
|
|
async with self.session.get(result_url) as response:
|
|
if response.status != 200:
|
|
error_text = await response.text()
|
|
raise Exception(f"Download error: {response.status} - {error_text}")
|
|
|
|
with open(zip_path, 'wb') as f:
|
|
async for chunk in response.content.iter_chunked(8192):
|
|
f.write(chunk)
|
|
|
|
self.logger.info(f"mineru-api: downloaded result ZIP: {zip_path}")
|
|
|
|
except aiohttp.ClientError as e:
|
|
raise Exception(f"Download connection error: {str(e)}")
|
|
|
|
# Extract ZIP file
|
|
extract_dir = os.path.join(temp_dir, f"mineru_extracted_{src_fileid}")
|
|
os.makedirs(extract_dir, exist_ok=True)
|
|
|
|
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
zip_ref.extractall(extract_dir)
|
|
|
|
self.logger.info(f"mineru-api: extracted results to: {extract_dir}")
|
|
|
|
# Parse extracted content
|
|
content = ""
|
|
images = []
|
|
tables = []
|
|
|
|
# Look for markdown file and other assets
|
|
for root, _, files in os.walk(extract_dir):
|
|
for file in files:
|
|
file_path = os.path.join(root, file)
|
|
|
|
if file.endswith('.md'):
|
|
# Read markdown content
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
self.logger.info(f"mineru-api: loaded markdown content: {len(content)} chars")
|
|
|
|
elif file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
|
|
# Copy image to temp directory and add to list
|
|
image_name = f"mineru_{file}"
|
|
dest_path = os.path.join(temp_dir, image_name)
|
|
|
|
import shutil
|
|
shutil.copy(file_path, dest_path)
|
|
images.append(image_name)
|
|
|
|
elif file.endswith('.html'):
|
|
# Parse HTML for additional table information if needed
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
html_content = f.read()
|
|
|
|
# Extract table information from HTML
|
|
table_count = html_content.count('<table')
|
|
if table_count > 0:
|
|
tables.append({
|
|
"source": "html",
|
|
"table_count": table_count,
|
|
"content": "Tables extracted from HTML format"
|
|
})
|
|
|
|
# Clean up ZIP file
|
|
os.remove(zip_path)
|
|
|
|
self.logger.info(f"mineru-api: parsed results - content: {len(content)} chars, images: {len(images)}, tables: {len(tables)}")
|
|
|
|
return content, images, tables
|
|
|
|
async def _mock_mineru_processing(self, pdf_path: str, temp_dir: str,
|
|
src_fileid: str, is_ppt_converted: bool) -> MinerUResult:
|
|
"""
|
|
Mock MinerU processing for development/testing.
|
|
|
|
Provides realistic output structure for development without API calls.
|
|
"""
|
|
try:
|
|
# Simulate processing delay
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Extract basic information from PDF for mock response
|
|
content_parts = []
|
|
images = []
|
|
tables = []
|
|
content_list = [] # For context extraction
|
|
|
|
with fitz.open(pdf_path) as doc:
|
|
for page_num, page in enumerate(doc):
|
|
# Extract text
|
|
page_text = page.get_text()
|
|
if page_text.strip():
|
|
content_parts.append(f"## Page {page_num + 1}\\n\\n{page_text}\\n")
|
|
# Add to content_list
|
|
content_list.append({
|
|
'page_idx': page_num,
|
|
'type': 'text',
|
|
'text': page_text,
|
|
'metadata': {}
|
|
})
|
|
|
|
# Mock image extraction (would be done by MinerU)
|
|
for img in page.get_images(full=True):
|
|
xref = img[0]
|
|
bbox = page.get_image_bbox(img)
|
|
if bbox.width > 0 and bbox.height > 0:
|
|
image_filename = f"mineru_image_{xref}.png"
|
|
image_path = os.path.join(temp_dir, image_filename)
|
|
|
|
# Extract and save image
|
|
try:
|
|
pix = fitz.Pixmap(doc, xref)
|
|
if pix.n - pix.alpha < 4: # GRAY or RGB
|
|
pix.save(image_path)
|
|
images.append(image_filename)
|
|
# Add to content_list
|
|
content_list.append({
|
|
'page_idx': page_num,
|
|
'type': 'image',
|
|
'img_path': image_filename,
|
|
'metadata': {}
|
|
})
|
|
pix = None # Free memory
|
|
except Exception:
|
|
pass
|
|
|
|
# Mock table detection
|
|
if "table" in page_text.lower() or "|" in page_text:
|
|
tables.append({
|
|
"page": page_num,
|
|
"content": "Mock table content detected",
|
|
"bbox": [0, 0, page.rect.width, page.rect.height]
|
|
})
|
|
# Add to content_list
|
|
content_list.append({
|
|
'page_idx': page_num,
|
|
'type': 'table',
|
|
'content': 'Mock table content',
|
|
'metadata': {}
|
|
})
|
|
|
|
# Combine content with mock markdown structure
|
|
mock_content = self._create_mock_markdown_content(content_parts, images)
|
|
|
|
# Detect language from the combined content
|
|
detected_language = None
|
|
if mock_content and mock_content.strip():
|
|
from .language_detector import LanguageDetector
|
|
language_code, confidence = LanguageDetector.detect_language(mock_content)
|
|
if confidence > 0.7:
|
|
detected_language = language_code
|
|
self.logger.info(f"mineru-api: detected document language (mock): {detected_language} (confidence: {confidence:.2f})")
|
|
|
|
# Mock metadata
|
|
metadata = {
|
|
"processing_time": 0.5,
|
|
"pages_processed": len(doc),
|
|
"images_found": len(images),
|
|
"tables_found": len(tables),
|
|
"is_ppt_source": is_ppt_converted,
|
|
"api_version": "mock",
|
|
"content_list": content_list, # Include content list for context extraction
|
|
"detected_language": detected_language # Add detected language to metadata
|
|
}
|
|
|
|
self.logger.info(f"mineru-api: mock processing complete: {metadata}")
|
|
|
|
return MinerUResult(
|
|
success=True,
|
|
content=mock_content,
|
|
images=images,
|
|
tables=tables,
|
|
metadata=metadata
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-api: mock processing error: {str(e)}")
|
|
raise
|
|
|
|
def _create_mock_markdown_content(self, content_parts: List[str], images: List[str]) -> str:
|
|
"""
|
|
Create mock markdown content that simulates MinerU output structure.
|
|
"""
|
|
mock_parts = []
|
|
|
|
# Add document header
|
|
mock_parts.append("# Document Content (MinerU Mock)")
|
|
mock_parts.append("")
|
|
|
|
# Add content parts
|
|
for part in content_parts:
|
|
mock_parts.append(part)
|
|
|
|
# Add image references
|
|
if images:
|
|
mock_parts.append("## Images")
|
|
mock_parts.append("")
|
|
for img in images:
|
|
mock_parts.append(f"")
|
|
mock_parts.append("")
|
|
|
|
return "\\n".join(mock_parts)
|
|
|
|
async def _retry_with_backoff(self, func, *args, **kwargs):
|
|
"""
|
|
Execute a function with exponential backoff retry logic.
|
|
|
|
Args:
|
|
func: Async function to execute
|
|
*args: Positional arguments for the function
|
|
**kwargs: Keyword arguments for the function
|
|
|
|
Returns:
|
|
Result from the function
|
|
|
|
Raises:
|
|
Exception from the last retry attempt
|
|
"""
|
|
max_retries = self.config.api_max_retries
|
|
retry_delay = self.config.api_retry_delay
|
|
backoff = self.config.api_retry_backoff
|
|
max_delay = self.config.api_retry_max_delay
|
|
retry_on_errors = self.config.retry_on_errors
|
|
|
|
last_exception = None
|
|
|
|
for attempt in range(max_retries + 1):
|
|
try:
|
|
result = await func(*args, **kwargs)
|
|
|
|
# Check if result indicates success
|
|
if hasattr(result, 'success') and not result.success:
|
|
# Check if this is a retryable error
|
|
if result.error and retry_on_errors:
|
|
should_retry = any(err_type in str(result.error) for err_type in retry_on_errors)
|
|
if not should_retry and attempt < max_retries:
|
|
self.logger.warning(f"Non-retryable error: {result.error}")
|
|
return result
|
|
|
|
if attempt < max_retries:
|
|
self.logger.warning(f"API call failed (attempt {attempt + 1}/{max_retries + 1}): {result.error}")
|
|
last_exception = Exception(result.error or "API call failed")
|
|
else:
|
|
return result
|
|
else:
|
|
# Success
|
|
if attempt > 0:
|
|
self.logger.info(f"API call succeeded after {attempt + 1} attempts")
|
|
return result
|
|
|
|
except (aiohttp.ClientError, asyncio.TimeoutError, ConnectionError) as e:
|
|
# Network-related errors are always retryable
|
|
last_exception = e
|
|
if attempt < max_retries:
|
|
self.logger.warning(f"Network error (attempt {attempt + 1}/{max_retries + 1}): {str(e)}")
|
|
else:
|
|
self.logger.error(f"Network error after {max_retries + 1} attempts: {str(e)}")
|
|
raise
|
|
|
|
except Exception as e:
|
|
# Check if this is a retryable error type
|
|
if retry_on_errors:
|
|
should_retry = any(err_type in str(e) for err_type in retry_on_errors)
|
|
if not should_retry:
|
|
self.logger.error(f"Non-retryable error: {str(e)}")
|
|
raise
|
|
|
|
last_exception = e
|
|
if attempt < max_retries:
|
|
self.logger.warning(f"API error (attempt {attempt + 1}/{max_retries + 1}): {str(e)}")
|
|
else:
|
|
self.logger.error(f"API error after {max_retries + 1} attempts: {str(e)}")
|
|
raise
|
|
|
|
# If we need to retry, wait with exponential backoff
|
|
if attempt < max_retries:
|
|
delay = min(retry_delay * (backoff ** attempt), max_delay)
|
|
self.logger.info(f"Retrying in {delay:.1f} seconds...")
|
|
await asyncio.sleep(delay)
|
|
|
|
# Should not reach here, but just in case
|
|
if last_exception:
|
|
raise last_exception
|
|
else:
|
|
raise Exception("Maximum retries exceeded")
|
|
|
|
async def _process_batch_self_hosted_impl(self, batch_pdf_path: str, temp_dir: str, src_fileid: str,
|
|
batch_idx: int, start_page: int, end_page: int,
|
|
is_ppt_converted: bool) -> MinerUResult:
|
|
"""
|
|
Process a batch of pages using self-hosted MinerU API.
|
|
|
|
Args:
|
|
batch_pdf_path: Path to batch PDF file
|
|
temp_dir: Temporary directory
|
|
src_fileid: Source file ID
|
|
batch_idx: Batch index
|
|
start_page: Original start page index
|
|
end_page: Original end page index
|
|
is_ppt_converted: Whether PDF is from PPT
|
|
|
|
Returns:
|
|
MinerUResult for this batch
|
|
"""
|
|
try:
|
|
if not self.session:
|
|
raise RuntimeError("API client not initialized")
|
|
|
|
# Prepare multipart form data
|
|
form_data = aiohttp.FormData()
|
|
|
|
# API parameters
|
|
form_data.add_field('return_middle_json', 'false')
|
|
form_data.add_field('return_model_output', 'false')
|
|
form_data.add_field('return_md', 'true')
|
|
form_data.add_field('return_images', 'true')
|
|
form_data.add_field('return_content_list', 'true')
|
|
form_data.add_field('end_page_id', str(end_page - start_page))
|
|
form_data.add_field('parse_method', 'auto')
|
|
form_data.add_field('start_page_id', '0')
|
|
form_data.add_field('output_dir', './output')
|
|
form_data.add_field('server_url', 'string')
|
|
form_data.add_field('backend', 'pipeline')
|
|
form_data.add_field('table_enable', 'true')
|
|
form_data.add_field('formula_enable', 'true')
|
|
|
|
# Add the batch PDF file
|
|
with open(batch_pdf_path, 'rb') as f:
|
|
form_data.add_field('files', f, filename=f"batch_{batch_idx}.pdf",
|
|
content_type='application/pdf')
|
|
|
|
# Make API request
|
|
async with self.session.post(
|
|
f"{self.config.mineru_api_url}/file_parse",
|
|
data=form_data,
|
|
headers={'accept': 'application/json'}
|
|
) as response:
|
|
|
|
if response.status != 200:
|
|
error_text = await response.text()
|
|
raise Exception(f"Self-hosted MinerU API error: {response.status} - {error_text}")
|
|
|
|
result = await response.json()
|
|
|
|
# Process the batch result
|
|
results = result.get('results', {})
|
|
if not results:
|
|
raise Exception("No results in API response")
|
|
|
|
file_result = next(iter(results.values()))
|
|
content_list_str = file_result.get('content_list', '')
|
|
|
|
if not content_list_str:
|
|
raise Exception("No content_list in API response")
|
|
|
|
# Parse content_list with adjusted page indices
|
|
markdown_content, images, tables, page_data = self._parse_content_list_to_markdown_batch(
|
|
content_list_str, temp_dir, src_fileid, start_page
|
|
)
|
|
|
|
# Save batch images if provided
|
|
images_data = file_result.get('images', {})
|
|
if images_data and isinstance(images_data, dict):
|
|
saved_images = self._save_base64_images(images_data, temp_dir,
|
|
f"{src_fileid}_batch_{batch_idx}")
|
|
images.extend([img for img in saved_images if img not in images])
|
|
|
|
metadata = {
|
|
"batch_idx": batch_idx,
|
|
"start_page": start_page,
|
|
"end_page": end_page,
|
|
"pages_in_batch": end_page - start_page,
|
|
"is_ppt_source": is_ppt_converted,
|
|
"page_data": page_data # Add page_data so it can be extracted per page
|
|
}
|
|
|
|
return MinerUResult(
|
|
success=True,
|
|
content=markdown_content,
|
|
images=images,
|
|
tables=tables,
|
|
metadata=metadata,
|
|
page_results=None
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-api: batch {batch_idx} processing failed: {str(e)}")
|
|
return MinerUResult(
|
|
success=False,
|
|
content="",
|
|
images=[],
|
|
tables=[],
|
|
metadata={"batch_idx": batch_idx, "error": str(e)},
|
|
error=str(e)
|
|
)
|
|
|
|
async def _process_batch_self_hosted(self, batch_pdf_path: str, temp_dir: str, src_fileid: str,
|
|
batch_idx: int, start_page: int, end_page: int,
|
|
is_ppt_converted: bool) -> MinerUResult:
|
|
"""
|
|
Process a batch WITHOUT retry logic.
|
|
Batch failures will fallback to single-page processing where retry happens.
|
|
"""
|
|
# Direct call without retry wrapper
|
|
return await self._process_batch_self_hosted_impl(
|
|
batch_pdf_path, temp_dir, src_fileid, batch_idx, start_page, end_page, is_ppt_converted
|
|
)
|
|
|
|
async def _process_batch_cloud_impl(self, batch_pdf_path: str, temp_dir: str, src_fileid: str,
|
|
batch_idx: int, start_page: int, end_page: int,
|
|
is_ppt_converted: bool) -> MinerUResult:
|
|
"""
|
|
Implementation of batch processing using cloud MinerU API.
|
|
"""
|
|
try:
|
|
# Upload batch PDF
|
|
file_url = await self._upload_file_to_accessible_url(batch_pdf_path,
|
|
f"{src_fileid}_batch_{batch_idx}")
|
|
|
|
# Create task for batch
|
|
task_id = await self._create_mineru_task_full_document(file_url,
|
|
f"{src_fileid}_batch_{batch_idx}")
|
|
|
|
# Poll for completion
|
|
result_url = await self._poll_task_completion(task_id, src_fileid, max_wait_time=300)
|
|
|
|
# Download and extract results
|
|
markdown_content, images, tables, page_data, _ = await self._download_and_extract_results_with_content_list(
|
|
result_url, temp_dir, f"{src_fileid}_batch_{batch_idx}"
|
|
)
|
|
|
|
# Adjust page indices to match original document
|
|
adjusted_page_data = {}
|
|
for page_idx, pdata in page_data.items():
|
|
adjusted_idx = page_idx + start_page
|
|
adjusted_page_data[adjusted_idx] = pdata
|
|
adjusted_page_data[adjusted_idx]['metadata']['original_page_num'] = adjusted_idx + 1
|
|
|
|
metadata = {
|
|
"batch_idx": batch_idx,
|
|
"start_page": start_page,
|
|
"end_page": end_page,
|
|
"pages_in_batch": end_page - start_page,
|
|
"is_ppt_source": is_ppt_converted,
|
|
"page_data": adjusted_page_data
|
|
}
|
|
|
|
return MinerUResult(
|
|
success=True,
|
|
content=markdown_content,
|
|
images=images,
|
|
tables=tables,
|
|
metadata=metadata,
|
|
page_results=None
|
|
)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-api: cloud batch {batch_idx} processing failed: {str(e)}")
|
|
return MinerUResult(
|
|
success=False,
|
|
content="",
|
|
images=[],
|
|
tables=[],
|
|
metadata={"batch_idx": batch_idx, "error": str(e)},
|
|
error=str(e)
|
|
)
|
|
|
|
async def _process_batch_cloud(self, batch_pdf_path: str, temp_dir: str, src_fileid: str,
|
|
batch_idx: int, start_page: int, end_page: int,
|
|
is_ppt_converted: bool) -> MinerUResult:
|
|
"""
|
|
Process a batch using cloud API WITHOUT retry logic.
|
|
Batch failures will fallback to single-page processing where retry happens.
|
|
"""
|
|
# Direct call without retry wrapper
|
|
return await self._process_batch_cloud_impl(
|
|
batch_pdf_path, temp_dir, src_fileid, batch_idx, start_page, end_page, is_ppt_converted
|
|
)
|
|
|
|
def _merge_batch_results(self, batch_results: List[Tuple[int, MinerUResult]],
|
|
total_pages: int, is_ppt_converted: bool) -> MinerUResult:
|
|
"""
|
|
Merge results from multiple batches into a single MinerUResult.
|
|
|
|
Args:
|
|
batch_results: List of (start_page, MinerUResult) tuples
|
|
total_pages: Total number of pages in original document
|
|
is_ppt_converted: Whether the PDF was converted from PPT
|
|
|
|
Returns:
|
|
Merged MinerUResult
|
|
"""
|
|
if not batch_results:
|
|
return MinerUResult(
|
|
success=False,
|
|
content="",
|
|
images=[],
|
|
tables=[],
|
|
metadata={},
|
|
error="No successful batches"
|
|
)
|
|
|
|
# Sort batches by start page
|
|
batch_results.sort(key=lambda x: x[0])
|
|
|
|
# Merge content
|
|
merged_content_parts = []
|
|
all_images = []
|
|
all_tables = []
|
|
all_page_data = {}
|
|
all_page_results = []
|
|
|
|
for start_page, batch_result in batch_results:
|
|
# Add batch content
|
|
merged_content_parts.append(batch_result.content)
|
|
|
|
# Collect images (avoid duplicates)
|
|
for img in batch_result.images:
|
|
if img not in all_images:
|
|
all_images.append(img)
|
|
|
|
# Collect tables
|
|
all_tables.extend(batch_result.tables)
|
|
|
|
# Merge page data if available
|
|
if batch_result.metadata and 'page_data' in batch_result.metadata:
|
|
all_page_data.update(batch_result.metadata['page_data'])
|
|
|
|
# Create page results if needed
|
|
if batch_result.page_results:
|
|
all_page_results.extend(batch_result.page_results)
|
|
|
|
# Join content with page separators
|
|
merged_content = "\n\n".join(merged_content_parts)
|
|
|
|
# Create merged metadata
|
|
merged_metadata = {
|
|
"processing_mode": "batch_processing",
|
|
"total_pages": total_pages,
|
|
"batch_count": len(batch_results),
|
|
"images_found": len(all_images),
|
|
"tables_found": len(all_tables),
|
|
"is_ppt_source": is_ppt_converted,
|
|
"page_data": all_page_data if all_page_data else None
|
|
}
|
|
|
|
self.logger.info(f"mineru-api: merged {len(batch_results)} batches - "
|
|
f"{len(all_images)} images, {len(all_tables)} tables")
|
|
|
|
return MinerUResult(
|
|
success=True,
|
|
content=merged_content,
|
|
images=all_images,
|
|
tables=all_tables,
|
|
metadata=merged_metadata,
|
|
page_results=all_page_results if all_page_results else None
|
|
)
|
|
|
|
def _parse_content_list_to_markdown_batch(self, content_list: Any, temp_dir: str,
|
|
src_fileid: str, page_offset: int) -> Tuple[str, List[str], List[Dict], Dict[int, Dict]]:
|
|
"""
|
|
Parse content_list for a batch with page offset adjustment.
|
|
|
|
Args:
|
|
content_list: Content list from API
|
|
temp_dir: Temporary directory
|
|
src_fileid: Source file ID
|
|
page_offset: Offset to add to page indices
|
|
|
|
Returns:
|
|
Tuple of (markdown, images, tables, page_data)
|
|
"""
|
|
# Parse normally first
|
|
markdown, images, tables, page_data = self._parse_content_list_to_markdown(
|
|
content_list, temp_dir, src_fileid
|
|
)
|
|
|
|
# Adjust page indices in page_data
|
|
adjusted_page_data = {}
|
|
for page_idx, pdata in page_data.items():
|
|
adjusted_idx = page_idx + page_offset
|
|
adjusted_page_data[adjusted_idx] = pdata
|
|
# Update page number in metadata
|
|
if 'metadata' in pdata:
|
|
pdata['metadata']['page_num'] = adjusted_idx + 1
|
|
|
|
# Adjust page numbers in tables
|
|
for table in tables:
|
|
if 'page' in table:
|
|
table['page'] += page_offset
|
|
|
|
return markdown, images, tables, adjusted_page_data
|
|
|
|
def _parse_content_list_to_markdown(self, content_list: List[Dict], temp_dir: str, src_fileid: str) -> Tuple[str, List[str], List[Dict], Dict[int, Dict]]:
|
|
"""
|
|
Parse content_list JSON format to markdown organized by pages.
|
|
|
|
Args:
|
|
content_list: List of content items with page_idx, type, and content
|
|
temp_dir: Temporary directory for saving images
|
|
src_fileid: Source file ID for logging
|
|
|
|
Returns:
|
|
Tuple of (markdown_content, image_list, table_list, page_data_dict)
|
|
"""
|
|
try:
|
|
import json
|
|
import base64
|
|
|
|
# If content_list is a string, parse it as JSON
|
|
if isinstance(content_list, str):
|
|
self.logger.info(f"mineru-api: Parsing content_list string of length {len(content_list)}")
|
|
try:
|
|
content_list = json.loads(content_list)
|
|
self.logger.info(f"mineru-api: Parsed content_list to {type(content_list)} with {len(content_list) if isinstance(content_list, list) else 'N/A'} items")
|
|
except json.JSONDecodeError as e:
|
|
self.logger.error(f"mineru-api: Failed to parse content_list JSON: {str(e)}")
|
|
self.logger.error(f"mineru-api: Content_list sample: {content_list[:500]}")
|
|
raise
|
|
|
|
# Log content_list structure
|
|
if isinstance(content_list, list):
|
|
self.logger.info(f"mineru-api: Content list has {len(content_list)} items")
|
|
if content_list:
|
|
self.logger.debug(f"mineru-api: First item sample: {content_list[0] if content_list else 'None'}")
|
|
else:
|
|
self.logger.warning(f"mineru-api: Content list is not a list, type: {type(content_list)}")
|
|
|
|
# Group content by page
|
|
page_groups = {}
|
|
for item in content_list:
|
|
page_idx = item.get('page_idx', 0)
|
|
if page_idx not in page_groups:
|
|
page_groups[page_idx] = []
|
|
page_groups[page_idx].append(item)
|
|
|
|
# Sort pages
|
|
sorted_pages = sorted(page_groups.keys())
|
|
|
|
# Build markdown and collect resources
|
|
markdown_parts = []
|
|
all_images = []
|
|
all_tables = []
|
|
page_data = {}
|
|
|
|
for page_idx in sorted_pages:
|
|
page_num = page_idx + 1 # Convert to 1-based
|
|
page_items = page_groups[page_idx]
|
|
|
|
# Page header - only add if there's content
|
|
# We'll add this after checking for content
|
|
|
|
page_content_parts = []
|
|
page_images = []
|
|
page_tables = []
|
|
|
|
for item in page_items:
|
|
item_type = item.get('type', 'text')
|
|
|
|
if item_type == 'text':
|
|
text = item.get('text', '').strip()
|
|
text_level = item.get('text_level', 0)
|
|
|
|
if text:
|
|
# Apply heading levels
|
|
if text_level > 0:
|
|
# Convert to markdown heading
|
|
heading_prefix = '#' * min(text_level, 6)
|
|
page_content_parts.append(f"{heading_prefix} {text}")
|
|
else:
|
|
page_content_parts.append(text)
|
|
|
|
elif item_type == 'image':
|
|
img_path = item.get('img_path', '')
|
|
img_caption = item.get('img_caption', [])
|
|
img_footnote = item.get('img_footnote', [])
|
|
|
|
self.logger.info(f"mineru-api: processing image item - img_path: {img_path[:100] if img_path else 'None'}")
|
|
|
|
# Handle image path/data
|
|
if img_path:
|
|
if img_path.startswith('data:'):
|
|
# Base64 encoded image
|
|
try:
|
|
# Extract format and data
|
|
header, data = img_path.split(',', 1)
|
|
fmt = header.split('/')[1].split(';')[0]
|
|
|
|
# Decode and save
|
|
img_data = base64.b64decode(data)
|
|
img_filename = f"content_list_img_{src_fileid}_p{page_num}_{len(page_images)}.{fmt}"
|
|
img_file_path = os.path.join(temp_dir, img_filename)
|
|
|
|
with open(img_file_path, 'wb') as f:
|
|
f.write(img_data)
|
|
|
|
page_images.append(img_filename)
|
|
all_images.append(img_filename)
|
|
|
|
# Add to markdown
|
|
page_content_parts.append(f"")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to decode base64 image: {str(e)}")
|
|
else:
|
|
# Regular image path - need to check if it's a file that needs to be created
|
|
img_filename = os.path.basename(img_path)
|
|
|
|
# For self-hosted API, images might be referenced but not yet saved
|
|
# We'll add them to the list and expect them to be in the 'images' field
|
|
page_images.append(img_filename)
|
|
all_images.append(img_filename)
|
|
|
|
# Use relative path for image reference
|
|
img_ref = f"images/{img_filename}" if not img_path.startswith('images/') else img_path
|
|
page_content_parts.append(f"")
|
|
self.logger.info(f"mineru-api: added image reference: ")
|
|
self.logger.info(f"mineru-api: expecting image file: {img_filename} to be provided by API")
|
|
|
|
# Add captions if present
|
|
if img_caption:
|
|
caption_text = ' '.join(img_caption)
|
|
page_content_parts.append(f"*{caption_text}*")
|
|
|
|
if img_footnote:
|
|
footnote_text = ' '.join(img_footnote)
|
|
page_content_parts.append(f"**Note:** {footnote_text}")
|
|
|
|
elif item_type == 'table':
|
|
table_body = item.get('table_body', '')
|
|
table_caption = item.get('table_caption', [])
|
|
table_footnote = item.get('table_footnote', [])
|
|
|
|
# Add table caption
|
|
if table_caption:
|
|
caption_text = ' '.join(table_caption)
|
|
page_content_parts.append(f"**{caption_text}**")
|
|
|
|
# Add table content
|
|
if table_body:
|
|
# If HTML table, add directly
|
|
if table_body.strip().startswith('<'):
|
|
page_content_parts.append(table_body)
|
|
else:
|
|
page_content_parts.append(f"```\n{table_body}\n```")
|
|
|
|
# Add footnote
|
|
if table_footnote:
|
|
footnote_text = ' '.join(table_footnote)
|
|
page_content_parts.append(f"*Note: {footnote_text}*")
|
|
|
|
# Store table data
|
|
table_data = {
|
|
'page': page_num,
|
|
'content': table_body,
|
|
'caption': table_caption,
|
|
'footnote': table_footnote
|
|
}
|
|
page_tables.append(table_data)
|
|
all_tables.append(table_data)
|
|
|
|
elif item_type == 'equation':
|
|
eq_text = item.get('text', '')
|
|
eq_format = item.get('text_format', 'latex')
|
|
|
|
if eq_text:
|
|
if eq_format == 'latex':
|
|
# Use display math for equations
|
|
page_content_parts.append(f"$$\n{eq_text}\n$$")
|
|
else:
|
|
page_content_parts.append(f"```{eq_format}\n{eq_text}\n```")
|
|
|
|
# Combine page content
|
|
# Filter out empty parts to avoid excessive newlines
|
|
non_empty_parts = [part for part in page_content_parts if part.strip()]
|
|
page_content = '\n'.join(non_empty_parts) if non_empty_parts else ''
|
|
|
|
# Only add page header and content if there's actual content
|
|
if page_content:
|
|
markdown_parts.append(f"\n\n## Page {page_num}\n\n")
|
|
markdown_parts.append(page_content)
|
|
|
|
# Store page data
|
|
page_data[page_idx] = {
|
|
'content': page_content,
|
|
'images': page_images,
|
|
'tables': page_tables,
|
|
'metadata': {'page_num': page_num}
|
|
}
|
|
|
|
# Combine all markdown
|
|
final_markdown = ''.join(markdown_parts)
|
|
|
|
self.logger.info(f"mineru-api: parsed content_list - {len(sorted_pages)} pages, "
|
|
f"{len(all_images)} images, {len(all_tables)} tables")
|
|
|
|
return final_markdown, all_images, all_tables, page_data
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-api: content_list parsing failed: {str(e)}")
|
|
raise
|
|
|
|
def detect_tables(self, content: str) -> bool:
|
|
"""
|
|
Detect if content contains table structures.
|
|
|
|
Based on gzero.py's table detection logic.
|
|
"""
|
|
table_indicators = [
|
|
'<table>', '<tr>', '<td>', '|---|',
|
|
'表格', 'Table', '| ', ' |',
|
|
'┌', '└', '├', '┤' # Table border characters
|
|
]
|
|
|
|
content_lower = content.lower()
|
|
for indicator in table_indicators:
|
|
if indicator.lower() in content_lower:
|
|
return True
|
|
|
|
# Check for pipe-separated table format
|
|
lines = content.split('\\n')
|
|
pipe_lines = [line for line in lines if line.count('|') >= 2]
|
|
if len(pipe_lines) >= 2: # At least header and one data row
|
|
return True
|
|
|
|
return False
|
|
|
|
async def extract_plain_text(self, pdf_path: str, src_fileid: str) -> str:
|
|
"""
|
|
Extract plain text from PDF using PyMuPDF.
|
|
|
|
This provides text content for comparison with MinerU results.
|
|
"""
|
|
try:
|
|
text_parts = []
|
|
with fitz.open(pdf_path) as doc:
|
|
for page in doc:
|
|
page_text = page.get_text()
|
|
if page_text.strip():
|
|
text_parts.append(page_text)
|
|
|
|
plain_text = '\\n\\n'.join(text_parts)
|
|
|
|
self.logger.info(f"mineru-api: extracted {len(plain_text)} characters of plain text")
|
|
|
|
return plain_text
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-api: plain text extraction failed: {str(e)}")
|
|
return ""
|
|
|
|
def merge_content(self, plain_text: str, mineru_content: str, src_fileid: str) -> str:
|
|
"""
|
|
Merge plain text with MinerU structured content.
|
|
|
|
This combines the reliability of plain text extraction with
|
|
MinerU's structured parsing capabilities.
|
|
"""
|
|
try:
|
|
# Simple merge strategy - could be enhanced with more sophisticated logic
|
|
if not mineru_content.strip():
|
|
self.logger.warning(f"mineru-api: MinerU content empty, using plain text")
|
|
return plain_text
|
|
|
|
if not plain_text.strip():
|
|
self.logger.warning(f"mineru-api: plain text empty, using MinerU content")
|
|
return mineru_content
|
|
|
|
# For now, prefer MinerU content as it should be more structured
|
|
# In practice, you might want more sophisticated merging logic
|
|
self.logger.info(f"mineru-api: using MinerU structured content")
|
|
return mineru_content
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-api: content merge failed: {str(e)}")
|
|
# Fallback to plain text
|
|
return plain_text if plain_text else mineru_content |