752 lines
31 KiB
Python
752 lines
31 KiB
Python
"""
|
||
Base parser module for MinerU - 公共代码基类
|
||
|
||
这个模块包含所有公共的处理逻辑,不依赖任何特定平台的代码。
|
||
通过适配器模式,支持不同平台的差异化实现。
|
||
"""
|
||
|
||
import os
|
||
import json
|
||
import hashlib
|
||
import shutil
|
||
import time
|
||
from pathlib import Path
|
||
from typing import List, Dict, Optional, Tuple, Any, Protocol
|
||
from dataclasses import dataclass
|
||
from abc import ABC, abstractmethod
|
||
from langchain.docstore.document import Document
|
||
from .logger import get_module_logger
|
||
logger = get_module_logger('base_parser')
|
||
|
||
from .config_base import MinerUConfig
|
||
from .converter import DocumentConverter
|
||
from .api_client import MinerUAPIClient, MinerUResult
|
||
from .image_processor import MinerUImageProcessor
|
||
from .content_processor import MinerUContentProcessor
|
||
from .context_types import EnhancedProcessingResult, PageContext, ContextMode
|
||
from .parallel_processor import ParallelMinerUProcessor, PageTask
|
||
from .parallel_processor_pool import get_parallel_processor
|
||
from .utils import get_file_hash, get_temp_dir
|
||
|
||
|
||
class PlatformAdapter(ABC):
|
||
"""平台适配器接口 - 定义不同平台需要实现的方法"""
|
||
|
||
@abstractmethod
|
||
async def trace_context(self, trace_id: str):
|
||
"""进入追踪上下文"""
|
||
pass
|
||
|
||
@abstractmethod
|
||
async def lock_enter(self, temp_dir: str):
|
||
"""进入文件锁"""
|
||
pass
|
||
|
||
@abstractmethod
|
||
async def lock_release(self, temp_dir: str):
|
||
"""释放文件锁"""
|
||
pass
|
||
|
||
@abstractmethod
|
||
async def upload_file(self, file_path: str, options: Any = None) -> str:
|
||
"""上传文件,返回URL"""
|
||
pass
|
||
|
||
@abstractmethod
|
||
def get_logger(self):
|
||
"""获取日志器"""
|
||
return logger
|
||
|
||
@abstractmethod
|
||
def get_settings(self) -> Dict[str, Any]:
|
||
"""获取配置设置"""
|
||
return {}
|
||
|
||
@abstractmethod
|
||
def get_learn_type(self, params: Dict[str, Any]) -> int:
|
||
"""获取learn_type参数"""
|
||
return params.get('learn_type', 9)
|
||
|
||
@abstractmethod
|
||
def set_trace_id(self, trace_id: str):
|
||
"""设置trace ID用于日志跟踪"""
|
||
pass
|
||
|
||
|
||
@dataclass
|
||
class ProcessingResult:
|
||
"""Main processing result data class - 公共数据结构"""
|
||
success: bool
|
||
content: str
|
||
images: List[str]
|
||
pages: List[Dict]
|
||
advanced_parser: Dict
|
||
error: Optional[str] = None
|
||
page_contexts: Optional[List[PageContext]] = None
|
||
content_list: Optional[List[Dict]] = None
|
||
page_chunks: Optional[List[Dict]] = None
|
||
|
||
|
||
class BaseMinerUExtractor:
|
||
"""
|
||
基础MinerU解析器 - 包含所有公共逻辑
|
||
|
||
这个类包含了所有与平台无关的处理逻辑,
|
||
通过适配器模式支持不同平台的差异化实现。
|
||
"""
|
||
|
||
def __init__(self, adapter: PlatformAdapter, config=None, **kwargs):
|
||
"""
|
||
初始化基础解析器
|
||
|
||
Args:
|
||
adapter: 平台适配器实例
|
||
config: 配置实例(可选,如果不提供则创建默认配置)
|
||
**kwargs: 平台特定的参数
|
||
"""
|
||
self.adapter = adapter
|
||
self.config = config if config else MinerUConfig()
|
||
self.logger = adapter.get_logger()
|
||
|
||
# 从适配器获取learn_type
|
||
self.learn_type = adapter.get_learn_type(kwargs)
|
||
|
||
# 保存其他参数供子类使用
|
||
self.platform_params = kwargs
|
||
|
||
# 保存adapter作为platform_adapter(为了兼容性)
|
||
self.platform_adapter = adapter
|
||
|
||
# 初始化组件
|
||
self.converter = DocumentConverter(self.config)
|
||
self.image_processor = MinerUImageProcessor(self.config)
|
||
self.content_processor = MinerUContentProcessor(self.config)
|
||
|
||
# 获取并行处理器,传递platform_adapter和配置
|
||
self.parallel_processor = get_parallel_processor(self.learn_type, self.platform_adapter, self.config)
|
||
|
||
async def process_file(self, filepath: str, src_name: str = None,
|
||
upload_options: Tuple = None) -> List[Document]:
|
||
"""
|
||
主处理方法 - 处理文档的完整流程
|
||
|
||
这是公共的处理流程,通过适配器调用平台特定的功能。
|
||
"""
|
||
# 生成文件ID
|
||
src_fileid = get_file_hash(filepath)
|
||
|
||
# 使用适配器的追踪上下文
|
||
async with self.adapter.trace_context(src_fileid):
|
||
try:
|
||
self.logger.info(f"mineru-parser: starting file processing: {filepath}, file hash: {src_fileid}")
|
||
|
||
# 设置处理环境
|
||
temp_dir = get_temp_dir(src_fileid, self.learn_type, self.config.cache_version)
|
||
await self.adapter.lock_enter(temp_dir)
|
||
|
||
try:
|
||
# 检查缓存
|
||
if self.config.enable_cache:
|
||
cached_docs = await self._load_from_cache(temp_dir, filepath, src_name)
|
||
if cached_docs:
|
||
self.logger.info(f"mineru-parser: loaded {len(cached_docs)} documents from cache")
|
||
return cached_docs
|
||
|
||
# 处理文档
|
||
result = await self._process_document_pipeline(
|
||
filepath, src_name, temp_dir, src_fileid, upload_options
|
||
)
|
||
|
||
self.logger.info(f"mineru-parser: pipeline returned result with success={result.success}")
|
||
|
||
if result.success:
|
||
if result.page_chunks:
|
||
docs = self._create_page_documents(result, filepath, src_name, temp_dir)
|
||
self.logger.info(f"mineru-parser: processing completed - returning {len(docs)} page documents")
|
||
return docs
|
||
else:
|
||
doc = self._create_document(result, filepath, src_name, temp_dir)
|
||
return [doc]
|
||
else:
|
||
raise Exception(result.error or "Processing failed")
|
||
|
||
finally:
|
||
await self.adapter.lock_release(temp_dir)
|
||
|
||
# 清理临时目录
|
||
if not self.config.enable_cache:
|
||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-parser: processing failed: {str(e)}")
|
||
raise
|
||
|
||
async def _process_document_pipeline(self, filepath: str, src_name: str,
|
||
temp_dir: str, src_fileid: str,
|
||
upload_options: Tuple) -> ProcessingResult:
|
||
"""处理文档的流水线 - 公共逻辑"""
|
||
try:
|
||
start_time = time.time()
|
||
|
||
# Step 1: 文件类型检测和转换
|
||
pdf_path, is_ppt_source = await self.converter.handle_file_input(
|
||
filepath, temp_dir, src_fileid
|
||
)
|
||
|
||
# Step 2: PDF格式检测
|
||
if not is_ppt_source:
|
||
is_ppt_format, metadata = self.converter.detect_pdf_format(pdf_path, src_fileid)
|
||
self.logger.info(f"mineru-parser: PDF format detection: is_ppt={is_ppt_format}")
|
||
else:
|
||
is_ppt_format = True
|
||
|
||
# Step 3: 提取PDF页面
|
||
pages_info = self.converter.extract_pdf_pages(pdf_path, src_fileid)
|
||
|
||
# Step 4: 处理文档
|
||
result = await self._process_document_format(
|
||
pdf_path, pages_info, temp_dir, src_fileid, upload_options,
|
||
is_ppt_format=(is_ppt_format or is_ppt_source)
|
||
)
|
||
|
||
processing_duration = time.time() - start_time
|
||
|
||
# 添加处理元数据
|
||
result.advanced_parser.update({
|
||
'processing_duration': processing_duration,
|
||
'is_ppt_source': is_ppt_source,
|
||
'is_ppt_format': is_ppt_format,
|
||
'learn_type': self.learn_type,
|
||
'mineru_version': self.config.cache_version
|
||
})
|
||
|
||
return result
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-parser: pipeline failed: {str(e)}")
|
||
return ProcessingResult(
|
||
success=False,
|
||
content="",
|
||
images=[],
|
||
pages=[],
|
||
advanced_parser={},
|
||
error=str(e)
|
||
)
|
||
|
||
async def _process_document_format(self, pdf_path: str, pages_info: List, temp_dir: str,
|
||
src_fileid: str, upload_options: Tuple,
|
||
is_ppt_format: bool) -> ProcessingResult:
|
||
"""统一的文档处理方法 - 使用并行处理"""
|
||
try:
|
||
format_type = "PPT" if is_ppt_format else "non-PPT"
|
||
self.logger.info(f"mineru-parser: processing as {format_type} format")
|
||
|
||
if not self.parallel_processor:
|
||
raise RuntimeError("Parallel processor not initialized")
|
||
|
||
result = await self._process_document_parallel(
|
||
pdf_path, pages_info, temp_dir, src_fileid,
|
||
upload_options, is_ppt_format
|
||
)
|
||
return result
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-parser: {format_type} format processing failed: {str(e)}")
|
||
raise
|
||
|
||
async def _process_document_parallel(self, pdf_path: str, pages_info: List,
|
||
temp_dir: str, src_fileid: str,
|
||
upload_options: Tuple,
|
||
is_ppt_format: bool) -> ProcessingResult:
|
||
"""并行处理文档 - 带缓存支持"""
|
||
try:
|
||
self.logger.info(f"mineru-parser: using parallel processing pipeline")
|
||
|
||
# 加载已缓存的页面
|
||
cached_pages = self._load_cached_pages(temp_dir)
|
||
if cached_pages:
|
||
self.logger.info(f"mineru-parser: found {len(cached_pages)} cached pages")
|
||
|
||
# 获取上传回调(通过适配器)
|
||
# 总是创建upload_callback,让适配器决定如何处理
|
||
# upload_func 需要接受4个参数: filepath, filename, upload_options, binary_data
|
||
async def upload_callback(filepath, filename, options, binary_data=None):
|
||
# 如果有 binary_data,说明图片被压缩了,需要先保存到临时文件
|
||
if binary_data:
|
||
import tempfile
|
||
import os as _os # Import os locally to avoid scope issues
|
||
with tempfile.NamedTemporaryFile(suffix=_os.path.splitext(filename)[1], delete=False) as tmp:
|
||
tmp.write(binary_data)
|
||
tmp_path = tmp.name
|
||
|
||
result = await self.adapter.upload_file(tmp_path, upload_options)
|
||
# 清理临时文件
|
||
try:
|
||
_os.unlink(tmp_path)
|
||
except:
|
||
pass
|
||
elif filepath:
|
||
result = await self.adapter.upload_file(filepath, upload_options)
|
||
else:
|
||
self.logger.warning(f"No file path or binary data provided for {filename}")
|
||
return None, None
|
||
|
||
# 返回 (url, upload_key) 格式
|
||
return result, None
|
||
|
||
# 并行处理文档
|
||
completed_tasks = await self.parallel_processor.process_document_with_cache(
|
||
pdf_path, temp_dir, src_fileid, is_ppt_format,
|
||
len(pages_info), upload_callback, upload_options,
|
||
cached_pages=cached_pages,
|
||
save_callback=lambda idx, data: self._save_page_cache(temp_dir, idx, data)
|
||
)
|
||
|
||
self.logger.info(f"mineru-parser: received {len(completed_tasks)} completed tasks")
|
||
|
||
# 转换结果
|
||
result = self._convert_parallel_results(completed_tasks, pages_info)
|
||
|
||
# 标记缓存完成
|
||
if len(result.page_chunks) == len(pages_info):
|
||
self._mark_cache_complete(temp_dir, len(pages_info))
|
||
|
||
return result
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-parser: parallel processing failed: {str(e)}")
|
||
raise
|
||
|
||
def _build_final_content(self, pages: List[Dict]) -> str:
|
||
"""构建最终内容 - 公共方法"""
|
||
content_parts = []
|
||
for index, page in enumerate(pages):
|
||
content_parts.append(f"__PAGE_OF_PORTION_{index + 1}__\n\n")
|
||
content_parts.append(page['content'])
|
||
content_parts.append('\n\n')
|
||
return ''.join(content_parts)
|
||
|
||
def _create_document(self, result: ProcessingResult, filepath: str,
|
||
src_name: str, temp_dir: str) -> Document:
|
||
"""创建Document对象 - 公共方法"""
|
||
# 保存结果到JSON
|
||
json_filepath = os.path.join(temp_dir, 'mineru_result.json')
|
||
with open(json_filepath, 'w', encoding='utf-8') as file:
|
||
json.dump(result.advanced_parser, file, ensure_ascii=False, indent=2)
|
||
|
||
# 保存内容
|
||
content_filepath = os.path.join(temp_dir, 'content.txt')
|
||
with open(content_filepath, 'w', encoding='utf-8') as file:
|
||
file.write(result.content)
|
||
|
||
# 准备元数据
|
||
doc_src_name = src_name if src_name else filepath
|
||
doc_src_name = os.path.splitext(os.path.basename(doc_src_name))[0] + Path(filepath).suffix
|
||
|
||
# 创建文档
|
||
doc = Document(
|
||
page_content=result.content,
|
||
metadata={
|
||
'title': doc_src_name,
|
||
'source': filepath,
|
||
'outlines': '',
|
||
'advanced_parser': json.dumps(result.advanced_parser, ensure_ascii=False),
|
||
'resources': result.images,
|
||
'parser_type': 'mineru',
|
||
'parser_version': self.config.cache_version,
|
||
'context_aware': self.config.enable_context_extraction
|
||
}
|
||
)
|
||
|
||
return doc
|
||
|
||
def _create_page_documents(self, result: ProcessingResult, filepath: str,
|
||
src_name: str, temp_dir: str) -> List[Document]:
|
||
"""创建分页文档 - 公共方法"""
|
||
docs = []
|
||
base_name = src_name if src_name else filepath
|
||
base_name = os.path.splitext(os.path.basename(base_name))[0]
|
||
file_ext = Path(filepath).suffix
|
||
|
||
# 保存主结果
|
||
main_advanced_parser = result.advanced_parser.copy()
|
||
main_advanced_parser['per_page_processing'] = True
|
||
main_advanced_parser['total_pages'] = len(result.page_chunks)
|
||
|
||
main_json_filepath = os.path.join(temp_dir, 'mineru_result.json')
|
||
with open(main_json_filepath, 'w', encoding='utf-8') as file:
|
||
json.dump(main_advanced_parser, file, ensure_ascii=False, indent=2)
|
||
|
||
# 跟踪空页面
|
||
empty_pages = []
|
||
|
||
for chunk in result.page_chunks:
|
||
page_num = chunk['page_idx'] + 1
|
||
|
||
# 检查页面是否为空
|
||
page_content = chunk.get('content', '').strip()
|
||
page_images = chunk.get('images', [])
|
||
|
||
if not page_content and not page_images:
|
||
self.logger.error(f"mineru-parser: Page {page_num} has no content and no images - skipping")
|
||
empty_pages.append(page_num)
|
||
continue
|
||
|
||
# 创建页面元数据
|
||
page_advanced_parser = {
|
||
'learn_type': self.learn_type,
|
||
'page_number': page_num,
|
||
'total_pages': len(result.page_chunks),
|
||
'has_tables': chunk.get('has_tables', False),
|
||
'images': chunk.get('images', []),
|
||
'processing_metadata': chunk.get('processing_metadata', {}),
|
||
'mineru_metadata': result.advanced_parser.get('mineru_metadata', {}),
|
||
'per_page_processing': True
|
||
}
|
||
|
||
# 保存页面结果
|
||
page_json_filepath = os.path.join(temp_dir, f'mineru_result_page_{page_num}.json')
|
||
with open(page_json_filepath, 'w', encoding='utf-8') as file:
|
||
json.dump(page_advanced_parser, file, ensure_ascii=False, indent=2)
|
||
|
||
# 保存页面内容
|
||
page_content_filepath = os.path.join(temp_dir, f'page_{page_num}_content.txt')
|
||
with open(page_content_filepath, 'w', encoding='utf-8') as file:
|
||
file.write(chunk['content'])
|
||
|
||
# 创建文档
|
||
doc = Document(
|
||
page_content=chunk['content'],
|
||
metadata={
|
||
'title': f"{base_name}_page_{page_num}{file_ext}",
|
||
'source': filepath,
|
||
'page': page_num,
|
||
'total_pages': len(result.page_chunks),
|
||
'outlines': '',
|
||
'advanced_parser': json.dumps(page_advanced_parser, ensure_ascii=False),
|
||
'resources': chunk.get('images', []),
|
||
'parser_type': 'mineru',
|
||
'parser_version': self.config.cache_version,
|
||
'context_aware': self.config.enable_context_extraction,
|
||
'has_tables': chunk.get('has_tables', False)
|
||
}
|
||
)
|
||
|
||
docs.append(doc)
|
||
|
||
# 报告空页面
|
||
if empty_pages:
|
||
self.logger.error(f"mineru-parser: Found {len(empty_pages)} empty pages: {empty_pages}")
|
||
|
||
return docs
|
||
|
||
def _convert_parallel_results(self, completed_tasks: List[PageTask],
|
||
pages_info: List) -> ProcessingResult:
|
||
"""转换并行处理结果 - 公共方法"""
|
||
page_chunks = []
|
||
all_processed_images = {}
|
||
|
||
for task in completed_tasks:
|
||
# 获取内容并清理
|
||
content = task.refined_content or task.content or ''
|
||
|
||
if content and not content.strip():
|
||
self.logger.info(f"mineru-parser: page {task.page_idx} contains only whitespace")
|
||
content = ''
|
||
|
||
# 清理内容
|
||
content = self.content_processor._clean_hallucination_patterns(content)
|
||
MAX_CONTENT_LENGTH = 30000
|
||
if len(content) > MAX_CONTENT_LENGTH:
|
||
self.logger.warning(f"mineru-parser: page {task.page_idx} content too long, truncating")
|
||
content = content[:MAX_CONTENT_LENGTH] + "..."
|
||
|
||
page_chunk = {
|
||
'page_idx': task.page_idx,
|
||
'content': content,
|
||
'images': task.images,
|
||
'processed_images': task.processed_images,
|
||
'image_descriptions': task.image_descriptions,
|
||
'has_tables': task.metadata.get('has_tables', False),
|
||
'processing_metadata': task.metadata
|
||
}
|
||
page_chunks.append(page_chunk)
|
||
all_processed_images.update(task.processed_images)
|
||
|
||
# 构建页面结构
|
||
pages = []
|
||
for chunk in page_chunks:
|
||
pages.append({
|
||
'index': chunk['page_idx'],
|
||
'content': chunk['content'],
|
||
'image_map': {},
|
||
'summary': '',
|
||
'input_tokens': 0,
|
||
'output_tokens': 0,
|
||
'dura': 0.0,
|
||
'has_tables': chunk.get('has_tables', False),
|
||
'images': chunk.get('images', [])
|
||
})
|
||
|
||
# 构建最终内容
|
||
final_content = self._build_final_content(pages)
|
||
|
||
# 构建元数据
|
||
advanced_parser = {
|
||
'learn_type': self.learn_type,
|
||
'input_tokens': 0,
|
||
'output_tokens': 0,
|
||
'dura': 0,
|
||
'pages': pages,
|
||
'parallel_processing': True,
|
||
'page_processing_metadata': [chunk.get('processing_metadata', {}) for chunk in page_chunks],
|
||
'image_count': len(all_processed_images),
|
||
'context_extraction_enabled': self.config.enable_context_extraction,
|
||
'multimodal_refinement_enabled': self.config.enable_multimodal_refinement,
|
||
'per_page_processing': True
|
||
}
|
||
|
||
return ProcessingResult(
|
||
success=True,
|
||
content=final_content,
|
||
images=list(all_processed_images.values()),
|
||
pages=pages,
|
||
advanced_parser=advanced_parser,
|
||
page_chunks=page_chunks
|
||
)
|
||
|
||
# ========== 缓存相关方法 ==========
|
||
|
||
async def _load_from_cache(self, temp_dir: str, filepath: str, src_name: str) -> Optional[List[Document]]:
|
||
"""从缓存加载 - 公共方法"""
|
||
try:
|
||
result_json_path = os.path.join(temp_dir, 'mineru_result.json')
|
||
cache_status_path = os.path.join(temp_dir, 'cache_status.json')
|
||
|
||
# 检查缓存状态
|
||
if os.path.exists(cache_status_path):
|
||
with open(cache_status_path, 'r', encoding='utf-8') as f:
|
||
cache_status = json.load(f)
|
||
|
||
if cache_status.get('status') == 'partial':
|
||
self.logger.info(f"mineru-parser: found partial cache")
|
||
return None
|
||
elif cache_status.get('status') != 'complete':
|
||
return None
|
||
|
||
if not os.path.exists(result_json_path):
|
||
return None
|
||
|
||
# 加载主结果
|
||
with open(result_json_path, 'r', encoding='utf-8') as f:
|
||
advanced_parser = json.load(f)
|
||
|
||
# 检查是否是分页处理
|
||
if advanced_parser.get('per_page_processing'):
|
||
docs = []
|
||
total_pages = advanced_parser.get('total_pages', 0)
|
||
|
||
for page_num in range(1, total_pages + 1):
|
||
page_json_path = os.path.join(temp_dir, f'mineru_result_page_{page_num}.json')
|
||
page_content_path = os.path.join(temp_dir, f'page_{page_num}_content.txt')
|
||
|
||
if not os.path.exists(page_json_path) or not os.path.exists(page_content_path):
|
||
return None
|
||
|
||
with open(page_content_path, 'r', encoding='utf-8') as f:
|
||
page_content = f.read()
|
||
|
||
if page_content == '[EMPTY_PAGE]':
|
||
page_content = ''
|
||
|
||
with open(page_json_path, 'r', encoding='utf-8') as f:
|
||
page_advanced_parser = json.load(f)
|
||
|
||
base_name = src_name if src_name else filepath
|
||
base_name = os.path.splitext(os.path.basename(base_name))[0]
|
||
file_ext = Path(filepath).suffix
|
||
|
||
doc = Document(
|
||
page_content=page_content,
|
||
metadata={
|
||
'title': f"{base_name}_page_{page_num}{file_ext}",
|
||
'source': filepath,
|
||
'page': page_num,
|
||
'total_pages': total_pages,
|
||
'outlines': '',
|
||
'advanced_parser': json.dumps(page_advanced_parser, ensure_ascii=False),
|
||
'resources': page_advanced_parser.get('images', []),
|
||
'parser_type': 'mineru',
|
||
'parser_version': self.config.cache_version,
|
||
'context_aware': self.config.enable_context_extraction,
|
||
'has_tables': page_advanced_parser.get('has_tables', False)
|
||
}
|
||
)
|
||
docs.append(doc)
|
||
|
||
self.logger.info(f"mineru-parser: cache hit - loaded {len(docs)} page documents")
|
||
return docs
|
||
else:
|
||
# 单文档结果
|
||
content_path = os.path.join(temp_dir, 'content.txt')
|
||
if not os.path.exists(content_path):
|
||
return None
|
||
|
||
with open(content_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
doc_src_name = src_name if src_name else filepath
|
||
doc_src_name = os.path.splitext(os.path.basename(doc_src_name))[0] + Path(filepath).suffix
|
||
|
||
doc = Document(
|
||
page_content=content,
|
||
metadata={
|
||
'title': doc_src_name,
|
||
'source': filepath,
|
||
'outlines': '',
|
||
'advanced_parser': json.dumps(advanced_parser, ensure_ascii=False),
|
||
'resources': advanced_parser.get('pages', [{}])[0].get('images', []),
|
||
'parser_type': 'mineru',
|
||
'parser_version': self.config.cache_version,
|
||
'context_aware': self.config.enable_context_extraction
|
||
}
|
||
)
|
||
|
||
self.logger.info(f"mineru-parser: cache hit - loaded single document")
|
||
return [doc]
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-parser: error loading from cache: {str(e)}")
|
||
return None
|
||
|
||
def _save_page_cache(self, temp_dir: str, page_idx: int, page_data: Dict):
|
||
"""保存页面缓存 - 公共方法"""
|
||
try:
|
||
page_num = page_idx + 1
|
||
|
||
# 保存页面内容
|
||
page_content_path = os.path.join(temp_dir, f'page_{page_num}_content.txt')
|
||
content = page_data.get('content', '')
|
||
page_images = page_data.get('images', [])
|
||
|
||
if (not content or not content.strip()) and not page_images:
|
||
self.logger.error(f"mineru-parser: Page {page_num} has no content and no images")
|
||
return
|
||
|
||
if not content or not content.strip():
|
||
content = '[EMPTY_PAGE]'
|
||
|
||
with open(page_content_path, 'w', encoding='utf-8') as f:
|
||
f.write(content)
|
||
|
||
# 保存页面元数据
|
||
page_meta_path = os.path.join(temp_dir, f'page_{page_num}_meta.json')
|
||
page_metadata = {
|
||
'page_idx': page_idx,
|
||
'page_num': page_num,
|
||
'has_tables': page_data.get('has_tables', False),
|
||
'images': page_data.get('images', []),
|
||
'processed_images': page_data.get('processed_images', {}),
|
||
'image_descriptions': page_data.get('image_descriptions', {}),
|
||
'processing_metadata': page_data.get('processing_metadata', {}),
|
||
'cached_at': time.time()
|
||
}
|
||
with open(page_meta_path, 'w', encoding='utf-8') as f:
|
||
json.dump(page_metadata, f, ensure_ascii=False, indent=2)
|
||
|
||
# 更新缓存状态
|
||
self._update_cache_status(temp_dir, page_idx)
|
||
|
||
self.logger.debug(f"mineru-parser: saved page {page_num} to cache")
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-parser: failed to save page cache: {str(e)}")
|
||
|
||
def _update_cache_status(self, temp_dir: str, completed_page_idx: int):
|
||
"""更新缓存状态 - 公共方法"""
|
||
cache_status_path = os.path.join(temp_dir, 'cache_status.json')
|
||
|
||
try:
|
||
if os.path.exists(cache_status_path):
|
||
with open(cache_status_path, 'r', encoding='utf-8') as f:
|
||
status = json.load(f)
|
||
else:
|
||
status = {
|
||
'status': 'partial',
|
||
'completed_pages': 0,
|
||
'completed_indices': [],
|
||
'total_pages': None,
|
||
'started_at': time.time(),
|
||
'updated_at': time.time()
|
||
}
|
||
|
||
if completed_page_idx not in status['completed_indices']:
|
||
status['completed_indices'].append(completed_page_idx)
|
||
status['completed_pages'] = len(status['completed_indices'])
|
||
|
||
status['updated_at'] = time.time()
|
||
|
||
with open(cache_status_path, 'w', encoding='utf-8') as f:
|
||
json.dump(status, f, ensure_ascii=False, indent=2)
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-parser: failed to update cache status: {str(e)}")
|
||
|
||
def _mark_cache_complete(self, temp_dir: str, total_pages: int):
|
||
"""标记缓存完成 - 公共方法"""
|
||
cache_status_path = os.path.join(temp_dir, 'cache_status.json')
|
||
|
||
try:
|
||
status = {
|
||
'status': 'complete',
|
||
'completed_pages': total_pages,
|
||
'total_pages': total_pages,
|
||
'completed_at': time.time()
|
||
}
|
||
|
||
with open(cache_status_path, 'w', encoding='utf-8') as f:
|
||
json.dump(status, f, ensure_ascii=False, indent=2)
|
||
|
||
self.logger.info(f"mineru-parser: marked cache as complete")
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-parser: failed to mark cache complete: {str(e)}")
|
||
|
||
def _load_cached_pages(self, temp_dir: str) -> Dict[int, Dict]:
|
||
"""加载缓存页面 - 公共方法"""
|
||
cached_pages = {}
|
||
|
||
try:
|
||
cache_status_path = os.path.join(temp_dir, 'cache_status.json')
|
||
if os.path.exists(cache_status_path):
|
||
with open(cache_status_path, 'r', encoding='utf-8') as f:
|
||
status = json.load(f)
|
||
|
||
for page_idx in status.get('completed_indices', []):
|
||
page_num = page_idx + 1
|
||
|
||
content_path = os.path.join(temp_dir, f'page_{page_num}_content.txt')
|
||
meta_path = os.path.join(temp_dir, f'page_{page_num}_meta.json')
|
||
|
||
if os.path.exists(content_path) and os.path.exists(meta_path):
|
||
with open(content_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
if content == '[EMPTY_PAGE]':
|
||
content = ''
|
||
|
||
with open(meta_path, 'r', encoding='utf-8') as f:
|
||
metadata = json.load(f)
|
||
|
||
cached_pages[page_idx] = {
|
||
'content': content,
|
||
'metadata': metadata
|
||
}
|
||
|
||
self.logger.info(f"mineru-parser: loaded {len(cached_pages)} cached pages")
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-parser: failed to load cached pages: {str(e)}")
|
||
|
||
return cached_pages
|