1079 lines
46 KiB
Python
1079 lines
46 KiB
Python
"""
|
||
Content processing module for MinerU-based parsing.
|
||
|
||
This module handles table detection, content fusion, and LLM-based
|
||
content refinement, following patterns from gzero.py.
|
||
"""
|
||
|
||
from typing import Dict, List, Optional
|
||
from dataclasses import dataclass
|
||
from .logger import get_module_logger
|
||
logger = get_module_logger('content_processor')
|
||
import tiktoken
|
||
|
||
import json
|
||
import os
|
||
import re
|
||
import io
|
||
from bs4 import BeautifulSoup
|
||
from .config_base import MinerUConfig
|
||
from .api_client import MinerUResult
|
||
from .context_types import ContentElement, PageContext, ContextMode
|
||
from .prompts.markdown_generation import format_markdown_generation_prompt
|
||
from .language_detector import LanguageDetector
|
||
|
||
|
||
@dataclass
|
||
class ContentProcessingResult:
|
||
"""Result from content processing operations"""
|
||
success: bool
|
||
processed_content: str
|
||
has_tables: bool
|
||
processing_metadata: Dict
|
||
error: Optional[str] = None
|
||
|
||
|
||
class MinerUContentProcessor:
|
||
"""Content processor for handling tables and content fusion"""
|
||
|
||
def __init__(self, config: MinerUConfig):
|
||
self.config = config
|
||
self.logger = logger
|
||
|
||
def detect_tables(self, content: str, _src_fileid: str) -> bool:
|
||
"""
|
||
Detect if content contains table structures.
|
||
|
||
Based on gzero.py's table detection logic.
|
||
"""
|
||
table_indicators = [
|
||
'<table>', '<tr>', '<td>', '|---|',
|
||
'表格', 'Table', '| ', ' |',
|
||
'┌', '└', '├', '┤', # Table border characters
|
||
'═', '║', '╔', '╗', '╚', '╝' # More table characters
|
||
]
|
||
|
||
content_lower = content.lower()
|
||
found_indicators = []
|
||
|
||
for indicator in table_indicators:
|
||
if indicator.lower() in content_lower:
|
||
found_indicators.append(indicator)
|
||
|
||
# Check for pipe-separated table format
|
||
lines = content.split('\n')
|
||
pipe_lines = [line for line in lines if line.count('|') >= 2]
|
||
has_pipe_table = len(pipe_lines) >= 2 # At least header and one data row
|
||
|
||
has_tables = bool(found_indicators) or has_pipe_table
|
||
|
||
if has_tables:
|
||
self.logger.info(f"mineru-content: tables detected - indicators: {found_indicators}, pipe_lines: {len(pipe_lines)}")
|
||
else:
|
||
self.logger.info(f"mineru-content: no tables detected")
|
||
|
||
return has_tables
|
||
|
||
async def process_content(self, mineru_result: MinerUResult, pdf_path: str,
|
||
temp_dir: str, src_fileid: str, learn_type: int,
|
||
has_tables: bool) -> ContentProcessingResult:
|
||
"""
|
||
[DEPRECATED] Process content with optional multimodal refinement.
|
||
|
||
This method is deprecated. Use process_page_content() instead for page-by-page processing.
|
||
|
||
Args:
|
||
mineru_result: Result from MinerU processing
|
||
pdf_path: Path to original PDF
|
||
temp_dir: Temporary directory
|
||
src_fileid: Source file ID
|
||
learn_type: Model type for LLM processing
|
||
has_tables: Whether the content contains tables
|
||
|
||
Returns:
|
||
ContentProcessingResult with processed content
|
||
"""
|
||
try:
|
||
process_type = "with tables" if has_tables else "without tables"
|
||
self.logger.info(f"mineru-content: processing content {process_type}")
|
||
|
||
# Check if multimodal refinement is enabled
|
||
if not self.config.enable_multimodal_refinement:
|
||
self.logger.info(f"mineru-content: multimodal refinement disabled, returning original content")
|
||
processing_metadata = {
|
||
'mineru_content_length': len(mineru_result.content),
|
||
'image_count': len(mineru_result.images),
|
||
'table_count': len(mineru_result.tables),
|
||
'multimodal_refinement': False
|
||
}
|
||
|
||
return ContentProcessingResult(
|
||
success=True,
|
||
processed_content=mineru_result.content,
|
||
has_tables=has_tables,
|
||
processing_metadata=processing_metadata
|
||
)
|
||
|
||
# Step 1: Extract plain text from PDF (only if has tables)
|
||
plain_text = ""
|
||
if has_tables:
|
||
plain_text = await self._extract_plain_text(pdf_path, src_fileid)
|
||
|
||
# Step 2: Extract PDF page images for multimodal processing
|
||
pdf_page_images = await self._extract_pdf_page_images(pdf_path, temp_dir, src_fileid)
|
||
|
||
# Step 3: Use LLM to refine content with PDF page images
|
||
refined_content = await self._llm_refine_content(
|
||
mineru_result.content, plain_text, pdf_page_images, temp_dir, src_fileid, learn_type, None
|
||
)
|
||
|
||
processing_metadata = {
|
||
'mineru_content_length': len(mineru_result.content),
|
||
'refined_content_length': len(refined_content),
|
||
'image_count': len(mineru_result.images),
|
||
'table_count': len(mineru_result.tables),
|
||
'pdf_page_images_count': len(pdf_page_images),
|
||
'multimodal_refinement': True
|
||
}
|
||
|
||
if has_tables:
|
||
processing_metadata['plain_text_length'] = len(plain_text)
|
||
|
||
self.logger.info(f"mineru-content: content processing completed: {processing_metadata}")
|
||
|
||
return ContentProcessingResult(
|
||
success=True,
|
||
processed_content=refined_content,
|
||
has_tables=has_tables,
|
||
processing_metadata=processing_metadata
|
||
)
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-content: content processing failed: {str(e)}")
|
||
return ContentProcessingResult(
|
||
success=False,
|
||
processed_content=mineru_result.content, # Fallback to original
|
||
has_tables=has_tables,
|
||
processing_metadata={},
|
||
error=str(e)
|
||
)
|
||
|
||
async def _extract_plain_text(self, pdf_path: str, _src_fileid: str) -> str:
|
||
"""
|
||
Extract plain text from PDF using PyMuPDF.
|
||
"""
|
||
try:
|
||
import fitz
|
||
|
||
text_parts = []
|
||
with fitz.open(pdf_path) as doc:
|
||
for page_num, page in enumerate(doc):
|
||
page_text = page.get_text('text') # Plain text extraction
|
||
if page_text.strip():
|
||
text_parts.append(f"=== Page {page_num + 1} ===\n{page_text}")
|
||
|
||
plain_text = '\n\n'.join(text_parts)
|
||
|
||
self.logger.info(f"mineru-content: extracted {len(plain_text)} characters of plain text")
|
||
|
||
return plain_text
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-content: plain text extraction failed: {str(e)}")
|
||
return ""
|
||
|
||
async def _extract_pdf_page_images(self, pdf_path: str, temp_dir: str, _src_fileid: str,
|
||
max_pages: int = 10) -> List[str]:
|
||
"""
|
||
Extract PDF pages as images for multimodal processing.
|
||
|
||
Args:
|
||
pdf_path: Path to PDF file
|
||
temp_dir: Temporary directory for saving images
|
||
_src_fileid: Source file ID
|
||
max_pages: Maximum number of pages to extract (to avoid token limits)
|
||
|
||
Returns:
|
||
List of paths to page image files
|
||
"""
|
||
try:
|
||
import pdf2image
|
||
import os
|
||
|
||
self.logger.info(f"mineru-content: extracting PDF page images from {pdf_path}")
|
||
|
||
# Get page count first
|
||
import fitz
|
||
with fitz.open(pdf_path) as doc:
|
||
total_pages = len(doc)
|
||
|
||
# Determine pages to extract
|
||
pages_to_extract = min(total_pages, max_pages)
|
||
|
||
# Configure pdf2image options
|
||
options = {
|
||
'pdf_path': pdf_path,
|
||
'dpi': 150, # Lower DPI for multimodal processing
|
||
'fmt': 'png',
|
||
'output_folder': temp_dir,
|
||
'use_pdftocairo': True,
|
||
'paths_only': True,
|
||
'first_page': 1,
|
||
'last_page': pages_to_extract
|
||
}
|
||
|
||
# Convert PDF pages to images
|
||
image_paths = pdf2image.convert_from_path(**options)
|
||
|
||
# Rename images to standard format
|
||
page_images = []
|
||
for idx, img_path in enumerate(image_paths):
|
||
new_name = f"pdf_page_{idx + 1:03d}.png"
|
||
new_path = os.path.join(temp_dir, new_name)
|
||
os.rename(img_path, new_path)
|
||
page_images.append(new_path)
|
||
|
||
self.logger.info(f"mineru-content: extracted {len(page_images)} page images")
|
||
|
||
return page_images
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-content: PDF page image extraction failed: {str(e)}")
|
||
return []
|
||
|
||
|
||
# def _extract_html_tables(self, content: str) -> List[Tuple[str, int, int]]:
|
||
# """
|
||
# Extract HTML tables from content with their positions.
|
||
|
||
# Returns:
|
||
# List of tuples: (table_html, start_pos, end_pos)
|
||
# """
|
||
# tables = []
|
||
|
||
# # Find all HTML table tags
|
||
# table_pattern = r'<table[^>]*>.*?</table>'
|
||
# matches = re.finditer(table_pattern, content, re.DOTALL | re.IGNORECASE)
|
||
|
||
# for match in matches:
|
||
# table_html = match.group(0)
|
||
# start_pos = match.start()
|
||
# end_pos = match.end()
|
||
# tables.append((table_html, start_pos, end_pos))
|
||
|
||
# self.logger.info(f"mineru-content: found {len(tables)} HTML tables")
|
||
# return tables
|
||
|
||
def _find_related_pdf_content(self, plain_text: str, table_html: str, context_size: int = 500) -> str:
|
||
"""
|
||
Find related content in PDF text based on table content.
|
||
|
||
Args:
|
||
plain_text: Full PDF plain text
|
||
table_html: HTML table to find context for
|
||
context_size: Number of characters before/after to include
|
||
|
||
Returns:
|
||
Related PDF text chunk
|
||
"""
|
||
try:
|
||
# Extract text from HTML table
|
||
soup = BeautifulSoup(table_html, 'html.parser')
|
||
table_text = soup.get_text(separator=' ', strip=True)
|
||
|
||
# Extract key words from table (first few non-numeric words)
|
||
# 根据标点符号分割文本
|
||
# 修复正则表达式中的字符范围语法错误
|
||
words = re.split(r'[,.!:?;*\-,。!?;、/()()\s]+', table_text)
|
||
key_words = []
|
||
for word in words:
|
||
# Skip pure numbers and short words
|
||
if len(word) > 3 and word not in key_words:
|
||
key_words.append(word)
|
||
|
||
if not key_words:
|
||
self.logger.warning("mineru-content: no key words found in table")
|
||
return ""
|
||
|
||
# Search for key words in PDF text
|
||
best_match_pos = -1
|
||
best_match_score = 0
|
||
|
||
# Use sliding window to find best match
|
||
window_size = len(table_text) * 3 # Look for similar sized content
|
||
for i in range(0, len(plain_text) - window_size, 100): # Step by 100 chars
|
||
window_text = plain_text[i:i + window_size]
|
||
|
||
# Count matching key words
|
||
match_score = sum(1 for word in key_words if word.lower() in window_text.lower())
|
||
|
||
if match_score > best_match_score:
|
||
best_match_score = match_score
|
||
best_match_pos = i
|
||
|
||
if best_match_pos >= 0 and best_match_score >= len(key_words) * 0.3:
|
||
# Extract context around the match
|
||
start = max(0, best_match_pos - context_size)
|
||
end = min(len(plain_text), best_match_pos + window_size + context_size)
|
||
|
||
context = plain_text[start:end]
|
||
self.logger.info(f"mineru-content: found related PDF content with score {best_match_score}/{len(key_words)}")
|
||
return context
|
||
else:
|
||
self.logger.warning(f"mineru-content: no good match found (best score: {best_match_score}/{len(key_words)})")
|
||
return ""
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-content: error finding related PDF content: {str(e)}")
|
||
return ""
|
||
|
||
async def _llm_refine_content(self, content: str, plain_text: str, pdf_page_images: List[str],
|
||
temp_dir: str, src_fileid: str, learn_type: int, language_code: Optional[str] = None) -> str:
|
||
"""
|
||
Use LLM to refine content by:
|
||
1. Removing HTML tags from content
|
||
2. Combining content + plain_text + PDF page images
|
||
3. Using AI model to generate markdown output
|
||
4. Storing table existence in metadata
|
||
|
||
Args:
|
||
content: MinerU extracted content with HTML
|
||
plain_text: Plain text extracted from PDF
|
||
pdf_page_images: List of PDF page image paths (full page screenshots)
|
||
temp_dir: Temporary directory
|
||
src_fileid: Source file ID
|
||
learn_type: Model type for LLM
|
||
"""
|
||
import base64
|
||
try:
|
||
# Remove HTML tags from content
|
||
soup = BeautifulSoup(content, 'html.parser')
|
||
content_text = soup.get_text(separator='\n', strip=True)
|
||
|
||
# Detect if original content had tables (for cache metadata only)
|
||
has_tables_in_html = bool(soup.find_all('table'))
|
||
|
||
# Use provided language code or detect from content
|
||
if not language_code:
|
||
combined_text = content_text + "\n" + plain_text
|
||
if combined_text.strip():
|
||
detected_code, confidence = LanguageDetector.detect_language(combined_text)
|
||
if confidence > 0.7:
|
||
language_code = detected_code
|
||
self.logger.info(f"mineru-refine-content: detected language: {language_code} (confidence: {confidence:.2f})")
|
||
else:
|
||
self.logger.info(f"mineru-refine-content: language detection confidence too low ({confidence:.2f})")
|
||
|
||
if language_code:
|
||
self.logger.info(f"mineru-refine-content: will generate content in {LanguageDetector.get_language_name(language_code)}")
|
||
else:
|
||
self.logger.info(f"mineru-refine-content: no language specified, will use default")
|
||
|
||
# Check for cache
|
||
cache_filepath = os.path.join(temp_dir, f"content_refinement_markdown_v1_{src_fileid}.json")
|
||
|
||
# Use imported prompt for markdown generation with language
|
||
markdown_generation_prompt = format_markdown_generation_prompt(language_code)
|
||
|
||
# Prepare messages with multimodal content
|
||
messages = [
|
||
{"role": "system", "content": markdown_generation_prompt}
|
||
]
|
||
|
||
# Create user message with text and images
|
||
user_content = []
|
||
|
||
# Add text content
|
||
combined_input = f"""## OCR提取的文本:
|
||
{content_text}
|
||
|
||
## PDF原始文本:
|
||
{plain_text}
|
||
|
||
请基于提供的PDF页面图片和以上文本源,生成准确、完整的Markdown格式文档。特别注意识别和重建表格内容。"""
|
||
|
||
user_content.append({
|
||
"type": "text",
|
||
"text": combined_input
|
||
})
|
||
|
||
# Add PDF page images if available
|
||
if pdf_page_images:
|
||
self.logger.info(f"mineru-content: including {len(pdf_page_images)} PDF page images in multimodal request")
|
||
# Limit pages to avoid token limits and model constraints
|
||
# Most models have a limit of 4 images per request
|
||
max_pages = min(len(pdf_page_images), 4)
|
||
for idx, page_img_path in enumerate(pdf_page_images[:max_pages]):
|
||
if os.path.exists(page_img_path):
|
||
try:
|
||
with open(page_img_path, 'rb') as img_file:
|
||
# Use BytesIO to avoid blocking the event loop
|
||
img_data = img_file.read()
|
||
img_buffer = io.BytesIO(img_data)
|
||
img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
|
||
user_content.append({
|
||
"type": "image_url",
|
||
"image_url": {
|
||
"url": f"data:image/png;base64,{img_base64}"
|
||
}
|
||
})
|
||
self.logger.info(f"mineru-refine-content: added PDF page {idx + 1} image")
|
||
except Exception as e:
|
||
self.logger.warning(f"mineru-refine-content: failed to read PDF page image {page_img_path}: {str(e)}")
|
||
else:
|
||
self.logger.warning(f"mineru-refine-content: PDF page image not found: {page_img_path}")
|
||
else:
|
||
self.logger.warning("mineru-refine-content: no PDF page images provided for multimodal processing")
|
||
|
||
messages.append({
|
||
"role": "user",
|
||
"content": user_content
|
||
})
|
||
|
||
self.logger.info(f"mineru-refine-content: processing content with LLM for markdown generation")
|
||
|
||
# Use the unified litellm helper for multimodal request
|
||
response = await self.config.call_litellm(
|
||
model_type=learn_type,
|
||
messages=messages,
|
||
temperature=0.1
|
||
)
|
||
|
||
# Process response
|
||
refined_content = content
|
||
total_prompt_tokens = 0
|
||
total_completion_tokens = 0
|
||
|
||
try:
|
||
if (response.choices and
|
||
len(response.choices) > 0 and
|
||
response.choices[0].message and
|
||
response.choices[0].message.content):
|
||
refined_content = response.choices[0].message.content.strip()
|
||
# 从响应中提取markdown内容
|
||
markdown_start = refined_content.find("```markdown")
|
||
markdown_end = refined_content.rfind("```")
|
||
|
||
if markdown_start >= 0 and markdown_end > markdown_start:
|
||
# 提取markdown内容并去除首尾的```标记
|
||
refined_content = refined_content[markdown_start + 11:markdown_end].strip()
|
||
else:
|
||
# 如果没找到markdown标记,使用原始内容
|
||
self.logger.warning("mineru-refine-content: no markdown block found in response")
|
||
|
||
# Clean hallucination patterns first
|
||
refined_content = self._clean_hallucination_patterns(refined_content)
|
||
|
||
# Limit content length to prevent OpenSearch indexing errors
|
||
# OpenSearch has a max field length of 32766 bytes
|
||
# We use 30000 characters as a safe limit (considering UTF-8 encoding)
|
||
MAX_CONTENT_LENGTH = 30000
|
||
if len(refined_content) > MAX_CONTENT_LENGTH:
|
||
self.logger.warning(f"mineru-refine-content: content too long ({len(refined_content)} chars), truncating to {MAX_CONTENT_LENGTH}")
|
||
# Try to truncate at a sentence boundary
|
||
truncated = refined_content[:MAX_CONTENT_LENGTH]
|
||
# Find last complete sentence
|
||
for sep in ['. ', '。', '! ', '? ', '\n\n', '\n']:
|
||
last_sep = truncated.rfind(sep)
|
||
if last_sep > MAX_CONTENT_LENGTH * 0.9: # Within last 10%
|
||
refined_content = truncated[:last_sep + len(sep)]
|
||
break
|
||
else:
|
||
# If no good sentence boundary, just truncate
|
||
refined_content = truncated + "..."
|
||
|
||
# Track token usage
|
||
if hasattr(response, 'usage') and response.usage:
|
||
total_prompt_tokens = response.usage.prompt_tokens
|
||
total_completion_tokens = response.usage.completion_tokens
|
||
|
||
self.logger.info(
|
||
f"mineru-refine-content: markdown generation completed - "
|
||
f"tokens: {total_prompt_tokens}/{total_completion_tokens}"
|
||
)
|
||
else:
|
||
self.logger.warning("mineru-refine-content: empty response from LLM, using original content")
|
||
refined_content = content_text
|
||
# Also clean fallback content
|
||
refined_content = self._clean_hallucination_patterns(refined_content)
|
||
|
||
except (AttributeError, IndexError, ValueError) as e:
|
||
self.logger.error(f"mineru-refine-content: LLM response parsing failed: {str(e)}")
|
||
refined_content = content_text
|
||
# Clean and apply length limit to fallback content
|
||
refined_content = self._clean_hallucination_patterns(refined_content)
|
||
MAX_CONTENT_LENGTH = 30000
|
||
if len(refined_content) > MAX_CONTENT_LENGTH:
|
||
self.logger.warning(f"mineru-refine-content: fallback content too long ({len(refined_content)} chars), truncating")
|
||
refined_content = refined_content[:MAX_CONTENT_LENGTH] + "..."
|
||
|
||
# Save to cache
|
||
try:
|
||
cache_data = {
|
||
"refined_content": refined_content,
|
||
"model": "llm", # Generic model name since we don't have model_config anymore
|
||
"input_length": len(content_text) + len(plain_text),
|
||
"output_length": len(refined_content),
|
||
"prompt_tokens": total_prompt_tokens,
|
||
"completion_tokens": total_completion_tokens,
|
||
"has_tables": has_tables_in_html,
|
||
"used_pdf_page_images": len(pdf_page_images) if pdf_page_images else 0
|
||
}
|
||
with open(cache_filepath, 'w', encoding='utf-8') as file:
|
||
json.dump(cache_data, file, ensure_ascii=False, indent=2)
|
||
except Exception as e:
|
||
self.logger.warning(f"mineru-refine-content: cache write failed: {str(e)}")
|
||
|
||
return refined_content
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-refine-content: LLM refinement failed: {str(e)}")
|
||
# Fallback: return text without HTML tags
|
||
try:
|
||
soup = BeautifulSoup(content, 'html.parser')
|
||
return soup.get_text(separator='\n', strip=True)
|
||
except:
|
||
return content
|
||
|
||
def split_content_by_pages(self, mineru_result: MinerUResult) -> Dict[int, Dict]:
|
||
"""
|
||
Split MinerU results by page index.
|
||
|
||
Args:
|
||
mineru_result: MinerU processing result with content_list
|
||
|
||
Returns:
|
||
Dictionary mapping page_idx to page data
|
||
"""
|
||
page_data = {}
|
||
content_list = mineru_result.metadata.get('content_list', [])
|
||
|
||
# Group content by page
|
||
for item in content_list:
|
||
page_idx = item.get('page_idx', 0)
|
||
if page_idx not in page_data:
|
||
page_data[page_idx] = {
|
||
'content_items': [],
|
||
'images': [],
|
||
'tables': [],
|
||
'page_idx': page_idx
|
||
}
|
||
|
||
# Add item to appropriate list
|
||
if item['type'] == 'image':
|
||
page_data[page_idx]['images'].append(item.get('img_path', ''))
|
||
elif item['type'] == 'table':
|
||
page_data[page_idx]['tables'].append(item.get('metadata', {}))
|
||
|
||
page_data[page_idx]['content_items'].append(item)
|
||
|
||
# Extract page content from the merged content
|
||
# Try to split by different page markers
|
||
content = mineru_result.content
|
||
|
||
# Method 1: Split by "## Page X" markers
|
||
if '\n\n## Page ' in content:
|
||
content_parts = content.split('\n\n## Page ')
|
||
for i, part in enumerate(content_parts):
|
||
if i == 0 and not part.startswith('## Page'):
|
||
# Handle content before first page marker
|
||
if part.strip() and 0 in page_data:
|
||
page_data[0]['content'] = part.strip()
|
||
continue
|
||
|
||
# Extract page number
|
||
page_match = re.match(r'^(\d+)', part)
|
||
if page_match:
|
||
page_num = int(page_match.group(1)) - 1 # Convert to 0-based index
|
||
if page_num in page_data:
|
||
# Remove page number line and get content
|
||
lines = part.split('\n', 1)
|
||
if len(lines) > 1:
|
||
page_data[page_num]['content'] = lines[1].strip()
|
||
else:
|
||
# Method 2: If no clear page markers, try to reconstruct from content_list
|
||
current_content = []
|
||
current_page = -1
|
||
|
||
for item in content_list:
|
||
item_page = item.get('page_idx', 0)
|
||
|
||
# If we moved to a new page, save previous content
|
||
if item_page != current_page and current_page >= 0:
|
||
if current_page in page_data:
|
||
page_data[current_page]['content'] = '\n\n'.join(current_content)
|
||
current_content = []
|
||
|
||
current_page = item_page
|
||
|
||
# Add text content
|
||
if item['type'] in ['text', 'title']:
|
||
text = item.get('text', '') or item.get('content', '')
|
||
if text.strip():
|
||
if item['type'] == 'title':
|
||
current_content.append(f"## {text.strip()}")
|
||
else:
|
||
current_content.append(text.strip())
|
||
elif item['type'] == 'table':
|
||
current_content.append("[Table content]") # Placeholder for table
|
||
|
||
# Save last page content
|
||
if current_page >= 0 and current_page in page_data:
|
||
page_data[current_page]['content'] = '\n\n'.join(current_content)
|
||
|
||
return page_data
|
||
|
||
async def process_page_content(self, page_content: str, page_images: List[str],
|
||
pdf_path: str, page_idx: int, temp_dir: str,
|
||
src_fileid: str, learn_type: int, language_code: Optional[str] = None) -> Dict:
|
||
"""
|
||
Process content for a single page.
|
||
|
||
Args:
|
||
page_content: Content text for the page
|
||
page_images: Images found on the page
|
||
pdf_path: Path to PDF file
|
||
page_idx: Page index (0-based)
|
||
temp_dir: Temporary directory
|
||
src_fileid: Source file ID
|
||
learn_type: Model type for LLM
|
||
|
||
Returns:
|
||
Dictionary with processed page data
|
||
"""
|
||
try:
|
||
# Handle empty content gracefully
|
||
if not page_content or not page_content.strip():
|
||
self.logger.info(f"mineru-content: page {page_idx} has no text content")
|
||
return {
|
||
'page_idx': page_idx,
|
||
'content': '',
|
||
'images': page_images,
|
||
'has_tables': False,
|
||
'processing_metadata': {
|
||
'original_length': 0,
|
||
'refined_length': 0,
|
||
'multimodal_used': False,
|
||
'empty_page': True
|
||
}
|
||
}
|
||
|
||
# Detect tables in this page's content
|
||
has_tables = self.detect_tables(page_content, f"{src_fileid}_page_{page_idx}")
|
||
|
||
# Extract single page image if multimodal is enabled
|
||
pdf_page_images = []
|
||
if self.config.enable_multimodal_refinement:
|
||
# Extract just this page as image
|
||
pdf_page_images = await self._extract_single_pdf_page_image(
|
||
pdf_path, page_idx, temp_dir, src_fileid
|
||
)
|
||
plain_text = ""
|
||
if self.config.enable_multimodal_refinement and has_tables:
|
||
plain_text = await self._extract_page_plain_text(pdf_path, page_idx, src_fileid)
|
||
|
||
# Process content with multimodal refinement if enabled, has image 、has table、no text
|
||
if self.config.enable_multimodal_refinement and \
|
||
(plain_text == "" or has_tables or len(page_images) > 0):
|
||
# Refine content for this page
|
||
refined_content = await self._llm_refine_content(
|
||
page_content, plain_text, pdf_page_images, temp_dir,
|
||
f"{src_fileid}_page_{page_idx}", learn_type, language_code
|
||
)
|
||
else:
|
||
refined_content = page_content
|
||
|
||
# Clean hallucination patterns and apply length limit to all content
|
||
refined_content = self._clean_hallucination_patterns(refined_content)
|
||
MAX_CONTENT_LENGTH = 30000
|
||
if len(refined_content) > MAX_CONTENT_LENGTH:
|
||
self.logger.warning(f"mineru-content: page {page_idx} content too long ({len(refined_content)} chars), truncating")
|
||
refined_content = refined_content[:MAX_CONTENT_LENGTH] + "..."
|
||
|
||
return {
|
||
'page_idx': page_idx,
|
||
'content': refined_content,
|
||
'images': page_images,
|
||
'has_tables': has_tables,
|
||
'processing_metadata': {
|
||
'original_length': len(page_content),
|
||
'refined_length': len(refined_content),
|
||
'multimodal_used': self.config.enable_multimodal_refinement and (has_tables or len(page_images) > 0)
|
||
}
|
||
}
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-content: page {page_idx} processing failed: {str(e)}")
|
||
return {
|
||
'page_idx': page_idx,
|
||
'content': page_content, # Fallback to original
|
||
'images': page_images,
|
||
'has_tables': False,
|
||
'processing_metadata': {'error': str(e)}
|
||
}
|
||
|
||
async def _extract_single_pdf_page_image(self, pdf_path: str, page_idx: int,
|
||
temp_dir: str, _src_fileid: str) -> List[str]:
|
||
"""Extract a single PDF page as image."""
|
||
try:
|
||
import pdf2image
|
||
|
||
options = {
|
||
'pdf_path': pdf_path,
|
||
'dpi': 150,
|
||
'fmt': 'png',
|
||
'output_folder': temp_dir,
|
||
'use_pdftocairo': True,
|
||
'paths_only': True,
|
||
'first_page': page_idx + 1, # pdf2image uses 1-based indexing
|
||
'last_page': page_idx + 1
|
||
}
|
||
|
||
image_paths = pdf2image.convert_from_path(**options)
|
||
|
||
if image_paths:
|
||
new_name = f"pdf_page_{page_idx + 1:03d}.png"
|
||
new_path = os.path.join(temp_dir, new_name)
|
||
os.rename(image_paths[0], new_path)
|
||
return [new_path]
|
||
|
||
return []
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-content: failed to extract page {page_idx} image: {str(e)}")
|
||
return []
|
||
|
||
async def _extract_page_plain_text(self, pdf_path: str, page_idx: int, _src_fileid: str) -> str:
|
||
"""Extract plain text from a specific PDF page."""
|
||
try:
|
||
import fitz
|
||
|
||
with fitz.open(pdf_path) as doc:
|
||
if page_idx < len(doc):
|
||
page = doc[page_idx]
|
||
return page.get_text('text')
|
||
|
||
return ""
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-content: failed to extract page {page_idx} text: {str(e)}")
|
||
return ""
|
||
|
||
def create_page_chunks(self, content: str, _src_fileid: str) -> List[Dict]:
|
||
"""
|
||
Split content into page-based chunks for compatibility with gzero.py format.
|
||
|
||
Args:
|
||
content: Processed content
|
||
src_fileid: Source file ID
|
||
|
||
Returns:
|
||
List of page dictionaries
|
||
"""
|
||
try:
|
||
# Split content by page markers
|
||
page_separators = ['=== Page ', '__PAGE_OF_PORTION_']
|
||
|
||
pages = []
|
||
current_content = content
|
||
|
||
# Try to split by existing page markers
|
||
page_parts = []
|
||
for separator in page_separators:
|
||
if separator in current_content:
|
||
page_parts = current_content.split(separator)
|
||
break
|
||
|
||
if len(page_parts) > 1:
|
||
# Content already has page separators
|
||
for i, part in enumerate(page_parts):
|
||
if i == 0 and not part.strip():
|
||
continue # Skip empty first part
|
||
|
||
pages.append({
|
||
'index': len(pages),
|
||
'content': part.strip(),
|
||
'image_map': {},
|
||
'summary': '',
|
||
'input_tokens': 0,
|
||
'output_tokens': 0,
|
||
'dura': 0.0
|
||
})
|
||
else:
|
||
# Single page content
|
||
pages.append({
|
||
'index': 0,
|
||
'content': content,
|
||
'image_map': {},
|
||
'summary': '',
|
||
'input_tokens': 0,
|
||
'output_tokens': 0,
|
||
'dura': 0.0
|
||
})
|
||
|
||
self.logger.info(f"mineru-content: created {len(pages)} page chunks")
|
||
|
||
return pages
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-content: page chunking failed: {str(e)}")
|
||
# Fallback to single page
|
||
return [{
|
||
'index': 0,
|
||
'content': content,
|
||
'image_map': {},
|
||
'summary': '',
|
||
'input_tokens': 0,
|
||
'output_tokens': 0,
|
||
'dura': 0.0
|
||
}]
|
||
|
||
def extract_page_contexts(self, content_list: List[Dict], _src_fileid: str) -> List[PageContext]:
|
||
"""
|
||
Extract page context information from MinerU content list.
|
||
|
||
Args:
|
||
content_list: List of content items from MinerU with page_idx and type
|
||
src_fileid: Source file ID
|
||
|
||
Returns:
|
||
List of PageContext objects
|
||
"""
|
||
try:
|
||
self.logger.info(f"mineru-content: extracting page contexts from {len(content_list)} items")
|
||
|
||
# Group content by page
|
||
page_groups = {}
|
||
for idx, item in enumerate(content_list):
|
||
page_idx = item.get('page_idx', 0)
|
||
if page_idx not in page_groups:
|
||
page_groups[page_idx] = []
|
||
page_groups[page_idx].append((idx, item))
|
||
|
||
# Create PageContext for each page
|
||
page_contexts = []
|
||
for page_idx in sorted(page_groups.keys()):
|
||
page_items = page_groups[page_idx]
|
||
|
||
# Determine page type
|
||
has_title = any(item[1].get('type') == 'title' for item in page_items)
|
||
text_count = sum(1 for item in page_items if item[1].get('type') == 'text')
|
||
|
||
if has_title and text_count > 0:
|
||
page_type = 'mixed'
|
||
elif has_title:
|
||
page_type = 'title'
|
||
else:
|
||
page_type = 'content'
|
||
|
||
# Extract title if available
|
||
page_title = None
|
||
for _, item in page_items:
|
||
if item.get('type') == 'title':
|
||
page_title = item.get('text', '').strip()
|
||
break
|
||
|
||
# Create content elements
|
||
content_elements = []
|
||
text_parts = []
|
||
|
||
for position, item in page_items:
|
||
element_type = item.get('type', 'unknown')
|
||
content = item.get('text', '') or item.get('content', '')
|
||
|
||
if element_type == 'text' and content:
|
||
text_parts.append(content)
|
||
|
||
content_elements.append(ContentElement(
|
||
type=element_type,
|
||
content=content,
|
||
page_idx=page_idx,
|
||
position=position,
|
||
bbox=item.get('bbox'),
|
||
metadata=item.get('metadata', {})
|
||
))
|
||
|
||
# Join text content
|
||
text_content = '\n'.join(text_parts)
|
||
|
||
# Count tokens
|
||
token_count = self._count_tokens(text_content)
|
||
|
||
page_context = PageContext(
|
||
page_idx=page_idx,
|
||
page_type=page_type,
|
||
title=page_title,
|
||
content_elements=content_elements,
|
||
text_content=text_content,
|
||
token_count=token_count
|
||
)
|
||
|
||
page_contexts.append(page_context)
|
||
|
||
self.logger.info(f"mineru-content: extracted {len(page_contexts)} page contexts")
|
||
return page_contexts
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-content: page context extraction failed: {str(e)}")
|
||
return []
|
||
|
||
def extract_context_for_position(self, content_list: List[Dict], position: int,
|
||
mode: ContextMode = ContextMode.PAGE,
|
||
window_size: int = 2,
|
||
max_tokens: int = 1000) -> str:
|
||
"""
|
||
Extract context around a specific position in the content list.
|
||
|
||
Args:
|
||
content_list: MinerU content list
|
||
position: Position in the content list
|
||
mode: Context extraction mode (PAGE or CHUNK)
|
||
window_size: Number of pages/chunks to include
|
||
max_tokens: Maximum tokens to include
|
||
|
||
Returns:
|
||
Extracted context string
|
||
"""
|
||
try:
|
||
if position >= len(content_list):
|
||
return ""
|
||
|
||
target_item = content_list[position]
|
||
|
||
if mode == ContextMode.PAGE:
|
||
# Extract based on page boundaries
|
||
target_page_idx = target_item.get('page_idx', 0)
|
||
min_page = max(0, target_page_idx - window_size)
|
||
max_page = target_page_idx + window_size
|
||
|
||
context_items = []
|
||
for idx, item in enumerate(content_list):
|
||
item_page = item.get('page_idx', 0)
|
||
if min_page <= item_page <= max_page and item.get('type') in ['text', 'title']:
|
||
# Add page marker
|
||
if item_page != target_page_idx:
|
||
page_marker = f"[Page {item_page + 1}]"
|
||
if not context_items or context_items[-1] != page_marker:
|
||
context_items.append(page_marker)
|
||
|
||
text = item.get('text', '').strip()
|
||
if text:
|
||
context_items.append(text)
|
||
|
||
else: # ContextMode.CHUNK
|
||
# Extract based on chunk position
|
||
min_pos = max(0, position - window_size)
|
||
max_pos = min(len(content_list) - 1, position + window_size)
|
||
|
||
context_items = []
|
||
for idx in range(min_pos, max_pos + 1):
|
||
if idx == position:
|
||
continue # Skip the target item itself
|
||
|
||
item = content_list[idx]
|
||
if item.get('type') in ['text', 'title', 'table']:
|
||
text = item.get('text', '').strip()
|
||
if text:
|
||
if idx < position:
|
||
context_items.append(f"[Before] {text}")
|
||
else:
|
||
context_items.append(f"[After] {text}")
|
||
|
||
# Join and truncate by tokens
|
||
context_text = '\n'.join(context_items)
|
||
return self._truncate_by_tokens(context_text, max_tokens)
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"mineru-content: context extraction failed: {str(e)}")
|
||
return ""
|
||
|
||
def _count_tokens(self, text: str) -> int:
|
||
"""Count tokens in text using tiktoken."""
|
||
try:
|
||
encoding = tiktoken.get_encoding("cl100k_base")
|
||
return len(encoding.encode(text))
|
||
except:
|
||
# Fallback to character-based estimation
|
||
return len(text) // 4
|
||
|
||
def _truncate_by_tokens(self, text: str, max_tokens: int) -> str:
|
||
"""Truncate text to fit within token limit while preserving sentence boundaries."""
|
||
if not text:
|
||
return ""
|
||
|
||
try:
|
||
encoding = tiktoken.get_encoding("cl100k_base")
|
||
tokens = encoding.encode(text)
|
||
|
||
if len(tokens) <= max_tokens:
|
||
return text
|
||
|
||
# Truncate to max tokens
|
||
truncated_tokens = tokens[:max_tokens]
|
||
truncated_text = encoding.decode(truncated_tokens)
|
||
|
||
# Try to find a sentence boundary
|
||
for sep in ['. ', '。', '! ', '? ', '\n']:
|
||
last_sep = truncated_text.rfind(sep)
|
||
if last_sep > len(truncated_text) * 0.8: # Within last 20%
|
||
return truncated_text[:last_sep + len(sep)]
|
||
|
||
return truncated_text + "..."
|
||
|
||
except:
|
||
# Fallback to character-based truncation
|
||
char_limit = max_tokens * 4
|
||
if len(text) > char_limit:
|
||
return text[:char_limit] + "..."
|
||
return text
|
||
|
||
def _clean_hallucination_patterns(self, content: str) -> str:
|
||
"""
|
||
Clean common AI hallucination patterns from content.
|
||
|
||
Args:
|
||
content: The content to clean
|
||
|
||
Returns:
|
||
Cleaned content
|
||
"""
|
||
if not content:
|
||
return content
|
||
|
||
original_length = len(content)
|
||
|
||
# Pattern 1: Remove excessive dots (more than 10 consecutive dots)
|
||
content = re.sub(r'\.{10,}', '...', content)
|
||
|
||
# Pattern 2: Remove other excessive repeated characters (more than 10)
|
||
# This handles patterns like "--------" or "========="
|
||
content = re.sub(r'(.)\1{9,}', r'\1\1\1', content)
|
||
|
||
# Pattern 3: Remove excessive repeated words or patterns
|
||
# For example: "................................................................"
|
||
# repeated many times
|
||
content = re.sub(r'(\.{3,}[\s\n]*){5,}', '...\n', content)
|
||
|
||
# Pattern 4: Remove number sequences that appear to be counting
|
||
# Like ", 68\n, 72\n, 73\n, 73\n, 73\n" repeated many times
|
||
content = re.sub(r'(,\s*\d+\s*\n?\s*){20,}', '', content)
|
||
|
||
# Pattern 5: Remove table of contents with excessive dots
|
||
# Like "8.2 macOS 系统安装无线驱动程序................................................"
|
||
content = re.sub(r'([^\n]+)(\.{20,})', r'\1', content)
|
||
|
||
# Pattern 6: Clean up multiple consecutive empty lines
|
||
content = re.sub(r'\n{4,}', '\n\n\n', content)
|
||
|
||
# Pattern 7: Remove excessive dots or comma-separated numbers at the end
|
||
# But preserve normal punctuation
|
||
content = re.sub(r'[\s\n]*[,\d\s]{10,}$', '', content)
|
||
content = re.sub(r'\.{4,}$', '...', content)
|
||
|
||
cleaned_length = len(content)
|
||
if cleaned_length < original_length:
|
||
reduction = original_length - cleaned_length
|
||
self.logger.info(f"mineru-content: cleaned {reduction} characters of hallucination patterns "
|
||
f"(from {original_length} to {cleaned_length})")
|
||
|
||
return content.strip() |