maxkb/apps/common/handle/impl/mineru/content_processor.py
2025-08-24 00:56:02 +08:00

1079 lines
46 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Content processing module for MinerU-based parsing.
This module handles table detection, content fusion, and LLM-based
content refinement, following patterns from gzero.py.
"""
from typing import Dict, List, Optional
from dataclasses import dataclass
from .logger import get_module_logger
logger = get_module_logger('content_processor')
import tiktoken
import json
import os
import re
import io
from bs4 import BeautifulSoup
from .config_base import MinerUConfig
from .api_client import MinerUResult
from .context_types import ContentElement, PageContext, ContextMode
from .prompts.markdown_generation import format_markdown_generation_prompt
from .language_detector import LanguageDetector
@dataclass
class ContentProcessingResult:
"""Result from content processing operations"""
success: bool
processed_content: str
has_tables: bool
processing_metadata: Dict
error: Optional[str] = None
class MinerUContentProcessor:
"""Content processor for handling tables and content fusion"""
def __init__(self, config: MinerUConfig):
self.config = config
self.logger = logger
def detect_tables(self, content: str, _src_fileid: str) -> bool:
"""
Detect if content contains table structures.
Based on gzero.py's table detection logic.
"""
table_indicators = [
'<table>', '<tr>', '<td>', '|---|',
'表格', 'Table', '| ', ' |',
'', '', '', '', # Table border characters
'', '', '', '', '', '' # More table characters
]
content_lower = content.lower()
found_indicators = []
for indicator in table_indicators:
if indicator.lower() in content_lower:
found_indicators.append(indicator)
# Check for pipe-separated table format
lines = content.split('\n')
pipe_lines = [line for line in lines if line.count('|') >= 2]
has_pipe_table = len(pipe_lines) >= 2 # At least header and one data row
has_tables = bool(found_indicators) or has_pipe_table
if has_tables:
self.logger.info(f"mineru-content: tables detected - indicators: {found_indicators}, pipe_lines: {len(pipe_lines)}")
else:
self.logger.info(f"mineru-content: no tables detected")
return has_tables
async def process_content(self, mineru_result: MinerUResult, pdf_path: str,
temp_dir: str, src_fileid: str, learn_type: int,
has_tables: bool) -> ContentProcessingResult:
"""
[DEPRECATED] Process content with optional multimodal refinement.
This method is deprecated. Use process_page_content() instead for page-by-page processing.
Args:
mineru_result: Result from MinerU processing
pdf_path: Path to original PDF
temp_dir: Temporary directory
src_fileid: Source file ID
learn_type: Model type for LLM processing
has_tables: Whether the content contains tables
Returns:
ContentProcessingResult with processed content
"""
try:
process_type = "with tables" if has_tables else "without tables"
self.logger.info(f"mineru-content: processing content {process_type}")
# Check if multimodal refinement is enabled
if not self.config.enable_multimodal_refinement:
self.logger.info(f"mineru-content: multimodal refinement disabled, returning original content")
processing_metadata = {
'mineru_content_length': len(mineru_result.content),
'image_count': len(mineru_result.images),
'table_count': len(mineru_result.tables),
'multimodal_refinement': False
}
return ContentProcessingResult(
success=True,
processed_content=mineru_result.content,
has_tables=has_tables,
processing_metadata=processing_metadata
)
# Step 1: Extract plain text from PDF (only if has tables)
plain_text = ""
if has_tables:
plain_text = await self._extract_plain_text(pdf_path, src_fileid)
# Step 2: Extract PDF page images for multimodal processing
pdf_page_images = await self._extract_pdf_page_images(pdf_path, temp_dir, src_fileid)
# Step 3: Use LLM to refine content with PDF page images
refined_content = await self._llm_refine_content(
mineru_result.content, plain_text, pdf_page_images, temp_dir, src_fileid, learn_type, None
)
processing_metadata = {
'mineru_content_length': len(mineru_result.content),
'refined_content_length': len(refined_content),
'image_count': len(mineru_result.images),
'table_count': len(mineru_result.tables),
'pdf_page_images_count': len(pdf_page_images),
'multimodal_refinement': True
}
if has_tables:
processing_metadata['plain_text_length'] = len(plain_text)
self.logger.info(f"mineru-content: content processing completed: {processing_metadata}")
return ContentProcessingResult(
success=True,
processed_content=refined_content,
has_tables=has_tables,
processing_metadata=processing_metadata
)
except Exception as e:
self.logger.error(f"mineru-content: content processing failed: {str(e)}")
return ContentProcessingResult(
success=False,
processed_content=mineru_result.content, # Fallback to original
has_tables=has_tables,
processing_metadata={},
error=str(e)
)
async def _extract_plain_text(self, pdf_path: str, _src_fileid: str) -> str:
"""
Extract plain text from PDF using PyMuPDF.
"""
try:
import fitz
text_parts = []
with fitz.open(pdf_path) as doc:
for page_num, page in enumerate(doc):
page_text = page.get_text('text') # Plain text extraction
if page_text.strip():
text_parts.append(f"=== Page {page_num + 1} ===\n{page_text}")
plain_text = '\n\n'.join(text_parts)
self.logger.info(f"mineru-content: extracted {len(plain_text)} characters of plain text")
return plain_text
except Exception as e:
self.logger.error(f"mineru-content: plain text extraction failed: {str(e)}")
return ""
async def _extract_pdf_page_images(self, pdf_path: str, temp_dir: str, _src_fileid: str,
max_pages: int = 10) -> List[str]:
"""
Extract PDF pages as images for multimodal processing.
Args:
pdf_path: Path to PDF file
temp_dir: Temporary directory for saving images
_src_fileid: Source file ID
max_pages: Maximum number of pages to extract (to avoid token limits)
Returns:
List of paths to page image files
"""
try:
import pdf2image
import os
self.logger.info(f"mineru-content: extracting PDF page images from {pdf_path}")
# Get page count first
import fitz
with fitz.open(pdf_path) as doc:
total_pages = len(doc)
# Determine pages to extract
pages_to_extract = min(total_pages, max_pages)
# Configure pdf2image options
options = {
'pdf_path': pdf_path,
'dpi': 150, # Lower DPI for multimodal processing
'fmt': 'png',
'output_folder': temp_dir,
'use_pdftocairo': True,
'paths_only': True,
'first_page': 1,
'last_page': pages_to_extract
}
# Convert PDF pages to images
image_paths = pdf2image.convert_from_path(**options)
# Rename images to standard format
page_images = []
for idx, img_path in enumerate(image_paths):
new_name = f"pdf_page_{idx + 1:03d}.png"
new_path = os.path.join(temp_dir, new_name)
os.rename(img_path, new_path)
page_images.append(new_path)
self.logger.info(f"mineru-content: extracted {len(page_images)} page images")
return page_images
except Exception as e:
self.logger.error(f"mineru-content: PDF page image extraction failed: {str(e)}")
return []
# def _extract_html_tables(self, content: str) -> List[Tuple[str, int, int]]:
# """
# Extract HTML tables from content with their positions.
# Returns:
# List of tuples: (table_html, start_pos, end_pos)
# """
# tables = []
# # Find all HTML table tags
# table_pattern = r'<table[^>]*>.*?</table>'
# matches = re.finditer(table_pattern, content, re.DOTALL | re.IGNORECASE)
# for match in matches:
# table_html = match.group(0)
# start_pos = match.start()
# end_pos = match.end()
# tables.append((table_html, start_pos, end_pos))
# self.logger.info(f"mineru-content: found {len(tables)} HTML tables")
# return tables
def _find_related_pdf_content(self, plain_text: str, table_html: str, context_size: int = 500) -> str:
"""
Find related content in PDF text based on table content.
Args:
plain_text: Full PDF plain text
table_html: HTML table to find context for
context_size: Number of characters before/after to include
Returns:
Related PDF text chunk
"""
try:
# Extract text from HTML table
soup = BeautifulSoup(table_html, 'html.parser')
table_text = soup.get_text(separator=' ', strip=True)
# Extract key words from table (first few non-numeric words)
# 根据标点符号分割文本
# 修复正则表达式中的字符范围语法错误
words = re.split(r'[,.!:?;*\-,。!?;、/()\s]+', table_text)
key_words = []
for word in words:
# Skip pure numbers and short words
if len(word) > 3 and word not in key_words:
key_words.append(word)
if not key_words:
self.logger.warning("mineru-content: no key words found in table")
return ""
# Search for key words in PDF text
best_match_pos = -1
best_match_score = 0
# Use sliding window to find best match
window_size = len(table_text) * 3 # Look for similar sized content
for i in range(0, len(plain_text) - window_size, 100): # Step by 100 chars
window_text = plain_text[i:i + window_size]
# Count matching key words
match_score = sum(1 for word in key_words if word.lower() in window_text.lower())
if match_score > best_match_score:
best_match_score = match_score
best_match_pos = i
if best_match_pos >= 0 and best_match_score >= len(key_words) * 0.3:
# Extract context around the match
start = max(0, best_match_pos - context_size)
end = min(len(plain_text), best_match_pos + window_size + context_size)
context = plain_text[start:end]
self.logger.info(f"mineru-content: found related PDF content with score {best_match_score}/{len(key_words)}")
return context
else:
self.logger.warning(f"mineru-content: no good match found (best score: {best_match_score}/{len(key_words)})")
return ""
except Exception as e:
self.logger.error(f"mineru-content: error finding related PDF content: {str(e)}")
return ""
async def _llm_refine_content(self, content: str, plain_text: str, pdf_page_images: List[str],
temp_dir: str, src_fileid: str, learn_type: int, language_code: Optional[str] = None) -> str:
"""
Use LLM to refine content by:
1. Removing HTML tags from content
2. Combining content + plain_text + PDF page images
3. Using AI model to generate markdown output
4. Storing table existence in metadata
Args:
content: MinerU extracted content with HTML
plain_text: Plain text extracted from PDF
pdf_page_images: List of PDF page image paths (full page screenshots)
temp_dir: Temporary directory
src_fileid: Source file ID
learn_type: Model type for LLM
"""
import base64
try:
# Remove HTML tags from content
soup = BeautifulSoup(content, 'html.parser')
content_text = soup.get_text(separator='\n', strip=True)
# Detect if original content had tables (for cache metadata only)
has_tables_in_html = bool(soup.find_all('table'))
# Use provided language code or detect from content
if not language_code:
combined_text = content_text + "\n" + plain_text
if combined_text.strip():
detected_code, confidence = LanguageDetector.detect_language(combined_text)
if confidence > 0.7:
language_code = detected_code
self.logger.info(f"mineru-refine-content: detected language: {language_code} (confidence: {confidence:.2f})")
else:
self.logger.info(f"mineru-refine-content: language detection confidence too low ({confidence:.2f})")
if language_code:
self.logger.info(f"mineru-refine-content: will generate content in {LanguageDetector.get_language_name(language_code)}")
else:
self.logger.info(f"mineru-refine-content: no language specified, will use default")
# Check for cache
cache_filepath = os.path.join(temp_dir, f"content_refinement_markdown_v1_{src_fileid}.json")
# Use imported prompt for markdown generation with language
markdown_generation_prompt = format_markdown_generation_prompt(language_code)
# Prepare messages with multimodal content
messages = [
{"role": "system", "content": markdown_generation_prompt}
]
# Create user message with text and images
user_content = []
# Add text content
combined_input = f"""## OCR提取的文本
{content_text}
## PDF原始文本
{plain_text}
请基于提供的PDF页面图片和以上文本源生成准确、完整的Markdown格式文档。特别注意识别和重建表格内容。"""
user_content.append({
"type": "text",
"text": combined_input
})
# Add PDF page images if available
if pdf_page_images:
self.logger.info(f"mineru-content: including {len(pdf_page_images)} PDF page images in multimodal request")
# Limit pages to avoid token limits and model constraints
# Most models have a limit of 4 images per request
max_pages = min(len(pdf_page_images), 4)
for idx, page_img_path in enumerate(pdf_page_images[:max_pages]):
if os.path.exists(page_img_path):
try:
with open(page_img_path, 'rb') as img_file:
# Use BytesIO to avoid blocking the event loop
img_data = img_file.read()
img_buffer = io.BytesIO(img_data)
img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
user_content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{img_base64}"
}
})
self.logger.info(f"mineru-refine-content: added PDF page {idx + 1} image")
except Exception as e:
self.logger.warning(f"mineru-refine-content: failed to read PDF page image {page_img_path}: {str(e)}")
else:
self.logger.warning(f"mineru-refine-content: PDF page image not found: {page_img_path}")
else:
self.logger.warning("mineru-refine-content: no PDF page images provided for multimodal processing")
messages.append({
"role": "user",
"content": user_content
})
self.logger.info(f"mineru-refine-content: processing content with LLM for markdown generation")
# Use the unified litellm helper for multimodal request
response = await self.config.call_litellm(
model_type=learn_type,
messages=messages,
temperature=0.1
)
# Process response
refined_content = content
total_prompt_tokens = 0
total_completion_tokens = 0
try:
if (response.choices and
len(response.choices) > 0 and
response.choices[0].message and
response.choices[0].message.content):
refined_content = response.choices[0].message.content.strip()
# 从响应中提取markdown内容
markdown_start = refined_content.find("```markdown")
markdown_end = refined_content.rfind("```")
if markdown_start >= 0 and markdown_end > markdown_start:
# 提取markdown内容并去除首尾的```标记
refined_content = refined_content[markdown_start + 11:markdown_end].strip()
else:
# 如果没找到markdown标记使用原始内容
self.logger.warning("mineru-refine-content: no markdown block found in response")
# Clean hallucination patterns first
refined_content = self._clean_hallucination_patterns(refined_content)
# Limit content length to prevent OpenSearch indexing errors
# OpenSearch has a max field length of 32766 bytes
# We use 30000 characters as a safe limit (considering UTF-8 encoding)
MAX_CONTENT_LENGTH = 30000
if len(refined_content) > MAX_CONTENT_LENGTH:
self.logger.warning(f"mineru-refine-content: content too long ({len(refined_content)} chars), truncating to {MAX_CONTENT_LENGTH}")
# Try to truncate at a sentence boundary
truncated = refined_content[:MAX_CONTENT_LENGTH]
# Find last complete sentence
for sep in ['. ', '', '! ', '? ', '\n\n', '\n']:
last_sep = truncated.rfind(sep)
if last_sep > MAX_CONTENT_LENGTH * 0.9: # Within last 10%
refined_content = truncated[:last_sep + len(sep)]
break
else:
# If no good sentence boundary, just truncate
refined_content = truncated + "..."
# Track token usage
if hasattr(response, 'usage') and response.usage:
total_prompt_tokens = response.usage.prompt_tokens
total_completion_tokens = response.usage.completion_tokens
self.logger.info(
f"mineru-refine-content: markdown generation completed - "
f"tokens: {total_prompt_tokens}/{total_completion_tokens}"
)
else:
self.logger.warning("mineru-refine-content: empty response from LLM, using original content")
refined_content = content_text
# Also clean fallback content
refined_content = self._clean_hallucination_patterns(refined_content)
except (AttributeError, IndexError, ValueError) as e:
self.logger.error(f"mineru-refine-content: LLM response parsing failed: {str(e)}")
refined_content = content_text
# Clean and apply length limit to fallback content
refined_content = self._clean_hallucination_patterns(refined_content)
MAX_CONTENT_LENGTH = 30000
if len(refined_content) > MAX_CONTENT_LENGTH:
self.logger.warning(f"mineru-refine-content: fallback content too long ({len(refined_content)} chars), truncating")
refined_content = refined_content[:MAX_CONTENT_LENGTH] + "..."
# Save to cache
try:
cache_data = {
"refined_content": refined_content,
"model": "llm", # Generic model name since we don't have model_config anymore
"input_length": len(content_text) + len(plain_text),
"output_length": len(refined_content),
"prompt_tokens": total_prompt_tokens,
"completion_tokens": total_completion_tokens,
"has_tables": has_tables_in_html,
"used_pdf_page_images": len(pdf_page_images) if pdf_page_images else 0
}
with open(cache_filepath, 'w', encoding='utf-8') as file:
json.dump(cache_data, file, ensure_ascii=False, indent=2)
except Exception as e:
self.logger.warning(f"mineru-refine-content: cache write failed: {str(e)}")
return refined_content
except Exception as e:
self.logger.error(f"mineru-refine-content: LLM refinement failed: {str(e)}")
# Fallback: return text without HTML tags
try:
soup = BeautifulSoup(content, 'html.parser')
return soup.get_text(separator='\n', strip=True)
except:
return content
def split_content_by_pages(self, mineru_result: MinerUResult) -> Dict[int, Dict]:
"""
Split MinerU results by page index.
Args:
mineru_result: MinerU processing result with content_list
Returns:
Dictionary mapping page_idx to page data
"""
page_data = {}
content_list = mineru_result.metadata.get('content_list', [])
# Group content by page
for item in content_list:
page_idx = item.get('page_idx', 0)
if page_idx not in page_data:
page_data[page_idx] = {
'content_items': [],
'images': [],
'tables': [],
'page_idx': page_idx
}
# Add item to appropriate list
if item['type'] == 'image':
page_data[page_idx]['images'].append(item.get('img_path', ''))
elif item['type'] == 'table':
page_data[page_idx]['tables'].append(item.get('metadata', {}))
page_data[page_idx]['content_items'].append(item)
# Extract page content from the merged content
# Try to split by different page markers
content = mineru_result.content
# Method 1: Split by "## Page X" markers
if '\n\n## Page ' in content:
content_parts = content.split('\n\n## Page ')
for i, part in enumerate(content_parts):
if i == 0 and not part.startswith('## Page'):
# Handle content before first page marker
if part.strip() and 0 in page_data:
page_data[0]['content'] = part.strip()
continue
# Extract page number
page_match = re.match(r'^(\d+)', part)
if page_match:
page_num = int(page_match.group(1)) - 1 # Convert to 0-based index
if page_num in page_data:
# Remove page number line and get content
lines = part.split('\n', 1)
if len(lines) > 1:
page_data[page_num]['content'] = lines[1].strip()
else:
# Method 2: If no clear page markers, try to reconstruct from content_list
current_content = []
current_page = -1
for item in content_list:
item_page = item.get('page_idx', 0)
# If we moved to a new page, save previous content
if item_page != current_page and current_page >= 0:
if current_page in page_data:
page_data[current_page]['content'] = '\n\n'.join(current_content)
current_content = []
current_page = item_page
# Add text content
if item['type'] in ['text', 'title']:
text = item.get('text', '') or item.get('content', '')
if text.strip():
if item['type'] == 'title':
current_content.append(f"## {text.strip()}")
else:
current_content.append(text.strip())
elif item['type'] == 'table':
current_content.append("[Table content]") # Placeholder for table
# Save last page content
if current_page >= 0 and current_page in page_data:
page_data[current_page]['content'] = '\n\n'.join(current_content)
return page_data
async def process_page_content(self, page_content: str, page_images: List[str],
pdf_path: str, page_idx: int, temp_dir: str,
src_fileid: str, learn_type: int, language_code: Optional[str] = None) -> Dict:
"""
Process content for a single page.
Args:
page_content: Content text for the page
page_images: Images found on the page
pdf_path: Path to PDF file
page_idx: Page index (0-based)
temp_dir: Temporary directory
src_fileid: Source file ID
learn_type: Model type for LLM
Returns:
Dictionary with processed page data
"""
try:
# Handle empty content gracefully
if not page_content or not page_content.strip():
self.logger.info(f"mineru-content: page {page_idx} has no text content")
return {
'page_idx': page_idx,
'content': '',
'images': page_images,
'has_tables': False,
'processing_metadata': {
'original_length': 0,
'refined_length': 0,
'multimodal_used': False,
'empty_page': True
}
}
# Detect tables in this page's content
has_tables = self.detect_tables(page_content, f"{src_fileid}_page_{page_idx}")
# Extract single page image if multimodal is enabled
pdf_page_images = []
if self.config.enable_multimodal_refinement:
# Extract just this page as image
pdf_page_images = await self._extract_single_pdf_page_image(
pdf_path, page_idx, temp_dir, src_fileid
)
plain_text = ""
if self.config.enable_multimodal_refinement and has_tables:
plain_text = await self._extract_page_plain_text(pdf_path, page_idx, src_fileid)
# Process content with multimodal refinement if enabled, has image 、has table、no text
if self.config.enable_multimodal_refinement and \
(plain_text == "" or has_tables or len(page_images) > 0):
# Refine content for this page
refined_content = await self._llm_refine_content(
page_content, plain_text, pdf_page_images, temp_dir,
f"{src_fileid}_page_{page_idx}", learn_type, language_code
)
else:
refined_content = page_content
# Clean hallucination patterns and apply length limit to all content
refined_content = self._clean_hallucination_patterns(refined_content)
MAX_CONTENT_LENGTH = 30000
if len(refined_content) > MAX_CONTENT_LENGTH:
self.logger.warning(f"mineru-content: page {page_idx} content too long ({len(refined_content)} chars), truncating")
refined_content = refined_content[:MAX_CONTENT_LENGTH] + "..."
return {
'page_idx': page_idx,
'content': refined_content,
'images': page_images,
'has_tables': has_tables,
'processing_metadata': {
'original_length': len(page_content),
'refined_length': len(refined_content),
'multimodal_used': self.config.enable_multimodal_refinement and (has_tables or len(page_images) > 0)
}
}
except Exception as e:
self.logger.error(f"mineru-content: page {page_idx} processing failed: {str(e)}")
return {
'page_idx': page_idx,
'content': page_content, # Fallback to original
'images': page_images,
'has_tables': False,
'processing_metadata': {'error': str(e)}
}
async def _extract_single_pdf_page_image(self, pdf_path: str, page_idx: int,
temp_dir: str, _src_fileid: str) -> List[str]:
"""Extract a single PDF page as image."""
try:
import pdf2image
options = {
'pdf_path': pdf_path,
'dpi': 150,
'fmt': 'png',
'output_folder': temp_dir,
'use_pdftocairo': True,
'paths_only': True,
'first_page': page_idx + 1, # pdf2image uses 1-based indexing
'last_page': page_idx + 1
}
image_paths = pdf2image.convert_from_path(**options)
if image_paths:
new_name = f"pdf_page_{page_idx + 1:03d}.png"
new_path = os.path.join(temp_dir, new_name)
os.rename(image_paths[0], new_path)
return [new_path]
return []
except Exception as e:
self.logger.error(f"mineru-content: failed to extract page {page_idx} image: {str(e)}")
return []
async def _extract_page_plain_text(self, pdf_path: str, page_idx: int, _src_fileid: str) -> str:
"""Extract plain text from a specific PDF page."""
try:
import fitz
with fitz.open(pdf_path) as doc:
if page_idx < len(doc):
page = doc[page_idx]
return page.get_text('text')
return ""
except Exception as e:
self.logger.error(f"mineru-content: failed to extract page {page_idx} text: {str(e)}")
return ""
def create_page_chunks(self, content: str, _src_fileid: str) -> List[Dict]:
"""
Split content into page-based chunks for compatibility with gzero.py format.
Args:
content: Processed content
src_fileid: Source file ID
Returns:
List of page dictionaries
"""
try:
# Split content by page markers
page_separators = ['=== Page ', '__PAGE_OF_PORTION_']
pages = []
current_content = content
# Try to split by existing page markers
page_parts = []
for separator in page_separators:
if separator in current_content:
page_parts = current_content.split(separator)
break
if len(page_parts) > 1:
# Content already has page separators
for i, part in enumerate(page_parts):
if i == 0 and not part.strip():
continue # Skip empty first part
pages.append({
'index': len(pages),
'content': part.strip(),
'image_map': {},
'summary': '',
'input_tokens': 0,
'output_tokens': 0,
'dura': 0.0
})
else:
# Single page content
pages.append({
'index': 0,
'content': content,
'image_map': {},
'summary': '',
'input_tokens': 0,
'output_tokens': 0,
'dura': 0.0
})
self.logger.info(f"mineru-content: created {len(pages)} page chunks")
return pages
except Exception as e:
self.logger.error(f"mineru-content: page chunking failed: {str(e)}")
# Fallback to single page
return [{
'index': 0,
'content': content,
'image_map': {},
'summary': '',
'input_tokens': 0,
'output_tokens': 0,
'dura': 0.0
}]
def extract_page_contexts(self, content_list: List[Dict], _src_fileid: str) -> List[PageContext]:
"""
Extract page context information from MinerU content list.
Args:
content_list: List of content items from MinerU with page_idx and type
src_fileid: Source file ID
Returns:
List of PageContext objects
"""
try:
self.logger.info(f"mineru-content: extracting page contexts from {len(content_list)} items")
# Group content by page
page_groups = {}
for idx, item in enumerate(content_list):
page_idx = item.get('page_idx', 0)
if page_idx not in page_groups:
page_groups[page_idx] = []
page_groups[page_idx].append((idx, item))
# Create PageContext for each page
page_contexts = []
for page_idx in sorted(page_groups.keys()):
page_items = page_groups[page_idx]
# Determine page type
has_title = any(item[1].get('type') == 'title' for item in page_items)
text_count = sum(1 for item in page_items if item[1].get('type') == 'text')
if has_title and text_count > 0:
page_type = 'mixed'
elif has_title:
page_type = 'title'
else:
page_type = 'content'
# Extract title if available
page_title = None
for _, item in page_items:
if item.get('type') == 'title':
page_title = item.get('text', '').strip()
break
# Create content elements
content_elements = []
text_parts = []
for position, item in page_items:
element_type = item.get('type', 'unknown')
content = item.get('text', '') or item.get('content', '')
if element_type == 'text' and content:
text_parts.append(content)
content_elements.append(ContentElement(
type=element_type,
content=content,
page_idx=page_idx,
position=position,
bbox=item.get('bbox'),
metadata=item.get('metadata', {})
))
# Join text content
text_content = '\n'.join(text_parts)
# Count tokens
token_count = self._count_tokens(text_content)
page_context = PageContext(
page_idx=page_idx,
page_type=page_type,
title=page_title,
content_elements=content_elements,
text_content=text_content,
token_count=token_count
)
page_contexts.append(page_context)
self.logger.info(f"mineru-content: extracted {len(page_contexts)} page contexts")
return page_contexts
except Exception as e:
self.logger.error(f"mineru-content: page context extraction failed: {str(e)}")
return []
def extract_context_for_position(self, content_list: List[Dict], position: int,
mode: ContextMode = ContextMode.PAGE,
window_size: int = 2,
max_tokens: int = 1000) -> str:
"""
Extract context around a specific position in the content list.
Args:
content_list: MinerU content list
position: Position in the content list
mode: Context extraction mode (PAGE or CHUNK)
window_size: Number of pages/chunks to include
max_tokens: Maximum tokens to include
Returns:
Extracted context string
"""
try:
if position >= len(content_list):
return ""
target_item = content_list[position]
if mode == ContextMode.PAGE:
# Extract based on page boundaries
target_page_idx = target_item.get('page_idx', 0)
min_page = max(0, target_page_idx - window_size)
max_page = target_page_idx + window_size
context_items = []
for idx, item in enumerate(content_list):
item_page = item.get('page_idx', 0)
if min_page <= item_page <= max_page and item.get('type') in ['text', 'title']:
# Add page marker
if item_page != target_page_idx:
page_marker = f"[Page {item_page + 1}]"
if not context_items or context_items[-1] != page_marker:
context_items.append(page_marker)
text = item.get('text', '').strip()
if text:
context_items.append(text)
else: # ContextMode.CHUNK
# Extract based on chunk position
min_pos = max(0, position - window_size)
max_pos = min(len(content_list) - 1, position + window_size)
context_items = []
for idx in range(min_pos, max_pos + 1):
if idx == position:
continue # Skip the target item itself
item = content_list[idx]
if item.get('type') in ['text', 'title', 'table']:
text = item.get('text', '').strip()
if text:
if idx < position:
context_items.append(f"[Before] {text}")
else:
context_items.append(f"[After] {text}")
# Join and truncate by tokens
context_text = '\n'.join(context_items)
return self._truncate_by_tokens(context_text, max_tokens)
except Exception as e:
self.logger.error(f"mineru-content: context extraction failed: {str(e)}")
return ""
def _count_tokens(self, text: str) -> int:
"""Count tokens in text using tiktoken."""
try:
encoding = tiktoken.get_encoding("cl100k_base")
return len(encoding.encode(text))
except:
# Fallback to character-based estimation
return len(text) // 4
def _truncate_by_tokens(self, text: str, max_tokens: int) -> str:
"""Truncate text to fit within token limit while preserving sentence boundaries."""
if not text:
return ""
try:
encoding = tiktoken.get_encoding("cl100k_base")
tokens = encoding.encode(text)
if len(tokens) <= max_tokens:
return text
# Truncate to max tokens
truncated_tokens = tokens[:max_tokens]
truncated_text = encoding.decode(truncated_tokens)
# Try to find a sentence boundary
for sep in ['. ', '', '! ', '? ', '\n']:
last_sep = truncated_text.rfind(sep)
if last_sep > len(truncated_text) * 0.8: # Within last 20%
return truncated_text[:last_sep + len(sep)]
return truncated_text + "..."
except:
# Fallback to character-based truncation
char_limit = max_tokens * 4
if len(text) > char_limit:
return text[:char_limit] + "..."
return text
def _clean_hallucination_patterns(self, content: str) -> str:
"""
Clean common AI hallucination patterns from content.
Args:
content: The content to clean
Returns:
Cleaned content
"""
if not content:
return content
original_length = len(content)
# Pattern 1: Remove excessive dots (more than 10 consecutive dots)
content = re.sub(r'\.{10,}', '...', content)
# Pattern 2: Remove other excessive repeated characters (more than 10)
# This handles patterns like "--------" or "========="
content = re.sub(r'(.)\1{9,}', r'\1\1\1', content)
# Pattern 3: Remove excessive repeated words or patterns
# For example: "................................................................"
# repeated many times
content = re.sub(r'(\.{3,}[\s\n]*){5,}', '...\n', content)
# Pattern 4: Remove number sequences that appear to be counting
# Like ", 68\n, 72\n, 73\n, 73\n, 73\n" repeated many times
content = re.sub(r'(,\s*\d+\s*\n?\s*){20,}', '', content)
# Pattern 5: Remove table of contents with excessive dots
# Like "8.2 macOS 系统安装无线驱动程序................................................"
content = re.sub(r'([^\n]+)(\.{20,})', r'\1', content)
# Pattern 6: Clean up multiple consecutive empty lines
content = re.sub(r'\n{4,}', '\n\n\n', content)
# Pattern 7: Remove excessive dots or comma-separated numbers at the end
# But preserve normal punctuation
content = re.sub(r'[\s\n]*[,\d\s]{10,}$', '', content)
content = re.sub(r'\.{4,}$', '...', content)
cleaned_length = len(content)
if cleaned_length < original_length:
reduction = original_length - cleaned_length
self.logger.info(f"mineru-content: cleaned {reduction} characters of hallucination patterns "
f"(from {original_length} to {cleaned_length})")
return content.strip()