', '| ', '|---|',
'表格', 'Table', '| ', ' |',
'┌', '└', '├', '┤', # Table border characters
'═', '║', '╔', '╗', '╚', '╝' # More table characters
]
content_lower = content.lower()
found_indicators = []
for indicator in table_indicators:
if indicator.lower() in content_lower:
found_indicators.append(indicator)
# Check for pipe-separated table format
lines = content.split('\n')
pipe_lines = [line for line in lines if line.count('|') >= 2]
has_pipe_table = len(pipe_lines) >= 2 # At least header and one data row
has_tables = bool(found_indicators) or has_pipe_table
if has_tables:
self.logger.info(f"mineru-content: tables detected - indicators: {found_indicators}, pipe_lines: {len(pipe_lines)}")
else:
self.logger.info(f"mineru-content: no tables detected")
return has_tables
async def process_content(self, mineru_result: MinerUResult, pdf_path: str,
temp_dir: str, src_fileid: str, learn_type: int,
has_tables: bool) -> ContentProcessingResult:
"""
[DEPRECATED] Process content with optional multimodal refinement.
This method is deprecated. Use process_page_content() instead for page-by-page processing.
Args:
mineru_result: Result from MinerU processing
pdf_path: Path to original PDF
temp_dir: Temporary directory
src_fileid: Source file ID
learn_type: Model type for LLM processing
has_tables: Whether the content contains tables
Returns:
ContentProcessingResult with processed content
"""
try:
process_type = "with tables" if has_tables else "without tables"
self.logger.info(f"mineru-content: processing content {process_type}")
# Check if multimodal refinement is enabled
if not self.config.enable_multimodal_refinement:
self.logger.info(f"mineru-content: multimodal refinement disabled, returning original content")
processing_metadata = {
'mineru_content_length': len(mineru_result.content),
'image_count': len(mineru_result.images),
'table_count': len(mineru_result.tables),
'multimodal_refinement': False
}
return ContentProcessingResult(
success=True,
processed_content=mineru_result.content,
has_tables=has_tables,
processing_metadata=processing_metadata
)
# Step 1: Extract plain text from PDF (only if has tables)
plain_text = ""
if has_tables:
plain_text = await self._extract_plain_text(pdf_path, src_fileid)
# Step 2: Extract PDF page images for multimodal processing
pdf_page_images = await self._extract_pdf_page_images(pdf_path, temp_dir, src_fileid)
# Step 3: Use LLM to refine content with PDF page images
refined_content = await self._llm_refine_content(
mineru_result.content, plain_text, pdf_page_images, temp_dir, src_fileid, learn_type, None
)
processing_metadata = {
'mineru_content_length': len(mineru_result.content),
'refined_content_length': len(refined_content),
'image_count': len(mineru_result.images),
'table_count': len(mineru_result.tables),
'pdf_page_images_count': len(pdf_page_images),
'multimodal_refinement': True
}
if has_tables:
processing_metadata['plain_text_length'] = len(plain_text)
self.logger.info(f"mineru-content: content processing completed: {processing_metadata}")
return ContentProcessingResult(
success=True,
processed_content=refined_content,
has_tables=has_tables,
processing_metadata=processing_metadata
)
except Exception as e:
self.logger.error(f"mineru-content: content processing failed: {str(e)}")
return ContentProcessingResult(
success=False,
processed_content=mineru_result.content, # Fallback to original
has_tables=has_tables,
processing_metadata={},
error=str(e)
)
async def _extract_plain_text(self, pdf_path: str, _src_fileid: str) -> str:
"""
Extract plain text from PDF using PyMuPDF.
"""
try:
import fitz
text_parts = []
with fitz.open(pdf_path) as doc:
for page_num, page in enumerate(doc):
page_text = page.get_text('text') # Plain text extraction
if page_text.strip():
text_parts.append(f"=== Page {page_num + 1} ===\n{page_text}")
plain_text = '\n\n'.join(text_parts)
self.logger.info(f"mineru-content: extracted {len(plain_text)} characters of plain text")
return plain_text
except Exception as e:
self.logger.error(f"mineru-content: plain text extraction failed: {str(e)}")
return ""
async def _extract_pdf_page_images(self, pdf_path: str, temp_dir: str, _src_fileid: str,
max_pages: int = 10) -> List[str]:
"""
Extract PDF pages as images for multimodal processing.
Args:
pdf_path: Path to PDF file
temp_dir: Temporary directory for saving images
_src_fileid: Source file ID
max_pages: Maximum number of pages to extract (to avoid token limits)
Returns:
List of paths to page image files
"""
try:
import pdf2image
import os
self.logger.info(f"mineru-content: extracting PDF page images from {pdf_path}")
# Get page count first
import fitz
with fitz.open(pdf_path) as doc:
total_pages = len(doc)
# Determine pages to extract
pages_to_extract = min(total_pages, max_pages)
# Configure pdf2image options
options = {
'pdf_path': pdf_path,
'dpi': 150, # Lower DPI for multimodal processing
'fmt': 'png',
'output_folder': temp_dir,
'use_pdftocairo': True,
'paths_only': True,
'first_page': 1,
'last_page': pages_to_extract
}
# Convert PDF pages to images
image_paths = pdf2image.convert_from_path(**options)
# Rename images to standard format
page_images = []
for idx, img_path in enumerate(image_paths):
new_name = f"pdf_page_{idx + 1:03d}.png"
new_path = os.path.join(temp_dir, new_name)
os.rename(img_path, new_path)
page_images.append(new_path)
self.logger.info(f"mineru-content: extracted {len(page_images)} page images")
return page_images
except Exception as e:
self.logger.error(f"mineru-content: PDF page image extraction failed: {str(e)}")
return []
# def _extract_html_tables(self, content: str) -> List[Tuple[str, int, int]]:
# """
# Extract HTML tables from content with their positions.
# Returns:
# List of tuples: (table_html, start_pos, end_pos)
# """
# tables = []
# # Find all HTML table tags
# table_pattern = r''
# matches = re.finditer(table_pattern, content, re.DOTALL | re.IGNORECASE)
# for match in matches:
# table_html = match.group(0)
# start_pos = match.start()
# end_pos = match.end()
# tables.append((table_html, start_pos, end_pos))
# self.logger.info(f"mineru-content: found {len(tables)} HTML tables")
# return tables
def _find_related_pdf_content(self, plain_text: str, table_html: str, context_size: int = 500) -> str:
"""
Find related content in PDF text based on table content.
Args:
plain_text: Full PDF plain text
table_html: HTML table to find context for
context_size: Number of characters before/after to include
Returns:
Related PDF text chunk
"""
try:
# Extract text from HTML table
soup = BeautifulSoup(table_html, 'html.parser')
table_text = soup.get_text(separator=' ', strip=True)
# Extract key words from table (first few non-numeric words)
# 根据标点符号分割文本
# 修复正则表达式中的字符范围语法错误
words = re.split(r'[,.!:?;*\-,。!?;、/()()\s]+', table_text)
key_words = []
for word in words:
# Skip pure numbers and short words
if len(word) > 3 and word not in key_words:
key_words.append(word)
if not key_words:
self.logger.warning("mineru-content: no key words found in table")
return ""
# Search for key words in PDF text
best_match_pos = -1
best_match_score = 0
# Use sliding window to find best match
window_size = len(table_text) * 3 # Look for similar sized content
for i in range(0, len(plain_text) - window_size, 100): # Step by 100 chars
window_text = plain_text[i:i + window_size]
# Count matching key words
match_score = sum(1 for word in key_words if word.lower() in window_text.lower())
if match_score > best_match_score:
best_match_score = match_score
best_match_pos = i
if best_match_pos >= 0 and best_match_score >= len(key_words) * 0.3:
# Extract context around the match
start = max(0, best_match_pos - context_size)
end = min(len(plain_text), best_match_pos + window_size + context_size)
context = plain_text[start:end]
self.logger.info(f"mineru-content: found related PDF content with score {best_match_score}/{len(key_words)}")
return context
else:
self.logger.warning(f"mineru-content: no good match found (best score: {best_match_score}/{len(key_words)})")
return ""
except Exception as e:
self.logger.error(f"mineru-content: error finding related PDF content: {str(e)}")
return ""
async def _llm_refine_content(self, content: str, plain_text: str, pdf_page_images: List[str],
temp_dir: str, src_fileid: str, learn_type: int, language_code: Optional[str] = None) -> str:
"""
Use LLM to refine content by:
1. Removing HTML tags from content
2. Combining content + plain_text + PDF page images
3. Using AI model to generate markdown output
4. Storing table existence in metadata
Args:
content: MinerU extracted content with HTML
plain_text: Plain text extracted from PDF
pdf_page_images: List of PDF page image paths (full page screenshots)
temp_dir: Temporary directory
src_fileid: Source file ID
learn_type: Model type for LLM
"""
import base64
try:
# Remove HTML tags from content
soup = BeautifulSoup(content, 'html.parser')
content_text = soup.get_text(separator='\n', strip=True)
# Detect if original content had tables (for cache metadata only)
has_tables_in_html = bool(soup.find_all('table'))
# Use provided language code or detect from content
if not language_code:
combined_text = content_text + "\n" + plain_text
if combined_text.strip():
detected_code, confidence = LanguageDetector.detect_language(combined_text)
if confidence > 0.7:
language_code = detected_code
self.logger.info(f"mineru-refine-content: detected language: {language_code} (confidence: {confidence:.2f})")
else:
self.logger.info(f"mineru-refine-content: language detection confidence too low ({confidence:.2f})")
if language_code:
self.logger.info(f"mineru-refine-content: will generate content in {LanguageDetector.get_language_name(language_code)}")
else:
self.logger.info(f"mineru-refine-content: no language specified, will use default")
# Check for cache
cache_filepath = os.path.join(temp_dir, f"content_refinement_markdown_v1_{src_fileid}.json")
# Use imported prompt for markdown generation with language
markdown_generation_prompt = format_markdown_generation_prompt(language_code)
# Prepare messages with multimodal content
messages = [
{"role": "system", "content": markdown_generation_prompt}
]
# Create user message with text and images
user_content = []
# Add text content
combined_input = f"""## OCR提取的文本:
{content_text}
## PDF原始文本:
{plain_text}
请基于提供的PDF页面图片和以上文本源,生成准确、完整的Markdown格式文档。特别注意识别和重建表格内容。"""
user_content.append({
"type": "text",
"text": combined_input
})
# Add PDF page images if available
if pdf_page_images:
self.logger.info(f"mineru-content: including {len(pdf_page_images)} PDF page images in multimodal request")
# Limit pages to avoid token limits and model constraints
# Most models have a limit of 4 images per request
max_pages = min(len(pdf_page_images), 4)
for idx, page_img_path in enumerate(pdf_page_images[:max_pages]):
if os.path.exists(page_img_path):
try:
with open(page_img_path, 'rb') as img_file:
# Use BytesIO to avoid blocking the event loop
img_data = img_file.read()
img_buffer = io.BytesIO(img_data)
img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
user_content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{img_base64}"
}
})
self.logger.info(f"mineru-refine-content: added PDF page {idx + 1} image")
except Exception as e:
self.logger.warning(f"mineru-refine-content: failed to read PDF page image {page_img_path}: {str(e)}")
else:
self.logger.warning(f"mineru-refine-content: PDF page image not found: {page_img_path}")
else:
self.logger.warning("mineru-refine-content: no PDF page images provided for multimodal processing")
messages.append({
"role": "user",
"content": user_content
})
self.logger.info(f"mineru-refine-content: processing content with LLM for markdown generation")
# Use the unified litellm helper for multimodal request
response = await self.config.call_litellm(
model_type=learn_type,
messages=messages,
temperature=0.1
)
# Process response
refined_content = content
total_prompt_tokens = 0
total_completion_tokens = 0
try:
if (response.choices and
len(response.choices) > 0 and
response.choices[0].message and
response.choices[0].message.content):
refined_content = response.choices[0].message.content.strip()
# 从响应中提取markdown内容
markdown_start = refined_content.find("```markdown")
markdown_end = refined_content.rfind("```")
if markdown_start >= 0 and markdown_end > markdown_start:
# 提取markdown内容并去除首尾的```标记
refined_content = refined_content[markdown_start + 11:markdown_end].strip()
else:
# 如果没找到markdown标记,使用原始内容
self.logger.warning("mineru-refine-content: no markdown block found in response")
# Clean hallucination patterns first
refined_content = self._clean_hallucination_patterns(refined_content)
# Limit content length to prevent OpenSearch indexing errors
# OpenSearch has a max field length of 32766 bytes
# We use 30000 characters as a safe limit (considering UTF-8 encoding)
MAX_CONTENT_LENGTH = 30000
if len(refined_content) > MAX_CONTENT_LENGTH:
self.logger.warning(f"mineru-refine-content: content too long ({len(refined_content)} chars), truncating to {MAX_CONTENT_LENGTH}")
# Try to truncate at a sentence boundary
truncated = refined_content[:MAX_CONTENT_LENGTH]
# Find last complete sentence
for sep in ['. ', '。', '! ', '? ', '\n\n', '\n']:
last_sep = truncated.rfind(sep)
if last_sep > MAX_CONTENT_LENGTH * 0.9: # Within last 10%
refined_content = truncated[:last_sep + len(sep)]
break
else:
# If no good sentence boundary, just truncate
refined_content = truncated + "..."
# Track token usage
if hasattr(response, 'usage') and response.usage:
total_prompt_tokens = response.usage.prompt_tokens
total_completion_tokens = response.usage.completion_tokens
self.logger.info(
f"mineru-refine-content: markdown generation completed - "
f"tokens: {total_prompt_tokens}/{total_completion_tokens}"
)
else:
self.logger.warning("mineru-refine-content: empty response from LLM, using original content")
refined_content = content_text
# Also clean fallback content
refined_content = self._clean_hallucination_patterns(refined_content)
except (AttributeError, IndexError, ValueError) as e:
self.logger.error(f"mineru-refine-content: LLM response parsing failed: {str(e)}")
refined_content = content_text
# Clean and apply length limit to fallback content
refined_content = self._clean_hallucination_patterns(refined_content)
MAX_CONTENT_LENGTH = 30000
if len(refined_content) > MAX_CONTENT_LENGTH:
self.logger.warning(f"mineru-refine-content: fallback content too long ({len(refined_content)} chars), truncating")
refined_content = refined_content[:MAX_CONTENT_LENGTH] + "..."
# Save to cache
try:
cache_data = {
"refined_content": refined_content,
"model": "llm", # Generic model name since we don't have model_config anymore
"input_length": len(content_text) + len(plain_text),
"output_length": len(refined_content),
"prompt_tokens": total_prompt_tokens,
"completion_tokens": total_completion_tokens,
"has_tables": has_tables_in_html,
"used_pdf_page_images": len(pdf_page_images) if pdf_page_images else 0
}
with open(cache_filepath, 'w', encoding='utf-8') as file:
json.dump(cache_data, file, ensure_ascii=False, indent=2)
except Exception as e:
self.logger.warning(f"mineru-refine-content: cache write failed: {str(e)}")
return refined_content
except Exception as e:
self.logger.error(f"mineru-refine-content: LLM refinement failed: {str(e)}")
# Fallback: return text without HTML tags
try:
soup = BeautifulSoup(content, 'html.parser')
return soup.get_text(separator='\n', strip=True)
except:
return content
def split_content_by_pages(self, mineru_result: MinerUResult) -> Dict[int, Dict]:
"""
Split MinerU results by page index.
Args:
mineru_result: MinerU processing result with content_list
Returns:
Dictionary mapping page_idx to page data
"""
page_data = {}
content_list = mineru_result.metadata.get('content_list', [])
# Group content by page
for item in content_list:
page_idx = item.get('page_idx', 0)
if page_idx not in page_data:
page_data[page_idx] = {
'content_items': [],
'images': [],
'tables': [],
'page_idx': page_idx
}
# Add item to appropriate list
if item['type'] == 'image':
page_data[page_idx]['images'].append(item.get('img_path', ''))
elif item['type'] == 'table':
page_data[page_idx]['tables'].append(item.get('metadata', {}))
page_data[page_idx]['content_items'].append(item)
# Extract page content from the merged content
# Try to split by different page markers
content = mineru_result.content
# Method 1: Split by "## Page X" markers
if '\n\n## Page ' in content:
content_parts = content.split('\n\n## Page ')
for i, part in enumerate(content_parts):
if i == 0 and not part.startswith('## Page'):
# Handle content before first page marker
if part.strip() and 0 in page_data:
page_data[0]['content'] = part.strip()
continue
# Extract page number
page_match = re.match(r'^(\d+)', part)
if page_match:
page_num = int(page_match.group(1)) - 1 # Convert to 0-based index
if page_num in page_data:
# Remove page number line and get content
lines = part.split('\n', 1)
if len(lines) > 1:
page_data[page_num]['content'] = lines[1].strip()
else:
# Method 2: If no clear page markers, try to reconstruct from content_list
current_content = []
current_page = -1
for item in content_list:
item_page = item.get('page_idx', 0)
# If we moved to a new page, save previous content
if item_page != current_page and current_page >= 0:
if current_page in page_data:
page_data[current_page]['content'] = '\n\n'.join(current_content)
current_content = []
current_page = item_page
# Add text content
if item['type'] in ['text', 'title']:
text = item.get('text', '') or item.get('content', '')
if text.strip():
if item['type'] == 'title':
current_content.append(f"## {text.strip()}")
else:
current_content.append(text.strip())
elif item['type'] == 'table':
current_content.append("[Table content]") # Placeholder for table
# Save last page content
if current_page >= 0 and current_page in page_data:
page_data[current_page]['content'] = '\n\n'.join(current_content)
return page_data
async def process_page_content(self, page_content: str, page_images: List[str],
pdf_path: str, page_idx: int, temp_dir: str,
src_fileid: str, learn_type: int, language_code: Optional[str] = None) -> Dict:
"""
Process content for a single page.
Args:
page_content: Content text for the page
page_images: Images found on the page
pdf_path: Path to PDF file
page_idx: Page index (0-based)
temp_dir: Temporary directory
src_fileid: Source file ID
learn_type: Model type for LLM
Returns:
Dictionary with processed page data
"""
try:
# Handle empty content gracefully
if not page_content or not page_content.strip():
self.logger.info(f"mineru-content: page {page_idx} has no text content")
return {
'page_idx': page_idx,
'content': '',
'images': page_images,
'has_tables': False,
'processing_metadata': {
'original_length': 0,
'refined_length': 0,
'multimodal_used': False,
'empty_page': True
}
}
# Detect tables in this page's content
has_tables = self.detect_tables(page_content, f"{src_fileid}_page_{page_idx}")
# Extract single page image if multimodal is enabled
pdf_page_images = []
if self.config.enable_multimodal_refinement:
# Extract just this page as image
pdf_page_images = await self._extract_single_pdf_page_image(
pdf_path, page_idx, temp_dir, src_fileid
)
plain_text = ""
if self.config.enable_multimodal_refinement and has_tables:
plain_text = await self._extract_page_plain_text(pdf_path, page_idx, src_fileid)
# Process content with multimodal refinement if enabled, has image 、has table、no text
if self.config.enable_multimodal_refinement and \
(plain_text == "" or has_tables or len(page_images) > 0):
# Refine content for this page
refined_content = await self._llm_refine_content(
page_content, plain_text, pdf_page_images, temp_dir,
f"{src_fileid}_page_{page_idx}", learn_type, language_code
)
else:
refined_content = page_content
# Clean hallucination patterns and apply length limit to all content
refined_content = self._clean_hallucination_patterns(refined_content)
MAX_CONTENT_LENGTH = 30000
if len(refined_content) > MAX_CONTENT_LENGTH:
self.logger.warning(f"mineru-content: page {page_idx} content too long ({len(refined_content)} chars), truncating")
refined_content = refined_content[:MAX_CONTENT_LENGTH] + "..."
return {
'page_idx': page_idx,
'content': refined_content,
'images': page_images,
'has_tables': has_tables,
'processing_metadata': {
'original_length': len(page_content),
'refined_length': len(refined_content),
'multimodal_used': self.config.enable_multimodal_refinement and (has_tables or len(page_images) > 0)
}
}
except Exception as e:
self.logger.error(f"mineru-content: page {page_idx} processing failed: {str(e)}")
return {
'page_idx': page_idx,
'content': page_content, # Fallback to original
'images': page_images,
'has_tables': False,
'processing_metadata': {'error': str(e)}
}
async def _extract_single_pdf_page_image(self, pdf_path: str, page_idx: int,
temp_dir: str, _src_fileid: str) -> List[str]:
"""Extract a single PDF page as image."""
try:
import pdf2image
options = {
'pdf_path': pdf_path,
'dpi': 150,
'fmt': 'png',
'output_folder': temp_dir,
'use_pdftocairo': True,
'paths_only': True,
'first_page': page_idx + 1, # pdf2image uses 1-based indexing
'last_page': page_idx + 1
}
image_paths = pdf2image.convert_from_path(**options)
if image_paths:
new_name = f"pdf_page_{page_idx + 1:03d}.png"
new_path = os.path.join(temp_dir, new_name)
os.rename(image_paths[0], new_path)
return [new_path]
return []
except Exception as e:
self.logger.error(f"mineru-content: failed to extract page {page_idx} image: {str(e)}")
return []
async def _extract_page_plain_text(self, pdf_path: str, page_idx: int, _src_fileid: str) -> str:
"""Extract plain text from a specific PDF page."""
try:
import fitz
with fitz.open(pdf_path) as doc:
if page_idx < len(doc):
page = doc[page_idx]
return page.get_text('text')
return ""
except Exception as e:
self.logger.error(f"mineru-content: failed to extract page {page_idx} text: {str(e)}")
return ""
def create_page_chunks(self, content: str, _src_fileid: str) -> List[Dict]:
"""
Split content into page-based chunks for compatibility with gzero.py format.
Args:
content: Processed content
src_fileid: Source file ID
Returns:
List of page dictionaries
"""
try:
# Split content by page markers
page_separators = ['=== Page ', '__PAGE_OF_PORTION_']
pages = []
current_content = content
# Try to split by existing page markers
page_parts = []
for separator in page_separators:
if separator in current_content:
page_parts = current_content.split(separator)
break
if len(page_parts) > 1:
# Content already has page separators
for i, part in enumerate(page_parts):
if i == 0 and not part.strip():
continue # Skip empty first part
pages.append({
'index': len(pages),
'content': part.strip(),
'image_map': {},
'summary': '',
'input_tokens': 0,
'output_tokens': 0,
'dura': 0.0
})
else:
# Single page content
pages.append({
'index': 0,
'content': content,
'image_map': {},
'summary': '',
'input_tokens': 0,
'output_tokens': 0,
'dura': 0.0
})
self.logger.info(f"mineru-content: created {len(pages)} page chunks")
return pages
except Exception as e:
self.logger.error(f"mineru-content: page chunking failed: {str(e)}")
# Fallback to single page
return [{
'index': 0,
'content': content,
'image_map': {},
'summary': '',
'input_tokens': 0,
'output_tokens': 0,
'dura': 0.0
}]
def extract_page_contexts(self, content_list: List[Dict], _src_fileid: str) -> List[PageContext]:
"""
Extract page context information from MinerU content list.
Args:
content_list: List of content items from MinerU with page_idx and type
src_fileid: Source file ID
Returns:
List of PageContext objects
"""
try:
self.logger.info(f"mineru-content: extracting page contexts from {len(content_list)} items")
# Group content by page
page_groups = {}
for idx, item in enumerate(content_list):
page_idx = item.get('page_idx', 0)
if page_idx not in page_groups:
page_groups[page_idx] = []
page_groups[page_idx].append((idx, item))
# Create PageContext for each page
page_contexts = []
for page_idx in sorted(page_groups.keys()):
page_items = page_groups[page_idx]
# Determine page type
has_title = any(item[1].get('type') == 'title' for item in page_items)
text_count = sum(1 for item in page_items if item[1].get('type') == 'text')
if has_title and text_count > 0:
page_type = 'mixed'
elif has_title:
page_type = 'title'
else:
page_type = 'content'
# Extract title if available
page_title = None
for _, item in page_items:
if item.get('type') == 'title':
page_title = item.get('text', '').strip()
break
# Create content elements
content_elements = []
text_parts = []
for position, item in page_items:
element_type = item.get('type', 'unknown')
content = item.get('text', '') or item.get('content', '')
if element_type == 'text' and content:
text_parts.append(content)
content_elements.append(ContentElement(
type=element_type,
content=content,
page_idx=page_idx,
position=position,
bbox=item.get('bbox'),
metadata=item.get('metadata', {})
))
# Join text content
text_content = '\n'.join(text_parts)
# Count tokens
token_count = self._count_tokens(text_content)
page_context = PageContext(
page_idx=page_idx,
page_type=page_type,
title=page_title,
content_elements=content_elements,
text_content=text_content,
token_count=token_count
)
page_contexts.append(page_context)
self.logger.info(f"mineru-content: extracted {len(page_contexts)} page contexts")
return page_contexts
except Exception as e:
self.logger.error(f"mineru-content: page context extraction failed: {str(e)}")
return []
def extract_context_for_position(self, content_list: List[Dict], position: int,
mode: ContextMode = ContextMode.PAGE,
window_size: int = 2,
max_tokens: int = 1000) -> str:
"""
Extract context around a specific position in the content list.
Args:
content_list: MinerU content list
position: Position in the content list
mode: Context extraction mode (PAGE or CHUNK)
window_size: Number of pages/chunks to include
max_tokens: Maximum tokens to include
Returns:
Extracted context string
"""
try:
if position >= len(content_list):
return ""
target_item = content_list[position]
if mode == ContextMode.PAGE:
# Extract based on page boundaries
target_page_idx = target_item.get('page_idx', 0)
min_page = max(0, target_page_idx - window_size)
max_page = target_page_idx + window_size
context_items = []
for idx, item in enumerate(content_list):
item_page = item.get('page_idx', 0)
if min_page <= item_page <= max_page and item.get('type') in ['text', 'title']:
# Add page marker
if item_page != target_page_idx:
page_marker = f"[Page {item_page + 1}]"
if not context_items or context_items[-1] != page_marker:
context_items.append(page_marker)
text = item.get('text', '').strip()
if text:
context_items.append(text)
else: # ContextMode.CHUNK
# Extract based on chunk position
min_pos = max(0, position - window_size)
max_pos = min(len(content_list) - 1, position + window_size)
context_items = []
for idx in range(min_pos, max_pos + 1):
if idx == position:
continue # Skip the target item itself
item = content_list[idx]
if item.get('type') in ['text', 'title', 'table']:
text = item.get('text', '').strip()
if text:
if idx < position:
context_items.append(f"[Before] {text}")
else:
context_items.append(f"[After] {text}")
# Join and truncate by tokens
context_text = '\n'.join(context_items)
return self._truncate_by_tokens(context_text, max_tokens)
except Exception as e:
self.logger.error(f"mineru-content: context extraction failed: {str(e)}")
return ""
def _count_tokens(self, text: str) -> int:
"""Count tokens in text using tiktoken."""
try:
encoding = tiktoken.get_encoding("cl100k_base")
return len(encoding.encode(text))
except:
# Fallback to character-based estimation
return len(text) // 4
def _truncate_by_tokens(self, text: str, max_tokens: int) -> str:
"""Truncate text to fit within token limit while preserving sentence boundaries."""
if not text:
return ""
try:
encoding = tiktoken.get_encoding("cl100k_base")
tokens = encoding.encode(text)
if len(tokens) <= max_tokens:
return text
# Truncate to max tokens
truncated_tokens = tokens[:max_tokens]
truncated_text = encoding.decode(truncated_tokens)
# Try to find a sentence boundary
for sep in ['. ', '。', '! ', '? ', '\n']:
last_sep = truncated_text.rfind(sep)
if last_sep > len(truncated_text) * 0.8: # Within last 20%
return truncated_text[:last_sep + len(sep)]
return truncated_text + "..."
except:
# Fallback to character-based truncation
char_limit = max_tokens * 4
if len(text) > char_limit:
return text[:char_limit] + "..."
return text
def _clean_hallucination_patterns(self, content: str) -> str:
"""
Clean common AI hallucination patterns from content.
Args:
content: The content to clean
Returns:
Cleaned content
"""
if not content:
return content
original_length = len(content)
# Pattern 1: Remove excessive dots (more than 10 consecutive dots)
content = re.sub(r'\.{10,}', '...', content)
# Pattern 2: Remove other excessive repeated characters (more than 10)
# This handles patterns like "--------" or "========="
content = re.sub(r'(.)\1{9,}', r'\1\1\1', content)
# Pattern 3: Remove excessive repeated words or patterns
# For example: "................................................................"
# repeated many times
content = re.sub(r'(\.{3,}[\s\n]*){5,}', '...\n', content)
# Pattern 4: Remove number sequences that appear to be counting
# Like ", 68\n, 72\n, 73\n, 73\n, 73\n" repeated many times
content = re.sub(r'(,\s*\d+\s*\n?\s*){20,}', '', content)
# Pattern 5: Remove table of contents with excessive dots
# Like "8.2 macOS 系统安装无线驱动程序................................................"
content = re.sub(r'([^\n]+)(\.{20,})', r'\1', content)
# Pattern 6: Clean up multiple consecutive empty lines
content = re.sub(r'\n{4,}', '\n\n\n', content)
# Pattern 7: Remove excessive dots or comma-separated numbers at the end
# But preserve normal punctuation
content = re.sub(r'[\s\n]*[,\d\s]{10,}$', '', content)
content = re.sub(r'\.{4,}$', '...', content)
cleaned_length = len(content)
if cleaned_length < original_length:
reduction = original_length - cleaned_length
self.logger.info(f"mineru-content: cleaned {reduction} characters of hallucination patterns "
f"(from {original_length} to {cleaned_length})")
return content.strip() |