""" Content processing module for MinerU-based parsing. This module handles table detection, content fusion, and LLM-based content refinement, following patterns from gzero.py. """ from typing import Dict, List, Optional from dataclasses import dataclass from .logger import get_module_logger logger = get_module_logger('content_processor') import tiktoken import json import os import re import io from bs4 import BeautifulSoup from .config_base import MinerUConfig from .api_client import MinerUResult from .context_types import ContentElement, PageContext, ContextMode from .prompts.markdown_generation import format_markdown_generation_prompt from .language_detector import LanguageDetector @dataclass class ContentProcessingResult: """Result from content processing operations""" success: bool processed_content: str has_tables: bool processing_metadata: Dict error: Optional[str] = None class MinerUContentProcessor: """Content processor for handling tables and content fusion""" def __init__(self, config: MinerUConfig): self.config = config self.logger = logger def detect_tables(self, content: str, _src_fileid: str) -> bool: """ Detect if content contains table structures. Based on gzero.py's table detection logic. """ table_indicators = [ '', '', '
', '|---|', '表格', 'Table', '| ', ' |', '┌', '└', '├', '┤', # Table border characters '═', '║', '╔', '╗', '╚', '╝' # More table characters ] content_lower = content.lower() found_indicators = [] for indicator in table_indicators: if indicator.lower() in content_lower: found_indicators.append(indicator) # Check for pipe-separated table format lines = content.split('\n') pipe_lines = [line for line in lines if line.count('|') >= 2] has_pipe_table = len(pipe_lines) >= 2 # At least header and one data row has_tables = bool(found_indicators) or has_pipe_table if has_tables: self.logger.info(f"mineru-content: tables detected - indicators: {found_indicators}, pipe_lines: {len(pipe_lines)}") else: self.logger.info(f"mineru-content: no tables detected") return has_tables async def process_content(self, mineru_result: MinerUResult, pdf_path: str, temp_dir: str, src_fileid: str, learn_type: int, has_tables: bool) -> ContentProcessingResult: """ [DEPRECATED] Process content with optional multimodal refinement. This method is deprecated. Use process_page_content() instead for page-by-page processing. Args: mineru_result: Result from MinerU processing pdf_path: Path to original PDF temp_dir: Temporary directory src_fileid: Source file ID learn_type: Model type for LLM processing has_tables: Whether the content contains tables Returns: ContentProcessingResult with processed content """ try: process_type = "with tables" if has_tables else "without tables" self.logger.info(f"mineru-content: processing content {process_type}") # Check if multimodal refinement is enabled if not self.config.enable_multimodal_refinement: self.logger.info(f"mineru-content: multimodal refinement disabled, returning original content") processing_metadata = { 'mineru_content_length': len(mineru_result.content), 'image_count': len(mineru_result.images), 'table_count': len(mineru_result.tables), 'multimodal_refinement': False } return ContentProcessingResult( success=True, processed_content=mineru_result.content, has_tables=has_tables, processing_metadata=processing_metadata ) # Step 1: Extract plain text from PDF (only if has tables) plain_text = "" if has_tables: plain_text = await self._extract_plain_text(pdf_path, src_fileid) # Step 2: Extract PDF page images for multimodal processing pdf_page_images = await self._extract_pdf_page_images(pdf_path, temp_dir, src_fileid) # Step 3: Use LLM to refine content with PDF page images refined_content = await self._llm_refine_content( mineru_result.content, plain_text, pdf_page_images, temp_dir, src_fileid, learn_type, None ) processing_metadata = { 'mineru_content_length': len(mineru_result.content), 'refined_content_length': len(refined_content), 'image_count': len(mineru_result.images), 'table_count': len(mineru_result.tables), 'pdf_page_images_count': len(pdf_page_images), 'multimodal_refinement': True } if has_tables: processing_metadata['plain_text_length'] = len(plain_text) self.logger.info(f"mineru-content: content processing completed: {processing_metadata}") return ContentProcessingResult( success=True, processed_content=refined_content, has_tables=has_tables, processing_metadata=processing_metadata ) except Exception as e: self.logger.error(f"mineru-content: content processing failed: {str(e)}") return ContentProcessingResult( success=False, processed_content=mineru_result.content, # Fallback to original has_tables=has_tables, processing_metadata={}, error=str(e) ) async def _extract_plain_text(self, pdf_path: str, _src_fileid: str) -> str: """ Extract plain text from PDF using PyMuPDF. """ try: import fitz text_parts = [] with fitz.open(pdf_path) as doc: for page_num, page in enumerate(doc): page_text = page.get_text('text') # Plain text extraction if page_text.strip(): text_parts.append(f"=== Page {page_num + 1} ===\n{page_text}") plain_text = '\n\n'.join(text_parts) self.logger.info(f"mineru-content: extracted {len(plain_text)} characters of plain text") return plain_text except Exception as e: self.logger.error(f"mineru-content: plain text extraction failed: {str(e)}") return "" async def _extract_pdf_page_images(self, pdf_path: str, temp_dir: str, _src_fileid: str, max_pages: int = 10) -> List[str]: """ Extract PDF pages as images for multimodal processing. Args: pdf_path: Path to PDF file temp_dir: Temporary directory for saving images _src_fileid: Source file ID max_pages: Maximum number of pages to extract (to avoid token limits) Returns: List of paths to page image files """ try: import pdf2image import os self.logger.info(f"mineru-content: extracting PDF page images from {pdf_path}") # Get page count first import fitz with fitz.open(pdf_path) as doc: total_pages = len(doc) # Determine pages to extract pages_to_extract = min(total_pages, max_pages) # Configure pdf2image options options = { 'pdf_path': pdf_path, 'dpi': 150, # Lower DPI for multimodal processing 'fmt': 'png', 'output_folder': temp_dir, 'use_pdftocairo': True, 'paths_only': True, 'first_page': 1, 'last_page': pages_to_extract } # Convert PDF pages to images image_paths = pdf2image.convert_from_path(**options) # Rename images to standard format page_images = [] for idx, img_path in enumerate(image_paths): new_name = f"pdf_page_{idx + 1:03d}.png" new_path = os.path.join(temp_dir, new_name) os.rename(img_path, new_path) page_images.append(new_path) self.logger.info(f"mineru-content: extracted {len(page_images)} page images") return page_images except Exception as e: self.logger.error(f"mineru-content: PDF page image extraction failed: {str(e)}") return [] # def _extract_html_tables(self, content: str) -> List[Tuple[str, int, int]]: # """ # Extract HTML tables from content with their positions. # Returns: # List of tuples: (table_html, start_pos, end_pos) # """ # tables = [] # # Find all HTML table tags # table_pattern = r']*>.*?
' # matches = re.finditer(table_pattern, content, re.DOTALL | re.IGNORECASE) # for match in matches: # table_html = match.group(0) # start_pos = match.start() # end_pos = match.end() # tables.append((table_html, start_pos, end_pos)) # self.logger.info(f"mineru-content: found {len(tables)} HTML tables") # return tables def _find_related_pdf_content(self, plain_text: str, table_html: str, context_size: int = 500) -> str: """ Find related content in PDF text based on table content. Args: plain_text: Full PDF plain text table_html: HTML table to find context for context_size: Number of characters before/after to include Returns: Related PDF text chunk """ try: # Extract text from HTML table soup = BeautifulSoup(table_html, 'html.parser') table_text = soup.get_text(separator=' ', strip=True) # Extract key words from table (first few non-numeric words) # 根据标点符号分割文本 # 修复正则表达式中的字符范围语法错误 words = re.split(r'[,.!:?;*\-,。!?;、/()()\s]+', table_text) key_words = [] for word in words: # Skip pure numbers and short words if len(word) > 3 and word not in key_words: key_words.append(word) if not key_words: self.logger.warning("mineru-content: no key words found in table") return "" # Search for key words in PDF text best_match_pos = -1 best_match_score = 0 # Use sliding window to find best match window_size = len(table_text) * 3 # Look for similar sized content for i in range(0, len(plain_text) - window_size, 100): # Step by 100 chars window_text = plain_text[i:i + window_size] # Count matching key words match_score = sum(1 for word in key_words if word.lower() in window_text.lower()) if match_score > best_match_score: best_match_score = match_score best_match_pos = i if best_match_pos >= 0 and best_match_score >= len(key_words) * 0.3: # Extract context around the match start = max(0, best_match_pos - context_size) end = min(len(plain_text), best_match_pos + window_size + context_size) context = plain_text[start:end] self.logger.info(f"mineru-content: found related PDF content with score {best_match_score}/{len(key_words)}") return context else: self.logger.warning(f"mineru-content: no good match found (best score: {best_match_score}/{len(key_words)})") return "" except Exception as e: self.logger.error(f"mineru-content: error finding related PDF content: {str(e)}") return "" async def _llm_refine_content(self, content: str, plain_text: str, pdf_page_images: List[str], temp_dir: str, src_fileid: str, learn_type: int, language_code: Optional[str] = None) -> str: """ Use LLM to refine content by: 1. Removing HTML tags from content 2. Combining content + plain_text + PDF page images 3. Using AI model to generate markdown output 4. Storing table existence in metadata Args: content: MinerU extracted content with HTML plain_text: Plain text extracted from PDF pdf_page_images: List of PDF page image paths (full page screenshots) temp_dir: Temporary directory src_fileid: Source file ID learn_type: Model type for LLM """ import base64 try: # Remove HTML tags from content soup = BeautifulSoup(content, 'html.parser') content_text = soup.get_text(separator='\n', strip=True) # Detect if original content had tables (for cache metadata only) has_tables_in_html = bool(soup.find_all('table')) # Use provided language code or detect from content if not language_code: combined_text = content_text + "\n" + plain_text if combined_text.strip(): detected_code, confidence = LanguageDetector.detect_language(combined_text) if confidence > 0.7: language_code = detected_code self.logger.info(f"mineru-refine-content: detected language: {language_code} (confidence: {confidence:.2f})") else: self.logger.info(f"mineru-refine-content: language detection confidence too low ({confidence:.2f})") if language_code: self.logger.info(f"mineru-refine-content: will generate content in {LanguageDetector.get_language_name(language_code)}") else: self.logger.info(f"mineru-refine-content: no language specified, will use default") # Check for cache cache_filepath = os.path.join(temp_dir, f"content_refinement_markdown_v1_{src_fileid}.json") # Use imported prompt for markdown generation with language markdown_generation_prompt = format_markdown_generation_prompt(language_code) # Prepare messages with multimodal content messages = [ {"role": "system", "content": markdown_generation_prompt} ] # Create user message with text and images user_content = [] # Add text content combined_input = f"""## OCR提取的文本: {content_text} ## PDF原始文本: {plain_text} 请基于提供的PDF页面图片和以上文本源,生成准确、完整的Markdown格式文档。特别注意识别和重建表格内容。""" user_content.append({ "type": "text", "text": combined_input }) # Add PDF page images if available if pdf_page_images: self.logger.info(f"mineru-content: including {len(pdf_page_images)} PDF page images in multimodal request") # Limit pages to avoid token limits and model constraints # Most models have a limit of 4 images per request max_pages = min(len(pdf_page_images), 4) for idx, page_img_path in enumerate(pdf_page_images[:max_pages]): if os.path.exists(page_img_path): try: with open(page_img_path, 'rb') as img_file: # Use BytesIO to avoid blocking the event loop img_data = img_file.read() img_buffer = io.BytesIO(img_data) img_base64 = base64.b64encode(img_buffer.getvalue()).decode('utf-8') user_content.append({ "type": "image_url", "image_url": { "url": f"data:image/png;base64,{img_base64}" } }) self.logger.info(f"mineru-refine-content: added PDF page {idx + 1} image") except Exception as e: self.logger.warning(f"mineru-refine-content: failed to read PDF page image {page_img_path}: {str(e)}") else: self.logger.warning(f"mineru-refine-content: PDF page image not found: {page_img_path}") else: self.logger.warning("mineru-refine-content: no PDF page images provided for multimodal processing") messages.append({ "role": "user", "content": user_content }) self.logger.info(f"mineru-refine-content: processing content with LLM for markdown generation") # Use the unified litellm helper for multimodal request response = await self.config.call_litellm( model_type=learn_type, messages=messages, temperature=0.1 ) # Process response refined_content = content total_prompt_tokens = 0 total_completion_tokens = 0 try: if (response.choices and len(response.choices) > 0 and response.choices[0].message and response.choices[0].message.content): refined_content = response.choices[0].message.content.strip() # 从响应中提取markdown内容 markdown_start = refined_content.find("```markdown") markdown_end = refined_content.rfind("```") if markdown_start >= 0 and markdown_end > markdown_start: # 提取markdown内容并去除首尾的```标记 refined_content = refined_content[markdown_start + 11:markdown_end].strip() else: # 如果没找到markdown标记,使用原始内容 self.logger.warning("mineru-refine-content: no markdown block found in response") # Clean hallucination patterns first refined_content = self._clean_hallucination_patterns(refined_content) # Limit content length to prevent OpenSearch indexing errors # OpenSearch has a max field length of 32766 bytes # We use 30000 characters as a safe limit (considering UTF-8 encoding) MAX_CONTENT_LENGTH = 30000 if len(refined_content) > MAX_CONTENT_LENGTH: self.logger.warning(f"mineru-refine-content: content too long ({len(refined_content)} chars), truncating to {MAX_CONTENT_LENGTH}") # Try to truncate at a sentence boundary truncated = refined_content[:MAX_CONTENT_LENGTH] # Find last complete sentence for sep in ['. ', '。', '! ', '? ', '\n\n', '\n']: last_sep = truncated.rfind(sep) if last_sep > MAX_CONTENT_LENGTH * 0.9: # Within last 10% refined_content = truncated[:last_sep + len(sep)] break else: # If no good sentence boundary, just truncate refined_content = truncated + "..." # Track token usage if hasattr(response, 'usage') and response.usage: total_prompt_tokens = response.usage.prompt_tokens total_completion_tokens = response.usage.completion_tokens self.logger.info( f"mineru-refine-content: markdown generation completed - " f"tokens: {total_prompt_tokens}/{total_completion_tokens}" ) else: self.logger.warning("mineru-refine-content: empty response from LLM, using original content") refined_content = content_text # Also clean fallback content refined_content = self._clean_hallucination_patterns(refined_content) except (AttributeError, IndexError, ValueError) as e: self.logger.error(f"mineru-refine-content: LLM response parsing failed: {str(e)}") refined_content = content_text # Clean and apply length limit to fallback content refined_content = self._clean_hallucination_patterns(refined_content) MAX_CONTENT_LENGTH = 30000 if len(refined_content) > MAX_CONTENT_LENGTH: self.logger.warning(f"mineru-refine-content: fallback content too long ({len(refined_content)} chars), truncating") refined_content = refined_content[:MAX_CONTENT_LENGTH] + "..." # Save to cache try: cache_data = { "refined_content": refined_content, "model": "llm", # Generic model name since we don't have model_config anymore "input_length": len(content_text) + len(plain_text), "output_length": len(refined_content), "prompt_tokens": total_prompt_tokens, "completion_tokens": total_completion_tokens, "has_tables": has_tables_in_html, "used_pdf_page_images": len(pdf_page_images) if pdf_page_images else 0 } with open(cache_filepath, 'w', encoding='utf-8') as file: json.dump(cache_data, file, ensure_ascii=False, indent=2) except Exception as e: self.logger.warning(f"mineru-refine-content: cache write failed: {str(e)}") return refined_content except Exception as e: self.logger.error(f"mineru-refine-content: LLM refinement failed: {str(e)}") # Fallback: return text without HTML tags try: soup = BeautifulSoup(content, 'html.parser') return soup.get_text(separator='\n', strip=True) except: return content def split_content_by_pages(self, mineru_result: MinerUResult) -> Dict[int, Dict]: """ Split MinerU results by page index. Args: mineru_result: MinerU processing result with content_list Returns: Dictionary mapping page_idx to page data """ page_data = {} content_list = mineru_result.metadata.get('content_list', []) # Group content by page for item in content_list: page_idx = item.get('page_idx', 0) if page_idx not in page_data: page_data[page_idx] = { 'content_items': [], 'images': [], 'tables': [], 'page_idx': page_idx } # Add item to appropriate list if item['type'] == 'image': page_data[page_idx]['images'].append(item.get('img_path', '')) elif item['type'] == 'table': page_data[page_idx]['tables'].append(item.get('metadata', {})) page_data[page_idx]['content_items'].append(item) # Extract page content from the merged content # Try to split by different page markers content = mineru_result.content # Method 1: Split by "## Page X" markers if '\n\n## Page ' in content: content_parts = content.split('\n\n## Page ') for i, part in enumerate(content_parts): if i == 0 and not part.startswith('## Page'): # Handle content before first page marker if part.strip() and 0 in page_data: page_data[0]['content'] = part.strip() continue # Extract page number page_match = re.match(r'^(\d+)', part) if page_match: page_num = int(page_match.group(1)) - 1 # Convert to 0-based index if page_num in page_data: # Remove page number line and get content lines = part.split('\n', 1) if len(lines) > 1: page_data[page_num]['content'] = lines[1].strip() else: # Method 2: If no clear page markers, try to reconstruct from content_list current_content = [] current_page = -1 for item in content_list: item_page = item.get('page_idx', 0) # If we moved to a new page, save previous content if item_page != current_page and current_page >= 0: if current_page in page_data: page_data[current_page]['content'] = '\n\n'.join(current_content) current_content = [] current_page = item_page # Add text content if item['type'] in ['text', 'title']: text = item.get('text', '') or item.get('content', '') if text.strip(): if item['type'] == 'title': current_content.append(f"## {text.strip()}") else: current_content.append(text.strip()) elif item['type'] == 'table': current_content.append("[Table content]") # Placeholder for table # Save last page content if current_page >= 0 and current_page in page_data: page_data[current_page]['content'] = '\n\n'.join(current_content) return page_data async def process_page_content(self, page_content: str, page_images: List[str], pdf_path: str, page_idx: int, temp_dir: str, src_fileid: str, learn_type: int, language_code: Optional[str] = None) -> Dict: """ Process content for a single page. Args: page_content: Content text for the page page_images: Images found on the page pdf_path: Path to PDF file page_idx: Page index (0-based) temp_dir: Temporary directory src_fileid: Source file ID learn_type: Model type for LLM Returns: Dictionary with processed page data """ try: # Handle empty content gracefully if not page_content or not page_content.strip(): self.logger.info(f"mineru-content: page {page_idx} has no text content") return { 'page_idx': page_idx, 'content': '', 'images': page_images, 'has_tables': False, 'processing_metadata': { 'original_length': 0, 'refined_length': 0, 'multimodal_used': False, 'empty_page': True } } # Detect tables in this page's content has_tables = self.detect_tables(page_content, f"{src_fileid}_page_{page_idx}") # Extract single page image if multimodal is enabled pdf_page_images = [] if self.config.enable_multimodal_refinement: # Extract just this page as image pdf_page_images = await self._extract_single_pdf_page_image( pdf_path, page_idx, temp_dir, src_fileid ) plain_text = "" if self.config.enable_multimodal_refinement and has_tables: plain_text = await self._extract_page_plain_text(pdf_path, page_idx, src_fileid) # Process content with multimodal refinement if enabled, has image 、has table、no text if self.config.enable_multimodal_refinement and \ (plain_text == "" or has_tables or len(page_images) > 0): # Refine content for this page refined_content = await self._llm_refine_content( page_content, plain_text, pdf_page_images, temp_dir, f"{src_fileid}_page_{page_idx}", learn_type, language_code ) else: refined_content = page_content # Clean hallucination patterns and apply length limit to all content refined_content = self._clean_hallucination_patterns(refined_content) MAX_CONTENT_LENGTH = 30000 if len(refined_content) > MAX_CONTENT_LENGTH: self.logger.warning(f"mineru-content: page {page_idx} content too long ({len(refined_content)} chars), truncating") refined_content = refined_content[:MAX_CONTENT_LENGTH] + "..." return { 'page_idx': page_idx, 'content': refined_content, 'images': page_images, 'has_tables': has_tables, 'processing_metadata': { 'original_length': len(page_content), 'refined_length': len(refined_content), 'multimodal_used': self.config.enable_multimodal_refinement and (has_tables or len(page_images) > 0) } } except Exception as e: self.logger.error(f"mineru-content: page {page_idx} processing failed: {str(e)}") return { 'page_idx': page_idx, 'content': page_content, # Fallback to original 'images': page_images, 'has_tables': False, 'processing_metadata': {'error': str(e)} } async def _extract_single_pdf_page_image(self, pdf_path: str, page_idx: int, temp_dir: str, _src_fileid: str) -> List[str]: """Extract a single PDF page as image.""" try: import pdf2image options = { 'pdf_path': pdf_path, 'dpi': 150, 'fmt': 'png', 'output_folder': temp_dir, 'use_pdftocairo': True, 'paths_only': True, 'first_page': page_idx + 1, # pdf2image uses 1-based indexing 'last_page': page_idx + 1 } image_paths = pdf2image.convert_from_path(**options) if image_paths: new_name = f"pdf_page_{page_idx + 1:03d}.png" new_path = os.path.join(temp_dir, new_name) os.rename(image_paths[0], new_path) return [new_path] return [] except Exception as e: self.logger.error(f"mineru-content: failed to extract page {page_idx} image: {str(e)}") return [] async def _extract_page_plain_text(self, pdf_path: str, page_idx: int, _src_fileid: str) -> str: """Extract plain text from a specific PDF page.""" try: import fitz with fitz.open(pdf_path) as doc: if page_idx < len(doc): page = doc[page_idx] return page.get_text('text') return "" except Exception as e: self.logger.error(f"mineru-content: failed to extract page {page_idx} text: {str(e)}") return "" def create_page_chunks(self, content: str, _src_fileid: str) -> List[Dict]: """ Split content into page-based chunks for compatibility with gzero.py format. Args: content: Processed content src_fileid: Source file ID Returns: List of page dictionaries """ try: # Split content by page markers page_separators = ['=== Page ', '__PAGE_OF_PORTION_'] pages = [] current_content = content # Try to split by existing page markers page_parts = [] for separator in page_separators: if separator in current_content: page_parts = current_content.split(separator) break if len(page_parts) > 1: # Content already has page separators for i, part in enumerate(page_parts): if i == 0 and not part.strip(): continue # Skip empty first part pages.append({ 'index': len(pages), 'content': part.strip(), 'image_map': {}, 'summary': '', 'input_tokens': 0, 'output_tokens': 0, 'dura': 0.0 }) else: # Single page content pages.append({ 'index': 0, 'content': content, 'image_map': {}, 'summary': '', 'input_tokens': 0, 'output_tokens': 0, 'dura': 0.0 }) self.logger.info(f"mineru-content: created {len(pages)} page chunks") return pages except Exception as e: self.logger.error(f"mineru-content: page chunking failed: {str(e)}") # Fallback to single page return [{ 'index': 0, 'content': content, 'image_map': {}, 'summary': '', 'input_tokens': 0, 'output_tokens': 0, 'dura': 0.0 }] def extract_page_contexts(self, content_list: List[Dict], _src_fileid: str) -> List[PageContext]: """ Extract page context information from MinerU content list. Args: content_list: List of content items from MinerU with page_idx and type src_fileid: Source file ID Returns: List of PageContext objects """ try: self.logger.info(f"mineru-content: extracting page contexts from {len(content_list)} items") # Group content by page page_groups = {} for idx, item in enumerate(content_list): page_idx = item.get('page_idx', 0) if page_idx not in page_groups: page_groups[page_idx] = [] page_groups[page_idx].append((idx, item)) # Create PageContext for each page page_contexts = [] for page_idx in sorted(page_groups.keys()): page_items = page_groups[page_idx] # Determine page type has_title = any(item[1].get('type') == 'title' for item in page_items) text_count = sum(1 for item in page_items if item[1].get('type') == 'text') if has_title and text_count > 0: page_type = 'mixed' elif has_title: page_type = 'title' else: page_type = 'content' # Extract title if available page_title = None for _, item in page_items: if item.get('type') == 'title': page_title = item.get('text', '').strip() break # Create content elements content_elements = [] text_parts = [] for position, item in page_items: element_type = item.get('type', 'unknown') content = item.get('text', '') or item.get('content', '') if element_type == 'text' and content: text_parts.append(content) content_elements.append(ContentElement( type=element_type, content=content, page_idx=page_idx, position=position, bbox=item.get('bbox'), metadata=item.get('metadata', {}) )) # Join text content text_content = '\n'.join(text_parts) # Count tokens token_count = self._count_tokens(text_content) page_context = PageContext( page_idx=page_idx, page_type=page_type, title=page_title, content_elements=content_elements, text_content=text_content, token_count=token_count ) page_contexts.append(page_context) self.logger.info(f"mineru-content: extracted {len(page_contexts)} page contexts") return page_contexts except Exception as e: self.logger.error(f"mineru-content: page context extraction failed: {str(e)}") return [] def extract_context_for_position(self, content_list: List[Dict], position: int, mode: ContextMode = ContextMode.PAGE, window_size: int = 2, max_tokens: int = 1000) -> str: """ Extract context around a specific position in the content list. Args: content_list: MinerU content list position: Position in the content list mode: Context extraction mode (PAGE or CHUNK) window_size: Number of pages/chunks to include max_tokens: Maximum tokens to include Returns: Extracted context string """ try: if position >= len(content_list): return "" target_item = content_list[position] if mode == ContextMode.PAGE: # Extract based on page boundaries target_page_idx = target_item.get('page_idx', 0) min_page = max(0, target_page_idx - window_size) max_page = target_page_idx + window_size context_items = [] for idx, item in enumerate(content_list): item_page = item.get('page_idx', 0) if min_page <= item_page <= max_page and item.get('type') in ['text', 'title']: # Add page marker if item_page != target_page_idx: page_marker = f"[Page {item_page + 1}]" if not context_items or context_items[-1] != page_marker: context_items.append(page_marker) text = item.get('text', '').strip() if text: context_items.append(text) else: # ContextMode.CHUNK # Extract based on chunk position min_pos = max(0, position - window_size) max_pos = min(len(content_list) - 1, position + window_size) context_items = [] for idx in range(min_pos, max_pos + 1): if idx == position: continue # Skip the target item itself item = content_list[idx] if item.get('type') in ['text', 'title', 'table']: text = item.get('text', '').strip() if text: if idx < position: context_items.append(f"[Before] {text}") else: context_items.append(f"[After] {text}") # Join and truncate by tokens context_text = '\n'.join(context_items) return self._truncate_by_tokens(context_text, max_tokens) except Exception as e: self.logger.error(f"mineru-content: context extraction failed: {str(e)}") return "" def _count_tokens(self, text: str) -> int: """Count tokens in text using tiktoken.""" try: encoding = tiktoken.get_encoding("cl100k_base") return len(encoding.encode(text)) except: # Fallback to character-based estimation return len(text) // 4 def _truncate_by_tokens(self, text: str, max_tokens: int) -> str: """Truncate text to fit within token limit while preserving sentence boundaries.""" if not text: return "" try: encoding = tiktoken.get_encoding("cl100k_base") tokens = encoding.encode(text) if len(tokens) <= max_tokens: return text # Truncate to max tokens truncated_tokens = tokens[:max_tokens] truncated_text = encoding.decode(truncated_tokens) # Try to find a sentence boundary for sep in ['. ', '。', '! ', '? ', '\n']: last_sep = truncated_text.rfind(sep) if last_sep > len(truncated_text) * 0.8: # Within last 20% return truncated_text[:last_sep + len(sep)] return truncated_text + "..." except: # Fallback to character-based truncation char_limit = max_tokens * 4 if len(text) > char_limit: return text[:char_limit] + "..." return text def _clean_hallucination_patterns(self, content: str) -> str: """ Clean common AI hallucination patterns from content. Args: content: The content to clean Returns: Cleaned content """ if not content: return content original_length = len(content) # Pattern 1: Remove excessive dots (more than 10 consecutive dots) content = re.sub(r'\.{10,}', '...', content) # Pattern 2: Remove other excessive repeated characters (more than 10) # This handles patterns like "--------" or "=========" content = re.sub(r'(.)\1{9,}', r'\1\1\1', content) # Pattern 3: Remove excessive repeated words or patterns # For example: "................................................................" # repeated many times content = re.sub(r'(\.{3,}[\s\n]*){5,}', '...\n', content) # Pattern 4: Remove number sequences that appear to be counting # Like ", 68\n, 72\n, 73\n, 73\n, 73\n" repeated many times content = re.sub(r'(,\s*\d+\s*\n?\s*){20,}', '', content) # Pattern 5: Remove table of contents with excessive dots # Like "8.2 macOS 系统安装无线驱动程序................................................" content = re.sub(r'([^\n]+)(\.{20,})', r'\1', content) # Pattern 6: Clean up multiple consecutive empty lines content = re.sub(r'\n{4,}', '\n\n\n', content) # Pattern 7: Remove excessive dots or comma-separated numbers at the end # But preserve normal punctuation content = re.sub(r'[\s\n]*[,\d\s]{10,}$', '', content) content = re.sub(r'\.{4,}$', '...', content) cleaned_length = len(content) if cleaned_length < original_length: reduction = original_length - cleaned_length self.logger.info(f"mineru-content: cleaned {reduction} characters of hallucination patterns " f"(from {original_length} to {cleaned_length})") return content.strip()