""" Utility functions for MinerU-based parsing. This module provides helper functions for file detection, conversion, and common operations used throughout the parsing system. """ import os import subprocess import tempfile import fitz import hashlib from pathlib import Path from typing import Tuple, Optional, Dict, Any from .logger import get_module_logger logger = get_module_logger('utils') # Platform-specific utils import (if available) try: from loader.train import utils as train_utils except ImportError: train_utils = None class PDFDetector: """PDF format detection utilities""" @staticmethod def is_pdf_from_ppt(pdf_path: str) -> Tuple[bool, Dict[str, Any]]: """ Detect if PDF was converted from PowerPoint presentation. Based on gzero.py's is_pdf_from_ppt detection logic. Args: pdf_path: Path to PDF file Returns: Tuple of (is_from_ppt, metadata_dict) """ try: with fitz.open(pdf_path) as doc: metadata = doc.metadata # Check Creator/Producer fields for PPT indicators creator = metadata.get('creator', '').lower() producer = metadata.get('producer', '').lower() ppt_indicators = [ 'powerpoint', 'microsoft office', 'presentation', 'impress', 'libreoffice', 'openoffice' ] is_from_ppt = any(indicator in creator or indicator in producer for indicator in ppt_indicators) # Check page aspect ratios (PPT typically uses 16:9 or 4:3) if not is_from_ppt and len(doc) > 0: page = doc[0] aspect_ratio = page.rect.width / page.rect.height if page.rect.height > 0 else 0 # Common PPT aspect ratios: 16:9 ≈ 1.78, 4:3 ≈ 1.33 ppt_ratios = [16/9, 4/3] # Include portrait orientations tolerance = 0.1 is_from_ppt = any(abs(aspect_ratio - ratio) < tolerance for ratio in ppt_ratios) return is_from_ppt, { 'creator': metadata.get('creator', ''), 'producer': metadata.get('producer', ''), 'page_count': len(doc), 'aspect_ratio': page.rect.width / page.rect.height if len(doc) > 0 and page.rect.height > 0 else 0 } except Exception as e: logger.error(f"Error detecting PPT format from PDF {pdf_path}: {str(e)}") return False, {} @staticmethod def extract_pdf_pages(pdf_path: str) -> list: """ Extract page information from PDF file. Based on gzero.py's page extraction logic. Args: pdf_path: Path to PDF file Returns: List of page information dictionaries """ pages = [] try: with fitz.open(pdf_path) as doc: for page_index, page in enumerate(doc): page_info = { 'index': page_index, 'rotation': page.rotation, 'text': page.get_text('text'), 'width': page.rect.width, 'height': page.rect.height, 'images': [] } # Extract image information for img in page.get_images(full=True): bbox = page.get_image_bbox(img) if bbox.width > 0 and bbox.height > 0: img_info = { 'xref': img[0], 'smask': img[1], 'width': img[2], 'height': img[3], 'bbox_x': bbox.x0, 'bbox_y': bbox.y0, 'bbox_w': bbox.width, 'bbox_h': bbox.height, 'bbox_r': (bbox.width * bbox.height) / (page_info['width'] * page_info['height']) } page_info['images'].append(img_info) pages.append(page_info) except Exception as e: logger.error(f"Error extracting pages from PDF {pdf_path}: {str(e)}") return pages class FileConverter: """File conversion utilities""" @staticmethod async def convert_ppt_to_pdf(ppt_path: str, output_dir: Optional[str] = None) -> str: """ Convert PPT/PPTX file to PDF format. Based on gzero.py's conversion logic with LibreOffice. Args: ppt_path: Path to PPT/PPTX file output_dir: Output directory (defaults to temp directory) Returns: Path to converted PDF file """ if output_dir is None: output_dir = tempfile.gettempdir() pdf_filename = Path(ppt_path).stem + "_converted.pdf" pdf_path = os.path.join(output_dir, pdf_filename) try: # Use LibreOffice for conversion (primary method) cmd = [ 'libreoffice', '--headless', '--convert-to', 'pdf', '--outdir', output_dir, ppt_path ] logger.info(f"mineru-converter: converting PPT to PDF: {ppt_path} -> {pdf_path}") result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) if result.returncode != 0: raise Exception(f"LibreOffice conversion failed: {result.stderr}") # LibreOffice generates file with original name generated_pdf = os.path.join(output_dir, Path(ppt_path).stem + ".pdf") if os.path.exists(generated_pdf) and generated_pdf != pdf_path: os.rename(generated_pdf, pdf_path) if not os.path.exists(pdf_path): raise Exception("PDF file was not generated") logger.info(f"mineru-converter: PPT conversion successful: {pdf_path}") return pdf_path except subprocess.TimeoutExpired: raise Exception("PPT conversion timeout") except FileNotFoundError: # LibreOffice not available, try alternative method logger.warning("LibreOffice not found, trying alternative conversion method") return await FileConverter._convert_ppt_alternative(ppt_path, output_dir) except Exception as e: logger.error(f"mineru-converter: PPT conversion failed: {str(e)}") raise @staticmethod async def _convert_ppt_alternative(ppt_path: str, output_dir: str) -> str: """ Alternative PPT to PDF conversion method using python-pptx + reportlab. Args: ppt_path: Path to PPT file output_dir: Output directory Returns: Path to converted PDF file """ try: from pptx import Presentation from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import A4 pdf_filename = Path(ppt_path).stem + "_converted.pdf" pdf_path = os.path.join(output_dir, pdf_filename) # Read PPT file prs = Presentation(ppt_path) # Create PDF c = canvas.Canvas(pdf_path, pagesize=A4) for slide_idx, slide in enumerate(prs.slides): if slide_idx > 0: c.showPage() # Extract slide content (simplified version) y_position = 750 for shape in slide.shapes: if hasattr(shape, "text") and shape.text.strip(): # Truncate long text to fit on page text = shape.text[:100] c.drawString(50, y_position, text) y_position -= 30 if y_position < 50: break c.save() logger.info(f"mineru-converter: alternative PPT conversion successful: {pdf_path}") return pdf_path except ImportError: raise Exception("Alternative PPT conversion requires python-pptx and reportlab libraries") except Exception as e: raise Exception(f"Alternative PPT conversion failed: {str(e)}") @staticmethod async def convert_doc_to_pdf(doc_path: str, output_dir: Optional[str] = None) -> str: """ Convert DOC/DOCX file to PDF format. Args: doc_path: Path to DOC/DOCX file output_dir: Output directory (defaults to temp directory) Returns: Path to converted PDF file """ if output_dir is None: output_dir = tempfile.gettempdir() pdf_filename = Path(doc_path).stem + "_converted.pdf" pdf_path = os.path.join(output_dir, pdf_filename) try: # Use LibreOffice for conversion (same as PPT) cmd = [ 'libreoffice', '--headless', '--convert-to', 'pdf', '--outdir', output_dir, doc_path ] logger.info(f"mineru-converter: converting DOC to PDF: {doc_path} -> {pdf_path}") result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) if result.returncode != 0: raise Exception(f"LibreOffice conversion failed: {result.stderr}") # LibreOffice generates file with original name generated_pdf = os.path.join(output_dir, Path(doc_path).stem + ".pdf") if os.path.exists(generated_pdf) and generated_pdf != pdf_path: os.rename(generated_pdf, pdf_path) if not os.path.exists(pdf_path): raise Exception("PDF file was not generated") logger.info(f"mineru-converter: DOC conversion successful: {pdf_path}") return pdf_path except subprocess.TimeoutExpired: raise Exception("DOC conversion timeout") except FileNotFoundError: # LibreOffice not available, try alternative method logger.warning("LibreOffice not found, trying alternative conversion method") return await FileConverter._convert_doc_alternative(doc_path, output_dir) except Exception as e: logger.error(f"mineru-converter: DOC conversion failed: {str(e)}") raise @staticmethod async def _convert_doc_alternative(doc_path: str, output_dir: str) -> str: """ Alternative DOC to PDF conversion method using python-docx. Args: doc_path: Path to DOC file output_dir: Output directory Returns: Path to converted PDF file """ try: from docx import Document from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import A4 from reportlab.lib.units import inch pdf_filename = Path(doc_path).stem + "_converted.pdf" pdf_path = os.path.join(output_dir, pdf_filename) # Read DOC file doc = Document(doc_path) # Create PDF c = canvas.Canvas(pdf_path, pagesize=A4) width, height = A4 y_position = height - inch for paragraph in doc.paragraphs: if paragraph.text.strip(): # Handle text wrapping text = paragraph.text if len(text) > 80: # Approximate line length lines = [text[i:i+80] for i in range(0, len(text), 80)] for line in lines: if y_position < inch: c.showPage() y_position = height - inch c.drawString(inch, y_position, line) y_position -= 20 else: if y_position < inch: c.showPage() y_position = height - inch c.drawString(inch, y_position, text) y_position -= 20 c.save() logger.info(f"mineru-converter: alternative DOC conversion successful: {pdf_path}") return pdf_path except ImportError: raise Exception("Alternative DOC conversion requires python-docx and reportlab libraries") except Exception as e: raise Exception(f"Alternative DOC conversion failed: {str(e)}") @staticmethod async def convert_image_to_pdf(image_path: str, output_dir: Optional[str] = None) -> str: """ Convert image file to PDF format. Args: image_path: Path to image file output_dir: Output directory (defaults to temp directory) Returns: Path to converted PDF file """ if output_dir is None: output_dir = tempfile.gettempdir() pdf_filename = Path(image_path).stem + "_converted.pdf" pdf_path = os.path.join(output_dir, pdf_filename) try: from PIL import Image logger.info(f"mineru-converter: converting image to PDF: {image_path} -> {pdf_path}") # Open image img = Image.open(image_path) # Convert to RGB if necessary (for PNG with transparency, etc.) if img.mode in ('RGBA', 'LA'): # Create a white background background = Image.new('RGB', img.size, (255, 255, 255)) if img.mode == 'RGBA': background.paste(img, mask=img.split()[3]) # Use alpha channel as mask else: background.paste(img, mask=img.split()[1]) # LA mode img = background elif img.mode not in ('RGB', 'L'): img = img.convert('RGB') # Save as PDF img.save(pdf_path, 'PDF', resolution=100.0) logger.info(f"mineru-converter: image conversion successful: {pdf_path}") return pdf_path except ImportError: raise Exception("Image conversion requires Pillow library") except Exception as e: logger.error(f"mineru-converter: image conversion failed: {str(e)}") raise @staticmethod def detect_file_type(file_path: str) -> str: """ Detect file type from file extension. Args: file_path: Path to file Returns: File type string ('pdf', 'ppt', 'pptx', 'doc', 'docx', 'image', etc.) """ extension = Path(file_path).suffix.lower() type_map = { '.pdf': 'pdf', '.ppt': 'ppt', '.pptx': 'pptx', '.doc': 'doc', '.docx': 'docx', '.png': 'image', '.jpg': 'image', '.jpeg': 'image', '.gif': 'image', '.bmp': 'image', '.tiff': 'image', '.tif': 'image', '.webp': 'image', '.svg': 'image' } return type_map.get(extension, 'unknown') @staticmethod def validate_file_size(file_path: str, max_size_mb: float) -> bool: """ Validate file size against maximum allowed size. Args: file_path: Path to file max_size_mb: Maximum size in MB Returns: True if file size is acceptable """ try: file_size = os.path.getsize(file_path) max_size_bytes = max_size_mb * 1024 * 1024 return file_size <= max_size_bytes except Exception as e: logger.error(f"Error checking file size for {file_path}: {str(e)}") return False def clean_filename(filename: str) -> str: """ Clean filename for safe file system operations. Args: filename: Original filename Returns: Cleaned filename """ # Remove invalid characters invalid_chars = '<>:"/\\|?*' cleaned = filename for char in invalid_chars: cleaned = cleaned.replace(char, '_') # Limit length if len(cleaned) > 200: name, ext = os.path.splitext(cleaned) cleaned = name[:200-len(ext)] + ext return cleaned def create_temp_directory(base_dir: str, prefix: str = "mineru_") -> str: """ Create temporary directory for processing. Args: base_dir: Base directory path prefix: Prefix for temp directory name Returns: Path to created temporary directory """ temp_dir = tempfile.mkdtemp(prefix=prefix, dir=base_dir) logger.debug(f"mineru-utils: created temporary directory: {temp_dir}") return temp_dir def get_file_hash(filepath: str) -> str: """ Generate MD5 hash of a file for tracing and identification. Args: filepath: Path to the file Returns: MD5 hash string of the file """ hash_obj = hashlib.md5() with open(filepath, 'rb') as file: while chunk := file.read(8192): hash_obj.update(chunk) return hash_obj.hexdigest() def get_temp_dir(src_fileid: str, learn_type: int, cache_version: str = 'v1') -> str: """ Setup processing environment with standardized directory structure. Args: src_fileid: Source file ID (typically file hash) learn_type: Model type index for processing cache_version: Cache version string (default: 'v1') Returns: Path to the created temporary directory """ # Get cache directory - use platform utils if available, else use temp if train_utils: cache_dir = train_utils.parser_cache_dir() else: import tempfile cache_dir = tempfile.gettempdir() temp_dir = os.path.join( cache_dir, cache_version, f"{src_fileid}_{learn_type}_mineru" ) os.makedirs(temp_dir, exist_ok=True) return temp_dir