367 lines
15 KiB
Python
367 lines
15 KiB
Python
"""
|
|
File conversion module for MinerU-based parsing.
|
|
|
|
This module handles file type detection, PPT to PDF conversion,
|
|
and related file processing operations.
|
|
"""
|
|
|
|
import os
|
|
import asyncio
|
|
import subprocess
|
|
from pathlib import Path
|
|
from typing import Tuple, Optional
|
|
from .logger import get_module_logger
|
|
logger = get_module_logger('converter')
|
|
|
|
from .utils import FileConverter, PDFDetector
|
|
from .config_base import MinerUConfig
|
|
|
|
|
|
class DocumentConverter:
|
|
"""Document conversion handler with PPT support"""
|
|
|
|
def __init__(self, config: MinerUConfig):
|
|
self.config = config
|
|
self.logger = logger
|
|
|
|
async def handle_file_input(self, file_path: str, temp_dir: str, src_fileid: str) -> Tuple[str, bool]:
|
|
"""
|
|
Handle input file processing - detect type and convert if needed.
|
|
|
|
Args:
|
|
file_path: Input file path
|
|
temp_dir: Temporary directory for processing
|
|
src_fileid: Source file ID for logging
|
|
|
|
Returns:
|
|
Tuple of (pdf_path, is_ppt_source)
|
|
"""
|
|
try:
|
|
file_type = FileConverter.detect_file_type(file_path)
|
|
self.logger.info(f"mineru-converter: detected file type: {file_type}")
|
|
|
|
# Validate file size
|
|
if not FileConverter.validate_file_size(file_path, self.config.max_file_size / (1024 * 1024)):
|
|
raise ValueError(f"File size exceeds maximum allowed size of {self.config.max_file_size / (1024 * 1024):.1f}MB")
|
|
|
|
if file_type in ['ppt', 'pptx']:
|
|
# PPT file needs conversion
|
|
self.logger.info(f"mineru-converter: converting PPT to PDF")
|
|
pdf_path = await self._convert_ppt_to_pdf(file_path, temp_dir, src_fileid)
|
|
return pdf_path, True # is_ppt_source = True
|
|
|
|
elif file_type in ['doc', 'docx']:
|
|
# DOC file needs conversion (treat similar to PPT)
|
|
self.logger.info(f"mineru-converter: converting DOC to PDF")
|
|
pdf_path = await self._convert_doc_to_pdf(file_path, temp_dir, src_fileid)
|
|
return pdf_path, True # is_ppt_source = True (DOC treated like PPT)
|
|
|
|
elif file_type == 'image':
|
|
# Image file needs conversion
|
|
self.logger.info(f"mineru-converter: converting image to PDF")
|
|
pdf_path = await self._convert_image_to_pdf(file_path, temp_dir, src_fileid)
|
|
return pdf_path, False # is_ppt_source = False (images are not PPT-like)
|
|
|
|
elif file_type == 'pdf':
|
|
# PDF file - copy to temp directory for processing
|
|
self.logger.info(f"mineru-converter: processing PDF directly")
|
|
pdf_path = os.path.join(temp_dir, f"source.pdf")
|
|
|
|
# Copy file to temp directory
|
|
import shutil
|
|
shutil.copy(file_path, pdf_path)
|
|
|
|
return pdf_path, False # is_ppt_source = False
|
|
|
|
else:
|
|
raise ValueError(f"Unsupported file type: {file_type}")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-converter: file input handling failed: {str(e)}")
|
|
raise
|
|
|
|
async def _convert_ppt_to_pdf(self, ppt_path: str, temp_dir: str, src_fileid: str) -> str:
|
|
"""
|
|
Convert PPT file to PDF using LibreOffice or alternative method.
|
|
|
|
Args:
|
|
ppt_path: Path to PPT file
|
|
temp_dir: Temporary directory
|
|
src_fileid: Source file ID for logging
|
|
|
|
Returns:
|
|
Path to converted PDF file
|
|
"""
|
|
try:
|
|
pdf_path = os.path.join(temp_dir, "source.pdf")
|
|
|
|
# Try LibreOffice conversion first
|
|
success = await self._try_libreoffice_conversion(ppt_path, temp_dir, src_fileid)
|
|
|
|
if success:
|
|
# LibreOffice generates file with original name
|
|
generated_pdf = os.path.join(temp_dir, Path(ppt_path).stem + ".pdf")
|
|
if os.path.exists(generated_pdf):
|
|
if generated_pdf != pdf_path:
|
|
os.rename(generated_pdf, pdf_path)
|
|
elif os.path.exists(pdf_path):
|
|
pass # File already exists with correct name
|
|
else:
|
|
raise Exception("PDF file was not generated by LibreOffice")
|
|
else:
|
|
# Fallback to alternative method
|
|
self.logger.warning(f"mineru-converter: LibreOffice failed, trying alternative conversion")
|
|
pdf_path = await self._try_alternative_conversion(ppt_path, temp_dir, src_fileid)
|
|
|
|
if not os.path.exists(pdf_path):
|
|
raise Exception("PDF conversion failed - no output file generated")
|
|
|
|
self.logger.info(f"mineru-converter: PPT to PDF conversion successful: {pdf_path}")
|
|
return pdf_path
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-converter: PPT conversion error: {str(e)}")
|
|
raise Exception(f"PPT to PDF conversion failed: {str(e)}")
|
|
|
|
async def _try_libreoffice_conversion(self, ppt_path: str, temp_dir: str, src_fileid: str) -> bool:
|
|
"""
|
|
Try converting PPT using LibreOffice.
|
|
|
|
Returns:
|
|
True if conversion successful, False otherwise
|
|
"""
|
|
try:
|
|
cmd = [
|
|
self.config.libreoffice_path,
|
|
'--headless',
|
|
'--convert-to', 'pdf',
|
|
'--outdir', temp_dir,
|
|
ppt_path
|
|
]
|
|
|
|
self.logger.info(f"mineru-converter: executing LibreOffice conversion")
|
|
|
|
# Run with timeout
|
|
process = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE
|
|
)
|
|
|
|
try:
|
|
stdout, stderr = await asyncio.wait_for(
|
|
process.communicate(),
|
|
timeout=self.config.conversion_timeout
|
|
)
|
|
|
|
if process.returncode != 0:
|
|
self.logger.warning(f"mineru-converter: LibreOffice returned non-zero: {stderr.decode()}")
|
|
return False
|
|
|
|
self.logger.info(f"mineru-converter: LibreOffice conversion completed successfully")
|
|
return True
|
|
|
|
except asyncio.TimeoutError:
|
|
self.logger.warning(f"mineru-converter: LibreOffice conversion timeout")
|
|
process.kill()
|
|
return False
|
|
|
|
except FileNotFoundError:
|
|
self.logger.warning(f"mineru-converter: LibreOffice not found at: {self.config.libreoffice_path}")
|
|
return False
|
|
except Exception as e:
|
|
self.logger.warning(f"mineru-converter: LibreOffice conversion error: {str(e)}")
|
|
return False
|
|
|
|
async def _try_alternative_conversion(self, ppt_path: str, temp_dir: str, src_fileid: str) -> str:
|
|
"""
|
|
Try alternative PPT conversion method using python libraries.
|
|
|
|
Returns:
|
|
Path to converted PDF file
|
|
"""
|
|
try:
|
|
from pptx import Presentation
|
|
from reportlab.pdfgen import canvas
|
|
from reportlab.lib.pagesizes import A4
|
|
|
|
pdf_path = os.path.join(temp_dir, "source.pdf")
|
|
|
|
self.logger.info(f"mineru-converter: using python-pptx + reportlab conversion")
|
|
|
|
# Load PPT presentation
|
|
prs = Presentation(ppt_path)
|
|
|
|
# Create PDF
|
|
c = canvas.Canvas(pdf_path, pagesize=A4)
|
|
page_width, page_height = A4
|
|
|
|
for slide_idx, slide in enumerate(prs.slides):
|
|
if slide_idx > 0:
|
|
c.showPage()
|
|
|
|
# Add slide content
|
|
y_position = page_height - 50 # Start from top
|
|
margin = 50
|
|
|
|
# Extract text content from slide
|
|
slide_texts = []
|
|
for shape in slide.shapes:
|
|
if hasattr(shape, "text") and shape.text.strip():
|
|
slide_texts.append(shape.text.strip())
|
|
|
|
# Write text to PDF
|
|
for text in slide_texts:
|
|
if y_position < margin:
|
|
break # Avoid writing below page margin
|
|
|
|
# Split long text into lines
|
|
words = text.split()
|
|
line = ""
|
|
for word in words:
|
|
test_line = f"{line} {word}".strip()
|
|
if len(test_line) > 80: # Approximate character limit per line
|
|
if line:
|
|
c.drawString(margin, y_position, line)
|
|
y_position -= 15
|
|
if y_position < margin:
|
|
break
|
|
line = word
|
|
else:
|
|
line = test_line
|
|
|
|
if line and y_position >= margin:
|
|
c.drawString(margin, y_position, line)
|
|
y_position -= 20 # Extra space between text blocks
|
|
|
|
# Add slide number
|
|
c.drawString(page_width - 100, 30, f"Slide {slide_idx + 1}")
|
|
|
|
c.save()
|
|
|
|
self.logger.info(f"mineru-converter: alternative conversion completed: {pdf_path}")
|
|
return pdf_path
|
|
|
|
except ImportError as e:
|
|
raise Exception(f"Alternative conversion requires python-pptx and reportlab: {str(e)}")
|
|
except Exception as e:
|
|
raise Exception(f"Alternative PPT conversion failed: {str(e)}")
|
|
|
|
async def _convert_doc_to_pdf(self, doc_path: str, temp_dir: str, src_fileid: str) -> str:
|
|
"""
|
|
Convert DOC file to PDF using LibreOffice or alternative method.
|
|
|
|
Args:
|
|
doc_path: Path to DOC file
|
|
temp_dir: Temporary directory
|
|
src_fileid: Source file ID for logging
|
|
|
|
Returns:
|
|
Path to converted PDF file
|
|
"""
|
|
try:
|
|
pdf_path = os.path.join(temp_dir, "source.pdf")
|
|
|
|
# Try LibreOffice conversion first
|
|
success = await self._try_libreoffice_conversion(doc_path, temp_dir, src_fileid)
|
|
|
|
if success:
|
|
# LibreOffice generates file with original name
|
|
generated_pdf = os.path.join(temp_dir, Path(doc_path).stem + ".pdf")
|
|
if os.path.exists(generated_pdf):
|
|
if generated_pdf != pdf_path:
|
|
os.rename(generated_pdf, pdf_path)
|
|
elif os.path.exists(pdf_path):
|
|
pass # File already exists with correct name
|
|
else:
|
|
raise Exception("PDF file was not generated by LibreOffice")
|
|
else:
|
|
# Fallback to alternative method
|
|
self.logger.warning(f"mineru-converter: LibreOffice failed, using FileConverter for DOC")
|
|
pdf_path = await FileConverter.convert_doc_to_pdf(doc_path, temp_dir)
|
|
|
|
if not os.path.exists(pdf_path):
|
|
raise Exception("PDF conversion failed - no output file generated")
|
|
|
|
self.logger.info(f"mineru-converter: DOC to PDF conversion successful: {pdf_path}")
|
|
return pdf_path
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-converter: DOC conversion error: {str(e)}")
|
|
raise Exception(f"DOC to PDF conversion failed: {str(e)}")
|
|
|
|
async def _convert_image_to_pdf(self, image_path: str, temp_dir: str, src_fileid: str) -> str:
|
|
"""
|
|
Convert image file to PDF.
|
|
|
|
Args:
|
|
image_path: Path to image file
|
|
temp_dir: Temporary directory
|
|
src_fileid: Source file ID for logging
|
|
|
|
Returns:
|
|
Path to converted PDF file
|
|
"""
|
|
try:
|
|
pdf_path = os.path.join(temp_dir, "source.pdf")
|
|
|
|
# Use FileConverter utility
|
|
converted_path = await FileConverter.convert_image_to_pdf(image_path, temp_dir)
|
|
|
|
# Rename to standard name if needed
|
|
if converted_path != pdf_path and os.path.exists(converted_path):
|
|
os.rename(converted_path, pdf_path)
|
|
|
|
if not os.path.exists(pdf_path):
|
|
raise Exception("PDF conversion failed - no output file generated")
|
|
|
|
self.logger.info(f"mineru-converter: image to PDF conversion successful: {pdf_path}")
|
|
return pdf_path
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-converter: image conversion error: {str(e)}")
|
|
raise Exception(f"Image to PDF conversion failed: {str(e)}")
|
|
|
|
def detect_pdf_format(self, pdf_path: str, src_fileid: str) -> Tuple[bool, dict]:
|
|
"""
|
|
Detect if PDF was converted from PowerPoint.
|
|
|
|
Args:
|
|
pdf_path: Path to PDF file
|
|
src_fileid: Source file ID for logging
|
|
|
|
Returns:
|
|
Tuple of (is_ppt_format, metadata)
|
|
"""
|
|
try:
|
|
is_ppt, metadata = PDFDetector.is_pdf_from_ppt(pdf_path)
|
|
|
|
self.logger.info(f"mineru-converter: PDF format detection: is_ppt={is_ppt}, metadata={metadata}")
|
|
|
|
return is_ppt, metadata
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-converter: PDF format detection failed: {str(e)}")
|
|
return False, {}
|
|
|
|
def extract_pdf_pages(self, pdf_path: str, src_fileid: str) -> list:
|
|
"""
|
|
Extract page information from PDF.
|
|
|
|
Args:
|
|
pdf_path: Path to PDF file
|
|
src_fileid: Source file ID for logging
|
|
|
|
Returns:
|
|
List of page information
|
|
"""
|
|
try:
|
|
pages = PDFDetector.extract_pdf_pages(pdf_path)
|
|
|
|
self.logger.info(f"mineru-converter: extracted {len(pages)} pages from PDF")
|
|
|
|
return pages
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"mineru-converter: page extraction failed: {str(e)}")
|
|
return [] |