qwen_agent/utils/single_file_processor.py

#!/usr/bin/env python3
"""
Single file processing functions for handling individual files.
"""

import os
import tempfile
import zipfile
import logging
from typing import Dict, List, Tuple, Optional
from pathlib import Path

# Configure logger
logger = logging.getLogger('app')

from utils.file_utils import download_file

# Try to import excel/csv processor, but handle if dependencies are missing
try:
    from utils.excel_csv_processor import (
        is_excel_file, is_csv_file, process_excel_file, process_csv_file
    )
    EXCEL_CSV_SUPPORT = True
except ImportError as e:
    logger.warning(f"Excel/CSV processing not available: {e}")
    EXCEL_CSV_SUPPORT = False

    # Fallback functions
    def is_excel_file(file_path):
        return file_path.lower().endswith(('.xlsx', '.xls'))

    def is_csv_file(file_path):
        return file_path.lower().endswith('.csv')

    def process_excel_file(file_path):
        return "", []

    def process_csv_file(file_path):
        return "", []


async def process_single_file(
    unique_id: str,
    group_name: str,
    filename: str,
    original_path: str,
    local_path: str
) -> Dict:
    """
    Process a single file and generate document.txt, pagination.txt, and embedding.pkl.

    Returns:
        Dict with processing results and file paths
    """
    # Create output directory for this file
    filename_stem = Path(filename).stem
    output_dir = os.path.join("projects", "data", unique_id, "processed", group_name, filename_stem)
    os.makedirs(output_dir, exist_ok=True)

    result = {
        "success": False,
        "filename": filename,
        "group": group_name,
        "output_dir": output_dir,
        "document_path": os.path.join(output_dir, "document.txt"),
        "pagination_path": os.path.join(output_dir, "pagination.txt"),
        "embedding_path": os.path.join(output_dir, "embedding.pkl"),
        "error": None,
        "content_size": 0,
        "pagination_lines": 0,
        "embedding_chunks": 0
    }

    try:
        # Download file if it's remote and not yet downloaded
        if original_path.startswith(('http://', 'https://')):
            if not os.path.exists(local_path):
                logger.info(f"Downloading {original_path} -> {local_path}")
                success = await download_file(original_path, local_path)
                if not success:
                    result["error"] = "Failed to download file"
                    return result

        # Extract content from file
        content, pagination_lines = await extract_file_content(local_path, filename)

        if not content or not content.strip():
            result["error"] = "No content extracted from file"
            return result

        # Write document.txt
        with open(result["document_path"], 'w', encoding='utf-8') as f:
            f.write(content)
        result["content_size"] = len(content)

        # Write pagination.txt
        if pagination_lines:
            with open(result["pagination_path"], 'w', encoding='utf-8') as f:
                for line in pagination_lines:
                    if line.strip():
                        f.write(f"{line}\n")
            result["pagination_lines"] = len(pagination_lines)
        else:
            # Generate pagination from text content
            pagination_lines = generate_pagination_from_text(result["document_path"],
                                                            result["pagination_path"])
            result["pagination_lines"] = len(pagination_lines)

        # Generate embeddings
        try:
            embedding_chunks = await generate_embeddings_for_file(
                result["document_path"], result["embedding_path"]
            )
            result["embedding_chunks"] = len(embedding_chunks) if embedding_chunks else 0
            result["success"] = True

        except Exception as e:
            result["error"] = f"Embedding generation failed: {str(e)}"
        logger.error(f"Failed to generate embeddings for {filename}: {str(e)}")

    except Exception as e:
        result["error"] = f"File processing failed: {str(e)}"
        logger.error(f"Error processing file {filename}: {str(e)}")

    return result


async def extract_file_content(file_path: str, filename: str) -> Tuple[str, List[str]]:
    """Extract content from various file formats."""

    # Handle zip files
    if filename.lower().endswith('.zip'):
        return await extract_from_zip(file_path, filename)

    # Handle Excel files
    elif is_excel_file(file_path):
        return await extract_from_excel(file_path, filename)

    # Handle CSV files
    elif is_csv_file(file_path):
        return await extract_from_csv(file_path, filename)

    # Handle text files
    else:
        return await extract_from_text(file_path, filename)


async def extract_from_zip(zip_path: str, filename: str) -> Tuple[str, List[str]]:
    """Extract content from zip file."""
    content_parts = []
    pagination_lines = []

    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # Extract to temporary directory
            temp_dir = tempfile.mkdtemp(prefix=f"extract_{Path(filename).stem}_")
            zip_ref.extractall(temp_dir)

            # Process extracted files
            for root, dirs, files in os.walk(temp_dir):
                for file in files:
                    if file.lower().endswith(('.txt', '.md', '.xlsx', '.xls', '.csv')):
                        file_path = os.path.join(root, file)

                        try:
                            file_content, file_pagination = await extract_file_content(file_path, file)

                            if file_content:
                                content_parts.append(f"# Page {file}")
                                content_parts.append(file_content)
                                pagination_lines.extend(file_pagination)

                        except Exception as e:
                            logger.error(f"Error processing extracted file {file}: {str(e)}")

            # Clean up temporary directory
            import shutil
            shutil.rmtree(temp_dir)

    except Exception as e:
        logger.error(f"Error extracting zip file {filename}: {str(e)}")
        return "", []

    return '\n\n'.join(content_parts), pagination_lines


async def extract_from_excel(file_path: str, filename: str) -> Tuple[str, List[str]]:
    """Extract content from Excel file."""
    try:
        document_content, pagination_lines = process_excel_file(file_path)

        if document_content:
            content = f"# Page {filename}\n{document_content}"
            return content, pagination_lines
        else:
            return "", []

    except Exception as e:
        logger.error(f"Error processing Excel file {filename}: {str(e)}")
        return "", []


async def extract_from_csv(file_path: str, filename: str) -> Tuple[str, List[str]]:
    """Extract content from CSV file."""
    try:
        document_content, pagination_lines = process_csv_file(file_path)

        if document_content:
            content = f"# Page {filename}\n{document_content}"
            return content, pagination_lines
        else:
            return "", []

    except Exception as e:
        logger.error(f"Error processing CSV file {filename}: {str(e)}")
        return "", []


async def extract_from_text(file_path: str, filename: str) -> Tuple[str, List[str]]:
    """Extract content from text file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()

        if content:
            return content, []
        else:
            return "", []

    except Exception as e:
        logger.error(f"Error reading text file {filename}: {str(e)}")
        return "", []


def generate_pagination_from_text(document_path: str, pagination_path: str) -> List[str]:
    """Generate pagination from text document."""
    try:
        # Import embedding module for pagination
        import sys
        sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'embedding'))
        from embedding import split_document_by_pages

        pages = split_document_by_pages(str(document_path), str(pagination_path))

        # Return pagination lines
        pagination_lines = []
        with open(pagination_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    pagination_lines.append(line.strip())

        return pagination_lines

    except Exception as e:
        logger.error(f"Error generating pagination from text: {str(e)}")
        return []


async def generate_embeddings_for_file(document_path: str, embedding_path: str) -> Optional[List]:
    """Generate embeddings for a document."""
    try:
        # Import embedding module
        import sys
        sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'embedding'))
        from embedding import embed_document

        # Generate embeddings using paragraph chunking
        embedding_data = embed_document(
            str(document_path),
            str(embedding_path),
            chunking_strategy='paragraph'
        )

        if embedding_data and 'chunks' in embedding_data:
            return embedding_data['chunks']
        else:
            return None

    except Exception as e:
        logger.error(f"Error generating embeddings: {str(e)}")
        return None


def check_file_already_processed(unique_id: str, group_name: str, filename: str) -> bool:
    """Check if a file has already been processed."""
    filename_stem = Path(filename).stem
    output_dir = os.path.join("projects", "data", unique_id, "processed", group_name, filename_stem)

    document_path = os.path.join(output_dir, "document.txt")
    pagination_path = os.path.join(output_dir, "pagination.txt")
    embedding_path = os.path.join(output_dir, "embedding.pkl")

    # Check if all files exist and are not empty
    if (os.path.exists(document_path) and os.path.exists(pagination_path) and
        os.path.exists(embedding_path)):

        if (os.path.getsize(document_path) > 0 and os.path.getsize(pagination_path) > 0 and
            os.path.getsize(embedding_path) > 0):
            return True

    return False