qwen_agent/utils/file_utils.py

#!/usr/bin/env python3
"""
File utility functions for file processing, downloading, and management.
"""

import os
import hashlib
import aiofiles
import aiohttp
import shutil
import zipfile
import tempfile
import logging
from typing import Dict, List, Optional
from pathlib import Path

# 配置日志
logger = logging.getLogger('app')


async def download_file(url: str, destination_path: str) -> bool:
    """Download file from URL asynchronously"""
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                if response.status == 200:
                    async with aiofiles.open(destination_path, 'wb') as f:
                        async for chunk in response.content.iter_chunked(8192):
                            await f.write(chunk)
                    return True
                else:
                    logger.error(f"Failed to download {url}, status code: {response.status}")
                    return False
    except Exception as e:
        logger.error(f"Error downloading {url}: {str(e)}")
        return False


def get_file_hash(file_path: str) -> str:
    """Calculate MD5 hash for a file path/URL"""
    return hashlib.md5(file_path.encode('utf-8')).hexdigest()


def remove_file_or_directory(path: str):
    """Remove file or directory recursively"""
    try:
        if os.path.exists(path):
            if os.path.isfile(path):
                os.remove(path)
            elif os.path.isdir(path):
                shutil.rmtree(path)
            logger.info(f"Removed: {path}")
        else:
            logger.warning(f"Path does not exist: {path}")
    except Exception as e:
        logger.error(f"Error removing {path}: {str(e)}")


def extract_zip_file(zip_path: str, extract_dir: str) -> List[str]:
    """Extract zip file and return list of extracted txt/md files"""
    extracted_files = []
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)

        # Find all extracted txt, md, xlsx, xls, and csv files
        for root, dirs, files in os.walk(extract_dir):
            for file in files:
                if file.lower().endswith(('.txt', '.md', '.xlsx', '.xls', '.csv')):
                    extracted_files.append(os.path.join(root, file))

        logger.info(f"Extracted {len(extracted_files)} txt/md/xlsx/csv files from {zip_path}")
        return extracted_files

    except Exception as e:
        logger.error(f"Error extracting zip file {zip_path}: {str(e)}")
        return []


def get_document_preview(document_path: str, max_lines: int = 10) -> str:
    """Get preview of document content"""
    try:
        with open(document_path, 'r', encoding='utf-8') as f:
            lines = []
            for i, line in enumerate(f):
                if i >= max_lines:
                    break
                lines.append(line.rstrip())
            return '\n'.join(lines)
    except Exception as e:
        return f"Error reading document: {str(e)}"


def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool:
    """Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)"""
    if not target_file.exists():
        return False

    # Check if pagination and embeddings files exist and are not empty
    if pagination_file.exists() and embeddings_file.exists():
        # Check file sizes to ensure they're not empty
        if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0:
            return True

    return False


def load_processed_files_log(unique_id: str) -> Dict[str, Dict]:
    """Load processed files log for a project"""
    log_file = os.path.join("projects", "data", unique_id, "processed_files.json")
    if os.path.exists(log_file):
        try:
            import json
            with open(log_file, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            logger.error(f"Error loading processed files log: {e}")
    return {}


def save_processed_files_log(unique_id: str, processed_log: Dict[str, Dict]):
    """Save processed files log for a project (legacy function)"""
    log_file = os.path.join("projects", "data", unique_id, "processed_files.json")
    try:
        os.makedirs(os.path.dirname(log_file), exist_ok=True)
        import json
        with open(log_file, 'w', encoding='utf-8') as f:
            json.dump(processed_log, f, ensure_ascii=False, indent=2)
    except Exception as e:
        logger.error(f"Error saving processed files log: {e}")


def get_processing_log(unique_id: str) -> Dict:
    """Get the comprehensive processing log for a project"""
    log_file = os.path.join("projects", "data", unique_id, "processing_log.json")
    if os.path.exists(log_file):
        try:
            import json
            with open(log_file, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            logger.error(f"Error loading processing log: {e}")
    return {}


def save_project_status(unique_id: str, status: Dict):
    """Save project processing status"""
    status_file = os.path.join("projects", "data", unique_id, "status.json")
    try:
        os.makedirs(os.path.dirname(status_file), exist_ok=True)
        import json
        with open(status_file, 'w', encoding='utf-8') as f:
            json.dump(status, f, ensure_ascii=False, indent=2)
    except Exception as e:
        logger.error(f"Error saving project status: {e}")


def load_project_status(unique_id: str) -> Dict:
    """Load project processing status"""
    status_file = os.path.join("projects", "data", unique_id, "status.json")
    if os.path.exists(status_file):
        try:
            import json
            with open(status_file, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            logger.error(f"Error loading project status: {e}")
    return {}


def get_file_metadata(file_path: str) -> Dict:
    """Get metadata for a file"""
    try:
        if not os.path.exists(file_path):
            return {"exists": False}

        stat = os.stat(file_path)
        return {
            "exists": True,
            "size": stat.st_size,
            "modified_time": stat.st_mtime,
            "created_time": stat.st_ctime,
            "is_file": os.path.isfile(file_path),
            "is_directory": os.path.isdir(file_path)
        }
    except Exception as e:
        return {"exists": False, "error": str(e)}


def update_file_processing_status(unique_id: str, group_name: str, filename: str, status: Dict):
    """Update processing status for a specific file"""
    status_file = os.path.join("projects", "data", unique_id, "file_status.json")

    try:
        # Load existing status
        if os.path.exists(status_file):
            import json
            with open(status_file, 'r', encoding='utf-8') as f:
                file_status = json.load(f)
        else:
            file_status = {}

        # Ensure structure exists
        if group_name not in file_status:
            file_status[group_name] = {}

        # Update status
        file_status[group_name][filename] = {
            **status,
            "updated_at": str(os.path.getmtime(file_path) if os.path.exists(file_path) else 0)
        }

        # Save updated status
        os.makedirs(os.path.dirname(status_file), exist_ok=True)
        with open(status_file, 'w', encoding='utf-8') as f:
            json.dump(file_status, f, ensure_ascii=False, indent=2)

    except Exception as e:
        logger.error(f"Error updating file processing status: {e}")


def get_file_processing_status(unique_id: str, group_name: str = None, filename: str = None) -> Dict:
    """Get processing status for files"""
    status_file = os.path.join("projects", "data", unique_id, "file_status.json")

    if not os.path.exists(status_file):
        return {}

    try:
        import json
        with open(status_file, 'r', encoding='utf-8') as f:
            file_status = json.load(f)

        # Filter by group and filename if provided
        if group_name:
            if group_name not in file_status:
                return {}

            if filename:
                return file_status[group_name].get(filename, {})
            else:
                return file_status[group_name]

        return file_status

    except Exception as e:
        logger.error(f"Error getting file processing status: {e}")
        return {}


def calculate_directory_size(directory_path: str) -> int:
    """Calculate total size of a directory recursively"""
    total_size = 0
    try:
        for dirpath, dirnames, filenames in os.walk(directory_path):
            for filename in filenames:
                file_path = os.path.join(dirpath, filename)
                if os.path.exists(file_path):
                    total_size += os.path.getsize(file_path)
    except Exception as e:
        logger.error(f"Error calculating directory size: {e}")

    return total_size


def get_project_statistics(unique_id: str) -> Dict:
    """Get comprehensive statistics for a project"""
    project_dir = os.path.join("projects", "data", unique_id)

    if not os.path.exists(project_dir):
        return {"project_exists": False}

    stats = {
        "project_exists": True,
        "unique_id": unique_id,
        "directories": {},
        "total_files": 0,
        "total_size": 0
    }

    # Check each directory
    directories = ["files", "processed", "dataset"]

    for dir_name in directories:
        dir_path = os.path.join(project_dir, dir_name)
        if os.path.exists(dir_path):
            dir_size = calculate_directory_size(dir_path)
            dir_files = 0

            for root, dirs, files in os.walk(dir_path):
                dir_files += len(files)

            stats["directories"][dir_name] = {
                "exists": True,
                "size": dir_size,
                "files": dir_files
            }

            stats["total_files"] += dir_files
            stats["total_size"] += dir_size
        else:
            stats["directories"][dir_name] = {
                "exists": False,
                "size": 0,
                "files": 0
            }

    return stats