qwen_agent/zip_project_handler.py

#!/usr/bin/env python3
"""
ZIP project handler.
Responsible for downloading ZIP files from URLs and extracting them into project directories.
"""

import os
import hashlib
import zipfile
import requests
import tempfile
import logging
from typing import List, Optional
from urllib.parse import urlparse
from pathlib import Path

# Configure logging
logger = logging.getLogger('app')


class ZipProjectHandler:
    """ZIP project handler."""

    def __init__(self, projects_dir: str = "./projects"):
        self.projects_dir = Path(projects_dir).resolve()
        self.projects_dir.mkdir(exist_ok=True)
        self.cache_dir = self.projects_dir / "_cache"
        self.cache_dir.mkdir(exist_ok=True)

    def _get_url_hash(self, url: str) -> str:
        """Get the URL hash for caching."""
        return hashlib.md5(url.encode('utf-8')).hexdigest()[:16]

    def _is_valid_url_or_path(self, path: str) -> bool:
        """Validate whether the URL or local path is valid."""
        # First try validating it as a URL
        try:
            result = urlparse(path)
            if all([result.scheme, result.netloc]):
                return True
        except Exception:
            pass

        # Then try validating it as a local path
        try:
            return Path(path).exists()
        except Exception:
            return False

    def _download_file(self, url: str, local_path: str) -> bool:
        """Download a file to a local path."""
        try:
            response = requests.get(url, stream=True, timeout=30)
            response.raise_for_status()

            with open(local_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

            return True
        except Exception as e:
            logger.error(f"Failed to download file: {e}")
            return False

    def _copy_local_file(self, local_path: str, target_path: str) -> bool:
        """Copy a local file to the target path."""
        try:
            import shutil
            shutil.copy2(local_path, target_path)
            return True
        except Exception as e:
            logger.error(f"Failed to copy local file: {e}")
            return False

    def _extract_zip(self, zip_path: str, extract_to: str) -> bool:
        """Extract a ZIP file to the specified directory."""
        try:
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(extract_to)
            return True
        except Exception as e:
            logger.error(f"Failed to extract ZIP file: {e}")
            return False

    def get_project_from_zip(self, zip_url: str, unique_id: Optional[str] = None) -> Optional[str]:
        """
        Get project data from a ZIP URL or local path

        Args:
            zip_url: URL of the ZIP file or local relative path
            unique_id: Optional unique identifier used as the folder name

        Returns:
            Optional[str]: Returns the project directory path on success, or None on failure
        """
        if not self._is_valid_url_or_path(zip_url):
            logger.error(f"Invalid URL or path: {zip_url}")
            return None

        # Use unique_id as the directory name, or url_hash if not provided
        if unique_id:
            project_dir_name = unique_id
            # When using unique_id, skip the cache and re-extract directly to ensure the project structure is correct
            cached_project_dir = self.projects_dir / project_dir_name
        else:
            project_dir_name = self._get_url_hash(zip_url)
            cached_project_dir = self.projects_dir / project_dir_name

        if cached_project_dir.exists() and not unique_id:
            logger.info(f"Using cached project directory: {cached_project_dir}")
            return str(cached_project_dir)

        # Download or copy the ZIP file
        url_hash = self._get_url_hash(zip_url)
        # When using unique_id, use it as the ZIP filename prefix to avoid conflicts
        if unique_id:
            zip_filename = f"{unique_id}_{url_hash}.zip"
        else:
            zip_filename = f"{url_hash}.zip"
        zip_path = self.cache_dir / zip_filename

        if not zip_path.exists():
            # Determine whether it is a URL or a local path
            try:
                result = urlparse(zip_url)
                is_url = all([result.scheme, result.netloc])
            except Exception:
                is_url = False

            if is_url:
                logger.info(f"Downloading ZIP file: {zip_url}")
                if not self._download_file(zip_url, str(zip_path)):
                    return None
            else:
                logger.info(f"Copying local ZIP file: {zip_url}")
                # Resolve the relative path
                local_path = Path(zip_url).resolve()
                if not self._copy_local_file(str(local_path), str(zip_path)):
                    return None
        else:
            logger.info(f"Using cached ZIP file: {zip_path}")

        # Extract into the project directory
        logger.info(f"Extracting ZIP file to: {cached_project_dir}")
        if not self._extract_zip(str(zip_path), str(cached_project_dir)):
            return None

        logger.info(f"Project is ready: {cached_project_dir}")
        return str(cached_project_dir)

    def collect_document_files(self, project_dir: str) -> List[str]:
        """
        Collect all document.txt files under the project directory

        Args:
            project_dir: Project directory path

        Returns:
            List[str]: Full path list of all document.txt files
        """
        document_files = []
        project_path = Path(project_dir)

        if not project_path.exists():
            logger.error(f"Project directory does not exist: {project_dir}")
            return document_files

        # Recursively search for all document.txt files
        for file_path in project_path.rglob("document.txt"):
            if file_path.is_file():
                document_files.append(str(file_path))

        logger.info(f"Found {project_dir} document.txt files in project directory {len(document_files)}")
        for file_path in document_files[:5]:  # Only print the first 5 file paths as examples
            logger.info(f"  - {file_path}")
        if len(document_files) > 5:
            logger.info(f"  ... and {len(document_files) - 5} more files")

        return document_files

    def cleanup_cache(self):
        """Clean up the cache directory."""
        try:
            import shutil
            if self.cache_dir.exists():
                shutil.rmtree(self.cache_dir)
                self.cache_dir.mkdir(exist_ok=True)
            logger.info("Cache cleanup completed")
        except Exception as e:
            logger.error(f"Failed to clean up cache: {e}")


# Global ZIP project handler instance
zip_handler = ZipProjectHandler()