#!/usr/bin/env python3 """ ZIP project handler. Responsible for downloading ZIP files from URLs and extracting them into project directories. """ import os import hashlib import zipfile import requests import tempfile import logging from typing import List, Optional from urllib.parse import urlparse from pathlib import Path # Configure logging logger = logging.getLogger('app') class ZipProjectHandler: """ZIP project handler.""" def __init__(self, projects_dir: str = "./projects"): self.projects_dir = Path(projects_dir).resolve() self.projects_dir.mkdir(exist_ok=True) self.cache_dir = self.projects_dir / "_cache" self.cache_dir.mkdir(exist_ok=True) def _get_url_hash(self, url: str) -> str: """Get the URL hash for caching.""" return hashlib.md5(url.encode('utf-8')).hexdigest()[:16] def _is_valid_url_or_path(self, path: str) -> bool: """Validate whether the URL or local path is valid.""" # First try validating it as a URL try: result = urlparse(path) if all([result.scheme, result.netloc]): return True except Exception: pass # Then try validating it as a local path try: return Path(path).exists() except Exception: return False def _download_file(self, url: str, local_path: str) -> bool: """Download a file to a local path.""" try: response = requests.get(url, stream=True, timeout=30) response.raise_for_status() with open(local_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) return True except Exception as e: logger.error(f"Failed to download file: {e}") return False def _copy_local_file(self, local_path: str, target_path: str) -> bool: """Copy a local file to the target path.""" try: import shutil shutil.copy2(local_path, target_path) return True except Exception as e: logger.error(f"Failed to copy local file: {e}") return False def _extract_zip(self, zip_path: str, extract_to: str) -> bool: """Extract a ZIP file to the specified directory.""" try: with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(extract_to) return True except Exception as e: logger.error(f"Failed to extract ZIP file: {e}") return False def get_project_from_zip(self, zip_url: str, unique_id: Optional[str] = None) -> Optional[str]: """ Get project data from a ZIP URL or local path Args: zip_url: URL of the ZIP file or local relative path unique_id: Optional unique identifier used as the folder name Returns: Optional[str]: Returns the project directory path on success, or None on failure """ if not self._is_valid_url_or_path(zip_url): logger.error(f"Invalid URL or path: {zip_url}") return None # Use unique_id as the directory name, or url_hash if not provided if unique_id: project_dir_name = unique_id # When using unique_id, skip the cache and re-extract directly to ensure the project structure is correct cached_project_dir = self.projects_dir / project_dir_name else: project_dir_name = self._get_url_hash(zip_url) cached_project_dir = self.projects_dir / project_dir_name if cached_project_dir.exists() and not unique_id: logger.info(f"Using cached project directory: {cached_project_dir}") return str(cached_project_dir) # Download or copy the ZIP file url_hash = self._get_url_hash(zip_url) # When using unique_id, use it as the ZIP filename prefix to avoid conflicts if unique_id: zip_filename = f"{unique_id}_{url_hash}.zip" else: zip_filename = f"{url_hash}.zip" zip_path = self.cache_dir / zip_filename if not zip_path.exists(): # Determine whether it is a URL or a local path try: result = urlparse(zip_url) is_url = all([result.scheme, result.netloc]) except Exception: is_url = False if is_url: logger.info(f"Downloading ZIP file: {zip_url}") if not self._download_file(zip_url, str(zip_path)): return None else: logger.info(f"Copying local ZIP file: {zip_url}") # Resolve the relative path local_path = Path(zip_url).resolve() if not self._copy_local_file(str(local_path), str(zip_path)): return None else: logger.info(f"Using cached ZIP file: {zip_path}") # Extract into the project directory logger.info(f"Extracting ZIP file to: {cached_project_dir}") if not self._extract_zip(str(zip_path), str(cached_project_dir)): return None logger.info(f"Project is ready: {cached_project_dir}") return str(cached_project_dir) def collect_document_files(self, project_dir: str) -> List[str]: """ Collect all document.txt files under the project directory Args: project_dir: Project directory path Returns: List[str]: Full path list of all document.txt files """ document_files = [] project_path = Path(project_dir) if not project_path.exists(): logger.error(f"Project directory does not exist: {project_dir}") return document_files # Recursively search for all document.txt files for file_path in project_path.rglob("document.txt"): if file_path.is_file(): document_files.append(str(file_path)) logger.info(f"Found {project_dir} document.txt files in project directory {len(document_files)}") for file_path in document_files[:5]: # Only print the first 5 file paths as examples logger.info(f" - {file_path}") if len(document_files) > 5: logger.info(f" ... and {len(document_files) - 5} more files") return document_files def cleanup_cache(self): """Clean up the cache directory.""" try: import shutil if self.cache_dir.exists(): shutil.rmtree(self.cache_dir) self.cache_dir.mkdir(exist_ok=True) logger.info("Cache cleanup completed") except Exception as e: logger.error(f"Failed to clean up cache: {e}") # Global ZIP project handler instance zip_handler = ZipProjectHandler()