Convert all Chinese comments, docstrings, logger/print output, HTTPException detail messages, and API response messages to English across the entire codebase. Functional zh/ja localized strings (e.g. prompt templates, timezone display names, date formats) are preserved as-is. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
195 lines
7.0 KiB
Python
195 lines
7.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
ZIP project handler.
|
|
Responsible for downloading ZIP files from URLs and extracting them into project directories.
|
|
"""
|
|
|
|
import os
|
|
import hashlib
|
|
import zipfile
|
|
import requests
|
|
import tempfile
|
|
import logging
|
|
from typing import List, Optional
|
|
from urllib.parse import urlparse
|
|
from pathlib import Path
|
|
|
|
# Configure logging
|
|
logger = logging.getLogger('app')
|
|
|
|
|
|
class ZipProjectHandler:
|
|
"""ZIP project handler."""
|
|
|
|
def __init__(self, projects_dir: str = "./projects"):
|
|
self.projects_dir = Path(projects_dir).resolve()
|
|
self.projects_dir.mkdir(exist_ok=True)
|
|
self.cache_dir = self.projects_dir / "_cache"
|
|
self.cache_dir.mkdir(exist_ok=True)
|
|
|
|
def _get_url_hash(self, url: str) -> str:
|
|
"""Get the URL hash for caching."""
|
|
return hashlib.md5(url.encode('utf-8')).hexdigest()[:16]
|
|
|
|
def _is_valid_url_or_path(self, path: str) -> bool:
|
|
"""Validate whether the URL or local path is valid."""
|
|
# First try validating it as a URL
|
|
try:
|
|
result = urlparse(path)
|
|
if all([result.scheme, result.netloc]):
|
|
return True
|
|
except Exception:
|
|
pass
|
|
|
|
# Then try validating it as a local path
|
|
try:
|
|
return Path(path).exists()
|
|
except Exception:
|
|
return False
|
|
|
|
def _download_file(self, url: str, local_path: str) -> bool:
|
|
"""Download a file to a local path."""
|
|
try:
|
|
response = requests.get(url, stream=True, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
with open(local_path, 'wb') as f:
|
|
for chunk in response.iter_content(chunk_size=8192):
|
|
if chunk:
|
|
f.write(chunk)
|
|
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to download file: {e}")
|
|
return False
|
|
|
|
def _copy_local_file(self, local_path: str, target_path: str) -> bool:
|
|
"""Copy a local file to the target path."""
|
|
try:
|
|
import shutil
|
|
shutil.copy2(local_path, target_path)
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to copy local file: {e}")
|
|
return False
|
|
|
|
def _extract_zip(self, zip_path: str, extract_to: str) -> bool:
|
|
"""Extract a ZIP file to the specified directory."""
|
|
try:
|
|
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
zip_ref.extractall(extract_to)
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to extract ZIP file: {e}")
|
|
return False
|
|
|
|
def get_project_from_zip(self, zip_url: str, unique_id: Optional[str] = None) -> Optional[str]:
|
|
"""
|
|
Get project data from a ZIP URL or local path
|
|
|
|
Args:
|
|
zip_url: URL of the ZIP file or local relative path
|
|
unique_id: Optional unique identifier used as the folder name
|
|
|
|
Returns:
|
|
Optional[str]: Returns the project directory path on success, or None on failure
|
|
"""
|
|
if not self._is_valid_url_or_path(zip_url):
|
|
logger.error(f"Invalid URL or path: {zip_url}")
|
|
return None
|
|
|
|
# Use unique_id as the directory name, or url_hash if not provided
|
|
if unique_id:
|
|
project_dir_name = unique_id
|
|
# When using unique_id, skip the cache and re-extract directly to ensure the project structure is correct
|
|
cached_project_dir = self.projects_dir / project_dir_name
|
|
else:
|
|
project_dir_name = self._get_url_hash(zip_url)
|
|
cached_project_dir = self.projects_dir / project_dir_name
|
|
|
|
if cached_project_dir.exists() and not unique_id:
|
|
logger.info(f"Using cached project directory: {cached_project_dir}")
|
|
return str(cached_project_dir)
|
|
|
|
# Download or copy the ZIP file
|
|
url_hash = self._get_url_hash(zip_url)
|
|
# When using unique_id, use it as the ZIP filename prefix to avoid conflicts
|
|
if unique_id:
|
|
zip_filename = f"{unique_id}_{url_hash}.zip"
|
|
else:
|
|
zip_filename = f"{url_hash}.zip"
|
|
zip_path = self.cache_dir / zip_filename
|
|
|
|
if not zip_path.exists():
|
|
# Determine whether it is a URL or a local path
|
|
try:
|
|
result = urlparse(zip_url)
|
|
is_url = all([result.scheme, result.netloc])
|
|
except Exception:
|
|
is_url = False
|
|
|
|
if is_url:
|
|
logger.info(f"Downloading ZIP file: {zip_url}")
|
|
if not self._download_file(zip_url, str(zip_path)):
|
|
return None
|
|
else:
|
|
logger.info(f"Copying local ZIP file: {zip_url}")
|
|
# Resolve the relative path
|
|
local_path = Path(zip_url).resolve()
|
|
if not self._copy_local_file(str(local_path), str(zip_path)):
|
|
return None
|
|
else:
|
|
logger.info(f"Using cached ZIP file: {zip_path}")
|
|
|
|
# Extract into the project directory
|
|
logger.info(f"Extracting ZIP file to: {cached_project_dir}")
|
|
if not self._extract_zip(str(zip_path), str(cached_project_dir)):
|
|
return None
|
|
|
|
logger.info(f"Project is ready: {cached_project_dir}")
|
|
return str(cached_project_dir)
|
|
|
|
def collect_document_files(self, project_dir: str) -> List[str]:
|
|
"""
|
|
Collect all document.txt files under the project directory
|
|
|
|
Args:
|
|
project_dir: Project directory path
|
|
|
|
Returns:
|
|
List[str]: Full path list of all document.txt files
|
|
"""
|
|
document_files = []
|
|
project_path = Path(project_dir)
|
|
|
|
if not project_path.exists():
|
|
logger.error(f"Project directory does not exist: {project_dir}")
|
|
return document_files
|
|
|
|
# Recursively search for all document.txt files
|
|
for file_path in project_path.rglob("document.txt"):
|
|
if file_path.is_file():
|
|
document_files.append(str(file_path))
|
|
|
|
logger.info(f"Found {project_dir} document.txt files in project directory {len(document_files)}")
|
|
for file_path in document_files[:5]: # Only print the first 5 file paths as examples
|
|
logger.info(f" - {file_path}")
|
|
if len(document_files) > 5:
|
|
logger.info(f" ... and {len(document_files) - 5} more files")
|
|
|
|
return document_files
|
|
|
|
def cleanup_cache(self):
|
|
"""Clean up the cache directory."""
|
|
try:
|
|
import shutil
|
|
if self.cache_dir.exists():
|
|
shutil.rmtree(self.cache_dir)
|
|
self.cache_dir.mkdir(exist_ok=True)
|
|
logger.info("Cache cleanup completed")
|
|
except Exception as e:
|
|
logger.error(f"Failed to clean up cache: {e}")
|
|
|
|
|
|
# Global ZIP project handler instance
|
|
zip_handler = ZipProjectHandler() |