#!/usr/bin/env python3 """ File processing tasks for the queue system. """ import os import json import time import shutil import logging from pathlib import Path from typing import Dict, List, Optional, Any from huey import crontab # Configure logging logger = logging.getLogger('app') from .config import huey from utils.file_utils import ( extract_zip_file, get_file_hash, load_processed_files_log, save_processed_files_log, get_document_preview ) @huey.task() def process_file_async( project_id: str, file_path: str, original_filename: str = None, target_directory: str = "files" ) -> Dict[str, Any]: """ Asynchronously process a single file. Args: project_id: Project ID file_path: File path original_filename: Original filename target_directory: Target directory Returns: Processing result dictionary """ try: logger.info(f"Starting file processing: {file_path}") # Ensure project directory exists project_dir = os.path.join("projects", project_id) files_dir = os.path.join(project_dir, target_directory) os.makedirs(files_dir, exist_ok=True) # Get file hash as identifier file_hash = get_file_hash(file_path) # Check if file has already been processed processed_log = load_processed_files_log(project_id) if file_hash in processed_log: logger.info(f"File already processed, skipping: {file_path}") return { "status": "skipped", "message": "File already processed", "file_hash": file_hash, "project_id": project_id } # Process the file result = _process_single_file( file_path, files_dir, original_filename or os.path.basename(file_path) ) # Update processing log if result["status"] == "success": processed_log[file_hash] = { "original_path": file_path, "original_filename": original_filename or os.path.basename(file_path), "processed_at": str(time.time()), "status": "processed", "result": result } save_processed_files_log(project_id, processed_log) result["file_hash"] = file_hash result["project_id"] = project_id logger.info(f"File processing complete: {file_path}, status: {result['status']}") return result except Exception as e: error_msg = f"Error processing file: {str(e)}" logger.error(error_msg) return { "status": "error", "message": error_msg, "file_path": file_path, "project_id": project_id } @huey.task() def process_multiple_files_async( project_id: str, file_paths: List[str], original_filenames: List[str] = None ) -> List[Dict[str, Any]]: """ Asynchronously process multiple files in batch. Args: project_id: Project ID file_paths: List of file paths original_filenames: List of original filenames Returns: List of processing results """ try: logger.info(f"Starting batch processing of {len(file_paths)} files") results = [] for i, file_path in enumerate(file_paths): original_filename = original_filenames[i] if original_filenames and i < len(original_filenames) else None # Create async task for each file result = process_file_async(project_id, file_path, original_filename) results.append(result) logger.info(f"Batch file processing tasks submitted, total {len(results)} files") return results except Exception as e: error_msg = f"Error during batch file processing: {str(e)}" logger.error(error_msg) return [{ "status": "error", "message": error_msg, "project_id": project_id }] @huey.task() def process_zip_file_async( project_id: str, zip_path: str, extract_to: str = None ) -> Dict[str, Any]: """ Asynchronously process a zip archive file. Args: project_id: Project ID zip_path: Zip file path extract_to: Extraction target directory Returns: Processing result dictionary """ try: logger.info(f"Starting zip file processing: {zip_path}") # Set extraction directory if extract_to is None: extract_to = os.path.join("projects", project_id, "extracted", os.path.basename(zip_path)) os.makedirs(extract_to, exist_ok=True) # Extract files extracted_files = extract_zip_file(zip_path, extract_to) if not extracted_files: return { "status": "error", "message": "Extraction failed or no supported files found", "zip_path": zip_path, "project_id": project_id } # Batch process extracted files result = process_multiple_files_async(project_id, extracted_files) return { "status": "success", "message": f"Zip file processing complete, extracted {len(extracted_files)} files", "zip_path": zip_path, "extract_to": extract_to, "extracted_files": extracted_files, "project_id": project_id, "batch_task_result": result } except Exception as e: error_msg = f"Error processing zip file: {str(e)}" logger.error(error_msg) return { "status": "error", "message": error_msg, "zip_path": zip_path, "project_id": project_id } @huey.task() def cleanup_processed_files( project_id: str, older_than_days: int = 30 ) -> Dict[str, Any]: """ Clean up old processed files. Args: project_id: Project ID older_than_days: Clean files older than this many days Returns: Cleanup result dictionary """ try: logger.info(f"Starting cleanup of files older than {older_than_days} days in project {project_id}") project_dir = os.path.join("projects", project_id) if not os.path.exists(project_dir): return { "status": "error", "message": "Project directory does not exist", "project_id": project_id } current_time = time.time() cutoff_time = current_time - (older_than_days * 24 * 3600) cleaned_files = [] # Walk through project directory for root, dirs, files in os.walk(project_dir): for file in files: file_path = os.path.join(root, file) file_mtime = os.path.getmtime(file_path) if file_mtime < cutoff_time: try: os.remove(file_path) cleaned_files.append(file_path) logger.info(f"Deleted old file: {file_path}") except Exception as e: logger.error(f"Failed to delete file {file_path}: {str(e)}") # Clean up empty directories for root, dirs, files in os.walk(project_dir, topdown=False): for dir in dirs: dir_path = os.path.join(root, dir) try: if not os.listdir(dir_path): os.rmdir(dir_path) logger.info(f"Deleted empty directory: {dir_path}") except Exception as e: logger.error(f"Failed to delete directory {dir_path}: {str(e)}") return { "status": "success", "message": f"Cleanup complete, deleted {len(cleaned_files)} files", "project_id": project_id, "cleaned_files": cleaned_files, "older_than_days": older_than_days } except Exception as e: error_msg = f"Error during file cleanup: {str(e)}" logger.error(error_msg) return { "status": "error", "message": error_msg, "project_id": project_id } def _process_single_file( file_path: str, target_dir: str, original_filename: str ) -> Dict[str, Any]: """ Internal method for processing a single file. Args: file_path: Source file path target_dir: Target directory original_filename: Original filename Returns: Processing result dictionary """ try: # Check if file exists if not os.path.exists(file_path): return { "status": "error", "message": "Source file does not exist", "file_path": file_path } # Get file info file_size = os.path.getsize(file_path) file_ext = os.path.splitext(original_filename)[1].lower() # Different processing based on file type supported_extensions = ['.txt', '.md', '.csv', '.xlsx', '.zip'] if file_ext not in supported_extensions: return { "status": "error", "message": f"Unsupported file type: {file_ext}", "file_path": file_path, "supported_extensions": supported_extensions } # Copy file to target directory target_file_path = os.path.join(target_dir, original_filename) # If target file already exists, add timestamp if os.path.exists(target_file_path): name, ext = os.path.splitext(original_filename) timestamp = int(time.time()) target_file_path = os.path.join(target_dir, f"{name}_{timestamp}{ext}") shutil.copy2(file_path, target_file_path) # Get file preview (if it's a text file) preview = None if file_ext in ['.txt', '.md']: preview = get_document_preview(target_file_path, max_lines=5) return { "status": "success", "message": "File processed successfully", "original_path": file_path, "target_path": target_file_path, "file_size": file_size, "file_extension": file_ext, "preview": preview } except Exception as e: return { "status": "error", "message": f"Error processing file: {str(e)}", "file_path": file_path } # Periodic task example: clean up files older than 30 days daily at 2 AM @huey.periodic_task(crontab(hour=2, minute=0)) def daily_cleanup(): """Daily cleanup task.""" logger.info("Running daily cleanup task") # Add cleanup logic here return {"status": "completed", "message": "Daily cleanup task completed"}