361 lines
11 KiB
Python
361 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
File processing tasks for the queue system.
|
||
"""
|
||
|
||
import os
|
||
import json
|
||
import time
|
||
import shutil
|
||
import logging
|
||
from pathlib import Path
|
||
from typing import Dict, List, Optional, Any
|
||
from huey import crontab
|
||
|
||
# 配置日志
|
||
logger = logging.getLogger('app')
|
||
|
||
from .config import huey
|
||
from utils.file_utils import (
|
||
extract_zip_file,
|
||
get_file_hash,
|
||
is_file_already_processed,
|
||
load_processed_files_log,
|
||
save_processed_files_log,
|
||
get_document_preview
|
||
)
|
||
|
||
|
||
@huey.task()
|
||
def process_file_async(
|
||
project_id: str,
|
||
file_path: str,
|
||
original_filename: str = None,
|
||
target_directory: str = "files"
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
异步处理单个文件
|
||
|
||
Args:
|
||
project_id: 项目ID
|
||
file_path: 文件路径
|
||
original_filename: 原始文件名
|
||
target_directory: 目标目录
|
||
|
||
Returns:
|
||
处理结果字典
|
||
"""
|
||
try:
|
||
logger.info(f"开始处理文件: {file_path}")
|
||
|
||
# 确保项目目录存在
|
||
project_dir = os.path.join("projects", project_id)
|
||
files_dir = os.path.join(project_dir, target_directory)
|
||
os.makedirs(files_dir, exist_ok=True)
|
||
|
||
# 获取文件hash作为标识
|
||
file_hash = get_file_hash(file_path)
|
||
|
||
# 检查文件是否已处理
|
||
processed_log = load_processed_files_log(project_id)
|
||
if file_hash in processed_log:
|
||
logger.info(f"文件已处理,跳过: {file_path}")
|
||
return {
|
||
"status": "skipped",
|
||
"message": "文件已处理",
|
||
"file_hash": file_hash,
|
||
"project_id": project_id
|
||
}
|
||
|
||
# 处理文件
|
||
result = _process_single_file(
|
||
file_path,
|
||
files_dir,
|
||
original_filename or os.path.basename(file_path)
|
||
)
|
||
|
||
# 更新处理日志
|
||
if result["status"] == "success":
|
||
processed_log[file_hash] = {
|
||
"original_path": file_path,
|
||
"original_filename": original_filename or os.path.basename(file_path),
|
||
"processed_at": str(time.time()),
|
||
"status": "processed",
|
||
"result": result
|
||
}
|
||
save_processed_files_log(project_id, processed_log)
|
||
|
||
result["file_hash"] = file_hash
|
||
result["project_id"] = project_id
|
||
|
||
logger.info(f"文件处理完成: {file_path}, 状态: {result['status']}")
|
||
return result
|
||
|
||
except Exception as e:
|
||
error_msg = f"处理文件时发生错误: {str(e)}"
|
||
logger.error(error_msg)
|
||
return {
|
||
"status": "error",
|
||
"message": error_msg,
|
||
"file_path": file_path,
|
||
"project_id": project_id
|
||
}
|
||
|
||
|
||
@huey.task()
|
||
def process_multiple_files_async(
|
||
project_id: str,
|
||
file_paths: List[str],
|
||
original_filenames: List[str] = None
|
||
) -> List[Dict[str, Any]]:
|
||
"""
|
||
批量异步处理多个文件
|
||
|
||
Args:
|
||
project_id: 项目ID
|
||
file_paths: 文件路径列表
|
||
original_filenames: 原始文件名列表
|
||
|
||
Returns:
|
||
处理结果列表
|
||
"""
|
||
try:
|
||
logger.info(f"开始批量处理 {len(file_paths)} 个文件")
|
||
|
||
results = []
|
||
for i, file_path in enumerate(file_paths):
|
||
original_filename = original_filenames[i] if original_filenames and i < len(original_filenames) else None
|
||
|
||
# 为每个文件创建异步任务
|
||
result = process_file_async(project_id, file_path, original_filename)
|
||
results.append(result)
|
||
|
||
logger.info(f"批量文件处理任务已提交,共 {len(results)} 个文件")
|
||
return results
|
||
|
||
except Exception as e:
|
||
error_msg = f"批量处理文件时发生错误: {str(e)}"
|
||
logger.error(error_msg)
|
||
return [{
|
||
"status": "error",
|
||
"message": error_msg,
|
||
"project_id": project_id
|
||
}]
|
||
|
||
|
||
@huey.task()
|
||
def process_zip_file_async(
|
||
project_id: str,
|
||
zip_path: str,
|
||
extract_to: str = None
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
异步处理zip压缩文件
|
||
|
||
Args:
|
||
project_id: 项目ID
|
||
zip_path: zip文件路径
|
||
extract_to: 解压目标目录
|
||
|
||
Returns:
|
||
处理结果字典
|
||
"""
|
||
try:
|
||
logger.info(f"开始处理zip文件: {zip_path}")
|
||
|
||
# 设置解压目录
|
||
if extract_to is None:
|
||
extract_to = os.path.join("projects", project_id, "extracted", os.path.basename(zip_path))
|
||
|
||
os.makedirs(extract_to, exist_ok=True)
|
||
|
||
# 解压文件
|
||
extracted_files = extract_zip_file(zip_path, extract_to)
|
||
|
||
if not extracted_files:
|
||
return {
|
||
"status": "error",
|
||
"message": "解压失败或没有找到支持的文件",
|
||
"zip_path": zip_path,
|
||
"project_id": project_id
|
||
}
|
||
|
||
# 批量处理解压后的文件
|
||
result = process_multiple_files_async(project_id, extracted_files)
|
||
|
||
return {
|
||
"status": "success",
|
||
"message": f"zip文件处理完成,解压出 {len(extracted_files)} 个文件",
|
||
"zip_path": zip_path,
|
||
"extract_to": extract_to,
|
||
"extracted_files": extracted_files,
|
||
"project_id": project_id,
|
||
"batch_task_result": result
|
||
}
|
||
|
||
except Exception as e:
|
||
error_msg = f"处理zip文件时发生错误: {str(e)}"
|
||
logger.error(error_msg)
|
||
return {
|
||
"status": "error",
|
||
"message": error_msg,
|
||
"zip_path": zip_path,
|
||
"project_id": project_id
|
||
}
|
||
|
||
|
||
@huey.task()
|
||
def cleanup_processed_files(
|
||
project_id: str,
|
||
older_than_days: int = 30
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
清理旧的处理文件
|
||
|
||
Args:
|
||
project_id: 项目ID
|
||
older_than_days: 清理多少天前的文件
|
||
|
||
Returns:
|
||
清理结果字典
|
||
"""
|
||
try:
|
||
logger.info(f"开始清理项目 {project_id} 中 {older_than_days} 天前的文件")
|
||
|
||
project_dir = os.path.join("projects", project_id)
|
||
if not os.path.exists(project_dir):
|
||
return {
|
||
"status": "error",
|
||
"message": "项目目录不存在",
|
||
"project_id": project_id
|
||
}
|
||
|
||
current_time = time.time()
|
||
cutoff_time = current_time - (older_than_days * 24 * 3600)
|
||
cleaned_files = []
|
||
|
||
# 遍历项目目录
|
||
for root, dirs, files in os.walk(project_dir):
|
||
for file in files:
|
||
file_path = os.path.join(root, file)
|
||
file_mtime = os.path.getmtime(file_path)
|
||
|
||
if file_mtime < cutoff_time:
|
||
try:
|
||
os.remove(file_path)
|
||
cleaned_files.append(file_path)
|
||
logger.info(f"已删除旧文件: {file_path}")
|
||
except Exception as e:
|
||
logger.error(f"删除文件失败 {file_path}: {str(e)}")
|
||
|
||
# 清理空目录
|
||
for root, dirs, files in os.walk(project_dir, topdown=False):
|
||
for dir in dirs:
|
||
dir_path = os.path.join(root, dir)
|
||
try:
|
||
if not os.listdir(dir_path):
|
||
os.rmdir(dir_path)
|
||
logger.info(f"已删除空目录: {dir_path}")
|
||
except Exception as e:
|
||
logger.error(f"删除目录失败 {dir_path}: {str(e)}")
|
||
|
||
return {
|
||
"status": "success",
|
||
"message": f"清理完成,删除了 {len(cleaned_files)} 个文件",
|
||
"project_id": project_id,
|
||
"cleaned_files": cleaned_files,
|
||
"older_than_days": older_than_days
|
||
}
|
||
|
||
except Exception as e:
|
||
error_msg = f"清理文件时发生错误: {str(e)}"
|
||
logger.error(error_msg)
|
||
return {
|
||
"status": "error",
|
||
"message": error_msg,
|
||
"project_id": project_id
|
||
}
|
||
|
||
|
||
def _process_single_file(
|
||
file_path: str,
|
||
target_dir: str,
|
||
original_filename: str
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
处理单个文件的内部方法
|
||
|
||
Args:
|
||
file_path: 源文件路径
|
||
target_dir: 目标目录
|
||
original_filename: 原始文件名
|
||
|
||
Returns:
|
||
处理结果字典
|
||
"""
|
||
try:
|
||
# 检查文件是否存在
|
||
if not os.path.exists(file_path):
|
||
return {
|
||
"status": "error",
|
||
"message": "源文件不存在",
|
||
"file_path": file_path
|
||
}
|
||
|
||
# 获取文件信息
|
||
file_size = os.path.getsize(file_path)
|
||
file_ext = os.path.splitext(original_filename)[1].lower()
|
||
|
||
# 根据文件类型进行不同处理
|
||
supported_extensions = ['.txt', '.md', '.csv', '.xlsx', '.zip']
|
||
|
||
if file_ext not in supported_extensions:
|
||
return {
|
||
"status": "error",
|
||
"message": f"不支持的文件类型: {file_ext}",
|
||
"file_path": file_path,
|
||
"supported_extensions": supported_extensions
|
||
}
|
||
|
||
# 复制文件到目标目录
|
||
target_file_path = os.path.join(target_dir, original_filename)
|
||
|
||
# 如果目标文件已存在,添加时间戳
|
||
if os.path.exists(target_file_path):
|
||
name, ext = os.path.splitext(original_filename)
|
||
timestamp = int(time.time())
|
||
target_file_path = os.path.join(target_dir, f"{name}_{timestamp}{ext}")
|
||
|
||
shutil.copy2(file_path, target_file_path)
|
||
|
||
# 获取文件预览(如果是文本文件)
|
||
preview = None
|
||
if file_ext in ['.txt', '.md']:
|
||
preview = get_document_preview(target_file_path, max_lines=5)
|
||
|
||
return {
|
||
"status": "success",
|
||
"message": "文件处理成功",
|
||
"original_path": file_path,
|
||
"target_path": target_file_path,
|
||
"file_size": file_size,
|
||
"file_extension": file_ext,
|
||
"preview": preview
|
||
}
|
||
|
||
except Exception as e:
|
||
return {
|
||
"status": "error",
|
||
"message": f"处理文件时发生错误: {str(e)}",
|
||
"file_path": file_path
|
||
}
|
||
|
||
|
||
# 定期任务示例:每天凌晨2点清理30天前的文件
|
||
@huey.periodic_task(crontab(hour=2, minute=0))
|
||
def daily_cleanup():
|
||
"""每日清理任务"""
|
||
logger.info("执行每日清理任务")
|
||
# 这里可以添加清理逻辑
|
||
return {"status": "completed", "message": "每日清理任务完成"}
|