remove process file

This commit is contained in:
朱潮 2025-10-19 22:12:27 +08:00
parent 1f81bef8c6
commit 213ed20502
4 changed files with 3 additions and 106 deletions

View File

@ -278,7 +278,6 @@ curl -X POST "http://localhost:8001/api/v1/tasks/cleanup?older_than_days=7"
- `POST /api/v1/chat/completions` - OpenAI 兼容的聊天接口 - `POST /api/v1/chat/completions` - OpenAI 兼容的聊天接口
### 文件处理接口 ### 文件处理接口
- `POST /api/v1/files/process` - 同步文件处理
- `POST /api/v1/files/process/async` - 异步文件处理 - `POST /api/v1/files/process/async` - 异步文件处理
- `GET /api/v1/files/{unique_id}/status` - 文件处理状态 - `GET /api/v1/files/{unique_id}/status` - 文件处理状态

View File

@ -16,8 +16,7 @@ from pydantic import BaseModel, Field
# Import utility modules # Import utility modules
from utils import ( from utils import (
# Models # Models
Message, DatasetRequest, ChatRequest, FileProcessRequest, Message, DatasetRequest, ChatRequest, ChatResponse, QueueTaskRequest, QueueTaskResponse,
FileProcessResponse, ChatResponse, QueueTaskRequest, QueueTaskResponse,
QueueStatusResponse, TaskStatusResponse, QueueStatusResponse, TaskStatusResponse,
# File utilities # File utilities
@ -178,103 +177,6 @@ async def generate_stream_response(agent, messages, request) -> AsyncGenerator[s
# Models are now imported from utils module # Models are now imported from utils module
@app.post("/api/v1/files/process")
async def process_files(request: FileProcessRequest, authorization: Optional[str] = Header(None)):
"""
Process dataset files for a given unique_id.
Files are organized by key groups, and each group is combined into a single document.txt file.
Supports zip files which will be extracted and their txt/md contents combined.
Args:
request: FileProcessRequest containing unique_id, files (key-grouped dict), system_prompt, and mcp_settings
authorization: Authorization header containing API key (Bearer <API_KEY>)
Returns:
FileProcessResponse: Processing result with file list
"""
try:
unique_id = request.unique_id
if not unique_id:
raise HTTPException(status_code=400, detail="unique_id is required")
# 处理文件使用按key分组格式
processed_files_by_key = {}
if request.files:
# 使用请求中的文件按key分组
processed_files_by_key = await download_dataset_files(unique_id, request.files)
total_files = sum(len(files) for files in processed_files_by_key.values())
print(f"Processed {total_files} dataset files across {len(processed_files_by_key)} keys for unique_id: {unique_id}")
else:
print(f"No files provided in request for unique_id: {unique_id}")
# 使用unique_id获取项目目录
project_dir = os.path.join("projects", unique_id)
if not os.path.exists(project_dir):
raise HTTPException(status_code=400, detail=f"Project directory not found for unique_id: {unique_id}")
# 收集项目目录下所有的 document.txt 文件
document_files = []
for root, dirs, files in os.walk(project_dir):
for file in files:
if file == "document.txt":
document_files.append(os.path.join(root, file))
# 合并所有处理的文件包含新按key分组的文件
all_files = document_files.copy()
for key, files in processed_files_by_key.items():
all_files.extend(files)
if not all_files:
print(f"警告: 项目目录 {project_dir} 中未找到任何 document.txt 文件")
# 保存system_prompt和mcp_settings到项目目录如果提供
if request.system_prompt:
system_prompt_file = os.path.join(project_dir, "system_prompt.md")
with open(system_prompt_file, 'w', encoding='utf-8') as f:
f.write(request.system_prompt)
print(f"Saved system_prompt for unique_id: {unique_id}")
if request.mcp_settings:
mcp_settings_file = os.path.join(project_dir, "mcp_settings.json")
with open(mcp_settings_file, 'w', encoding='utf-8') as f:
json.dump(request.mcp_settings, f, ensure_ascii=False, indent=2)
print(f"Saved mcp_settings for unique_id: {unique_id}")
# 生成项目README.md文件
try:
save_project_readme(unique_id)
print(f"Generated README.md for unique_id: {unique_id}")
except Exception as e:
print(f"Failed to generate README.md for unique_id: {unique_id}, error: {str(e)}")
# 不影响主要处理流程,继续执行
# 返回结果包含按key分组的文件信息
result_files = []
for key in processed_files_by_key.keys():
# 添加对应的dataset document.txt路径
document_path = os.path.join("projects", unique_id, "dataset", key, "document.txt")
if os.path.exists(document_path):
result_files.append(document_path)
# 对于没有在processed_files_by_key中但存在的document.txt文件也添加到结果中
existing_document_paths = set(result_files) # 避免重复
for doc_file in document_files:
if doc_file not in existing_document_paths:
result_files.append(doc_file)
return FileProcessResponse(
success=True,
message=f"Successfully processed {len(result_files)} document files across {len(processed_files_by_key)} keys",
unique_id=unique_id,
processed_files=result_files
)
except HTTPException:
raise
except Exception as e:
print(f"Error processing files: {str(e)}")
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
@app.post("/api/v1/files/process/async") @app.post("/api/v1/files/process/async")
async def process_files_async_endpoint(request: QueueTaskRequest, authorization: Optional[str] = Header(None)): async def process_files_async_endpoint(request: QueueTaskRequest, authorization: Optional[str] = Header(None)):

View File

@ -303,7 +303,7 @@ def _process_single_file(
file_ext = os.path.splitext(original_filename)[1].lower() file_ext = os.path.splitext(original_filename)[1].lower()
# 根据文件类型进行不同处理 # 根据文件类型进行不同处理
supported_extensions = ['.txt', '.md', '.pdf', '.doc', '.docx', '.zip'] supported_extensions = ['.txt', '.md', '.csv', '.xlsx', '.zip']
if file_ext not in supported_extensions: if file_ext not in supported_extensions:
return { return {

View File

@ -56,11 +56,9 @@ from .api_models import (
Message, Message,
DatasetRequest, DatasetRequest,
ChatRequest, ChatRequest,
FileProcessRequest,
DatasetResponse, DatasetResponse,
ChatCompletionResponse, ChatCompletionResponse,
ChatResponse, ChatResponse,
FileProcessResponse,
ErrorResponse, ErrorResponse,
HealthCheckResponse, HealthCheckResponse,
SystemStatusResponse, SystemStatusResponse,
@ -131,11 +129,9 @@ __all__ = [
'Message', 'Message',
'DatasetRequest', 'DatasetRequest',
'ChatRequest', 'ChatRequest',
'FileProcessRequest',
'DatasetResponse', 'DatasetResponse',
'ChatCompletionResponse', 'ChatCompletionResponse',
'ChatResponse', 'ChatResponse',
'FileProcessResponse',
'ErrorResponse', 'ErrorResponse',
'HealthCheckResponse', 'HealthCheckResponse',
'SystemStatusResponse', 'SystemStatusResponse',