Merge branch 'onprem'

This commit is contained in:
朱潮 2025-10-28 14:48:45 +08:00
commit b05aee38e9
11 changed files with 93 additions and 54 deletions

View File

@ -2,11 +2,12 @@ import json
import os
import tempfile
import shutil
import uuid
from typing import AsyncGenerator, Dict, List, Optional, Union, Any
from datetime import datetime
import uvicorn
from fastapi import FastAPI, HTTPException, Depends, Header
from fastapi import FastAPI, HTTPException, Depends, Header, UploadFile, File
from fastapi.responses import StreamingResponse, HTMLResponse, FileResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
@ -440,7 +441,7 @@ async def chat_completions(request: ChatRequest, authorization: Optional[str] =
raise HTTPException(status_code=400, detail="unique_id is required")
# 使用unique_id获取项目目录
project_dir = os.path.join("projects", unique_id)
project_dir = os.path.join("projects", "data", unique_id)
if not os.path.exists(project_dir):
project_dir = ""
@ -531,6 +532,44 @@ async def chat_completions(request: ChatRequest, authorization: Optional[str] =
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
@app.post("/api/v1/upload")
async def upload_file(file: UploadFile = File(...)):
"""
文件上传API接口上传文件到 ./projects/uploads 目录
Args:
file: 上传的文件
Returns:
dict: 包含文件路径和文件名的响应
"""
try:
# 确保上传目录存在
upload_dir = os.path.join("projects", "uploads")
os.makedirs(upload_dir, exist_ok=True)
# 生成唯一文件名
file_extension = os.path.splitext(file.filename)[1] if file.filename else ""
unique_filename = f"{uuid.uuid4()}{file_extension}"
file_path = os.path.join(upload_dir, unique_filename)
# 保存文件
with open(file_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
return {
"success": True,
"message": "文件上传成功",
"filename": unique_filename,
"original_filename": file.filename,
"file_path": file_path
}
except Exception as e:
print(f"Error uploading file: {str(e)}")
raise HTTPException(status_code=500, detail=f"文件上传失败: {str(e)}")
@app.get("/api/health")
async def health_check():
"""Health check endpoint"""
@ -617,7 +656,7 @@ async def get_files_processing_status(unique_id: str):
processed_log = load_processed_files_log(unique_id)
# Get project directory info
project_dir = os.path.join("projects", unique_id)
project_dir = os.path.join("projects", "data", unique_id)
project_exists = os.path.exists(project_dir)
# Collect document.txt files
@ -635,7 +674,7 @@ async def get_files_processing_status(unique_id: str):
"processed_files": processed_log,
"document_files_count": len(document_files),
"document_files": document_files,
"log_file_exists": os.path.exists(os.path.join("projects", unique_id, "processed_files.json"))
"log_file_exists": os.path.exists(os.path.join("projects", "data", unique_id, "processed_files.json"))
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"获取文件处理状态失败: {str(e)}")
@ -645,8 +684,8 @@ async def get_files_processing_status(unique_id: str):
async def reset_files_processing(unique_id: str):
"""重置项目的文件处理状态,删除处理日志和所有文件"""
try:
project_dir = os.path.join("projects", unique_id)
log_file = os.path.join("projects", unique_id, "processed_files.json")
project_dir = os.path.join("projects", "data", unique_id)
log_file = os.path.join("projects", "data", unique_id, "processed_files.json")
# Load processed log to know what files to remove
processed_log = load_processed_files_log(unique_id)
@ -668,7 +707,7 @@ async def reset_files_processing(unique_id: str):
elif 'filename' in file_info:
# Fallback to old filename-based structure
filename_without_ext = os.path.splitext(file_info['filename'])[0]
dataset_dir = os.path.join("projects", unique_id, "dataset", filename_without_ext)
dataset_dir = os.path.join("projects", "data", unique_id, "dataset", filename_without_ext)
if remove_file_or_directory(dataset_dir):
removed_files.append(dataset_dir)

View File

@ -20,7 +20,7 @@ def get_allowed_directory():
return os.path.abspath(dataset_dir)
# 从环境变量读取项目数据目录
project_dir = os.getenv("PROJECT_DATA_DIR", "./projects")
project_dir = os.getenv("PROJECT_DATA_DIR", "./projects/data")
return os.path.abspath(project_dir)

View File

@ -143,7 +143,7 @@ def search_count(patterns: List[Dict[str, Any]], file_paths: List[str],
"content": [
{
"type": "text",
"text": f"Error: Specified files not found in project directory {project_data_dir}"
"text": f"Error: Specified files not found in project directory {get_allowed_directory()}"
}
]
}
@ -328,7 +328,7 @@ def search(patterns: List[Dict[str, Any]], file_paths: List[str],
"content": [
{
"type": "text",
"text": f"Error: Specified files not found in project directory {project_data_dir}"
"text": f"Error: Specified files not found in project directory {get_allowed_directory()}"
}
]
}
@ -565,7 +565,7 @@ def regex_grep(patterns: Union[str, List[str]], file_paths: List[str], context_l
"content": [
{
"type": "text",
"text": f"Error: Specified files not found in project directory {project_data_dir}"
"text": f"Error: Specified files not found in project directory {get_allowed_directory()}"
}
]
}
@ -713,7 +713,7 @@ def regex_grep_count(patterns: Union[str, List[str]], file_paths: List[str],
"content": [
{
"type": "text",
"text": f"Error: Specified files not found in project directory {project_data_dir}"
"text": f"Error: Specified files not found in project directory {get_allowed_directory()}"
}
]
}

View File

@ -47,7 +47,7 @@ def process_files_async(
)
# 确保项目目录存在
project_dir = os.path.join("projects", unique_id)
project_dir = os.path.join("projects", "data", unique_id)
if not os.path.exists(project_dir):
os.makedirs(project_dir, exist_ok=True)
@ -102,7 +102,7 @@ def process_files_async(
result_files = []
for key in processed_files_by_key.keys():
# 添加对应的dataset document.txt路径
document_path = os.path.join("projects", unique_id, "dataset", key, "document.txt")
document_path = os.path.join("projects", "data", unique_id, "dataset", key, "document.txt")
if os.path.exists(document_path):
result_files.append(document_path)
@ -172,7 +172,7 @@ def cleanup_project_async(
try:
print(f"开始异步清理项目项目ID: {unique_id}")
project_dir = os.path.join("projects", unique_id)
project_dir = os.path.join("projects", "data", unique_id)
removed_items = []
if remove_all and os.path.exists(project_dir):

View File

@ -20,8 +20,8 @@ except ImportError:
def merge_documents_by_group(unique_id: str, group_name: str) -> Dict:
"""Merge all document.txt files in a group into a single document."""
processed_group_dir = os.path.join("projects", unique_id, "processed", group_name)
dataset_group_dir = os.path.join("projects", unique_id, "dataset", group_name)
processed_group_dir = os.path.join("projects", "data", unique_id, "processed", group_name)
dataset_group_dir = os.path.join("projects", "data", unique_id, "dataset", group_name)
os.makedirs(dataset_group_dir, exist_ok=True)
merged_document_path = os.path.join(dataset_group_dir, "document.txt")
@ -91,8 +91,8 @@ def merge_documents_by_group(unique_id: str, group_name: str) -> Dict:
def merge_paginations_by_group(unique_id: str, group_name: str) -> Dict:
"""Merge all pagination.txt files in a group."""
processed_group_dir = os.path.join("projects", unique_id, "processed", group_name)
dataset_group_dir = os.path.join("projects", unique_id, "dataset", group_name)
processed_group_dir = os.path.join("projects", "data", unique_id, "processed", group_name)
dataset_group_dir = os.path.join("projects", "data", unique_id, "dataset", group_name)
os.makedirs(dataset_group_dir, exist_ok=True)
merged_pagination_path = os.path.join(dataset_group_dir, "pagination.txt")
@ -161,8 +161,8 @@ def merge_paginations_by_group(unique_id: str, group_name: str) -> Dict:
def merge_embeddings_by_group(unique_id: str, group_name: str) -> Dict:
"""Merge all embedding.pkl files in a group."""
processed_group_dir = os.path.join("projects", unique_id, "processed", group_name)
dataset_group_dir = os.path.join("projects", unique_id, "dataset", group_name)
processed_group_dir = os.path.join("projects", "data", unique_id, "processed", group_name)
dataset_group_dir = os.path.join("projects", "data", unique_id, "dataset", group_name)
os.makedirs(dataset_group_dir, exist_ok=True)
merged_embedding_path = os.path.join(dataset_group_dir, "embedding.pkl")
@ -296,7 +296,7 @@ def merge_all_data_by_group(unique_id: str, group_name: str) -> Dict:
def get_group_merge_status(unique_id: str, group_name: str) -> Dict:
"""Get the status of merged data for a group."""
dataset_group_dir = os.path.join("projects", unique_id, "dataset", group_name)
dataset_group_dir = os.path.join("projects", "data", unique_id, "dataset", group_name)
status = {
"group_name": group_name,
@ -340,7 +340,7 @@ def get_group_merge_status(unique_id: str, group_name: str) -> Dict:
def cleanup_dataset_group(unique_id: str, group_name: str) -> bool:
"""Clean up merged dataset files for a group."""
dataset_group_dir = os.path.join("projects", unique_id, "dataset", group_name)
dataset_group_dir = os.path.join("projects", "data", unique_id, "dataset", group_name)
try:
if os.path.exists(dataset_group_dir):

View File

@ -63,7 +63,7 @@ async def download_dataset_files(unique_id: str, files: Dict[str, List[str]]) ->
filename = os.path.basename(file_path)
# Get local file path
local_path = os.path.join("projects", unique_id, "files", group_name, filename)
local_path = os.path.join("projects", "data", unique_id, "files", group_name, filename)
# Skip if file doesn't exist (might be remote file that failed to download)
if not os.path.exists(local_path) and not file_path.startswith(('http://', 'https://')):
@ -144,7 +144,7 @@ async def save_processing_log(
}
}
log_file_path = os.path.join("projects", unique_id, "processing_log.json")
log_file_path = os.path.join("projects", "data", unique_id, "processing_log.json")
try:
with open(log_file_path, 'w', encoding='utf-8') as f:
json.dump(log_data, f, ensure_ascii=False, indent=2)
@ -155,7 +155,7 @@ async def save_processing_log(
def generate_dataset_structure(unique_id: str) -> str:
"""Generate a string representation of the dataset structure"""
project_dir = os.path.join("projects", unique_id)
project_dir = os.path.join("projects", "data", unique_id)
structure = []
def add_directory_contents(dir_path: str, prefix: str = ""):
@ -198,7 +198,7 @@ def generate_dataset_structure(unique_id: str) -> str:
def get_processing_status(unique_id: str) -> Dict:
"""Get comprehensive processing status for a project."""
project_dir = os.path.join("projects", unique_id)
project_dir = os.path.join("projects", "data", unique_id)
if not os.path.exists(project_dir):
return {
@ -261,7 +261,7 @@ def get_processing_status(unique_id: str) -> Dict:
def remove_dataset_directory(unique_id: str, filename_without_ext: str):
"""Remove a specific dataset directory (deprecated - use new structure)"""
# This function is kept for compatibility but delegates to new structure
dataset_path = os.path.join("projects", unique_id, "processed", filename_without_ext)
dataset_path = os.path.join("projects", "data", unique_id, "processed", filename_without_ext)
if os.path.exists(dataset_path):
import shutil
shutil.rmtree(dataset_path)
@ -270,13 +270,13 @@ def remove_dataset_directory(unique_id: str, filename_without_ext: str):
def remove_dataset_directory_by_key(unique_id: str, key: str):
"""Remove dataset directory by key (group name)"""
# Remove files directory
files_group_path = os.path.join("projects", unique_id, "files", key)
files_group_path = os.path.join("projects", "data", unique_id, "files", key)
if os.path.exists(files_group_path):
import shutil
shutil.rmtree(files_group_path)
# Remove processed directory
processed_group_path = os.path.join("projects", unique_id, "processed", key)
processed_group_path = os.path.join("projects", "data", unique_id, "processed", key)
if os.path.exists(processed_group_path):
import shutil
shutil.rmtree(processed_group_path)

View File

@ -13,7 +13,7 @@ from pathlib import Path
def get_existing_files(unique_id: str) -> Dict[str, Set[str]]:
"""Get existing files organized by group."""
existing_files = {}
files_dir = os.path.join("projects", unique_id, "files")
files_dir = os.path.join("projects", "data", unique_id, "files")
if not os.path.exists(files_dir):
return existing_files
@ -83,7 +83,7 @@ def sync_files_to_group(unique_id: str, files: Dict[str, List[str]]) -> Tuple[Di
Returns:
Tuple of (synced_files, failed_files)
"""
project_dir = os.path.join("projects", unique_id)
project_dir = os.path.join("projects", "data", unique_id)
files_dir = os.path.join(project_dir, "files")
# Create files directory
@ -164,7 +164,7 @@ def sync_files_to_group(unique_id: str, files: Dict[str, List[str]]) -> Tuple[Di
def cleanup_orphaned_files(unique_id: str, changes: Dict) -> Dict[str, List[str]]:
"""Remove files and their processing results that are no longer needed."""
removed_files = {}
project_dir = os.path.join("projects", unique_id)
project_dir = os.path.join("projects", "data", unique_id)
# Handle individual file removals
for group_name, removed_filenames in changes["removed"].items():
@ -225,7 +225,7 @@ def cleanup_orphaned_files(unique_id: str, changes: Dict) -> Dict[str, List[str]
def get_group_files_list(unique_id: str, group_name: str) -> List[str]:
"""Get list of files in a specific group."""
group_dir = os.path.join("projects", unique_id, "files", group_name)
group_dir = os.path.join("projects", "data", unique_id, "files", group_name)
if not os.path.exists(group_dir):
return []
@ -241,7 +241,7 @@ def get_group_files_list(unique_id: str, group_name: str) -> List[str]:
def ensure_directories(unique_id: str):
"""Ensure all necessary directories exist for a project."""
base_dir = os.path.join("projects", unique_id)
base_dir = os.path.join("projects", "data", unique_id)
directories = [
"files",
"processed",

View File

@ -103,7 +103,7 @@ def is_file_already_processed(target_file: Path, pagination_file: Path, embeddin
def load_processed_files_log(unique_id: str) -> Dict[str, Dict]:
"""Load processed files log for a project"""
log_file = os.path.join("projects", unique_id, "processed_files.json")
log_file = os.path.join("projects", "data", unique_id, "processed_files.json")
if os.path.exists(log_file):
try:
import json
@ -116,7 +116,7 @@ def load_processed_files_log(unique_id: str) -> Dict[str, Dict]:
def save_processed_files_log(unique_id: str, processed_log: Dict[str, Dict]):
"""Save processed files log for a project (legacy function)"""
log_file = os.path.join("projects", unique_id, "processed_files.json")
log_file = os.path.join("projects", "data", unique_id, "processed_files.json")
try:
os.makedirs(os.path.dirname(log_file), exist_ok=True)
import json
@ -128,7 +128,7 @@ def save_processed_files_log(unique_id: str, processed_log: Dict[str, Dict]):
def get_processing_log(unique_id: str) -> Dict:
"""Get the comprehensive processing log for a project"""
log_file = os.path.join("projects", unique_id, "processing_log.json")
log_file = os.path.join("projects", "data", unique_id, "processing_log.json")
if os.path.exists(log_file):
try:
import json
@ -141,7 +141,7 @@ def get_processing_log(unique_id: str) -> Dict:
def save_project_status(unique_id: str, status: Dict):
"""Save project processing status"""
status_file = os.path.join("projects", unique_id, "status.json")
status_file = os.path.join("projects", "data", unique_id, "status.json")
try:
os.makedirs(os.path.dirname(status_file), exist_ok=True)
import json
@ -153,7 +153,7 @@ def save_project_status(unique_id: str, status: Dict):
def load_project_status(unique_id: str) -> Dict:
"""Load project processing status"""
status_file = os.path.join("projects", unique_id, "status.json")
status_file = os.path.join("projects", "data", unique_id, "status.json")
if os.path.exists(status_file):
try:
import json
@ -185,7 +185,7 @@ def get_file_metadata(file_path: str) -> Dict:
def update_file_processing_status(unique_id: str, group_name: str, filename: str, status: Dict):
"""Update processing status for a specific file"""
status_file = os.path.join("projects", unique_id, "file_status.json")
status_file = os.path.join("projects", "data", unique_id, "file_status.json")
try:
# Load existing status
@ -217,7 +217,7 @@ def update_file_processing_status(unique_id: str, group_name: str, filename: str
def get_file_processing_status(unique_id: str, group_name: str = None, filename: str = None) -> Dict:
"""Get processing status for files"""
status_file = os.path.join("projects", unique_id, "file_status.json")
status_file = os.path.join("projects", "data", unique_id, "file_status.json")
if not os.path.exists(status_file):
return {}
@ -261,7 +261,7 @@ def calculate_directory_size(directory_path: str) -> int:
def get_project_statistics(unique_id: str) -> Dict:
"""Get comprehensive statistics for a project"""
project_dir = os.path.join("projects", unique_id)
project_dir = os.path.join("projects", "data", unique_id)
if not os.path.exists(project_dir):
return {"project_exists": False}

View File

@ -17,9 +17,9 @@ def is_file_already_processed(target_file: Path, pagination_file: Path, embeddin
return False
def organize_single_project_files(unique_id: str, skip_processed=True):
"""Organize files for a single project from projects/{unique_id}/files to projects/{unique_id}/dataset/{file_name}/document.txt"""
"""Organize files for a single project from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt"""
project_dir = Path("projects") / unique_id
project_dir = Path("projects") / "data" / unique_id
if not project_dir.exists():
print(f"Project directory not found: {project_dir}")
@ -120,9 +120,9 @@ def organize_single_project_files(unique_id: str, skip_processed=True):
def organize_dataset_files():
"""Move files from projects/{unique_id}/files to projects/{unique_id}/dataset/{file_name}/document.txt"""
"""Move files from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt"""
projects_dir = Path("projects")
projects_dir = Path("projects") / "data"
if not projects_dir.exists():
print("Projects directory not found")

View File

@ -113,7 +113,7 @@ def generate_directory_tree(project_dir: str, unique_id: str, max_depth: int = 3
def generate_project_readme(unique_id: str) -> str:
"""Generate README.md content for a project"""
project_dir = os.path.join("projects", unique_id)
project_dir = os.path.join("projects", "data", unique_id)
readme_content = f"""# Project: {unique_id}
## Project Overview
@ -192,7 +192,7 @@ This project contains processed documents and their associated embeddings for se
def save_project_readme(unique_id: str):
"""Save README.md for a project"""
readme_content = generate_project_readme(unique_id)
readme_path = os.path.join("projects", unique_id, "README.md")
readme_path = os.path.join("projects", "data", unique_id, "README.md")
try:
os.makedirs(os.path.dirname(readme_path), exist_ok=True)
@ -207,7 +207,7 @@ def save_project_readme(unique_id: str):
def get_project_status(unique_id: str) -> Dict:
"""Get comprehensive status of a project"""
project_dir = os.path.join("projects", unique_id)
project_dir = os.path.join("projects", "data", unique_id)
project_exists = os.path.exists(project_dir)
if not project_exists:
@ -259,7 +259,7 @@ def get_project_status(unique_id: str) -> Dict:
def remove_project(unique_id: str) -> bool:
"""Remove entire project directory"""
project_dir = os.path.join("projects", unique_id)
project_dir = os.path.join("projects", "data", unique_id)
try:
if os.path.exists(project_dir):
import shutil
@ -326,7 +326,7 @@ def get_project_stats(unique_id: str) -> Dict:
# Check embeddings files
embedding_files = []
dataset_dir = os.path.join("projects", unique_id, "dataset")
dataset_dir = os.path.join("projects", "data", unique_id, "dataset")
if os.path.exists(dataset_dir):
for root, dirs, files in os.walk(dataset_dir):
for file in files:

View File

@ -50,7 +50,7 @@ async def process_single_file(
"""
# Create output directory for this file
filename_stem = Path(filename).stem
output_dir = os.path.join("projects", unique_id, "processed", group_name, filename_stem)
output_dir = os.path.join("projects", "data", unique_id, "processed", group_name, filename_stem)
os.makedirs(output_dir, exist_ok=True)
result = {
@ -280,7 +280,7 @@ async def generate_embeddings_for_file(document_path: str, embedding_path: str)
def check_file_already_processed(unique_id: str, group_name: str, filename: str) -> bool:
"""Check if a file has already been processed."""
filename_stem = Path(filename).stem
output_dir = os.path.join("projects", unique_id, "processed", group_name, filename_stem)
output_dir = os.path.join("projects", "data", unique_id, "processed", group_name, filename_stem)
document_path = os.path.join(output_dir, "document.txt")
pagination_path = os.path.join(output_dir, "pagination.txt")