add upload
This commit is contained in:
parent
70dcffbb7a
commit
f13b1aaec9
@ -2,11 +2,12 @@ import json
|
|||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
|
import uuid
|
||||||
from typing import AsyncGenerator, Dict, List, Optional, Union, Any
|
from typing import AsyncGenerator, Dict, List, Optional, Union, Any
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from fastapi import FastAPI, HTTPException, Depends, Header
|
from fastapi import FastAPI, HTTPException, Depends, Header, UploadFile, File
|
||||||
from fastapi.responses import StreamingResponse, HTMLResponse, FileResponse
|
from fastapi.responses import StreamingResponse, HTMLResponse, FileResponse
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
@ -440,7 +441,7 @@ async def chat_completions(request: ChatRequest, authorization: Optional[str] =
|
|||||||
raise HTTPException(status_code=400, detail="unique_id is required")
|
raise HTTPException(status_code=400, detail="unique_id is required")
|
||||||
|
|
||||||
# 使用unique_id获取项目目录
|
# 使用unique_id获取项目目录
|
||||||
project_dir = os.path.join("projects", unique_id)
|
project_dir = os.path.join("projects", "data", unique_id)
|
||||||
if not os.path.exists(project_dir):
|
if not os.path.exists(project_dir):
|
||||||
project_dir = ""
|
project_dir = ""
|
||||||
|
|
||||||
@ -531,6 +532,44 @@ async def chat_completions(request: ChatRequest, authorization: Optional[str] =
|
|||||||
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/v1/upload")
|
||||||
|
async def upload_file(file: UploadFile = File(...)):
|
||||||
|
"""
|
||||||
|
文件上传API接口,上传文件到 ./projects/uploads 目录
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file: 上传的文件
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: 包含文件路径和文件名的响应
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 确保上传目录存在
|
||||||
|
upload_dir = os.path.join("projects", "uploads")
|
||||||
|
os.makedirs(upload_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# 生成唯一文件名
|
||||||
|
file_extension = os.path.splitext(file.filename)[1] if file.filename else ""
|
||||||
|
unique_filename = f"{uuid.uuid4()}{file_extension}"
|
||||||
|
file_path = os.path.join(upload_dir, unique_filename)
|
||||||
|
|
||||||
|
# 保存文件
|
||||||
|
with open(file_path, "wb") as buffer:
|
||||||
|
shutil.copyfileobj(file.file, buffer)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"message": "文件上传成功",
|
||||||
|
"filename": unique_filename,
|
||||||
|
"original_filename": file.filename,
|
||||||
|
"file_path": file_path
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error uploading file: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"文件上传失败: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
@app.get("/api/health")
|
@app.get("/api/health")
|
||||||
async def health_check():
|
async def health_check():
|
||||||
"""Health check endpoint"""
|
"""Health check endpoint"""
|
||||||
@ -617,7 +656,7 @@ async def get_files_processing_status(unique_id: str):
|
|||||||
processed_log = load_processed_files_log(unique_id)
|
processed_log = load_processed_files_log(unique_id)
|
||||||
|
|
||||||
# Get project directory info
|
# Get project directory info
|
||||||
project_dir = os.path.join("projects", unique_id)
|
project_dir = os.path.join("projects", "data", unique_id)
|
||||||
project_exists = os.path.exists(project_dir)
|
project_exists = os.path.exists(project_dir)
|
||||||
|
|
||||||
# Collect document.txt files
|
# Collect document.txt files
|
||||||
@ -635,7 +674,7 @@ async def get_files_processing_status(unique_id: str):
|
|||||||
"processed_files": processed_log,
|
"processed_files": processed_log,
|
||||||
"document_files_count": len(document_files),
|
"document_files_count": len(document_files),
|
||||||
"document_files": document_files,
|
"document_files": document_files,
|
||||||
"log_file_exists": os.path.exists(os.path.join("projects", unique_id, "processed_files.json"))
|
"log_file_exists": os.path.exists(os.path.join("projects", "data", unique_id, "processed_files.json"))
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=500, detail=f"获取文件处理状态失败: {str(e)}")
|
raise HTTPException(status_code=500, detail=f"获取文件处理状态失败: {str(e)}")
|
||||||
@ -645,8 +684,8 @@ async def get_files_processing_status(unique_id: str):
|
|||||||
async def reset_files_processing(unique_id: str):
|
async def reset_files_processing(unique_id: str):
|
||||||
"""重置项目的文件处理状态,删除处理日志和所有文件"""
|
"""重置项目的文件处理状态,删除处理日志和所有文件"""
|
||||||
try:
|
try:
|
||||||
project_dir = os.path.join("projects", unique_id)
|
project_dir = os.path.join("projects", "data", unique_id)
|
||||||
log_file = os.path.join("projects", unique_id, "processed_files.json")
|
log_file = os.path.join("projects", "data", unique_id, "processed_files.json")
|
||||||
|
|
||||||
# Load processed log to know what files to remove
|
# Load processed log to know what files to remove
|
||||||
processed_log = load_processed_files_log(unique_id)
|
processed_log = load_processed_files_log(unique_id)
|
||||||
@ -668,7 +707,7 @@ async def reset_files_processing(unique_id: str):
|
|||||||
elif 'filename' in file_info:
|
elif 'filename' in file_info:
|
||||||
# Fallback to old filename-based structure
|
# Fallback to old filename-based structure
|
||||||
filename_without_ext = os.path.splitext(file_info['filename'])[0]
|
filename_without_ext = os.path.splitext(file_info['filename'])[0]
|
||||||
dataset_dir = os.path.join("projects", unique_id, "dataset", filename_without_ext)
|
dataset_dir = os.path.join("projects", "data", unique_id, "dataset", filename_without_ext)
|
||||||
if remove_file_or_directory(dataset_dir):
|
if remove_file_or_directory(dataset_dir):
|
||||||
removed_files.append(dataset_dir)
|
removed_files.append(dataset_dir)
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@ def get_allowed_directory():
|
|||||||
return os.path.abspath(dataset_dir)
|
return os.path.abspath(dataset_dir)
|
||||||
|
|
||||||
# 从环境变量读取项目数据目录
|
# 从环境变量读取项目数据目录
|
||||||
project_dir = os.getenv("PROJECT_DATA_DIR", "./projects")
|
project_dir = os.getenv("PROJECT_DATA_DIR", "./projects/data")
|
||||||
return os.path.abspath(project_dir)
|
return os.path.abspath(project_dir)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -143,7 +143,7 @@ def search_count(patterns: List[Dict[str, Any]], file_paths: List[str],
|
|||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"text": f"Error: Specified files not found in project directory {project_data_dir}"
|
"text": f"Error: Specified files not found in project directory {get_allowed_directory()}"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -328,7 +328,7 @@ def search(patterns: List[Dict[str, Any]], file_paths: List[str],
|
|||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"text": f"Error: Specified files not found in project directory {project_data_dir}"
|
"text": f"Error: Specified files not found in project directory {get_allowed_directory()}"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -565,7 +565,7 @@ def regex_grep(patterns: Union[str, List[str]], file_paths: List[str], context_l
|
|||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"text": f"Error: Specified files not found in project directory {project_data_dir}"
|
"text": f"Error: Specified files not found in project directory {get_allowed_directory()}"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -713,7 +713,7 @@ def regex_grep_count(patterns: Union[str, List[str]], file_paths: List[str],
|
|||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"text": f"Error: Specified files not found in project directory {project_data_dir}"
|
"text": f"Error: Specified files not found in project directory {get_allowed_directory()}"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@ -47,7 +47,7 @@ def process_files_async(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# 确保项目目录存在
|
# 确保项目目录存在
|
||||||
project_dir = os.path.join("projects", unique_id)
|
project_dir = os.path.join("projects", "data", unique_id)
|
||||||
if not os.path.exists(project_dir):
|
if not os.path.exists(project_dir):
|
||||||
os.makedirs(project_dir, exist_ok=True)
|
os.makedirs(project_dir, exist_ok=True)
|
||||||
|
|
||||||
@ -102,7 +102,7 @@ def process_files_async(
|
|||||||
result_files = []
|
result_files = []
|
||||||
for key in processed_files_by_key.keys():
|
for key in processed_files_by_key.keys():
|
||||||
# 添加对应的dataset document.txt路径
|
# 添加对应的dataset document.txt路径
|
||||||
document_path = os.path.join("projects", unique_id, "dataset", key, "document.txt")
|
document_path = os.path.join("projects", "data", unique_id, "dataset", key, "document.txt")
|
||||||
if os.path.exists(document_path):
|
if os.path.exists(document_path):
|
||||||
result_files.append(document_path)
|
result_files.append(document_path)
|
||||||
|
|
||||||
@ -172,7 +172,7 @@ def cleanup_project_async(
|
|||||||
try:
|
try:
|
||||||
print(f"开始异步清理项目,项目ID: {unique_id}")
|
print(f"开始异步清理项目,项目ID: {unique_id}")
|
||||||
|
|
||||||
project_dir = os.path.join("projects", unique_id)
|
project_dir = os.path.join("projects", "data", unique_id)
|
||||||
removed_items = []
|
removed_items = []
|
||||||
|
|
||||||
if remove_all and os.path.exists(project_dir):
|
if remove_all and os.path.exists(project_dir):
|
||||||
|
|||||||
@ -20,8 +20,8 @@ except ImportError:
|
|||||||
def merge_documents_by_group(unique_id: str, group_name: str) -> Dict:
|
def merge_documents_by_group(unique_id: str, group_name: str) -> Dict:
|
||||||
"""Merge all document.txt files in a group into a single document."""
|
"""Merge all document.txt files in a group into a single document."""
|
||||||
|
|
||||||
processed_group_dir = os.path.join("projects", unique_id, "processed", group_name)
|
processed_group_dir = os.path.join("projects", "data", unique_id, "processed", group_name)
|
||||||
dataset_group_dir = os.path.join("projects", unique_id, "dataset", group_name)
|
dataset_group_dir = os.path.join("projects", "data", unique_id, "dataset", group_name)
|
||||||
os.makedirs(dataset_group_dir, exist_ok=True)
|
os.makedirs(dataset_group_dir, exist_ok=True)
|
||||||
|
|
||||||
merged_document_path = os.path.join(dataset_group_dir, "document.txt")
|
merged_document_path = os.path.join(dataset_group_dir, "document.txt")
|
||||||
@ -91,8 +91,8 @@ def merge_documents_by_group(unique_id: str, group_name: str) -> Dict:
|
|||||||
def merge_paginations_by_group(unique_id: str, group_name: str) -> Dict:
|
def merge_paginations_by_group(unique_id: str, group_name: str) -> Dict:
|
||||||
"""Merge all pagination.txt files in a group."""
|
"""Merge all pagination.txt files in a group."""
|
||||||
|
|
||||||
processed_group_dir = os.path.join("projects", unique_id, "processed", group_name)
|
processed_group_dir = os.path.join("projects", "data", unique_id, "processed", group_name)
|
||||||
dataset_group_dir = os.path.join("projects", unique_id, "dataset", group_name)
|
dataset_group_dir = os.path.join("projects", "data", unique_id, "dataset", group_name)
|
||||||
os.makedirs(dataset_group_dir, exist_ok=True)
|
os.makedirs(dataset_group_dir, exist_ok=True)
|
||||||
|
|
||||||
merged_pagination_path = os.path.join(dataset_group_dir, "pagination.txt")
|
merged_pagination_path = os.path.join(dataset_group_dir, "pagination.txt")
|
||||||
@ -161,8 +161,8 @@ def merge_paginations_by_group(unique_id: str, group_name: str) -> Dict:
|
|||||||
def merge_embeddings_by_group(unique_id: str, group_name: str) -> Dict:
|
def merge_embeddings_by_group(unique_id: str, group_name: str) -> Dict:
|
||||||
"""Merge all embedding.pkl files in a group."""
|
"""Merge all embedding.pkl files in a group."""
|
||||||
|
|
||||||
processed_group_dir = os.path.join("projects", unique_id, "processed", group_name)
|
processed_group_dir = os.path.join("projects", "data", unique_id, "processed", group_name)
|
||||||
dataset_group_dir = os.path.join("projects", unique_id, "dataset", group_name)
|
dataset_group_dir = os.path.join("projects", "data", unique_id, "dataset", group_name)
|
||||||
os.makedirs(dataset_group_dir, exist_ok=True)
|
os.makedirs(dataset_group_dir, exist_ok=True)
|
||||||
|
|
||||||
merged_embedding_path = os.path.join(dataset_group_dir, "embedding.pkl")
|
merged_embedding_path = os.path.join(dataset_group_dir, "embedding.pkl")
|
||||||
@ -296,7 +296,7 @@ def merge_all_data_by_group(unique_id: str, group_name: str) -> Dict:
|
|||||||
def get_group_merge_status(unique_id: str, group_name: str) -> Dict:
|
def get_group_merge_status(unique_id: str, group_name: str) -> Dict:
|
||||||
"""Get the status of merged data for a group."""
|
"""Get the status of merged data for a group."""
|
||||||
|
|
||||||
dataset_group_dir = os.path.join("projects", unique_id, "dataset", group_name)
|
dataset_group_dir = os.path.join("projects", "data", unique_id, "dataset", group_name)
|
||||||
|
|
||||||
status = {
|
status = {
|
||||||
"group_name": group_name,
|
"group_name": group_name,
|
||||||
@ -340,7 +340,7 @@ def get_group_merge_status(unique_id: str, group_name: str) -> Dict:
|
|||||||
def cleanup_dataset_group(unique_id: str, group_name: str) -> bool:
|
def cleanup_dataset_group(unique_id: str, group_name: str) -> bool:
|
||||||
"""Clean up merged dataset files for a group."""
|
"""Clean up merged dataset files for a group."""
|
||||||
|
|
||||||
dataset_group_dir = os.path.join("projects", unique_id, "dataset", group_name)
|
dataset_group_dir = os.path.join("projects", "data", unique_id, "dataset", group_name)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if os.path.exists(dataset_group_dir):
|
if os.path.exists(dataset_group_dir):
|
||||||
|
|||||||
@ -63,7 +63,7 @@ async def download_dataset_files(unique_id: str, files: Dict[str, List[str]]) ->
|
|||||||
filename = os.path.basename(file_path)
|
filename = os.path.basename(file_path)
|
||||||
|
|
||||||
# Get local file path
|
# Get local file path
|
||||||
local_path = os.path.join("projects", unique_id, "files", group_name, filename)
|
local_path = os.path.join("projects", "data", unique_id, "files", group_name, filename)
|
||||||
|
|
||||||
# Skip if file doesn't exist (might be remote file that failed to download)
|
# Skip if file doesn't exist (might be remote file that failed to download)
|
||||||
if not os.path.exists(local_path) and not file_path.startswith(('http://', 'https://')):
|
if not os.path.exists(local_path) and not file_path.startswith(('http://', 'https://')):
|
||||||
@ -144,7 +144,7 @@ async def save_processing_log(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log_file_path = os.path.join("projects", unique_id, "processing_log.json")
|
log_file_path = os.path.join("projects", "data", unique_id, "processing_log.json")
|
||||||
try:
|
try:
|
||||||
with open(log_file_path, 'w', encoding='utf-8') as f:
|
with open(log_file_path, 'w', encoding='utf-8') as f:
|
||||||
json.dump(log_data, f, ensure_ascii=False, indent=2)
|
json.dump(log_data, f, ensure_ascii=False, indent=2)
|
||||||
@ -155,7 +155,7 @@ async def save_processing_log(
|
|||||||
|
|
||||||
def generate_dataset_structure(unique_id: str) -> str:
|
def generate_dataset_structure(unique_id: str) -> str:
|
||||||
"""Generate a string representation of the dataset structure"""
|
"""Generate a string representation of the dataset structure"""
|
||||||
project_dir = os.path.join("projects", unique_id)
|
project_dir = os.path.join("projects", "data", unique_id)
|
||||||
structure = []
|
structure = []
|
||||||
|
|
||||||
def add_directory_contents(dir_path: str, prefix: str = ""):
|
def add_directory_contents(dir_path: str, prefix: str = ""):
|
||||||
@ -198,7 +198,7 @@ def generate_dataset_structure(unique_id: str) -> str:
|
|||||||
def get_processing_status(unique_id: str) -> Dict:
|
def get_processing_status(unique_id: str) -> Dict:
|
||||||
"""Get comprehensive processing status for a project."""
|
"""Get comprehensive processing status for a project."""
|
||||||
|
|
||||||
project_dir = os.path.join("projects", unique_id)
|
project_dir = os.path.join("projects", "data", unique_id)
|
||||||
|
|
||||||
if not os.path.exists(project_dir):
|
if not os.path.exists(project_dir):
|
||||||
return {
|
return {
|
||||||
@ -261,7 +261,7 @@ def get_processing_status(unique_id: str) -> Dict:
|
|||||||
def remove_dataset_directory(unique_id: str, filename_without_ext: str):
|
def remove_dataset_directory(unique_id: str, filename_without_ext: str):
|
||||||
"""Remove a specific dataset directory (deprecated - use new structure)"""
|
"""Remove a specific dataset directory (deprecated - use new structure)"""
|
||||||
# This function is kept for compatibility but delegates to new structure
|
# This function is kept for compatibility but delegates to new structure
|
||||||
dataset_path = os.path.join("projects", unique_id, "processed", filename_without_ext)
|
dataset_path = os.path.join("projects", "data", unique_id, "processed", filename_without_ext)
|
||||||
if os.path.exists(dataset_path):
|
if os.path.exists(dataset_path):
|
||||||
import shutil
|
import shutil
|
||||||
shutil.rmtree(dataset_path)
|
shutil.rmtree(dataset_path)
|
||||||
@ -270,13 +270,13 @@ def remove_dataset_directory(unique_id: str, filename_without_ext: str):
|
|||||||
def remove_dataset_directory_by_key(unique_id: str, key: str):
|
def remove_dataset_directory_by_key(unique_id: str, key: str):
|
||||||
"""Remove dataset directory by key (group name)"""
|
"""Remove dataset directory by key (group name)"""
|
||||||
# Remove files directory
|
# Remove files directory
|
||||||
files_group_path = os.path.join("projects", unique_id, "files", key)
|
files_group_path = os.path.join("projects", "data", unique_id, "files", key)
|
||||||
if os.path.exists(files_group_path):
|
if os.path.exists(files_group_path):
|
||||||
import shutil
|
import shutil
|
||||||
shutil.rmtree(files_group_path)
|
shutil.rmtree(files_group_path)
|
||||||
|
|
||||||
# Remove processed directory
|
# Remove processed directory
|
||||||
processed_group_path = os.path.join("projects", unique_id, "processed", key)
|
processed_group_path = os.path.join("projects", "data", unique_id, "processed", key)
|
||||||
if os.path.exists(processed_group_path):
|
if os.path.exists(processed_group_path):
|
||||||
import shutil
|
import shutil
|
||||||
shutil.rmtree(processed_group_path)
|
shutil.rmtree(processed_group_path)
|
||||||
|
|||||||
@ -13,7 +13,7 @@ from pathlib import Path
|
|||||||
def get_existing_files(unique_id: str) -> Dict[str, Set[str]]:
|
def get_existing_files(unique_id: str) -> Dict[str, Set[str]]:
|
||||||
"""Get existing files organized by group."""
|
"""Get existing files organized by group."""
|
||||||
existing_files = {}
|
existing_files = {}
|
||||||
files_dir = os.path.join("projects", unique_id, "files")
|
files_dir = os.path.join("projects", "data", unique_id, "files")
|
||||||
|
|
||||||
if not os.path.exists(files_dir):
|
if not os.path.exists(files_dir):
|
||||||
return existing_files
|
return existing_files
|
||||||
@ -83,7 +83,7 @@ def sync_files_to_group(unique_id: str, files: Dict[str, List[str]]) -> Tuple[Di
|
|||||||
Returns:
|
Returns:
|
||||||
Tuple of (synced_files, failed_files)
|
Tuple of (synced_files, failed_files)
|
||||||
"""
|
"""
|
||||||
project_dir = os.path.join("projects", unique_id)
|
project_dir = os.path.join("projects", "data", unique_id)
|
||||||
files_dir = os.path.join(project_dir, "files")
|
files_dir = os.path.join(project_dir, "files")
|
||||||
|
|
||||||
# Create files directory
|
# Create files directory
|
||||||
@ -164,7 +164,7 @@ def sync_files_to_group(unique_id: str, files: Dict[str, List[str]]) -> Tuple[Di
|
|||||||
def cleanup_orphaned_files(unique_id: str, changes: Dict) -> Dict[str, List[str]]:
|
def cleanup_orphaned_files(unique_id: str, changes: Dict) -> Dict[str, List[str]]:
|
||||||
"""Remove files and their processing results that are no longer needed."""
|
"""Remove files and their processing results that are no longer needed."""
|
||||||
removed_files = {}
|
removed_files = {}
|
||||||
project_dir = os.path.join("projects", unique_id)
|
project_dir = os.path.join("projects", "data", unique_id)
|
||||||
|
|
||||||
# Handle individual file removals
|
# Handle individual file removals
|
||||||
for group_name, removed_filenames in changes["removed"].items():
|
for group_name, removed_filenames in changes["removed"].items():
|
||||||
@ -225,7 +225,7 @@ def cleanup_orphaned_files(unique_id: str, changes: Dict) -> Dict[str, List[str]
|
|||||||
|
|
||||||
def get_group_files_list(unique_id: str, group_name: str) -> List[str]:
|
def get_group_files_list(unique_id: str, group_name: str) -> List[str]:
|
||||||
"""Get list of files in a specific group."""
|
"""Get list of files in a specific group."""
|
||||||
group_dir = os.path.join("projects", unique_id, "files", group_name)
|
group_dir = os.path.join("projects", "data", unique_id, "files", group_name)
|
||||||
|
|
||||||
if not os.path.exists(group_dir):
|
if not os.path.exists(group_dir):
|
||||||
return []
|
return []
|
||||||
@ -241,7 +241,7 @@ def get_group_files_list(unique_id: str, group_name: str) -> List[str]:
|
|||||||
|
|
||||||
def ensure_directories(unique_id: str):
|
def ensure_directories(unique_id: str):
|
||||||
"""Ensure all necessary directories exist for a project."""
|
"""Ensure all necessary directories exist for a project."""
|
||||||
base_dir = os.path.join("projects", unique_id)
|
base_dir = os.path.join("projects", "data", unique_id)
|
||||||
directories = [
|
directories = [
|
||||||
"files",
|
"files",
|
||||||
"processed",
|
"processed",
|
||||||
|
|||||||
@ -103,7 +103,7 @@ def is_file_already_processed(target_file: Path, pagination_file: Path, embeddin
|
|||||||
|
|
||||||
def load_processed_files_log(unique_id: str) -> Dict[str, Dict]:
|
def load_processed_files_log(unique_id: str) -> Dict[str, Dict]:
|
||||||
"""Load processed files log for a project"""
|
"""Load processed files log for a project"""
|
||||||
log_file = os.path.join("projects", unique_id, "processed_files.json")
|
log_file = os.path.join("projects", "data", unique_id, "processed_files.json")
|
||||||
if os.path.exists(log_file):
|
if os.path.exists(log_file):
|
||||||
try:
|
try:
|
||||||
import json
|
import json
|
||||||
@ -116,7 +116,7 @@ def load_processed_files_log(unique_id: str) -> Dict[str, Dict]:
|
|||||||
|
|
||||||
def save_processed_files_log(unique_id: str, processed_log: Dict[str, Dict]):
|
def save_processed_files_log(unique_id: str, processed_log: Dict[str, Dict]):
|
||||||
"""Save processed files log for a project (legacy function)"""
|
"""Save processed files log for a project (legacy function)"""
|
||||||
log_file = os.path.join("projects", unique_id, "processed_files.json")
|
log_file = os.path.join("projects", "data", unique_id, "processed_files.json")
|
||||||
try:
|
try:
|
||||||
os.makedirs(os.path.dirname(log_file), exist_ok=True)
|
os.makedirs(os.path.dirname(log_file), exist_ok=True)
|
||||||
import json
|
import json
|
||||||
@ -128,7 +128,7 @@ def save_processed_files_log(unique_id: str, processed_log: Dict[str, Dict]):
|
|||||||
|
|
||||||
def get_processing_log(unique_id: str) -> Dict:
|
def get_processing_log(unique_id: str) -> Dict:
|
||||||
"""Get the comprehensive processing log for a project"""
|
"""Get the comprehensive processing log for a project"""
|
||||||
log_file = os.path.join("projects", unique_id, "processing_log.json")
|
log_file = os.path.join("projects", "data", unique_id, "processing_log.json")
|
||||||
if os.path.exists(log_file):
|
if os.path.exists(log_file):
|
||||||
try:
|
try:
|
||||||
import json
|
import json
|
||||||
@ -141,7 +141,7 @@ def get_processing_log(unique_id: str) -> Dict:
|
|||||||
|
|
||||||
def save_project_status(unique_id: str, status: Dict):
|
def save_project_status(unique_id: str, status: Dict):
|
||||||
"""Save project processing status"""
|
"""Save project processing status"""
|
||||||
status_file = os.path.join("projects", unique_id, "status.json")
|
status_file = os.path.join("projects", "data", unique_id, "status.json")
|
||||||
try:
|
try:
|
||||||
os.makedirs(os.path.dirname(status_file), exist_ok=True)
|
os.makedirs(os.path.dirname(status_file), exist_ok=True)
|
||||||
import json
|
import json
|
||||||
@ -153,7 +153,7 @@ def save_project_status(unique_id: str, status: Dict):
|
|||||||
|
|
||||||
def load_project_status(unique_id: str) -> Dict:
|
def load_project_status(unique_id: str) -> Dict:
|
||||||
"""Load project processing status"""
|
"""Load project processing status"""
|
||||||
status_file = os.path.join("projects", unique_id, "status.json")
|
status_file = os.path.join("projects", "data", unique_id, "status.json")
|
||||||
if os.path.exists(status_file):
|
if os.path.exists(status_file):
|
||||||
try:
|
try:
|
||||||
import json
|
import json
|
||||||
@ -185,7 +185,7 @@ def get_file_metadata(file_path: str) -> Dict:
|
|||||||
|
|
||||||
def update_file_processing_status(unique_id: str, group_name: str, filename: str, status: Dict):
|
def update_file_processing_status(unique_id: str, group_name: str, filename: str, status: Dict):
|
||||||
"""Update processing status for a specific file"""
|
"""Update processing status for a specific file"""
|
||||||
status_file = os.path.join("projects", unique_id, "file_status.json")
|
status_file = os.path.join("projects", "data", unique_id, "file_status.json")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Load existing status
|
# Load existing status
|
||||||
@ -217,7 +217,7 @@ def update_file_processing_status(unique_id: str, group_name: str, filename: str
|
|||||||
|
|
||||||
def get_file_processing_status(unique_id: str, group_name: str = None, filename: str = None) -> Dict:
|
def get_file_processing_status(unique_id: str, group_name: str = None, filename: str = None) -> Dict:
|
||||||
"""Get processing status for files"""
|
"""Get processing status for files"""
|
||||||
status_file = os.path.join("projects", unique_id, "file_status.json")
|
status_file = os.path.join("projects", "data", unique_id, "file_status.json")
|
||||||
|
|
||||||
if not os.path.exists(status_file):
|
if not os.path.exists(status_file):
|
||||||
return {}
|
return {}
|
||||||
@ -261,7 +261,7 @@ def calculate_directory_size(directory_path: str) -> int:
|
|||||||
|
|
||||||
def get_project_statistics(unique_id: str) -> Dict:
|
def get_project_statistics(unique_id: str) -> Dict:
|
||||||
"""Get comprehensive statistics for a project"""
|
"""Get comprehensive statistics for a project"""
|
||||||
project_dir = os.path.join("projects", unique_id)
|
project_dir = os.path.join("projects", "data", unique_id)
|
||||||
|
|
||||||
if not os.path.exists(project_dir):
|
if not os.path.exists(project_dir):
|
||||||
return {"project_exists": False}
|
return {"project_exists": False}
|
||||||
|
|||||||
@ -17,9 +17,9 @@ def is_file_already_processed(target_file: Path, pagination_file: Path, embeddin
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def organize_single_project_files(unique_id: str, skip_processed=True):
|
def organize_single_project_files(unique_id: str, skip_processed=True):
|
||||||
"""Organize files for a single project from projects/{unique_id}/files to projects/{unique_id}/dataset/{file_name}/document.txt"""
|
"""Organize files for a single project from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt"""
|
||||||
|
|
||||||
project_dir = Path("projects") / unique_id
|
project_dir = Path("projects") / "data" / unique_id
|
||||||
|
|
||||||
if not project_dir.exists():
|
if not project_dir.exists():
|
||||||
print(f"Project directory not found: {project_dir}")
|
print(f"Project directory not found: {project_dir}")
|
||||||
@ -120,9 +120,9 @@ def organize_single_project_files(unique_id: str, skip_processed=True):
|
|||||||
|
|
||||||
|
|
||||||
def organize_dataset_files():
|
def organize_dataset_files():
|
||||||
"""Move files from projects/{unique_id}/files to projects/{unique_id}/dataset/{file_name}/document.txt"""
|
"""Move files from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt"""
|
||||||
|
|
||||||
projects_dir = Path("projects")
|
projects_dir = Path("projects") / "data"
|
||||||
|
|
||||||
if not projects_dir.exists():
|
if not projects_dir.exists():
|
||||||
print("Projects directory not found")
|
print("Projects directory not found")
|
||||||
|
|||||||
@ -113,7 +113,7 @@ def generate_directory_tree(project_dir: str, unique_id: str, max_depth: int = 3
|
|||||||
|
|
||||||
def generate_project_readme(unique_id: str) -> str:
|
def generate_project_readme(unique_id: str) -> str:
|
||||||
"""Generate README.md content for a project"""
|
"""Generate README.md content for a project"""
|
||||||
project_dir = os.path.join("projects", unique_id)
|
project_dir = os.path.join("projects", "data", unique_id)
|
||||||
readme_content = f"""# Project: {unique_id}
|
readme_content = f"""# Project: {unique_id}
|
||||||
|
|
||||||
## Project Overview
|
## Project Overview
|
||||||
@ -192,7 +192,7 @@ This project contains processed documents and their associated embeddings for se
|
|||||||
def save_project_readme(unique_id: str):
|
def save_project_readme(unique_id: str):
|
||||||
"""Save README.md for a project"""
|
"""Save README.md for a project"""
|
||||||
readme_content = generate_project_readme(unique_id)
|
readme_content = generate_project_readme(unique_id)
|
||||||
readme_path = os.path.join("projects", unique_id, "README.md")
|
readme_path = os.path.join("projects", "data", unique_id, "README.md")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
os.makedirs(os.path.dirname(readme_path), exist_ok=True)
|
os.makedirs(os.path.dirname(readme_path), exist_ok=True)
|
||||||
@ -207,7 +207,7 @@ def save_project_readme(unique_id: str):
|
|||||||
|
|
||||||
def get_project_status(unique_id: str) -> Dict:
|
def get_project_status(unique_id: str) -> Dict:
|
||||||
"""Get comprehensive status of a project"""
|
"""Get comprehensive status of a project"""
|
||||||
project_dir = os.path.join("projects", unique_id)
|
project_dir = os.path.join("projects", "data", unique_id)
|
||||||
project_exists = os.path.exists(project_dir)
|
project_exists = os.path.exists(project_dir)
|
||||||
|
|
||||||
if not project_exists:
|
if not project_exists:
|
||||||
@ -259,7 +259,7 @@ def get_project_status(unique_id: str) -> Dict:
|
|||||||
|
|
||||||
def remove_project(unique_id: str) -> bool:
|
def remove_project(unique_id: str) -> bool:
|
||||||
"""Remove entire project directory"""
|
"""Remove entire project directory"""
|
||||||
project_dir = os.path.join("projects", unique_id)
|
project_dir = os.path.join("projects", "data", unique_id)
|
||||||
try:
|
try:
|
||||||
if os.path.exists(project_dir):
|
if os.path.exists(project_dir):
|
||||||
import shutil
|
import shutil
|
||||||
@ -326,7 +326,7 @@ def get_project_stats(unique_id: str) -> Dict:
|
|||||||
|
|
||||||
# Check embeddings files
|
# Check embeddings files
|
||||||
embedding_files = []
|
embedding_files = []
|
||||||
dataset_dir = os.path.join("projects", unique_id, "dataset")
|
dataset_dir = os.path.join("projects", "data", unique_id, "dataset")
|
||||||
if os.path.exists(dataset_dir):
|
if os.path.exists(dataset_dir):
|
||||||
for root, dirs, files in os.walk(dataset_dir):
|
for root, dirs, files in os.walk(dataset_dir):
|
||||||
for file in files:
|
for file in files:
|
||||||
|
|||||||
@ -50,7 +50,7 @@ async def process_single_file(
|
|||||||
"""
|
"""
|
||||||
# Create output directory for this file
|
# Create output directory for this file
|
||||||
filename_stem = Path(filename).stem
|
filename_stem = Path(filename).stem
|
||||||
output_dir = os.path.join("projects", unique_id, "processed", group_name, filename_stem)
|
output_dir = os.path.join("projects", "data", unique_id, "processed", group_name, filename_stem)
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
@ -280,7 +280,7 @@ async def generate_embeddings_for_file(document_path: str, embedding_path: str)
|
|||||||
def check_file_already_processed(unique_id: str, group_name: str, filename: str) -> bool:
|
def check_file_already_processed(unique_id: str, group_name: str, filename: str) -> bool:
|
||||||
"""Check if a file has already been processed."""
|
"""Check if a file has already been processed."""
|
||||||
filename_stem = Path(filename).stem
|
filename_stem = Path(filename).stem
|
||||||
output_dir = os.path.join("projects", unique_id, "processed", group_name, filename_stem)
|
output_dir = os.path.join("projects", "data", unique_id, "processed", group_name, filename_stem)
|
||||||
|
|
||||||
document_path = os.path.join(output_dir, "document.txt")
|
document_path = os.path.join(output_dir, "document.txt")
|
||||||
pagination_path = os.path.join(output_dir, "pagination.txt")
|
pagination_path = os.path.join(output_dir, "pagination.txt")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user