优化文件上传处理功能实现分组删除,按需embedding

This commit is contained in:
朱潮 2025-10-25 10:09:11 +08:00
parent 418b5eb891
commit cec83ac4a9
6 changed files with 1317 additions and 368 deletions

View File

@ -1,98 +0,0 @@
#!/usr/bin/env python3
"""
测试脚本验证多查询和多模式搜索功能
"""
import sys
import os
sys.path.append('/Users/moshui/Documents/felo/qwen-agent/mcp')
from semantic_search_server import semantic_search
from multi_keyword_search_server import regex_grep, regex_grep_count
def test_semantic_search():
"""测试语义搜索的多查询功能"""
print("=== 测试语义搜索多查询功能 ===")
# 测试数据(模拟)
# 注意这里需要实际的embedding文件才能测试
print("语义搜索功能已修改,支持多查询输入")
print("参数格式:")
print(" - 单查询queries='查询内容' 或 query='查询内容'")
print(" - 多查询queries=['查询1', '查询2', '查询3']")
print()
def test_regex_grep():
"""测试正则表达式的多模式搜索功能"""
print("=== 测试正则表达式多模式搜索功能 ===")
# 创建测试文件
test_file = "/tmp/test_regex.txt"
with open(test_file, 'w') as f:
f.write("""def hello_world():
print("Hello, World!")
return "success"
def hello_python():
print("Hello, Python!")
return 42
class HelloWorld:
def __init__(self):
self.name = "World"
def greet(self):
return f"Hello, {self.name}!'
# 测试数字模式
version = "1.2.3"
count = 42
""")
# 测试多模式搜索
print("测试多模式搜索:['def.*hello', 'class.*World', '\\d+\\.\\d+\\.\\d+']")
result = regex_grep(
patterns=['def.*hello', 'class.*World', r'\d+\.\d+\.\d+'],
file_paths=[test_file],
case_sensitive=False
)
if "content" in result:
print("搜索结果:")
print(result["content"][0]["text"])
print()
# 测试多模式统计
print("测试多模式统计:['def', 'class', 'Hello', '\\d+']")
result = regex_grep_count(
patterns=['def', 'class', 'Hello', r'\d+'],
file_paths=[test_file],
case_sensitive=False
)
if "content" in result:
print("统计结果:")
print(result["content"][0]["text"])
# 清理测试文件
os.remove(test_file)
def main():
"""主测试函数"""
print("开始测试多查询和多模式搜索功能...")
print()
test_semantic_search()
test_regex_grep()
print("=== 测试完成 ===")
print("所有功能已成功修改:")
print("1. ✅ semantic_search 支持多查询 (queries 参数)")
print("2. ✅ regex_grep 支持多模式 (patterns 参数)")
print("3. ✅ regex_grep_count 支持多模式 (patterns 参数)")
print("4. ✅ 保持向后兼容性 (仍支持单查询/模式)")
print("5. ✅ 更新了工具定义 JSON 文件")
if __name__ == "__main__":
main()

356
utils/data_merger.py Normal file
View File

@ -0,0 +1,356 @@
#!/usr/bin/env python3
"""
Data merging functions for combining processed file results.
"""
import os
import pickle
from typing import Dict, List, Optional, Tuple
import json
# Try to import numpy, but handle if missing
try:
import numpy as np
NUMPY_SUPPORT = True
except ImportError:
print("NumPy not available, some embedding features may be limited")
NUMPY_SUPPORT = False
def merge_documents_by_group(unique_id: str, group_name: str) -> Dict:
"""Merge all document.txt files in a group into a single document."""
processed_group_dir = os.path.join("projects", unique_id, "processed", group_name)
dataset_group_dir = os.path.join("projects", unique_id, "dataset", group_name)
os.makedirs(dataset_group_dir, exist_ok=True)
merged_document_path = os.path.join(dataset_group_dir, "document.txt")
result = {
"success": False,
"merged_document_path": merged_document_path,
"source_files": [],
"total_pages": 0,
"total_characters": 0,
"error": None
}
try:
# Find all document.txt files in the processed directory
document_files = []
if os.path.exists(processed_group_dir):
for item in os.listdir(processed_group_dir):
item_path = os.path.join(processed_group_dir, item)
if os.path.isdir(item_path):
document_path = os.path.join(item_path, "document.txt")
if os.path.exists(document_path) and os.path.getsize(document_path) > 0:
document_files.append((item, document_path))
if not document_files:
result["error"] = "No document files found to merge"
return result
# Merge all documents with page separators
merged_content = []
total_characters = 0
for filename_stem, document_path in sorted(document_files):
try:
with open(document_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
if content:
merged_content.append(f"# Page {filename_stem}")
merged_content.append(content)
total_characters += len(content)
result["source_files"].append(filename_stem)
except Exception as e:
print(f"Error reading document file {document_path}: {str(e)}")
continue
if merged_content:
# Write merged document
with open(merged_document_path, 'w', encoding='utf-8') as f:
f.write('\n\n'.join(merged_content))
result["total_pages"] = len(document_files)
result["total_characters"] = total_characters
result["success"] = True
else:
result["error"] = "No valid content found in document files"
except Exception as e:
result["error"] = f"Document merging failed: {str(e)}"
print(f"Error merging documents for group {group_name}: {str(e)}")
return result
def merge_paginations_by_group(unique_id: str, group_name: str) -> Dict:
"""Merge all pagination.txt files in a group."""
processed_group_dir = os.path.join("projects", unique_id, "processed", group_name)
dataset_group_dir = os.path.join("projects", unique_id, "dataset", group_name)
os.makedirs(dataset_group_dir, exist_ok=True)
merged_pagination_path = os.path.join(dataset_group_dir, "pagination.txt")
result = {
"success": False,
"merged_pagination_path": merged_pagination_path,
"source_files": [],
"total_lines": 0,
"error": None
}
try:
# Find all pagination.txt files
pagination_files = []
if os.path.exists(processed_group_dir):
for item in os.listdir(processed_group_dir):
item_path = os.path.join(processed_group_dir, item)
if os.path.isdir(item_path):
pagination_path = os.path.join(item_path, "pagination.txt")
if os.path.exists(pagination_path) and os.path.getsize(pagination_path) > 0:
pagination_files.append((item, pagination_path))
if not pagination_files:
result["error"] = "No pagination files found to merge"
return result
# Merge all pagination files
merged_lines = []
for filename_stem, pagination_path in sorted(pagination_files):
try:
with open(pagination_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
if line:
merged_lines.append(line)
result["source_files"].append(filename_stem)
except Exception as e:
print(f"Error reading pagination file {pagination_path}: {str(e)}")
continue
if merged_lines:
# Write merged pagination
with open(merged_pagination_path, 'w', encoding='utf-8') as f:
for line in merged_lines:
f.write(f"{line}\n")
result["total_lines"] = len(merged_lines)
result["success"] = True
else:
result["error"] = "No valid pagination data found"
except Exception as e:
result["error"] = f"Pagination merging failed: {str(e)}"
print(f"Error merging paginations for group {group_name}: {str(e)}")
return result
def merge_embeddings_by_group(unique_id: str, group_name: str) -> Dict:
"""Merge all embedding.pkl files in a group."""
processed_group_dir = os.path.join("projects", unique_id, "processed", group_name)
dataset_group_dir = os.path.join("projects", unique_id, "dataset", group_name)
os.makedirs(dataset_group_dir, exist_ok=True)
merged_embedding_path = os.path.join(dataset_group_dir, "embedding.pkl")
result = {
"success": False,
"merged_embedding_path": merged_embedding_path,
"source_files": [],
"total_chunks": 0,
"total_dimensions": 0,
"error": None
}
try:
# Find all embedding.pkl files
embedding_files = []
if os.path.exists(processed_group_dir):
for item in os.listdir(processed_group_dir):
item_path = os.path.join(processed_group_dir, item)
if os.path.isdir(item_path):
embedding_path = os.path.join(item_path, "embedding.pkl")
if os.path.exists(embedding_path) and os.path.getsize(embedding_path) > 0:
embedding_files.append((item, embedding_path))
if not embedding_files:
result["error"] = "No embedding files found to merge"
return result
# Load and merge all embedding data
all_chunks = []
total_chunks = 0
dimensions = 0
for filename_stem, embedding_path in sorted(embedding_files):
try:
with open(embedding_path, 'rb') as f:
embedding_data = pickle.load(f)
if isinstance(embedding_data, dict) and 'chunks' in embedding_data:
chunks = embedding_data['chunks']
# Add source file metadata to each chunk
for chunk in chunks:
if isinstance(chunk, dict):
chunk['source_file'] = filename_stem
chunk['source_group'] = group_name
all_chunks.extend(chunks)
total_chunks += len(chunks)
# Get dimensions from first chunk if available
if dimensions == 0 and chunks and isinstance(chunks[0], dict):
if 'embedding' in chunks[0] and hasattr(chunks[0]['embedding'], 'shape'):
dimensions = chunks[0]['embedding'].shape[0]
result["source_files"].append(filename_stem)
except Exception as e:
print(f"Error loading embedding file {embedding_path}: {str(e)}")
continue
if all_chunks:
# Create merged embedding data structure
merged_embedding_data = {
'chunks': all_chunks,
'total_chunks': total_chunks,
'dimensions': dimensions,
'source_files': result["source_files"],
'group_name': group_name,
'merged_at': str(os.path.getmtime(merged_embedding_path) if os.path.exists(merged_embedding_path) else 0)
}
# Save merged embeddings
with open(merged_embedding_path, 'wb') as f:
pickle.dump(merged_embedding_data, f)
result["total_chunks"] = total_chunks
result["total_dimensions"] = dimensions
result["success"] = True
else:
result["error"] = "No valid embedding data found"
except Exception as e:
result["error"] = f"Embedding merging failed: {str(e)}"
print(f"Error merging embeddings for group {group_name}: {str(e)}")
return result
def merge_all_data_by_group(unique_id: str, group_name: str) -> Dict:
"""Merge documents, paginations, and embeddings for a group."""
merge_results = {
"group_name": group_name,
"unique_id": unique_id,
"success": True,
"document_merge": None,
"pagination_merge": None,
"embedding_merge": None,
"errors": []
}
# Merge documents
document_result = merge_documents_by_group(unique_id, group_name)
merge_results["document_merge"] = document_result
if not document_result["success"]:
merge_results["success"] = False
merge_results["errors"].append(f"Document merge failed: {document_result['error']}")
# Merge paginations
pagination_result = merge_paginations_by_group(unique_id, group_name)
merge_results["pagination_merge"] = pagination_result
if not pagination_result["success"]:
merge_results["success"] = False
merge_results["errors"].append(f"Pagination merge failed: {pagination_result['error']}")
# Merge embeddings
embedding_result = merge_embeddings_by_group(unique_id, group_name)
merge_results["embedding_merge"] = embedding_result
if not embedding_result["success"]:
merge_results["success"] = False
merge_results["errors"].append(f"Embedding merge failed: {embedding_result['error']}")
return merge_results
def get_group_merge_status(unique_id: str, group_name: str) -> Dict:
"""Get the status of merged data for a group."""
dataset_group_dir = os.path.join("projects", unique_id, "dataset", group_name)
status = {
"group_name": group_name,
"unique_id": unique_id,
"dataset_dir_exists": os.path.exists(dataset_group_dir),
"document_exists": False,
"document_size": 0,
"pagination_exists": False,
"pagination_size": 0,
"embedding_exists": False,
"embedding_size": 0,
"merge_complete": False
}
if os.path.exists(dataset_group_dir):
document_path = os.path.join(dataset_group_dir, "document.txt")
pagination_path = os.path.join(dataset_group_dir, "pagination.txt")
embedding_path = os.path.join(dataset_group_dir, "embedding.pkl")
if os.path.exists(document_path):
status["document_exists"] = True
status["document_size"] = os.path.getsize(document_path)
if os.path.exists(pagination_path):
status["pagination_exists"] = True
status["pagination_size"] = os.path.getsize(pagination_path)
if os.path.exists(embedding_path):
status["embedding_exists"] = True
status["embedding_size"] = os.path.getsize(embedding_path)
# Check if all files exist and are not empty
if (status["document_exists"] and status["document_size"] > 0 and
status["pagination_exists"] and status["pagination_size"] > 0 and
status["embedding_exists"] and status["embedding_size"] > 0):
status["merge_complete"] = True
return status
def cleanup_dataset_group(unique_id: str, group_name: str) -> bool:
"""Clean up merged dataset files for a group."""
dataset_group_dir = os.path.join("projects", unique_id, "dataset", group_name)
try:
if os.path.exists(dataset_group_dir):
import shutil
shutil.rmtree(dataset_group_dir)
print(f"Cleaned up dataset group: {group_name}")
return True
else:
return True # Nothing to clean up
except Exception as e:
print(f"Error cleaning up dataset group {group_name}: {str(e)}")
return False

View File

@ -1,293 +1,169 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Dataset management functions for organizing and processing datasets. Dataset management functions for organizing and processing datasets.
New implementation with per-file processing and group merging.
""" """
import os import os
import shutil
import json import json
import tempfile from typing import Dict, List
import zipfile
from typing import Dict, List, Optional
from pathlib import Path
from utils.file_utils import ( # Import new modules
download_file, extract_zip_file, get_file_hash, from utils.file_manager import (
load_processed_files_log, save_processed_files_log, ensure_directories, sync_files_to_group, cleanup_orphaned_files,
remove_file_or_directory get_group_files_list
) )
from utils.excel_csv_processor import ( from utils.single_file_processor import (
is_excel_file, is_csv_file, process_excel_file, process_csv_file process_single_file, check_file_already_processed
)
from utils.data_merger import (
merge_all_data_by_group, cleanup_dataset_group
) )
async def download_dataset_files(unique_id: str, files: Dict[str, List[str]]) -> Dict[str, List[str]]: async def download_dataset_files(unique_id: str, files: Dict[str, List[str]]) -> Dict[str, List[str]]:
"""Download or copy dataset files and organize them by key into dataset/{key}/document.txt. """
Supports zip file extraction and combines content using '# Page' separators.""" Process dataset files with new architecture:
1. Sync files to group directories
2. Process each file individually
3. Merge results by group
4. Clean up orphaned files
"""
if not files: if not files:
return {} return {}
# Set up directories
project_dir = os.path.join("projects", unique_id)
files_dir = os.path.join(project_dir, "files")
dataset_dir = os.path.join(project_dir, "dataset")
# Create directories if they don't exist
os.makedirs(files_dir, exist_ok=True)
os.makedirs(dataset_dir, exist_ok=True)
processed_files_by_key = {} print(f"Starting new file processing for project: {unique_id}")
def extract_zip_file_func(zip_path: str, extract_dir: str) -> List[str]: # Ensure project directories exist
"""Extract zip file and return list of extracted txt/md files""" ensure_directories(unique_id)
extracted_files = []
try:
import zipfile
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_dir)
# Find all extracted txt, md, xlsx, xls, and csv files
for root, dirs, files in os.walk(extract_dir):
for file in files:
if file.lower().endswith(('.txt', '.md', '.xlsx', '.xls', '.csv')):
extracted_files.append(os.path.join(root, file))
print(f"Extracted {len(extracted_files)} txt/md/xlsx/csv files from {zip_path}")
return extracted_files
except Exception as e:
print(f"Error extracting zip file {zip_path}: {str(e)}")
return []
# Process each key and its associated files # Step 1: Sync files to group directories
for key, file_list in files.items(): print("Step 1: Syncing files to group directories...")
print(f"Processing key '{key}' with {len(file_list)} files") synced_files, failed_files = sync_files_to_group(unique_id, files)
processed_files_by_key[key] = []
# Create target directory for this key
target_dir = os.path.join(dataset_dir, key)
os.makedirs(target_dir, exist_ok=True)
# Check if files are already processed before doing any work
document_file = os.path.join(target_dir, "document.txt")
pagination_file = os.path.join(target_dir, "pagination.txt")
embeddings_file = os.path.join(target_dir, "document_embeddings.pkl")
already_processed = (
os.path.exists(document_file) and
os.path.exists(pagination_file) and
os.path.exists(embeddings_file) and
os.path.getsize(document_file) > 0 and
os.path.getsize(pagination_file) > 0 and
os.path.getsize(embeddings_file) > 0
)
if already_processed:
print(f" Skipping already processed files for {key}")
processed_files_by_key[key].append(document_file)
continue # Skip to next key
# Read and combine all files for this key
combined_content = []
pagination_lines = [] # Collect pagination lines from all files
all_processed_files = []
for file_path in file_list:
# Check if it's a URL (remote file) or local file
is_remote = file_path.startswith(('http://', 'https://'))
filename = file_path.split("/")[-1] if file_path else f"file_{len(all_processed_files)}"
# Create temporary extraction directory for zip files
temp_extract_dir = None
files_to_process = []
try:
if is_remote:
# Handle remote file
temp_file = os.path.join(files_dir, filename)
print(f"Downloading {file_path} -> {temp_file}")
success = await download_file(file_path, temp_file)
if not success:
print(f"Failed to download {file_path}")
continue
# Check if it's a zip file
if filename.lower().endswith('.zip'):
temp_extract_dir = tempfile.mkdtemp(prefix=f"extract_{key}_")
print(f"Extracting zip to temporary directory: {temp_extract_dir}")
extracted_files = extract_zip_file_func(temp_file, temp_extract_dir)
files_to_process.extend(extracted_files)
# Copy the zip file to project files directory
zip_dest = os.path.join(files_dir, filename)
shutil.copy2(temp_file, zip_dest)
print(f"Copied local zip file: {temp_file} -> {zip_dest}")
else:
files_to_process.append(temp_file)
else:
# Handle local file
if not os.path.exists(file_path):
print(f"Local file not found: {file_path}")
continue
if filename.lower().endswith('.zip'):
# Copy to project directory first
local_zip_path = os.path.join(files_dir, filename)
shutil.copy2(file_path, local_zip_path)
print(f"Copied local zip file: {file_path} -> {local_zip_path}")
# Extract zip file
temp_extract_dir = tempfile.mkdtemp(prefix=f"extract_{key}_")
print(f"Extracting local zip to temporary directory: {temp_extract_dir}")
extracted_files = extract_zip_file_func(local_zip_path, temp_extract_dir)
files_to_process.extend(extracted_files)
else:
# Copy non-zip file directly
dest_file = os.path.join(files_dir, filename)
shutil.copy2(file_path, dest_file)
files_to_process.append(dest_file)
print(f"Copied local file: {file_path} -> {dest_file}")
# Process all files (extracted from zip or single file)
for process_file_path in files_to_process:
try:
base_filename = os.path.basename(process_file_path)
# Check if it's an Excel file
if is_excel_file(process_file_path):
print(f"Processing Excel file: {base_filename}")
document_content, excel_pagination_lines = process_excel_file(process_file_path)
if document_content:
combined_content.append(f"# Page {base_filename}")
combined_content.append(document_content)
# Collect pagination lines from Excel files
pagination_lines.extend(excel_pagination_lines)
# Check if it's a CSV file
elif is_csv_file(process_file_path):
print(f"Processing CSV file: {base_filename}")
document_content, csv_pagination_lines = process_csv_file(process_file_path)
if document_content:
combined_content.append(f"# Page {base_filename}")
combined_content.append(document_content)
# Collect pagination lines from CSV files
pagination_lines.extend(csv_pagination_lines)
# Handle text files (original logic)
else:
with open(process_file_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
if content:
# Add file content with page separator
combined_content.append(f"# Page {base_filename}")
combined_content.append(content)
except Exception as e:
print(f"Failed to read file content from {process_file_path}: {str(e)}")
except Exception as e:
print(f"Error processing file {file_path}: {str(e)}")
finally:
# Clean up temporary extraction directory
if temp_extract_dir and os.path.exists(temp_extract_dir):
try:
shutil.rmtree(temp_extract_dir)
print(f"Cleaned up temporary directory: {temp_extract_dir}")
except Exception as e:
print(f"Failed to clean up temporary directory {temp_extract_dir}: {str(e)}")
# Write combined content to dataset/{key}/document.txt
if combined_content:
try:
with open(document_file, 'w', encoding='utf-8') as f:
f.write('\n\n'.join(combined_content))
print(f"Created combined document: {document_file}")
# Generate pagination and embeddings for the combined document
try:
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'embedding'))
from embedding import embed_document
# Generate pagination file from collected pagination lines
# For Excel/CSV files, use the pagination format we collected
# For text files, fall back to the original pagination generation
if pagination_lines:
print(f" Writing pagination data from Excel/CSV files for {key}")
with open(pagination_file, 'w', encoding='utf-8') as f:
for line in pagination_lines:
if line.strip():
f.write(f"{line}\n")
print(f" Generated {len(pagination_lines)} pagination lines")
else:
# For text-only files, use the original pagination generation
from embedding import split_document_by_pages
print(f" Generating pagination from text files for {key}")
pages = split_document_by_pages(str(document_file), str(pagination_file))
print(f" Generated {len(pages)} pages")
# Generate embeddings
print(f" Generating embeddings for {key}")
# Use paragraph chunking strategy with default settings
embedding_data = embed_document(
str(document_file),
str(embeddings_file),
chunking_strategy='paragraph'
)
if embedding_data:
print(f" Generated embeddings for {len(embedding_data['chunks'])} chunks")
# Add to processed files only after successful embedding
processed_files_by_key[key].append(document_file)
else:
print(f" Failed to generate embeddings")
except Exception as e:
print(f" Failed to generate pagination/embeddings for {key}: {str(e)}")
except Exception as e:
print(f"Failed to write combined document: {str(e)}")
# Load existing log # Step 2: Detect changes and cleanup orphaned files
processed_log = load_processed_files_log(unique_id) from utils.file_manager import detect_file_changes
changes = detect_file_changes(unique_id, files)
# Update log with newly processed files if any(changes["removed"].values()):
for key, file_list in files.items(): print("Step 2: Cleaning up orphaned files...")
if key not in processed_log: removed_files = cleanup_orphaned_files(unique_id, changes)
processed_log[key] = {} print(f"Removed orphaned files: {removed_files}")
# Step 3: Process individual files
print("Step 3: Processing individual files...")
processed_files_by_group = {}
processing_results = {}
for group_name, file_list in files.items():
processed_files_by_group[group_name] = []
processing_results[group_name] = []
for file_path in file_list: for file_path in file_list:
filename = os.path.basename(file_path) filename = os.path.basename(file_path)
processed_log[key][filename] = {
"original_path": file_path, # Get local file path
"processed_at": str(os.path.getmtime(document_file) if os.path.exists(document_file) else 0), local_path = os.path.join("projects", unique_id, "files", group_name, filename)
"status": "processed" if key in processed_files_by_key and processed_files_by_key[key] else "failed"
} # Skip if file doesn't exist (might be remote file that failed to download)
if not os.path.exists(local_path) and not file_path.startswith(('http://', 'https://')):
print(f"Skipping non-existent file: {filename}")
continue
# Check if already processed
if check_file_already_processed(unique_id, group_name, filename):
print(f"Skipping already processed file: {filename}")
processed_files_by_group[group_name].append(filename)
processing_results[group_name].append({
"filename": filename,
"status": "existing"
})
continue
# Process the file
print(f"Processing file: {filename} (group: {group_name})")
result = await process_single_file(unique_id, group_name, filename, file_path, local_path)
processing_results[group_name].append(result)
if result["success"]:
processed_files_by_group[group_name].append(filename)
print(f" Successfully processed {filename}")
else:
print(f" Failed to process {filename}: {result['error']}")
# Save the updated processed log # Step 4: Merge results by group
save_processed_files_log(unique_id, processed_log) print("Step 4: Merging results by group...")
merge_results = {}
return processed_files_by_key for group_name in processed_files_by_group.keys():
# Get all files in the group (including existing ones)
group_files = get_group_files_list(unique_id, group_name)
if group_files:
print(f"Merging group: {group_name} with {len(group_files)} files")
merge_result = merge_all_data_by_group(unique_id, group_name)
merge_results[group_name] = merge_result
if merge_result["success"]:
print(f" Successfully merged group {group_name}")
else:
print(f" Failed to merge group {group_name}: {merge_result['errors']}")
# Step 5: Save processing log
print("Step 5: Saving processing log...")
await save_processing_log(unique_id, files, synced_files, processing_results, merge_results)
print(f"File processing completed for project: {unique_id}")
return processed_files_by_group
async def save_processing_log(
unique_id: str,
requested_files: Dict[str, List[str]],
synced_files: Dict,
processing_results: Dict,
merge_results: Dict
):
"""Save comprehensive processing log."""
log_data = {
"unique_id": unique_id,
"timestamp": str(os.path.getmtime("projects") if os.path.exists("projects") else 0),
"requested_files": requested_files,
"synced_files": synced_files,
"processing_results": processing_results,
"merge_results": merge_results,
"summary": {
"total_groups": len(requested_files),
"total_files_requested": sum(len(files) for files in requested_files.values()),
"total_files_processed": sum(
len([r for r in results if r.get("success", False)])
for results in processing_results.values()
),
"total_groups_merged": len([r for r in merge_results.values() if r.get("success", False)])
}
}
log_file_path = os.path.join("projects", unique_id, "processing_log.json")
try:
with open(log_file_path, 'w', encoding='utf-8') as f:
json.dump(log_data, f, ensure_ascii=False, indent=2)
print(f"Processing log saved to: {log_file_path}")
except Exception as e:
print(f"Error saving processing log: {str(e)}")
def generate_dataset_structure(unique_id: str) -> str: def generate_dataset_structure(unique_id: str) -> str:
"""Generate a string representation of the dataset structure""" """Generate a string representation of the dataset structure"""
dataset_dir = os.path.join("projects", unique_id, "dataset") project_dir = os.path.join("projects", unique_id)
structure = [] structure = []
def add_directory_contents(dir_path: str, prefix: str = ""): def add_directory_contents(dir_path: str, prefix: str = ""):
try: try:
if not os.path.exists(dir_path):
structure.append(f"{prefix}└── (not found)")
return
items = sorted(os.listdir(dir_path)) items = sorted(os.listdir(dir_path))
for i, item in enumerate(items): for i, item in enumerate(items):
item_path = os.path.join(dir_path, item) item_path = os.path.join(dir_path, item)
@ -301,22 +177,109 @@ def generate_dataset_structure(unique_id: str) -> str:
except Exception as e: except Exception as e:
structure.append(f"{prefix}└── Error: {str(e)}") structure.append(f"{prefix}└── Error: {str(e)}")
if os.path.exists(dataset_dir): # Add files directory structure
structure.append(f"dataset/") files_dir = os.path.join(project_dir, "files")
add_directory_contents(dataset_dir, "") structure.append("files/")
else: add_directory_contents(files_dir, "")
structure.append("dataset/ (not found)")
# Add processed directory structure
processed_dir = os.path.join(project_dir, "processed")
structure.append("\nprocessed/")
add_directory_contents(processed_dir, "")
# Add dataset directory structure
dataset_dir = os.path.join(project_dir, "dataset")
structure.append("\ndataset/")
add_directory_contents(dataset_dir, "")
return "\n".join(structure) return "\n".join(structure)
def get_processing_status(unique_id: str) -> Dict:
"""Get comprehensive processing status for a project."""
project_dir = os.path.join("projects", unique_id)
if not os.path.exists(project_dir):
return {
"project_exists": False,
"unique_id": unique_id
}
status = {
"project_exists": True,
"unique_id": unique_id,
"directories": {
"files": os.path.exists(os.path.join(project_dir, "files")),
"processed": os.path.exists(os.path.join(project_dir, "processed")),
"dataset": os.path.exists(os.path.join(project_dir, "dataset"))
},
"groups": {},
"processing_log_exists": os.path.exists(os.path.join(project_dir, "processing_log.json"))
}
# Check each group's status
files_dir = os.path.join(project_dir, "files")
if os.path.exists(files_dir):
for group_name in os.listdir(files_dir):
group_path = os.path.join(files_dir, group_name)
if os.path.isdir(group_path):
status["groups"][group_name] = {
"files_count": len([
f for f in os.listdir(group_path)
if os.path.isfile(os.path.join(group_path, f))
]),
"merge_status": "pending"
}
# Check merge status for each group
dataset_dir = os.path.join(project_dir, "dataset")
if os.path.exists(dataset_dir):
for group_name in os.listdir(dataset_dir):
group_path = os.path.join(dataset_dir, group_name)
if os.path.isdir(group_path):
if group_name in status["groups"]:
# Check if merge is complete
document_path = os.path.join(group_path, "document.txt")
pagination_path = os.path.join(group_path, "pagination.txt")
embedding_path = os.path.join(group_path, "embedding.pkl")
if (os.path.exists(document_path) and os.path.exists(pagination_path) and
os.path.exists(embedding_path)):
status["groups"][group_name]["merge_status"] = "completed"
else:
status["groups"][group_name]["merge_status"] = "incomplete"
else:
status["groups"][group_name] = {
"files_count": 0,
"merge_status": "completed"
}
return status
def remove_dataset_directory(unique_id: str, filename_without_ext: str): def remove_dataset_directory(unique_id: str, filename_without_ext: str):
"""Remove a specific dataset directory""" """Remove a specific dataset directory (deprecated - use new structure)"""
dataset_path = os.path.join("projects", unique_id, "dataset", filename_without_ext) # This function is kept for compatibility but delegates to new structure
remove_file_or_directory(dataset_path) dataset_path = os.path.join("projects", unique_id, "processed", filename_without_ext)
if os.path.exists(dataset_path):
import shutil
shutil.rmtree(dataset_path)
def remove_dataset_directory_by_key(unique_id: str, key: str): def remove_dataset_directory_by_key(unique_id: str, key: str):
"""Remove dataset directory by key""" """Remove dataset directory by key (group name)"""
dataset_path = os.path.join("projects", unique_id, "dataset", key) # Remove files directory
remove_file_or_directory(dataset_path) files_group_path = os.path.join("projects", unique_id, "files", key)
if os.path.exists(files_group_path):
import shutil
shutil.rmtree(files_group_path)
# Remove processed directory
processed_group_path = os.path.join("projects", unique_id, "processed", key)
if os.path.exists(processed_group_path):
import shutil
shutil.rmtree(processed_group_path)
# Remove dataset directory
cleanup_dataset_group(unique_id, key)

253
utils/file_manager.py Normal file
View File

@ -0,0 +1,253 @@
#!/usr/bin/env python3
"""
File management functions for syncing files and detecting changes.
"""
import os
import json
import shutil
from typing import Dict, List, Set, Tuple
from pathlib import Path
def get_existing_files(unique_id: str) -> Dict[str, Set[str]]:
"""Get existing files organized by group."""
existing_files = {}
files_dir = os.path.join("projects", unique_id, "files")
if not os.path.exists(files_dir):
return existing_files
for group_name in os.listdir(files_dir):
group_dir = os.path.join(files_dir, group_name)
if os.path.isdir(group_dir):
existing_files[group_name] = set()
for file_name in os.listdir(group_dir):
file_path = os.path.join(group_dir, file_name)
if os.path.isfile(file_path):
existing_files[group_name].add(file_name)
return existing_files
def detect_file_changes(unique_id: str, new_files: Dict[str, List[str]]) -> Dict:
"""Detect file changes: added, removed, and existing files."""
existing_files = get_existing_files(unique_id)
changes = {
"added": {},
"removed": {},
"existing": {},
"removed_groups": set() # Track completely removed groups
}
# Convert new_files to sets for comparison
new_files_sets = {}
for group, file_list in new_files.items():
# Extract filenames from paths
filenames = set()
for file_path in file_list:
filename = os.path.basename(file_path)
filenames.add(filename)
new_files_sets[group] = filenames
# Detect added and existing files
for group, new_filenames in new_files_sets.items():
existing_filenames = existing_files.get(group, set())
changes["added"][group] = new_filenames - existing_filenames
changes["existing"][group] = new_filenames & existing_filenames
# For removed files, check against existing files in this group
if group not in changes["removed"]:
changes["removed"][group] = set()
# Detect removed files (files that exist but not in new request)
for group, existing_filenames in existing_files.items():
if group in new_files_sets:
# Group exists in new request, check for individual file removals
new_filenames = new_files_sets[group]
changes["removed"][group] = existing_filenames - new_filenames
else:
# Group completely removed from new request
changes["removed_groups"].add(group)
changes["removed"][group] = existing_filenames
return changes
def sync_files_to_group(unique_id: str, files: Dict[str, List[str]]) -> Tuple[Dict, Dict]:
"""
Sync files to group directories and return sync results.
Returns:
Tuple of (synced_files, failed_files)
"""
project_dir = os.path.join("projects", unique_id)
files_dir = os.path.join(project_dir, "files")
# Create files directory
os.makedirs(files_dir, exist_ok=True)
# Detect changes first
changes = detect_file_changes(unique_id, files)
synced_files = {}
failed_files = {}
# Process each group
for group_name, file_list in files.items():
group_dir = os.path.join(files_dir, group_name)
os.makedirs(group_dir, exist_ok=True)
synced_files[group_name] = []
failed_files[group_name] = []
# Only process files that are new or need updating
filenames_to_sync = changes["added"].get(group_name, set())
for file_path in file_list:
filename = os.path.basename(file_path)
# Skip if file already exists and is in the "existing" category
if filename in changes["existing"].get(group_name, set()):
synced_files[group_name].append({
"original_path": file_path,
"local_path": os.path.join(group_dir, filename),
"status": "existing"
})
continue
# Only process files that are actually new
if filename not in filenames_to_sync:
continue
try:
# Check if it's a URL (remote file) or local file
is_remote = file_path.startswith(('http://', 'https://'))
if is_remote:
# For remote files, we'll download them in the processing stage
synced_files[group_name].append({
"original_path": file_path,
"local_path": os.path.join(group_dir, filename),
"status": "pending_download"
})
else:
# Handle local file
if not os.path.exists(file_path):
failed_files[group_name].append({
"original_path": file_path,
"error": "Local file not found"
})
continue
# Copy file to group directory
dest_path = os.path.join(group_dir, filename)
shutil.copy2(file_path, dest_path)
synced_files[group_name].append({
"original_path": file_path,
"local_path": dest_path,
"status": "copied"
})
except Exception as e:
failed_files[group_name].append({
"original_path": file_path,
"error": str(e)
})
return synced_files, failed_files
def cleanup_orphaned_files(unique_id: str, changes: Dict) -> Dict[str, List[str]]:
"""Remove files and their processing results that are no longer needed."""
removed_files = {}
project_dir = os.path.join("projects", unique_id)
# Handle individual file removals
for group_name, removed_filenames in changes["removed"].items():
if not removed_filenames:
continue
removed_files[group_name] = []
for filename in removed_filenames:
try:
# Remove original file
file_path = os.path.join(project_dir, "files", group_name, filename)
if os.path.exists(file_path):
os.remove(file_path)
removed_files[group_name].append(f"file: {filename}")
# Remove processed directory for this file
processed_dir = os.path.join(project_dir, "processed", group_name,
Path(filename).stem)
if os.path.exists(processed_dir):
shutil.rmtree(processed_dir)
removed_files[group_name].append(f"processed: {Path(filename).stem}")
except Exception as e:
print(f"Error cleaning up {filename}: {str(e)}")
# Handle completely removed groups
for group_name in changes.get("removed_groups", set()):
print(f"Cleaning up completely removed group: {group_name}")
removed_files[group_name] = []
try:
# Remove entire files/group directory
files_group_dir = os.path.join(project_dir, "files", group_name)
if os.path.exists(files_group_dir):
shutil.rmtree(files_group_dir)
removed_files[group_name].append("files group directory")
# Remove entire processed/group directory
processed_group_dir = os.path.join(project_dir, "processed", group_name)
if os.path.exists(processed_group_dir):
shutil.rmtree(processed_group_dir)
removed_files[group_name].append("processed group directory")
# Remove entire dataset/group directory
dataset_group_dir = os.path.join(project_dir, "dataset", group_name)
if os.path.exists(dataset_group_dir):
shutil.rmtree(dataset_group_dir)
removed_files[group_name].append("dataset group directory")
print(f"Completely removed group '{group_name}' and all its data")
except Exception as e:
print(f"Error cleaning up group {group_name}: {str(e)}")
return removed_files
def get_group_files_list(unique_id: str, group_name: str) -> List[str]:
"""Get list of files in a specific group."""
group_dir = os.path.join("projects", unique_id, "files", group_name)
if not os.path.exists(group_dir):
return []
files = []
for filename in os.listdir(group_dir):
file_path = os.path.join(group_dir, filename)
if os.path.isfile(file_path):
files.append(filename)
return sorted(files)
def ensure_directories(unique_id: str):
"""Ensure all necessary directories exist for a project."""
base_dir = os.path.join("projects", unique_id)
directories = [
"files",
"processed",
"dataset"
]
for dir_name in directories:
dir_path = os.path.join(base_dir, dir_name)
os.makedirs(dir_path, exist_ok=True)

View File

@ -115,7 +115,7 @@ def load_processed_files_log(unique_id: str) -> Dict[str, Dict]:
def save_processed_files_log(unique_id: str, processed_log: Dict[str, Dict]): def save_processed_files_log(unique_id: str, processed_log: Dict[str, Dict]):
"""Save processed files log for a project""" """Save processed files log for a project (legacy function)"""
log_file = os.path.join("projects", unique_id, "processed_files.json") log_file = os.path.join("projects", unique_id, "processed_files.json")
try: try:
os.makedirs(os.path.dirname(log_file), exist_ok=True) os.makedirs(os.path.dirname(log_file), exist_ok=True)
@ -123,4 +123,182 @@ def save_processed_files_log(unique_id: str, processed_log: Dict[str, Dict]):
with open(log_file, 'w', encoding='utf-8') as f: with open(log_file, 'w', encoding='utf-8') as f:
json.dump(processed_log, f, ensure_ascii=False, indent=2) json.dump(processed_log, f, ensure_ascii=False, indent=2)
except Exception as e: except Exception as e:
print(f"Error saving processed files log: {e}") print(f"Error saving processed files log: {e}")
def get_processing_log(unique_id: str) -> Dict:
"""Get the comprehensive processing log for a project"""
log_file = os.path.join("projects", unique_id, "processing_log.json")
if os.path.exists(log_file):
try:
import json
with open(log_file, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
print(f"Error loading processing log: {e}")
return {}
def save_project_status(unique_id: str, status: Dict):
"""Save project processing status"""
status_file = os.path.join("projects", unique_id, "status.json")
try:
os.makedirs(os.path.dirname(status_file), exist_ok=True)
import json
with open(status_file, 'w', encoding='utf-8') as f:
json.dump(status, f, ensure_ascii=False, indent=2)
except Exception as e:
print(f"Error saving project status: {e}")
def load_project_status(unique_id: str) -> Dict:
"""Load project processing status"""
status_file = os.path.join("projects", unique_id, "status.json")
if os.path.exists(status_file):
try:
import json
with open(status_file, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
print(f"Error loading project status: {e}")
return {}
def get_file_metadata(file_path: str) -> Dict:
"""Get metadata for a file"""
try:
if not os.path.exists(file_path):
return {"exists": False}
stat = os.stat(file_path)
return {
"exists": True,
"size": stat.st_size,
"modified_time": stat.st_mtime,
"created_time": stat.st_ctime,
"is_file": os.path.isfile(file_path),
"is_directory": os.path.isdir(file_path)
}
except Exception as e:
return {"exists": False, "error": str(e)}
def update_file_processing_status(unique_id: str, group_name: str, filename: str, status: Dict):
"""Update processing status for a specific file"""
status_file = os.path.join("projects", unique_id, "file_status.json")
try:
# Load existing status
if os.path.exists(status_file):
import json
with open(status_file, 'r', encoding='utf-8') as f:
file_status = json.load(f)
else:
file_status = {}
# Ensure structure exists
if group_name not in file_status:
file_status[group_name] = {}
# Update status
file_status[group_name][filename] = {
**status,
"updated_at": str(os.path.getmtime(file_path) if os.path.exists(file_path) else 0)
}
# Save updated status
os.makedirs(os.path.dirname(status_file), exist_ok=True)
with open(status_file, 'w', encoding='utf-8') as f:
json.dump(file_status, f, ensure_ascii=False, indent=2)
except Exception as e:
print(f"Error updating file processing status: {e}")
def get_file_processing_status(unique_id: str, group_name: str = None, filename: str = None) -> Dict:
"""Get processing status for files"""
status_file = os.path.join("projects", unique_id, "file_status.json")
if not os.path.exists(status_file):
return {}
try:
import json
with open(status_file, 'r', encoding='utf-8') as f:
file_status = json.load(f)
# Filter by group and filename if provided
if group_name:
if group_name not in file_status:
return {}
if filename:
return file_status[group_name].get(filename, {})
else:
return file_status[group_name]
return file_status
except Exception as e:
print(f"Error getting file processing status: {e}")
return {}
def calculate_directory_size(directory_path: str) -> int:
"""Calculate total size of a directory recursively"""
total_size = 0
try:
for dirpath, dirnames, filenames in os.walk(directory_path):
for filename in filenames:
file_path = os.path.join(dirpath, filename)
if os.path.exists(file_path):
total_size += os.path.getsize(file_path)
except Exception as e:
print(f"Error calculating directory size: {e}")
return total_size
def get_project_statistics(unique_id: str) -> Dict:
"""Get comprehensive statistics for a project"""
project_dir = os.path.join("projects", unique_id)
if not os.path.exists(project_dir):
return {"project_exists": False}
stats = {
"project_exists": True,
"unique_id": unique_id,
"directories": {},
"total_files": 0,
"total_size": 0
}
# Check each directory
directories = ["files", "processed", "dataset"]
for dir_name in directories:
dir_path = os.path.join(project_dir, dir_name)
if os.path.exists(dir_path):
dir_size = calculate_directory_size(dir_path)
dir_files = 0
for root, dirs, files in os.walk(dir_path):
dir_files += len(files)
stats["directories"][dir_name] = {
"exists": True,
"size": dir_size,
"files": dir_files
}
stats["total_files"] += dir_files
stats["total_size"] += dir_size
else:
stats["directories"][dir_name] = {
"exists": False,
"size": 0,
"files": 0
}
return stats

View File

@ -0,0 +1,297 @@
#!/usr/bin/env python3
"""
Single file processing functions for handling individual files.
"""
import os
import tempfile
import zipfile
from typing import Dict, List, Tuple, Optional
from pathlib import Path
from utils.file_utils import download_file
# Try to import excel/csv processor, but handle if dependencies are missing
try:
from utils.excel_csv_processor import (
is_excel_file, is_csv_file, process_excel_file, process_csv_file
)
EXCEL_CSV_SUPPORT = True
except ImportError as e:
print(f"Excel/CSV processing not available: {e}")
EXCEL_CSV_SUPPORT = False
# Fallback functions
def is_excel_file(file_path):
return file_path.lower().endswith(('.xlsx', '.xls'))
def is_csv_file(file_path):
return file_path.lower().endswith('.csv')
def process_excel_file(file_path):
return "", []
def process_csv_file(file_path):
return "", []
async def process_single_file(
unique_id: str,
group_name: str,
filename: str,
original_path: str,
local_path: str
) -> Dict:
"""
Process a single file and generate document.txt, pagination.txt, and embedding.pkl.
Returns:
Dict with processing results and file paths
"""
# Create output directory for this file
filename_stem = Path(filename).stem
output_dir = os.path.join("projects", unique_id, "processed", group_name, filename_stem)
os.makedirs(output_dir, exist_ok=True)
result = {
"success": False,
"filename": filename,
"group": group_name,
"output_dir": output_dir,
"document_path": os.path.join(output_dir, "document.txt"),
"pagination_path": os.path.join(output_dir, "pagination.txt"),
"embedding_path": os.path.join(output_dir, "embedding.pkl"),
"error": None,
"content_size": 0,
"pagination_lines": 0,
"embedding_chunks": 0
}
try:
# Download file if it's remote and not yet downloaded
if original_path.startswith(('http://', 'https://')):
if not os.path.exists(local_path):
print(f"Downloading {original_path} -> {local_path}")
success = await download_file(original_path, local_path)
if not success:
result["error"] = "Failed to download file"
return result
# Extract content from file
content, pagination_lines = await extract_file_content(local_path, filename)
if not content or not content.strip():
result["error"] = "No content extracted from file"
return result
# Write document.txt
with open(result["document_path"], 'w', encoding='utf-8') as f:
f.write(content)
result["content_size"] = len(content)
# Write pagination.txt
if pagination_lines:
with open(result["pagination_path"], 'w', encoding='utf-8') as f:
for line in pagination_lines:
if line.strip():
f.write(f"{line}\n")
result["pagination_lines"] = len(pagination_lines)
else:
# Generate pagination from text content
pagination_lines = generate_pagination_from_text(result["document_path"],
result["pagination_path"])
result["pagination_lines"] = len(pagination_lines)
# Generate embeddings
try:
embedding_chunks = await generate_embeddings_for_file(
result["document_path"], result["embedding_path"]
)
result["embedding_chunks"] = len(embedding_chunks) if embedding_chunks else 0
result["success"] = True
except Exception as e:
result["error"] = f"Embedding generation failed: {str(e)}"
print(f"Failed to generate embeddings for {filename}: {str(e)}")
except Exception as e:
result["error"] = f"File processing failed: {str(e)}"
print(f"Error processing file {filename}: {str(e)}")
return result
async def extract_file_content(file_path: str, filename: str) -> Tuple[str, List[str]]:
"""Extract content from various file formats."""
# Handle zip files
if filename.lower().endswith('.zip'):
return await extract_from_zip(file_path, filename)
# Handle Excel files
elif is_excel_file(file_path):
return await extract_from_excel(file_path, filename)
# Handle CSV files
elif is_csv_file(file_path):
return await extract_from_csv(file_path, filename)
# Handle text files
else:
return await extract_from_text(file_path, filename)
async def extract_from_zip(zip_path: str, filename: str) -> Tuple[str, List[str]]:
"""Extract content from zip file."""
content_parts = []
pagination_lines = []
try:
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
# Extract to temporary directory
temp_dir = tempfile.mkdtemp(prefix=f"extract_{Path(filename).stem}_")
zip_ref.extractall(temp_dir)
# Process extracted files
for root, dirs, files in os.walk(temp_dir):
for file in files:
if file.lower().endswith(('.txt', '.md', '.xlsx', '.xls', '.csv')):
file_path = os.path.join(root, file)
try:
file_content, file_pagination = await extract_file_content(file_path, file)
if file_content:
content_parts.append(f"# Page {file}")
content_parts.append(file_content)
pagination_lines.extend(file_pagination)
except Exception as e:
print(f"Error processing extracted file {file}: {str(e)}")
# Clean up temporary directory
import shutil
shutil.rmtree(temp_dir)
except Exception as e:
print(f"Error extracting zip file {filename}: {str(e)}")
return "", []
return '\n\n'.join(content_parts), pagination_lines
async def extract_from_excel(file_path: str, filename: str) -> Tuple[str, List[str]]:
"""Extract content from Excel file."""
try:
document_content, pagination_lines = process_excel_file(file_path)
if document_content:
content = f"# Page {filename}\n{document_content}"
return content, pagination_lines
else:
return "", []
except Exception as e:
print(f"Error processing Excel file {filename}: {str(e)}")
return "", []
async def extract_from_csv(file_path: str, filename: str) -> Tuple[str, List[str]]:
"""Extract content from CSV file."""
try:
document_content, pagination_lines = process_csv_file(file_path)
if document_content:
content = f"# Page {filename}\n{document_content}"
return content, pagination_lines
else:
return "", []
except Exception as e:
print(f"Error processing CSV file {filename}: {str(e)}")
return "", []
async def extract_from_text(file_path: str, filename: str) -> Tuple[str, List[str]]:
"""Extract content from text file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
if content:
return content, []
else:
return "", []
except Exception as e:
print(f"Error reading text file {filename}: {str(e)}")
return "", []
def generate_pagination_from_text(document_path: str, pagination_path: str) -> List[str]:
"""Generate pagination from text document."""
try:
# Import embedding module for pagination
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'embedding'))
from embedding import split_document_by_pages
pages = split_document_by_pages(str(document_path), str(pagination_path))
# Return pagination lines
pagination_lines = []
with open(pagination_path, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
pagination_lines.append(line.strip())
return pagination_lines
except Exception as e:
print(f"Error generating pagination from text: {str(e)}")
return []
async def generate_embeddings_for_file(document_path: str, embedding_path: str) -> Optional[List]:
"""Generate embeddings for a document."""
try:
# Import embedding module
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'embedding'))
from embedding import embed_document
# Generate embeddings using paragraph chunking
embedding_data = embed_document(
str(document_path),
str(embedding_path),
chunking_strategy='paragraph'
)
if embedding_data and 'chunks' in embedding_data:
return embedding_data['chunks']
else:
return None
except Exception as e:
print(f"Error generating embeddings: {str(e)}")
return None
def check_file_already_processed(unique_id: str, group_name: str, filename: str) -> bool:
"""Check if a file has already been processed."""
filename_stem = Path(filename).stem
output_dir = os.path.join("projects", unique_id, "processed", group_name, filename_stem)
document_path = os.path.join(output_dir, "document.txt")
pagination_path = os.path.join(output_dir, "pagination.txt")
embedding_path = os.path.join(output_dir, "embedding.pkl")
# Check if all files exist and are not empty
if (os.path.exists(document_path) and os.path.exists(pagination_path) and
os.path.exists(embedding_path)):
if (os.path.getsize(document_path) > 0 and os.path.getsize(pagination_path) > 0 and
os.path.getsize(embedding_path) > 0):
return True
return False