qwen_agent/utils/organize_dataset_files.py
2025-10-25 22:02:04 +08:00

176 lines
7.0 KiB
Python

#!/usr/bin/env python3
import os
import shutil
from pathlib import Path
def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool:
"""Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)"""
if not target_file.exists():
return False
# Check if pagination and embeddings files exist and are not empty
if pagination_file.exists() and embeddings_file.exists():
# Check file sizes to ensure they're not empty
if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0:
return True
return False
def organize_single_project_files(unique_id: str, skip_processed=True):
"""Organize files for a single project from projects/{unique_id}/files to projects/{unique_id}/dataset/{file_name}/document.txt"""
project_dir = Path("projects") / unique_id
if not project_dir.exists():
print(f"Project directory not found: {project_dir}")
return
print(f"Organizing files for project: {unique_id} (skip_processed={skip_processed})")
files_dir = project_dir / "files"
dataset_dir = project_dir / "dataset"
# Check if files directory exists and has files
if not files_dir.exists():
print(f" No files directory found, skipping...")
return
files = list(files_dir.glob("*"))
if not files:
print(f" Files directory is empty, skipping...")
return
# Create dataset directory if it doesn't exist
dataset_dir.mkdir(exist_ok=True)
# Copy each file to its own directory
for file_path in files:
if file_path.is_file():
# Get filename without extension as directory name
file_name_without_ext = file_path.stem
target_dir = dataset_dir / file_name_without_ext
target_file = target_dir / "document.txt"
pagination_file = target_dir / "pagination.txt"
embeddings_file = target_dir / "embedding.pkl"
# Check if file is already processed
if skip_processed and is_file_already_processed(target_file, pagination_file, embeddings_file):
print(f" Skipping already processed file: {file_path.name}")
continue
print(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
# Create target directory
target_dir.mkdir(exist_ok=True)
# Copy and rename file
shutil.copy2(str(file_path), str(target_file))
print(f" Files remain in original location (copied to dataset structure)")
# Process each document.txt file: split pages and generate embeddings
if not skip_processed:
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), 'embedding'))
from embedding import split_document_by_pages, embed_document
for file_path in files:
if file_path.is_file():
file_name_without_ext = file_path.stem
target_dir = dataset_dir / file_name_without_ext
document_file = target_dir / "document.txt"
pagination_file = target_dir / "pagination.txt"
embeddings_file = target_dir / "embedding.pkl"
# Skip if already processed
if is_file_already_processed(document_file, pagination_file, embeddings_file):
print(f" Skipping document processing for already processed file: {file_path.name}")
continue
# Split document by pages
print(f" Splitting pages for {document_file.name}")
try:
pages = split_document_by_pages(str(document_file), str(pagination_file))
print(f" Generated {len(pages)} pages")
except Exception as e:
print(f" Failed to split pages: {e}")
continue
# Generate embeddings
print(f" Generating embeddings for {document_file.name}")
try:
# Use paragraph chunking strategy with default settings
embedding_data = embed_document(
str(document_file),
str(embeddings_file),
chunking_strategy='paragraph'
)
if embedding_data:
print(f" Generated embeddings for {len(embedding_data['chunks'])} chunks")
else:
print(f" Failed to generate embeddings")
except Exception as e:
print(f" Failed to generate embeddings: {e}")
print(f" Document processing completed for project {unique_id}")
else:
print(f" Skipping document processing (skip_processed=True)")
def organize_dataset_files():
"""Move files from projects/{unique_id}/files to projects/{unique_id}/dataset/{file_name}/document.txt"""
projects_dir = Path("projects")
if not projects_dir.exists():
print("Projects directory not found")
return
# Get all project directories (exclude cache and other non-project dirs)
project_dirs = [d for d in projects_dir.iterdir()
if d.is_dir() and d.name != "_cache" and not d.name.startswith(".")]
for project_dir in project_dirs:
print(f"\nProcessing project: {project_dir.name}")
files_dir = project_dir / "files"
dataset_dir = project_dir / "dataset"
# Check if files directory exists and has files
if not files_dir.exists():
print(f" No files directory found, skipping...")
continue
files = list(files_dir.glob("*"))
if not files:
print(f" Files directory is empty, skipping...")
continue
# Create dataset directory if it doesn't exist
dataset_dir.mkdir(exist_ok=True)
# Move each file to its own directory
for file_path in files:
if file_path.is_file():
# Get filename without extension as directory name
file_name_without_ext = file_path.stem
target_dir = dataset_dir / file_name_without_ext
target_file = target_dir / "document.txt"
print(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
# Create target directory
target_dir.mkdir(exist_ok=True)
# Copy and rename file
shutil.copy2(str(file_path), str(target_file))
print(f" Files remain in original location (copied to dataset structure)")
print("\nFile organization complete!")
if __name__ == "__main__":
organize_dataset_files()