#!/usr/bin/env python3 """ Dataset management functions for organizing and processing datasets. """ import os import shutil import json from typing import Dict, List, Optional from pathlib import Path from utils.file_utils import ( download_file, extract_zip_file, get_file_hash, load_processed_files_log, save_processed_files_log, remove_file_or_directory ) async def download_dataset_files(unique_id: str, files: Dict[str, List[str]]) -> Dict[str, List[str]]: """Download or copy dataset files and organize them by key into dataset/{key}/document.txt. Supports zip file extraction and combines content using '# Page' separators.""" if not files: return {} # Set up directories project_dir = os.path.join("projects", unique_id) files_dir = os.path.join(project_dir, "files") dataset_dir = os.path.join(project_dir, "dataset") # Create directories if they don't exist os.makedirs(files_dir, exist_ok=True) os.makedirs(dataset_dir, exist_ok=True) processed_files_by_key = {} def extract_zip_file_func(zip_path: str, extract_dir: str) -> List[str]: """Extract zip file and return list of extracted txt/md files""" extracted_files = [] try: import zipfile with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(extract_dir) # Find all extracted txt and md files for root, dirs, files in os.walk(extract_dir): for file in files: if file.lower().endswith(('.txt', '.md')): extracted_files.append(os.path.join(root, file)) print(f"Extracted {len(extracted_files)} txt/md files from {zip_path}") return extracted_files except Exception as e: print(f"Error extracting zip file {zip_path}: {str(e)}") return [] # Process each key and its associated files for key, file_list in files.items(): print(f"Processing key '{key}' with {len(file_list)} files") processed_files_by_key[key] = [] # Create target directory for this key target_dir = os.path.join(dataset_dir, key) os.makedirs(target_dir, exist_ok=True) # Check if files are already processed before doing any work document_file = os.path.join(target_dir, "document.txt") pagination_file = os.path.join(target_dir, "pagination.txt") embeddings_file = os.path.join(target_dir, "document_embeddings.pkl") already_processed = ( os.path.exists(document_file) and os.path.exists(pagination_file) and os.path.exists(embeddings_file) and os.path.getsize(document_file) > 0 and os.path.getsize(pagination_file) > 0 and os.path.getsize(embeddings_file) > 0 ) if already_processed: print(f" Skipping already processed files for {key}") processed_files_by_key[key].append(document_file) continue # Skip to next key # Read and combine all files for this key combined_content = [] all_processed_files = [] for file_path in file_list: # Check if it's a URL (remote file) or local file is_remote = file_path.startswith(('http://', 'https://')) filename = file_path.split("/")[-1] if file_path else f"file_{len(all_processed_files)}" # Create temporary extraction directory for zip files temp_extract_dir = None files_to_process = [] try: if is_remote: # Handle remote file temp_file = os.path.join(files_dir, filename) print(f"Downloading {file_path} -> {temp_file}") success = await download_file(file_path, temp_file) if not success: print(f"Failed to download {file_path}") continue # Check if it's a zip file if filename.lower().endswith('.zip'): temp_extract_dir = tempfile.mkdtemp(prefix=f"extract_{key}_") print(f"Extracting zip to temporary directory: {temp_extract_dir}") extracted_files = extract_zip_file_func(temp_file, temp_extract_dir) files_to_process.extend(extracted_files) # Copy the zip file to project files directory zip_dest = os.path.join(files_dir, filename) shutil.copy2(temp_file, zip_dest) print(f"Copied local zip file: {temp_file} -> {zip_dest}") else: files_to_process.append(temp_file) else: # Handle local file if not os.path.exists(file_path): print(f"Local file not found: {file_path}") continue if filename.lower().endswith('.zip'): # Copy to project directory first local_zip_path = os.path.join(files_dir, filename) shutil.copy2(file_path, local_zip_path) print(f"Copied local zip file: {file_path} -> {local_zip_path}") # Extract zip file temp_extract_dir = tempfile.mkdtemp(prefix=f"extract_{key}_") print(f"Extracting local zip to temporary directory: {temp_extract_dir}") extracted_files = extract_zip_file_func(local_zip_path, temp_extract_dir) files_to_process.extend(extracted_files) else: # Copy non-zip file directly dest_file = os.path.join(files_dir, filename) shutil.copy2(file_path, dest_file) files_to_process.append(dest_file) print(f"Copied local file: {file_path} -> {dest_file}") # Process all files (extracted from zip or single file) for process_file_path in files_to_process: try: with open(process_file_path, 'r', encoding='utf-8') as f: content = f.read().strip() if content: # Add file content with page separator base_filename = os.path.basename(process_file_path) combined_content.append(f"# Page {base_filename}") combined_content.append(content) except Exception as e: print(f"Failed to read file content from {process_file_path}: {str(e)}") except Exception as e: print(f"Error processing file {file_path}: {str(e)}") finally: # Clean up temporary extraction directory if temp_extract_dir and os.path.exists(temp_extract_dir): try: shutil.rmtree(temp_extract_dir) print(f"Cleaned up temporary directory: {temp_extract_dir}") except Exception as e: print(f"Failed to clean up temporary directory {temp_extract_dir}: {str(e)}") # Write combined content to dataset/{key}/document.txt if combined_content: try: with open(document_file, 'w', encoding='utf-8') as f: f.write('\n\n'.join(combined_content)) print(f"Created combined document: {document_file}") # Generate pagination and embeddings for the combined document try: import sys sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'embedding')) from embedding import split_document_by_pages, embed_document # Generate pagination print(f" Generating pagination for {key}") pages = split_document_by_pages(str(document_file), str(pagination_file)) print(f" Generated {len(pages)} pages") # Generate embeddings print(f" Generating embeddings for {key}") local_model_path = "./models/paraphrase-multilingual-MiniLM-L12-v2" if not os.path.exists(local_model_path): local_model_path = None # Fallback to HuggingFace model # Use paragraph chunking strategy with default settings embedding_data = embed_document( str(document_file), str(embeddings_file), chunking_strategy='paragraph', model_path=local_model_path ) if embedding_data: print(f" Generated embeddings for {len(embedding_data['chunks'])} chunks") # Add to processed files only after successful embedding processed_files_by_key[key].append(document_file) else: print(f" Failed to generate embeddings") except Exception as e: print(f" Failed to generate pagination/embeddings for {key}: {str(e)}") except Exception as e: print(f"Failed to write combined document: {str(e)}") # Load existing log processed_log = load_processed_files_log(unique_id) # Update log with newly processed files for key, file_list in files.items(): if key not in processed_log: processed_log[key] = {} for file_path in file_list: filename = os.path.basename(file_path) processed_log[key][filename] = { "original_path": file_path, "processed_at": str(os.path.getmtime(document_file) if os.path.exists(document_file) else 0), "status": "processed" if key in processed_files_by_key and processed_files_by_key[key] else "failed" } # Save the updated processed log save_processed_files_log(unique_id, processed_log) return processed_files_by_key def generate_dataset_structure(unique_id: str) -> str: """Generate a string representation of the dataset structure""" dataset_dir = os.path.join("projects", unique_id, "dataset") structure = [] def add_directory_contents(dir_path: str, prefix: str = ""): try: items = sorted(os.listdir(dir_path)) for i, item in enumerate(items): item_path = os.path.join(dir_path, item) is_last = i == len(items) - 1 current_prefix = "└── " if is_last else "├── " structure.append(f"{prefix}{current_prefix}{item}") if os.path.isdir(item_path): next_prefix = prefix + (" " if is_last else "│ ") add_directory_contents(item_path, next_prefix) except Exception as e: structure.append(f"{prefix}└── Error: {str(e)}") if os.path.exists(dataset_dir): structure.append(f"dataset/") add_directory_contents(dataset_dir, "") else: structure.append("dataset/ (not found)") return "\n".join(structure) def remove_dataset_directory(unique_id: str, filename_without_ext: str): """Remove a specific dataset directory""" dataset_path = os.path.join("projects", unique_id, "dataset", filename_without_ext) remove_file_or_directory(dataset_path) def remove_dataset_directory_by_key(unique_id: str, key: str): """Remove dataset directory by key""" dataset_path = os.path.join("projects", unique_id, "dataset", key) remove_file_or_directory(dataset_path)