304 lines
10 KiB
Python
304 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
File utility functions for file processing, downloading, and management.
|
|
"""
|
|
|
|
import os
|
|
import hashlib
|
|
import aiofiles
|
|
import aiohttp
|
|
import shutil
|
|
import zipfile
|
|
import tempfile
|
|
from typing import Dict, List, Optional
|
|
from pathlib import Path
|
|
|
|
|
|
async def download_file(url: str, destination_path: str) -> bool:
|
|
"""Download file from URL asynchronously"""
|
|
try:
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(url) as response:
|
|
if response.status == 200:
|
|
async with aiofiles.open(destination_path, 'wb') as f:
|
|
async for chunk in response.content.iter_chunked(8192):
|
|
await f.write(chunk)
|
|
return True
|
|
else:
|
|
print(f"Failed to download {url}, status code: {response.status}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"Error downloading {url}: {str(e)}")
|
|
return False
|
|
|
|
|
|
def get_file_hash(file_path: str) -> str:
|
|
"""Calculate MD5 hash for a file path/URL"""
|
|
return hashlib.md5(file_path.encode('utf-8')).hexdigest()
|
|
|
|
|
|
def remove_file_or_directory(path: str):
|
|
"""Remove file or directory recursively"""
|
|
try:
|
|
if os.path.exists(path):
|
|
if os.path.isfile(path):
|
|
os.remove(path)
|
|
elif os.path.isdir(path):
|
|
shutil.rmtree(path)
|
|
print(f"Removed: {path}")
|
|
else:
|
|
print(f"Path does not exist: {path}")
|
|
except Exception as e:
|
|
print(f"Error removing {path}: {str(e)}")
|
|
|
|
|
|
def extract_zip_file(zip_path: str, extract_dir: str) -> List[str]:
|
|
"""Extract zip file and return list of extracted txt/md files"""
|
|
extracted_files = []
|
|
try:
|
|
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
zip_ref.extractall(extract_dir)
|
|
|
|
# Find all extracted txt, md, xlsx, xls, and csv files
|
|
for root, dirs, files in os.walk(extract_dir):
|
|
for file in files:
|
|
if file.lower().endswith(('.txt', '.md', '.xlsx', '.xls', '.csv')):
|
|
extracted_files.append(os.path.join(root, file))
|
|
|
|
print(f"Extracted {len(extracted_files)} txt/md/xlsx/csv files from {zip_path}")
|
|
return extracted_files
|
|
|
|
except Exception as e:
|
|
print(f"Error extracting zip file {zip_path}: {str(e)}")
|
|
return []
|
|
|
|
|
|
def get_document_preview(document_path: str, max_lines: int = 10) -> str:
|
|
"""Get preview of document content"""
|
|
try:
|
|
with open(document_path, 'r', encoding='utf-8') as f:
|
|
lines = []
|
|
for i, line in enumerate(f):
|
|
if i >= max_lines:
|
|
break
|
|
lines.append(line.rstrip())
|
|
return '\n'.join(lines)
|
|
except Exception as e:
|
|
return f"Error reading document: {str(e)}"
|
|
|
|
|
|
def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool:
|
|
"""Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)"""
|
|
if not target_file.exists():
|
|
return False
|
|
|
|
# Check if pagination and embeddings files exist and are not empty
|
|
if pagination_file.exists() and embeddings_file.exists():
|
|
# Check file sizes to ensure they're not empty
|
|
if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0:
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def load_processed_files_log(unique_id: str) -> Dict[str, Dict]:
|
|
"""Load processed files log for a project"""
|
|
log_file = os.path.join("projects", "data", unique_id, "processed_files.json")
|
|
if os.path.exists(log_file):
|
|
try:
|
|
import json
|
|
with open(log_file, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
except Exception as e:
|
|
print(f"Error loading processed files log: {e}")
|
|
return {}
|
|
|
|
|
|
def save_processed_files_log(unique_id: str, processed_log: Dict[str, Dict]):
|
|
"""Save processed files log for a project (legacy function)"""
|
|
log_file = os.path.join("projects", "data", unique_id, "processed_files.json")
|
|
try:
|
|
os.makedirs(os.path.dirname(log_file), exist_ok=True)
|
|
import json
|
|
with open(log_file, 'w', encoding='utf-8') as f:
|
|
json.dump(processed_log, f, ensure_ascii=False, indent=2)
|
|
except Exception as e:
|
|
print(f"Error saving processed files log: {e}")
|
|
|
|
|
|
def get_processing_log(unique_id: str) -> Dict:
|
|
"""Get the comprehensive processing log for a project"""
|
|
log_file = os.path.join("projects", "data", unique_id, "processing_log.json")
|
|
if os.path.exists(log_file):
|
|
try:
|
|
import json
|
|
with open(log_file, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
except Exception as e:
|
|
print(f"Error loading processing log: {e}")
|
|
return {}
|
|
|
|
|
|
def save_project_status(unique_id: str, status: Dict):
|
|
"""Save project processing status"""
|
|
status_file = os.path.join("projects", "data", unique_id, "status.json")
|
|
try:
|
|
os.makedirs(os.path.dirname(status_file), exist_ok=True)
|
|
import json
|
|
with open(status_file, 'w', encoding='utf-8') as f:
|
|
json.dump(status, f, ensure_ascii=False, indent=2)
|
|
except Exception as e:
|
|
print(f"Error saving project status: {e}")
|
|
|
|
|
|
def load_project_status(unique_id: str) -> Dict:
|
|
"""Load project processing status"""
|
|
status_file = os.path.join("projects", "data", unique_id, "status.json")
|
|
if os.path.exists(status_file):
|
|
try:
|
|
import json
|
|
with open(status_file, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
except Exception as e:
|
|
print(f"Error loading project status: {e}")
|
|
return {}
|
|
|
|
|
|
def get_file_metadata(file_path: str) -> Dict:
|
|
"""Get metadata for a file"""
|
|
try:
|
|
if not os.path.exists(file_path):
|
|
return {"exists": False}
|
|
|
|
stat = os.stat(file_path)
|
|
return {
|
|
"exists": True,
|
|
"size": stat.st_size,
|
|
"modified_time": stat.st_mtime,
|
|
"created_time": stat.st_ctime,
|
|
"is_file": os.path.isfile(file_path),
|
|
"is_directory": os.path.isdir(file_path)
|
|
}
|
|
except Exception as e:
|
|
return {"exists": False, "error": str(e)}
|
|
|
|
|
|
def update_file_processing_status(unique_id: str, group_name: str, filename: str, status: Dict):
|
|
"""Update processing status for a specific file"""
|
|
status_file = os.path.join("projects", "data", unique_id, "file_status.json")
|
|
|
|
try:
|
|
# Load existing status
|
|
if os.path.exists(status_file):
|
|
import json
|
|
with open(status_file, 'r', encoding='utf-8') as f:
|
|
file_status = json.load(f)
|
|
else:
|
|
file_status = {}
|
|
|
|
# Ensure structure exists
|
|
if group_name not in file_status:
|
|
file_status[group_name] = {}
|
|
|
|
# Update status
|
|
file_status[group_name][filename] = {
|
|
**status,
|
|
"updated_at": str(os.path.getmtime(file_path) if os.path.exists(file_path) else 0)
|
|
}
|
|
|
|
# Save updated status
|
|
os.makedirs(os.path.dirname(status_file), exist_ok=True)
|
|
with open(status_file, 'w', encoding='utf-8') as f:
|
|
json.dump(file_status, f, ensure_ascii=False, indent=2)
|
|
|
|
except Exception as e:
|
|
print(f"Error updating file processing status: {e}")
|
|
|
|
|
|
def get_file_processing_status(unique_id: str, group_name: str = None, filename: str = None) -> Dict:
|
|
"""Get processing status for files"""
|
|
status_file = os.path.join("projects", "data", unique_id, "file_status.json")
|
|
|
|
if not os.path.exists(status_file):
|
|
return {}
|
|
|
|
try:
|
|
import json
|
|
with open(status_file, 'r', encoding='utf-8') as f:
|
|
file_status = json.load(f)
|
|
|
|
# Filter by group and filename if provided
|
|
if group_name:
|
|
if group_name not in file_status:
|
|
return {}
|
|
|
|
if filename:
|
|
return file_status[group_name].get(filename, {})
|
|
else:
|
|
return file_status[group_name]
|
|
|
|
return file_status
|
|
|
|
except Exception as e:
|
|
print(f"Error getting file processing status: {e}")
|
|
return {}
|
|
|
|
|
|
def calculate_directory_size(directory_path: str) -> int:
|
|
"""Calculate total size of a directory recursively"""
|
|
total_size = 0
|
|
try:
|
|
for dirpath, dirnames, filenames in os.walk(directory_path):
|
|
for filename in filenames:
|
|
file_path = os.path.join(dirpath, filename)
|
|
if os.path.exists(file_path):
|
|
total_size += os.path.getsize(file_path)
|
|
except Exception as e:
|
|
print(f"Error calculating directory size: {e}")
|
|
|
|
return total_size
|
|
|
|
|
|
def get_project_statistics(unique_id: str) -> Dict:
|
|
"""Get comprehensive statistics for a project"""
|
|
project_dir = os.path.join("projects", "data", unique_id)
|
|
|
|
if not os.path.exists(project_dir):
|
|
return {"project_exists": False}
|
|
|
|
stats = {
|
|
"project_exists": True,
|
|
"unique_id": unique_id,
|
|
"directories": {},
|
|
"total_files": 0,
|
|
"total_size": 0
|
|
}
|
|
|
|
# Check each directory
|
|
directories = ["files", "processed", "dataset"]
|
|
|
|
for dir_name in directories:
|
|
dir_path = os.path.join(project_dir, dir_name)
|
|
if os.path.exists(dir_path):
|
|
dir_size = calculate_directory_size(dir_path)
|
|
dir_files = 0
|
|
|
|
for root, dirs, files in os.walk(dir_path):
|
|
dir_files += len(files)
|
|
|
|
stats["directories"][dir_name] = {
|
|
"exists": True,
|
|
"size": dir_size,
|
|
"files": dir_files
|
|
}
|
|
|
|
stats["total_files"] += dir_files
|
|
stats["total_size"] += dir_size
|
|
else:
|
|
stats["directories"][dir_name] = {
|
|
"exists": False,
|
|
"size": 0,
|
|
"files": 0
|
|
}
|
|
|
|
return stats |