maxkb/dev/sync_mineru_dirs.py
2025-08-25 01:20:33 +08:00

281 lines
9.4 KiB
Python
Executable File

#!/usr/bin/env python3
"""
双向同步脚本:在两个 mineru 目录之间进行文件同步
目录1: ~/Documents/felo/gptbase-parser/loader/mineru
目录2: /Users/moshui/Documents/felo/moshui/MaxKB/apps/common/handle/impl/mineru
"""
import os
import sys
import time
import shutil
import hashlib
import argparse
from pathlib import Path
from datetime import datetime
from typing import Set, Tuple, Optional
import subprocess
try:
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
WATCHDOG_AVAILABLE = True
except ImportError:
WATCHDOG_AVAILABLE = False
print("警告: watchdog 未安装,实时监控功能不可用")
print("运行 'pip install watchdog' 来启用实时监控功能")
class DirectorySyncer:
def __init__(self, dir1: str, dir2: str, verbose: bool = False):
self.dir1 = Path(dir1).expanduser().resolve()
self.dir2 = Path(dir2).expanduser().resolve()
self.verbose = verbose
self.exclude_patterns = {
'__pycache__',
'.DS_Store',
'*.pyc',
'*.pyo',
'.git',
'.idea',
'.vscode',
'*.swp',
'*.swo',
'*~'
}
def should_exclude(self, path: Path) -> bool:
"""检查文件或目录是否应该被排除"""
name = path.name
for pattern in self.exclude_patterns:
if pattern.startswith('*'):
if name.endswith(pattern[1:]):
return True
elif pattern.endswith('*'):
if name.startswith(pattern[:-1]):
return True
elif name == pattern:
return True
return False
def get_file_hash(self, filepath: Path) -> str:
"""计算文件的 MD5 哈希值"""
hash_md5 = hashlib.md5()
try:
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
except Exception as e:
if self.verbose:
print(f"无法计算 {filepath} 的哈希值: {e}")
return ""
def get_relative_files(self, directory: Path) -> Set[Path]:
"""获取目录中所有文件的相对路径集合"""
files = set()
for item in directory.rglob("*"):
if self.should_exclude(item):
continue
if item.is_file():
relative_path = item.relative_to(directory)
files.add(relative_path)
return files
def sync_file(self, source: Path, dest: Path, relative_path: Path) -> bool:
"""同步单个文件"""
source_file = source / relative_path
dest_file = dest / relative_path
try:
dest_file.parent.mkdir(parents=True, exist_ok=True)
if not dest_file.exists():
shutil.copy2(source_file, dest_file)
if self.verbose:
print(f"复制: {relative_path}")
return True
else:
source_hash = self.get_file_hash(source_file)
dest_hash = self.get_file_hash(dest_file)
if source_hash != dest_hash:
source_mtime = source_file.stat().st_mtime
dest_mtime = dest_file.stat().st_mtime
if source_mtime > dest_mtime:
shutil.copy2(source_file, dest_file)
if self.verbose:
print(f"更新: {relative_path} (源文件较新)")
return True
elif self.verbose:
print(f"跳过: {relative_path} (目标文件较新或相同)")
except Exception as e:
print(f"错误同步 {relative_path}: {e}")
return False
return False
def sync_directories(self) -> Tuple[int, int]:
"""执行双向同步"""
print(f"\n开始同步...")
print(f"目录1: {self.dir1}")
print(f"目录2: {self.dir2}")
print("-" * 60)
files1 = self.get_relative_files(self.dir1)
files2 = self.get_relative_files(self.dir2)
all_files = files1 | files2
synced_count = 0
deleted_count = 0
for rel_path in all_files:
file1 = self.dir1 / rel_path
file2 = self.dir2 / rel_path
if file1.exists() and not file2.exists():
if self.sync_file(self.dir1, self.dir2, rel_path):
synced_count += 1
print(f"{rel_path}")
elif file2.exists() and not file1.exists():
if self.sync_file(self.dir2, self.dir1, rel_path):
synced_count += 1
print(f"{rel_path}")
elif file1.exists() and file2.exists():
hash1 = self.get_file_hash(file1)
hash2 = self.get_file_hash(file2)
if hash1 != hash2:
mtime1 = file1.stat().st_mtime
mtime2 = file2.stat().st_mtime
if mtime1 > mtime2:
if self.sync_file(self.dir1, self.dir2, rel_path):
synced_count += 1
print(f"{rel_path} (更新)")
else:
if self.sync_file(self.dir2, self.dir1, rel_path):
synced_count += 1
print(f"{rel_path} (更新)")
return synced_count, deleted_count
def watch(self):
"""启动文件监控"""
if not WATCHDOG_AVAILABLE:
print("错误: watchdog 模块未安装")
print("请运行: pip install watchdog")
return
class SyncHandler(FileSystemEventHandler):
def __init__(self, syncer):
self.syncer = syncer
self.last_sync = 0
self.sync_delay = 1
def on_any_event(self, event):
if event.is_directory:
return
current_time = time.time()
if current_time - self.last_sync > self.sync_delay:
path = Path(event.src_path)
if not self.syncer.should_exclude(path):
print(f"\n检测到变化: {path.name}")
self.syncer.sync_directories()
self.last_sync = current_time
event_handler = SyncHandler(self)
observer = Observer()
observer.schedule(event_handler, str(self.dir1), recursive=True)
observer.schedule(event_handler, str(self.dir2), recursive=True)
observer.start()
print(f"\n监控模式已启动...")
print(f"正在监控两个目录的变化,按 Ctrl+C 退出")
print("-" * 60)
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
print("\n监控已停止")
observer.join()
def main():
parser = argparse.ArgumentParser(
description="双向同步两个 mineru 目录",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
%(prog)s # 执行一次同步
%(prog)s --watch # 启动实时监控模式
%(prog)s --verbose # 显示详细信息
%(prog)s --dry-run # 模拟运行,不实际同步
"""
)
parser.add_argument(
"--watch", "-w",
action="store_true",
help="启动监控模式,实时同步文件变化"
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="显示详细输出"
)
parser.add_argument(
"--dry-run", "-n",
action="store_true",
help="模拟运行,只显示将要执行的操作"
)
parser.add_argument(
"--dir1",
default="~/Documents/felo/gptbase-parser/loader/mineru",
help="第一个目录路径 (默认: ~/Documents/felo/gptbase-parser/loader/mineru)"
)
parser.add_argument(
"--dir2",
default="/Users/moshui/Documents/felo/moshui/MaxKB/apps/common/handle/impl/mineru",
help="第二个目录路径"
)
args = parser.parse_args()
syncer = DirectorySyncer(args.dir1, args.dir2, verbose=args.verbose)
if not syncer.dir1.exists():
print(f"错误: 目录不存在 - {syncer.dir1}")
sys.exit(1)
if not syncer.dir2.exists():
print(f"错误: 目录不存在 - {syncer.dir2}")
sys.exit(1)
if args.dry_run:
print("模拟运行模式 - 不会实际修改文件")
syncer.verbose = True
if args.watch:
syncer.sync_directories()
syncer.watch()
else:
synced, deleted = syncer.sync_directories()
print("-" * 60)
print(f"同步完成: {synced} 个文件已同步")
if deleted > 0:
print(f" {deleted} 个文件已删除")
if __name__ == "__main__":
main()