qwen_agent/skills/developing/poem-storyboard/scripts/merge_videos.py
2026-06-14 15:27:59 +08:00

174 lines
5.9 KiB
Python

#!/usr/bin/env python3
"""Merge per-shot videos into a single film, ordered by storyboard.json.
Because adjacent shots share keyframes (shot i's last frame == shot i+1's first
frame), a plain hard concat is already seamless. An optional crossfade mode is
provided for cases where soft dissolves are explicitly wanted.
Requires ffmpeg on PATH.
Examples:
# Hard concat in storyboard order (default, seamless via shared frames)
python merge_videos.py --storyboard storyboard.json --dir ./outputs --out final.mp4
# Crossfade between every shot (0.5s dissolve)
python merge_videos.py --storyboard storyboard.json --dir ./outputs \
--out final.mp4 --crossfade 0.5
"""
import argparse
import json
import os
import shutil
import subprocess
import sys
import tempfile
def log(msg):
print(msg, file=sys.stderr)
def ensure_ffmpeg():
if shutil.which("ffmpeg") is None:
log("ERROR: ffmpeg not found on PATH. Install it first "
"(macOS: brew install ffmpeg).")
sys.exit(1)
def probe_duration(path):
"""Return clip duration in seconds via ffprobe."""
out = subprocess.run(
["ffprobe", "-v", "error", "-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1", path],
capture_output=True, text=True, check=True,
)
return float(out.stdout.strip())
def resolve_shot_files(storyboard, video_dir):
"""Map each shot to its video file in storyboard order."""
files = []
for shot in storyboard["shots"]:
sid = shot["id"]
# Accept common naming patterns produced by the agnes step.
candidates = [
f"shot_{sid}.mp4", f"shot{sid}.mp4",
f"{sid}.mp4", f"shot_{sid:02d}.mp4",
]
found = None
for name in candidates:
p = os.path.join(video_dir, name)
if os.path.exists(p):
found = p
break
if not found:
log(f"ERROR: no video file for shot id={sid} in {video_dir}. "
f"Tried: {candidates}")
sys.exit(1)
files.append(found)
return files
def hard_concat(files, out, width, height, fps):
"""Re-encode every clip to a common format, then concat losslessly."""
tmpdir = tempfile.mkdtemp(prefix="poem_merge_")
normalized = []
try:
for i, f in enumerate(files):
np = os.path.join(tmpdir, f"n_{i:03d}.mp4")
subprocess.run(
["ffmpeg", "-y", "-i", f,
"-vf", f"scale={width}:{height}:force_original_aspect_ratio=decrease,"
f"pad={width}:{height}:(ow-iw)/2:(oh-ih)/2,fps={fps}",
"-c:v", "libx264", "-pix_fmt", "yuv420p", "-an", np],
check=True,
)
normalized.append(np)
listfile = os.path.join(tmpdir, "list.txt")
with open(listfile, "w") as fh:
for np in normalized:
fh.write(f"file '{np}'\n")
subprocess.run(
["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", listfile,
"-c:v", "libx264", "-pix_fmt", "yuv420p", out],
check=True,
)
finally:
shutil.rmtree(tmpdir, ignore_errors=True)
def crossfade_concat(files, out, width, height, fps, fade):
"""Chain clips with an xfade dissolve of `fade` seconds between each."""
norm_filters = []
inputs = []
for f in files:
inputs += ["-i", f]
# Normalize each input stream.
for i in range(len(files)):
norm_filters.append(
f"[{i}:v]scale={width}:{height}:force_original_aspect_ratio=decrease,"
f"pad={width}:{height}:(ow-iw)/2:(oh-ih)/2,fps={fps},"
f"settb=AVTB[v{i}]"
)
durations = [probe_duration(f) for f in files]
chain = []
prev = "[v0]"
offset = durations[0] - fade
for i in range(1, len(files)):
out_label = f"[x{i}]" if i < len(files) - 1 else "[vout]"
chain.append(
f"{prev}[v{i}]xfade=transition=fade:duration={fade}:"
f"offset={offset:.3f}{out_label}"
)
prev = out_label
offset += durations[i] - fade
filter_complex = ";".join(norm_filters + chain)
subprocess.run(
["ffmpeg", "-y", *inputs, "-filter_complex", filter_complex,
"-map", "[vout]", "-c:v", "libx264", "-pix_fmt", "yuv420p", out],
check=True,
)
def main():
ap = argparse.ArgumentParser(description="Merge per-shot videos by storyboard order.")
ap.add_argument("--storyboard", required=True, help="Path to storyboard.json")
ap.add_argument("--dir", required=True, help="Directory holding shot_<id>.mp4 files")
ap.add_argument("--out", required=True, help="Output merged video path")
ap.add_argument("--crossfade", type=float, default=0.0,
help="Crossfade seconds between shots (0 = hard concat, default)")
ap.add_argument("--width", type=int, default=1152)
ap.add_argument("--height", type=int, default=768)
ap.add_argument("--fps", type=int, default=24)
args = ap.parse_args()
ensure_ffmpeg()
with open(args.storyboard) as fh:
storyboard = json.load(fh)
# Derive resolution from aspect if present (e.g. "1152x768").
aspect = storyboard.get("aspect")
if aspect and "x" in aspect:
try:
w, h = aspect.lower().split("x")
args.width, args.height = int(w), int(h)
except ValueError:
pass
files = resolve_shot_files(storyboard, args.dir)
log(f"Merging {len(files)} shots -> {args.out} "
f"({args.width}x{args.height}@{args.fps}fps, "
f"{'crossfade ' + str(args.crossfade) + 's' if args.crossfade > 0 else 'hard concat'})")
if args.crossfade > 0:
crossfade_concat(files, args.out, args.width, args.height, args.fps, args.crossfade)
else:
hard_concat(files, args.out, args.width, args.height, args.fps)
print(f"SAVED: {args.out}")
if __name__ == "__main__":
main()