qwen_agent/skills/developing/poem-storyboard/scripts/merge_videos.py

#!/usr/bin/env python3
"""Merge per-shot videos into a single film, ordered by storyboard.json.

Because adjacent shots share keyframes (shot i's last frame == shot i+1's first
frame), a plain hard concat is already seamless. An optional crossfade mode is
provided for cases where soft dissolves are explicitly wanted.

Requires ffmpeg on PATH.

Examples:
    # Hard concat in storyboard order (default, seamless via shared frames)
    python merge_videos.py --storyboard storyboard.json --dir ./outputs --out final.mp4

    # Crossfade between every shot (0.5s dissolve)
    python merge_videos.py --storyboard storyboard.json --dir ./outputs \
        --out final.mp4 --crossfade 0.5
"""

import argparse
import json
import os
import shutil
import subprocess
import sys
import tempfile


def log(msg):
    print(msg, file=sys.stderr)


def ensure_ffmpeg():
    if shutil.which("ffmpeg") is None:
        log("ERROR: ffmpeg not found on PATH. Install it first "
            "(macOS: brew install ffmpeg).")
        sys.exit(1)


def probe_duration(path):
    """Return clip duration in seconds via ffprobe."""
    out = subprocess.run(
        ["ffprobe", "-v", "error", "-show_entries", "format=duration",
         "-of", "default=noprint_wrappers=1:nokey=1", path],
        capture_output=True, text=True, check=True,
    )
    return float(out.stdout.strip())


def resolve_shot_files(storyboard, video_dir):
    """Map each shot to its video file in storyboard order."""
    files = []
    for shot in storyboard["shots"]:
        sid = shot["id"]
        # Accept common naming patterns produced by the agnes step.
        candidates = [
            f"shot_{sid}.mp4", f"shot{sid}.mp4",
            f"{sid}.mp4", f"shot_{sid:02d}.mp4",
        ]
        found = None
        for name in candidates:
            p = os.path.join(video_dir, name)
            if os.path.exists(p):
                found = p
                break
        if not found:
            log(f"ERROR: no video file for shot id={sid} in {video_dir}. "
                f"Tried: {candidates}")
            sys.exit(1)
        files.append(found)
    return files


def hard_concat(files, out, width, height, fps):
    """Re-encode every clip to a common format, then concat losslessly."""
    tmpdir = tempfile.mkdtemp(prefix="poem_merge_")
    normalized = []
    try:
        for i, f in enumerate(files):
            np = os.path.join(tmpdir, f"n_{i:03d}.mp4")
            subprocess.run(
                ["ffmpeg", "-y", "-i", f,
                 "-vf", f"scale={width}:{height}:force_original_aspect_ratio=decrease,"
                        f"pad={width}:{height}:(ow-iw)/2:(oh-ih)/2,fps={fps}",
                 "-c:v", "libx264", "-pix_fmt", "yuv420p", "-an", np],
                check=True,
            )
            normalized.append(np)
        listfile = os.path.join(tmpdir, "list.txt")
        with open(listfile, "w") as fh:
            for np in normalized:
                fh.write(f"file '{np}'\n")
        subprocess.run(
            ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", listfile,
             "-c:v", "libx264", "-pix_fmt", "yuv420p", out],
            check=True,
        )
    finally:
        shutil.rmtree(tmpdir, ignore_errors=True)


def crossfade_concat(files, out, width, height, fps, fade):
    """Chain clips with an xfade dissolve of `fade` seconds between each."""
    norm_filters = []
    inputs = []
    for f in files:
        inputs += ["-i", f]
    # Normalize each input stream.
    for i in range(len(files)):
        norm_filters.append(
            f"[{i}:v]scale={width}:{height}:force_original_aspect_ratio=decrease,"
            f"pad={width}:{height}:(ow-iw)/2:(oh-ih)/2,fps={fps},"
            f"settb=AVTB[v{i}]"
        )
    durations = [probe_duration(f) for f in files]
    chain = []
    prev = "[v0]"
    offset = durations[0] - fade
    for i in range(1, len(files)):
        out_label = f"[x{i}]" if i < len(files) - 1 else "[vout]"
        chain.append(
            f"{prev}[v{i}]xfade=transition=fade:duration={fade}:"
            f"offset={offset:.3f}{out_label}"
        )
        prev = out_label
        offset += durations[i] - fade
    filter_complex = ";".join(norm_filters + chain)
    subprocess.run(
        ["ffmpeg", "-y", *inputs, "-filter_complex", filter_complex,
         "-map", "[vout]", "-c:v", "libx264", "-pix_fmt", "yuv420p", out],
        check=True,
    )


def main():
    ap = argparse.ArgumentParser(description="Merge per-shot videos by storyboard order.")
    ap.add_argument("--storyboard", required=True, help="Path to storyboard.json")
    ap.add_argument("--dir", required=True, help="Directory holding shot_<id>.mp4 files")
    ap.add_argument("--out", required=True, help="Output merged video path")
    ap.add_argument("--crossfade", type=float, default=0.0,
                    help="Crossfade seconds between shots (0 = hard concat, default)")
    ap.add_argument("--width", type=int, default=1152)
    ap.add_argument("--height", type=int, default=768)
    ap.add_argument("--fps", type=int, default=24)
    args = ap.parse_args()

    ensure_ffmpeg()
    with open(args.storyboard) as fh:
        storyboard = json.load(fh)

    # Derive resolution from aspect if present (e.g. "1152x768").
    aspect = storyboard.get("aspect")
    if aspect and "x" in aspect:
        try:
            w, h = aspect.lower().split("x")
            args.width, args.height = int(w), int(h)
        except ValueError:
            pass

    files = resolve_shot_files(storyboard, args.dir)
    log(f"Merging {len(files)} shots -> {args.out} "
        f"({args.width}x{args.height}@{args.fps}fps, "
        f"{'crossfade ' + str(args.crossfade) + 's' if args.crossfade > 0 else 'hard concat'})")

    if args.crossfade > 0:
        crossfade_concat(files, args.out, args.width, args.height, args.fps, args.crossfade)
    else:
        hard_concat(files, args.out, args.width, args.height, args.fps)

    print(f"SAVED: {args.out}")


if __name__ == "__main__":
    main()