video-render/render.py

"""Rendering logic for producing vertical clips with dynamic captions.

This module defines a single function ``render_clip`` which takes a video
segment and produces a vertical clip suitable for social media. Each clip
contains three regions:

* A top region (480px high) showing a title generated by an LLM.
* A middle region (960px high) containing the original video, scaled to
  fit horizontally while preserving aspect ratio and centred vertically.
* A bottom region (480px high) showing a dynamic caption. The caption
  displays a sliding window of three to five words from the transcript,
  colouring the currently spoken word differently to draw the viewer's
  attention.

The function uses the MoviePy library to compose the various elements and
writes the resulting video to disk. It returns the path to the created
file.
"""

from __future__ import annotations

import os
from typing import Dict, List

import numpy as np
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy.video.VideoClip import ColorClip, VideoClip
from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
from moviepy.video.VideoClip import TextClip
from PIL import Image, ImageDraw, ImageFont

from .utils import wrap_text


def render_clip(
    video_path: str,
    start: float,
    end: float,
    top_text: str,
    words: List[Dict[str, float]],
    out_dir: str,
    base_name: str,
    idx: int,
    # Use a widely available system font by default. DejaVuSans is installed
    # in most Debian-based containers. The caller can override this path.
    font_path: str = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
    final_width: int = 1080,
    final_height: int = 1920,
    top_h: int = 480,
    middle_h: int = 960,
    bottom_h: int = 480,
    video_codec: str = "libx264",
    bitrate: str = "3000k",
) -> str:
    """Render a single clip with title and dynamic caption.

    Parameters
    ----------
    video_path: str
        Path to the source video file.
    start: float
        Start time of the clip in seconds.
    end: float
        End time of the clip in seconds.
    top_text: str
        The title to display in the top region.
    words: List[Dict[str, float]]
        List of word-level timestamps for this clip. Each dict must have
        ``start``, ``end`` and ``word`` keys. The start and end values
        should be relative to the beginning of this clip (i.e. start at 0).
    out_dir: str
        Directory where the output file should be saved. The function
        creates this directory if it doesn't exist.
    base_name: str
        Base name of the original video (sanitized). Used to build the
        output filename.
    idx: int
        Index of the clip. Output will be named ``clip_{idx}.mp4``.
    font_path: str
        Path to the TrueType font to use for both title and caption.
    final_width: int
        Width of the final video in pixels.
    final_height: int
        Height of the final video in pixels.
    top_h: int
        Height of the title area in pixels.
    middle_h: int
        Height of the video area in pixels.
    bottom_h: int
        Height of the caption area in pixels.
    video_codec: str
        FFmpeg codec to use when writing the video.
    bitrate: str
        Bitrate for the output video.

    Returns
    -------
    str
        The path to the rendered video file.
    """
    os.makedirs(out_dir, exist_ok=True)
    # Extract the segment from the source video
    with VideoFileClip(video_path) as clip:
        segment = clip.subclip(start, end)
        dur = segment.duration
        # Background
        bg = ColorClip(size=(final_width, final_height), color=(0, 0, 0), duration=dur)
        # Resize video to fit width
        video_resized = segment.resize(width=final_width)
        # Compute vertical position to centre in the middle region
        y = top_h + (middle_h - video_resized.h) // 2
        video_resized = video_resized.set_position((0, y))

        # Build title clip
        # Wrap the title to avoid overflow
        wrapped_lines = wrap_text(top_text, max_chars=40)
        wrapped_title = "\n".join(wrapped_lines)
        title_clip = TextClip(
            wrapped_title,
            font=font_path,
            fontsize=70,
            color="white",
            method="caption",
            size=(final_width, top_h),
            align="center",
        ).set_duration(dur).set_position((0, 0))

        # Prepare font for caption rendering
        pil_font = ImageFont.truetype(font_path, size=60)
        default_color = (255, 255, 255)  # white
        highlight_color = (255, 215, 0)  # gold-like yellow

        # Precompute widths of a space and bounding box height for vertical centering
        space_width = pil_font.getbbox(" ")[2] - pil_font.getbbox(" ")[0]
        bbox = pil_font.getbbox("A")
        text_height = bbox[3] - bbox[1]

        def make_caption_frame(t: float):
            """Generate an image for the caption at time t."""
            # Determine current word index
            idx_cur = 0
            for i, w in enumerate(words):
                if w["start"] <= t < w["end"]:
                    idx_cur = i
                    break
                if t >= w["end"]:
                    idx_cur = i
            # Define window of words to display: show up to 5 words
            start_idx = max(0, idx_cur - 2)
            end_idx = min(len(words), idx_cur + 3)
            window = words[start_idx:end_idx]
            # Compute widths for each word
            word_sizes = []
            for w in window:
                bbox = pil_font.getbbox(w["word"])
                word_width = bbox[2] - bbox[0]
                word_sizes.append(word_width)
            total_width = sum(word_sizes) + space_width * (len(window) - 1 if window else 0)
            # Create blank image for caption area
            img = Image.new("RGB", (final_width, bottom_h), color=(0, 0, 0))
            draw = ImageDraw.Draw(img)
            x = int((final_width - total_width) / 2)
            y_pos = int((bottom_h - text_height) / 2)
            for j, w in enumerate(window):
                color = highlight_color if (start_idx + j) == idx_cur else default_color
                draw.text((x, y_pos), w["word"], font=pil_font, fill=color)
                x += word_sizes[j] + space_width
            return np.array(img)

        caption_clip = VideoClip(make_frame=make_caption_frame, duration=dur)
        caption_clip = caption_clip.set_position((0, final_height - bottom_h))

        # Compose final clip
        final = CompositeVideoClip([
            bg,
            video_resized,
            title_clip,
            caption_clip,
        ], size=(final_width, final_height))
        # Use the original audio from the video segment
        final_audio = segment.audio
        if final_audio is not None:
            final = final.set_audio(final_audio)
        # Define output path
        out_path = os.path.join(out_dir, f"clip_{idx}.mp4")
        # Write to disk
        final.write_videofile(
            out_path,
            codec=video_codec,
            fps=30,
            bitrate=bitrate,
            audio_codec="aac",
            preset="ultrafast",
            ffmpeg_params=[
                "-tune", "zerolatency",
                "-pix_fmt", "yuv420p",
                "-profile:v", "high",
                "-level", "4.1",
            ],
            threads=4,
        )
        # Close clips to free resources
        final.close()
        segment.close()
    return out_path