"""Rendering logic for producing vertical clips with dynamic captions. This module defines a single function ``render_clip`` which takes a video segment and produces a vertical clip suitable for social media. Each clip contains three regions: * A top region (480px high) showing a title generated by an LLM. * A middle region (960px high) containing the original video, scaled to fit horizontally while preserving aspect ratio and centred vertically. * A bottom region (480px high) showing a dynamic caption. The caption displays a sliding window of three to five words from the transcript, colouring the currently spoken word differently to draw the viewer's attention. The function uses the MoviePy library to compose the various elements and writes the resulting video to disk. It returns the path to the created file. """ from __future__ import annotations import os from typing import Dict, List import numpy as np from moviepy.video.io.VideoFileClip import VideoFileClip from moviepy.video.VideoClip import ColorClip, VideoClip from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip from moviepy.video.VideoClip import TextClip from PIL import Image, ImageDraw, ImageFont from .utils import wrap_text def render_clip( video_path: str, start: float, end: float, top_text: str, words: List[Dict[str, float]], out_dir: str, base_name: str, idx: int, # Use a widely available system font by default. DejaVuSans is installed # in most Debian-based containers. The caller can override this path. font_path: str = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", final_width: int = 1080, final_height: int = 1920, top_h: int = 480, middle_h: int = 960, bottom_h: int = 480, video_codec: str = "libx264", bitrate: str = "3000k", ) -> str: """Render a single clip with title and dynamic caption. Parameters ---------- video_path: str Path to the source video file. start: float Start time of the clip in seconds. end: float End time of the clip in seconds. top_text: str The title to display in the top region. words: List[Dict[str, float]] List of word-level timestamps for this clip. Each dict must have ``start``, ``end`` and ``word`` keys. The start and end values should be relative to the beginning of this clip (i.e. start at 0). out_dir: str Directory where the output file should be saved. The function creates this directory if it doesn't exist. base_name: str Base name of the original video (sanitized). Used to build the output filename. idx: int Index of the clip. Output will be named ``clip_{idx}.mp4``. font_path: str Path to the TrueType font to use for both title and caption. final_width: int Width of the final video in pixels. final_height: int Height of the final video in pixels. top_h: int Height of the title area in pixels. middle_h: int Height of the video area in pixels. bottom_h: int Height of the caption area in pixels. video_codec: str FFmpeg codec to use when writing the video. bitrate: str Bitrate for the output video. Returns ------- str The path to the rendered video file. """ os.makedirs(out_dir, exist_ok=True) # Extract the segment from the source video with VideoFileClip(video_path) as clip: segment = clip.subclip(start, end) dur = segment.duration # Background bg = ColorClip(size=(final_width, final_height), color=(0, 0, 0), duration=dur) # Resize video to fit width video_resized = segment.resize(width=final_width) # Compute vertical position to centre in the middle region y = top_h + (middle_h - video_resized.h) // 2 video_resized = video_resized.set_position((0, y)) # Build title clip # Wrap the title to avoid overflow wrapped_lines = wrap_text(top_text, max_chars=40) wrapped_title = "\n".join(wrapped_lines) title_clip = TextClip( wrapped_title, font=font_path, fontsize=70, color="white", method="caption", size=(final_width, top_h), align="center", ).set_duration(dur).set_position((0, 0)) # Prepare font for caption rendering pil_font = ImageFont.truetype(font_path, size=60) default_color = (255, 255, 255) # white highlight_color = (255, 215, 0) # gold-like yellow # Precompute widths of a space and bounding box height for vertical centering space_width = pil_font.getbbox(" ")[2] - pil_font.getbbox(" ")[0] bbox = pil_font.getbbox("A") text_height = bbox[3] - bbox[1] def make_caption_frame(t: float): """Generate an image for the caption at time t.""" # Determine current word index idx_cur = 0 for i, w in enumerate(words): if w["start"] <= t < w["end"]: idx_cur = i break if t >= w["end"]: idx_cur = i # Define window of words to display: show up to 5 words start_idx = max(0, idx_cur - 2) end_idx = min(len(words), idx_cur + 3) window = words[start_idx:end_idx] # Compute widths for each word word_sizes = [] for w in window: bbox = pil_font.getbbox(w["word"]) word_width = bbox[2] - bbox[0] word_sizes.append(word_width) total_width = sum(word_sizes) + space_width * (len(window) - 1 if window else 0) # Create blank image for caption area img = Image.new("RGB", (final_width, bottom_h), color=(0, 0, 0)) draw = ImageDraw.Draw(img) x = int((final_width - total_width) / 2) y_pos = int((bottom_h - text_height) / 2) for j, w in enumerate(window): color = highlight_color if (start_idx + j) == idx_cur else default_color draw.text((x, y_pos), w["word"], font=pil_font, fill=color) x += word_sizes[j] + space_width return np.array(img) caption_clip = VideoClip(make_frame=make_caption_frame, duration=dur) caption_clip = caption_clip.set_position((0, final_height - bottom_h)) # Compose final clip final = CompositeVideoClip([ bg, video_resized, title_clip, caption_clip, ], size=(final_width, final_height)) # Use the original audio from the video segment final_audio = segment.audio if final_audio is not None: final = final.set_audio(final_audio) # Define output path out_path = os.path.join(out_dir, f"clip_{idx}.mp4") # Write to disk final.write_videofile( out_path, codec=video_codec, fps=30, bitrate=bitrate, audio_codec="aac", preset="ultrafast", ffmpeg_params=[ "-tune", "zerolatency", "-pix_fmt", "yuv420p", "-profile:v", "high", "-level", "4.1", ], threads=4, ) # Close clips to free resources final.close() segment.close() return out_path