Files
video-render/render.py
LeoMortari 0c0a9c3b5c Inicia novos recursos
Dentre eles estão recurso de adicao do faster-whisper, geração de legenda e integracao com Gemini e Open Router
2025-10-17 09:27:50 -03:00

205 lines
7.4 KiB
Python

"""Rendering logic for producing vertical clips with dynamic captions.
This module defines a single function ``render_clip`` which takes a video
segment and produces a vertical clip suitable for social media. Each clip
contains three regions:
* A top region (480px high) showing a title generated by an LLM.
* A middle region (960px high) containing the original video, scaled to
fit horizontally while preserving aspect ratio and centred vertically.
* A bottom region (480px high) showing a dynamic caption. The caption
displays a sliding window of three to five words from the transcript,
colouring the currently spoken word differently to draw the viewer's
attention.
The function uses the MoviePy library to compose the various elements and
writes the resulting video to disk. It returns the path to the created
file.
"""
from __future__ import annotations
import os
from typing import Dict, List
import numpy as np
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy.video.VideoClip import ColorClip, VideoClip
from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
from moviepy.video.VideoClip import TextClip
from PIL import Image, ImageDraw, ImageFont
from .utils import wrap_text
def render_clip(
video_path: str,
start: float,
end: float,
top_text: str,
words: List[Dict[str, float]],
out_dir: str,
base_name: str,
idx: int,
# Use a widely available system font by default. DejaVuSans is installed
# in most Debian-based containers. The caller can override this path.
font_path: str = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
final_width: int = 1080,
final_height: int = 1920,
top_h: int = 480,
middle_h: int = 960,
bottom_h: int = 480,
video_codec: str = "libx264",
bitrate: str = "3000k",
) -> str:
"""Render a single clip with title and dynamic caption.
Parameters
----------
video_path: str
Path to the source video file.
start: float
Start time of the clip in seconds.
end: float
End time of the clip in seconds.
top_text: str
The title to display in the top region.
words: List[Dict[str, float]]
List of word-level timestamps for this clip. Each dict must have
``start``, ``end`` and ``word`` keys. The start and end values
should be relative to the beginning of this clip (i.e. start at 0).
out_dir: str
Directory where the output file should be saved. The function
creates this directory if it doesn't exist.
base_name: str
Base name of the original video (sanitized). Used to build the
output filename.
idx: int
Index of the clip. Output will be named ``clip_{idx}.mp4``.
font_path: str
Path to the TrueType font to use for both title and caption.
final_width: int
Width of the final video in pixels.
final_height: int
Height of the final video in pixels.
top_h: int
Height of the title area in pixels.
middle_h: int
Height of the video area in pixels.
bottom_h: int
Height of the caption area in pixels.
video_codec: str
FFmpeg codec to use when writing the video.
bitrate: str
Bitrate for the output video.
Returns
-------
str
The path to the rendered video file.
"""
os.makedirs(out_dir, exist_ok=True)
# Extract the segment from the source video
with VideoFileClip(video_path) as clip:
segment = clip.subclip(start, end)
dur = segment.duration
# Background
bg = ColorClip(size=(final_width, final_height), color=(0, 0, 0), duration=dur)
# Resize video to fit width
video_resized = segment.resize(width=final_width)
# Compute vertical position to centre in the middle region
y = top_h + (middle_h - video_resized.h) // 2
video_resized = video_resized.set_position((0, y))
# Build title clip
# Wrap the title to avoid overflow
wrapped_lines = wrap_text(top_text, max_chars=40)
wrapped_title = "\n".join(wrapped_lines)
title_clip = TextClip(
wrapped_title,
font=font_path,
fontsize=70,
color="white",
method="caption",
size=(final_width, top_h),
align="center",
).set_duration(dur).set_position((0, 0))
# Prepare font for caption rendering
pil_font = ImageFont.truetype(font_path, size=60)
default_color = (255, 255, 255) # white
highlight_color = (255, 215, 0) # gold-like yellow
# Precompute widths of a space and bounding box height for vertical centering
space_width = pil_font.getbbox(" ")[2] - pil_font.getbbox(" ")[0]
bbox = pil_font.getbbox("A")
text_height = bbox[3] - bbox[1]
def make_caption_frame(t: float):
"""Generate an image for the caption at time t."""
# Determine current word index
idx_cur = 0
for i, w in enumerate(words):
if w["start"] <= t < w["end"]:
idx_cur = i
break
if t >= w["end"]:
idx_cur = i
# Define window of words to display: show up to 5 words
start_idx = max(0, idx_cur - 2)
end_idx = min(len(words), idx_cur + 3)
window = words[start_idx:end_idx]
# Compute widths for each word
word_sizes = []
for w in window:
bbox = pil_font.getbbox(w["word"])
word_width = bbox[2] - bbox[0]
word_sizes.append(word_width)
total_width = sum(word_sizes) + space_width * (len(window) - 1 if window else 0)
# Create blank image for caption area
img = Image.new("RGB", (final_width, bottom_h), color=(0, 0, 0))
draw = ImageDraw.Draw(img)
x = int((final_width - total_width) / 2)
y_pos = int((bottom_h - text_height) / 2)
for j, w in enumerate(window):
color = highlight_color if (start_idx + j) == idx_cur else default_color
draw.text((x, y_pos), w["word"], font=pil_font, fill=color)
x += word_sizes[j] + space_width
return np.array(img)
caption_clip = VideoClip(make_frame=make_caption_frame, duration=dur)
caption_clip = caption_clip.set_position((0, final_height - bottom_h))
# Compose final clip
final = CompositeVideoClip([
bg,
video_resized,
title_clip,
caption_clip,
], size=(final_width, final_height))
# Use the original audio from the video segment
final_audio = segment.audio
if final_audio is not None:
final = final.set_audio(final_audio)
# Define output path
out_path = os.path.join(out_dir, f"clip_{idx}.mp4")
# Write to disk
final.write_videofile(
out_path,
codec=video_codec,
fps=30,
bitrate=bitrate,
audio_codec="aac",
preset="ultrafast",
ffmpeg_params=[
"-tune", "zerolatency",
"-pix_fmt", "yuv420p",
"-profile:v", "high",
"-level", "4.1",
],
threads=4,
)
# Close clips to free resources
final.close()
segment.close()
return out_path