Dentre eles estão recurso de adicao do faster-whisper, geração de legenda e integracao com Gemini e Open Router
205 lines
7.4 KiB
Python
205 lines
7.4 KiB
Python
"""Rendering logic for producing vertical clips with dynamic captions.
|
|
|
|
This module defines a single function ``render_clip`` which takes a video
|
|
segment and produces a vertical clip suitable for social media. Each clip
|
|
contains three regions:
|
|
|
|
* A top region (480px high) showing a title generated by an LLM.
|
|
* A middle region (960px high) containing the original video, scaled to
|
|
fit horizontally while preserving aspect ratio and centred vertically.
|
|
* A bottom region (480px high) showing a dynamic caption. The caption
|
|
displays a sliding window of three to five words from the transcript,
|
|
colouring the currently spoken word differently to draw the viewer's
|
|
attention.
|
|
|
|
The function uses the MoviePy library to compose the various elements and
|
|
writes the resulting video to disk. It returns the path to the created
|
|
file.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
from typing import Dict, List
|
|
|
|
import numpy as np
|
|
from moviepy.video.io.VideoFileClip import VideoFileClip
|
|
from moviepy.video.VideoClip import ColorClip, VideoClip
|
|
from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
|
|
from moviepy.video.VideoClip import TextClip
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
from .utils import wrap_text
|
|
|
|
|
|
def render_clip(
|
|
video_path: str,
|
|
start: float,
|
|
end: float,
|
|
top_text: str,
|
|
words: List[Dict[str, float]],
|
|
out_dir: str,
|
|
base_name: str,
|
|
idx: int,
|
|
# Use a widely available system font by default. DejaVuSans is installed
|
|
# in most Debian-based containers. The caller can override this path.
|
|
font_path: str = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
|
|
final_width: int = 1080,
|
|
final_height: int = 1920,
|
|
top_h: int = 480,
|
|
middle_h: int = 960,
|
|
bottom_h: int = 480,
|
|
video_codec: str = "libx264",
|
|
bitrate: str = "3000k",
|
|
) -> str:
|
|
"""Render a single clip with title and dynamic caption.
|
|
|
|
Parameters
|
|
----------
|
|
video_path: str
|
|
Path to the source video file.
|
|
start: float
|
|
Start time of the clip in seconds.
|
|
end: float
|
|
End time of the clip in seconds.
|
|
top_text: str
|
|
The title to display in the top region.
|
|
words: List[Dict[str, float]]
|
|
List of word-level timestamps for this clip. Each dict must have
|
|
``start``, ``end`` and ``word`` keys. The start and end values
|
|
should be relative to the beginning of this clip (i.e. start at 0).
|
|
out_dir: str
|
|
Directory where the output file should be saved. The function
|
|
creates this directory if it doesn't exist.
|
|
base_name: str
|
|
Base name of the original video (sanitized). Used to build the
|
|
output filename.
|
|
idx: int
|
|
Index of the clip. Output will be named ``clip_{idx}.mp4``.
|
|
font_path: str
|
|
Path to the TrueType font to use for both title and caption.
|
|
final_width: int
|
|
Width of the final video in pixels.
|
|
final_height: int
|
|
Height of the final video in pixels.
|
|
top_h: int
|
|
Height of the title area in pixels.
|
|
middle_h: int
|
|
Height of the video area in pixels.
|
|
bottom_h: int
|
|
Height of the caption area in pixels.
|
|
video_codec: str
|
|
FFmpeg codec to use when writing the video.
|
|
bitrate: str
|
|
Bitrate for the output video.
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
The path to the rendered video file.
|
|
"""
|
|
os.makedirs(out_dir, exist_ok=True)
|
|
# Extract the segment from the source video
|
|
with VideoFileClip(video_path) as clip:
|
|
segment = clip.subclip(start, end)
|
|
dur = segment.duration
|
|
# Background
|
|
bg = ColorClip(size=(final_width, final_height), color=(0, 0, 0), duration=dur)
|
|
# Resize video to fit width
|
|
video_resized = segment.resize(width=final_width)
|
|
# Compute vertical position to centre in the middle region
|
|
y = top_h + (middle_h - video_resized.h) // 2
|
|
video_resized = video_resized.set_position((0, y))
|
|
|
|
# Build title clip
|
|
# Wrap the title to avoid overflow
|
|
wrapped_lines = wrap_text(top_text, max_chars=40)
|
|
wrapped_title = "\n".join(wrapped_lines)
|
|
title_clip = TextClip(
|
|
wrapped_title,
|
|
font=font_path,
|
|
fontsize=70,
|
|
color="white",
|
|
method="caption",
|
|
size=(final_width, top_h),
|
|
align="center",
|
|
).set_duration(dur).set_position((0, 0))
|
|
|
|
# Prepare font for caption rendering
|
|
pil_font = ImageFont.truetype(font_path, size=60)
|
|
default_color = (255, 255, 255) # white
|
|
highlight_color = (255, 215, 0) # gold-like yellow
|
|
|
|
# Precompute widths of a space and bounding box height for vertical centering
|
|
space_width = pil_font.getbbox(" ")[2] - pil_font.getbbox(" ")[0]
|
|
bbox = pil_font.getbbox("A")
|
|
text_height = bbox[3] - bbox[1]
|
|
|
|
def make_caption_frame(t: float):
|
|
"""Generate an image for the caption at time t."""
|
|
# Determine current word index
|
|
idx_cur = 0
|
|
for i, w in enumerate(words):
|
|
if w["start"] <= t < w["end"]:
|
|
idx_cur = i
|
|
break
|
|
if t >= w["end"]:
|
|
idx_cur = i
|
|
# Define window of words to display: show up to 5 words
|
|
start_idx = max(0, idx_cur - 2)
|
|
end_idx = min(len(words), idx_cur + 3)
|
|
window = words[start_idx:end_idx]
|
|
# Compute widths for each word
|
|
word_sizes = []
|
|
for w in window:
|
|
bbox = pil_font.getbbox(w["word"])
|
|
word_width = bbox[2] - bbox[0]
|
|
word_sizes.append(word_width)
|
|
total_width = sum(word_sizes) + space_width * (len(window) - 1 if window else 0)
|
|
# Create blank image for caption area
|
|
img = Image.new("RGB", (final_width, bottom_h), color=(0, 0, 0))
|
|
draw = ImageDraw.Draw(img)
|
|
x = int((final_width - total_width) / 2)
|
|
y_pos = int((bottom_h - text_height) / 2)
|
|
for j, w in enumerate(window):
|
|
color = highlight_color if (start_idx + j) == idx_cur else default_color
|
|
draw.text((x, y_pos), w["word"], font=pil_font, fill=color)
|
|
x += word_sizes[j] + space_width
|
|
return np.array(img)
|
|
|
|
caption_clip = VideoClip(make_frame=make_caption_frame, duration=dur)
|
|
caption_clip = caption_clip.set_position((0, final_height - bottom_h))
|
|
|
|
# Compose final clip
|
|
final = CompositeVideoClip([
|
|
bg,
|
|
video_resized,
|
|
title_clip,
|
|
caption_clip,
|
|
], size=(final_width, final_height))
|
|
# Use the original audio from the video segment
|
|
final_audio = segment.audio
|
|
if final_audio is not None:
|
|
final = final.set_audio(final_audio)
|
|
# Define output path
|
|
out_path = os.path.join(out_dir, f"clip_{idx}.mp4")
|
|
# Write to disk
|
|
final.write_videofile(
|
|
out_path,
|
|
codec=video_codec,
|
|
fps=30,
|
|
bitrate=bitrate,
|
|
audio_codec="aac",
|
|
preset="ultrafast",
|
|
ffmpeg_params=[
|
|
"-tune", "zerolatency",
|
|
"-pix_fmt", "yuv420p",
|
|
"-profile:v", "high",
|
|
"-level", "4.1",
|
|
],
|
|
threads=4,
|
|
)
|
|
# Close clips to free resources
|
|
final.close()
|
|
segment.close()
|
|
return out_path |