Inicia novos recursos
Dentre eles estão recurso de adicao do faster-whisper, geração de legenda e integracao com Gemini e Open Router
This commit is contained in:
205
render.py
Normal file
205
render.py
Normal file
@@ -0,0 +1,205 @@
|
||||
"""Rendering logic for producing vertical clips with dynamic captions.
|
||||
|
||||
This module defines a single function ``render_clip`` which takes a video
|
||||
segment and produces a vertical clip suitable for social media. Each clip
|
||||
contains three regions:
|
||||
|
||||
* A top region (480px high) showing a title generated by an LLM.
|
||||
* A middle region (960px high) containing the original video, scaled to
|
||||
fit horizontally while preserving aspect ratio and centred vertically.
|
||||
* A bottom region (480px high) showing a dynamic caption. The caption
|
||||
displays a sliding window of three to five words from the transcript,
|
||||
colouring the currently spoken word differently to draw the viewer's
|
||||
attention.
|
||||
|
||||
The function uses the MoviePy library to compose the various elements and
|
||||
writes the resulting video to disk. It returns the path to the created
|
||||
file.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy as np
|
||||
from moviepy.video.io.VideoFileClip import VideoFileClip
|
||||
from moviepy.video.VideoClip import ColorClip, VideoClip
|
||||
from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
|
||||
from moviepy.video.VideoClip import TextClip
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
from .utils import wrap_text
|
||||
|
||||
|
||||
def render_clip(
|
||||
video_path: str,
|
||||
start: float,
|
||||
end: float,
|
||||
top_text: str,
|
||||
words: List[Dict[str, float]],
|
||||
out_dir: str,
|
||||
base_name: str,
|
||||
idx: int,
|
||||
# Use a widely available system font by default. DejaVuSans is installed
|
||||
# in most Debian-based containers. The caller can override this path.
|
||||
font_path: str = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
|
||||
final_width: int = 1080,
|
||||
final_height: int = 1920,
|
||||
top_h: int = 480,
|
||||
middle_h: int = 960,
|
||||
bottom_h: int = 480,
|
||||
video_codec: str = "libx264",
|
||||
bitrate: str = "3000k",
|
||||
) -> str:
|
||||
"""Render a single clip with title and dynamic caption.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
video_path: str
|
||||
Path to the source video file.
|
||||
start: float
|
||||
Start time of the clip in seconds.
|
||||
end: float
|
||||
End time of the clip in seconds.
|
||||
top_text: str
|
||||
The title to display in the top region.
|
||||
words: List[Dict[str, float]]
|
||||
List of word-level timestamps for this clip. Each dict must have
|
||||
``start``, ``end`` and ``word`` keys. The start and end values
|
||||
should be relative to the beginning of this clip (i.e. start at 0).
|
||||
out_dir: str
|
||||
Directory where the output file should be saved. The function
|
||||
creates this directory if it doesn't exist.
|
||||
base_name: str
|
||||
Base name of the original video (sanitized). Used to build the
|
||||
output filename.
|
||||
idx: int
|
||||
Index of the clip. Output will be named ``clip_{idx}.mp4``.
|
||||
font_path: str
|
||||
Path to the TrueType font to use for both title and caption.
|
||||
final_width: int
|
||||
Width of the final video in pixels.
|
||||
final_height: int
|
||||
Height of the final video in pixels.
|
||||
top_h: int
|
||||
Height of the title area in pixels.
|
||||
middle_h: int
|
||||
Height of the video area in pixels.
|
||||
bottom_h: int
|
||||
Height of the caption area in pixels.
|
||||
video_codec: str
|
||||
FFmpeg codec to use when writing the video.
|
||||
bitrate: str
|
||||
Bitrate for the output video.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
The path to the rendered video file.
|
||||
"""
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
# Extract the segment from the source video
|
||||
with VideoFileClip(video_path) as clip:
|
||||
segment = clip.subclip(start, end)
|
||||
dur = segment.duration
|
||||
# Background
|
||||
bg = ColorClip(size=(final_width, final_height), color=(0, 0, 0), duration=dur)
|
||||
# Resize video to fit width
|
||||
video_resized = segment.resize(width=final_width)
|
||||
# Compute vertical position to centre in the middle region
|
||||
y = top_h + (middle_h - video_resized.h) // 2
|
||||
video_resized = video_resized.set_position((0, y))
|
||||
|
||||
# Build title clip
|
||||
# Wrap the title to avoid overflow
|
||||
wrapped_lines = wrap_text(top_text, max_chars=40)
|
||||
wrapped_title = "\n".join(wrapped_lines)
|
||||
title_clip = TextClip(
|
||||
wrapped_title,
|
||||
font=font_path,
|
||||
fontsize=70,
|
||||
color="white",
|
||||
method="caption",
|
||||
size=(final_width, top_h),
|
||||
align="center",
|
||||
).set_duration(dur).set_position((0, 0))
|
||||
|
||||
# Prepare font for caption rendering
|
||||
pil_font = ImageFont.truetype(font_path, size=60)
|
||||
default_color = (255, 255, 255) # white
|
||||
highlight_color = (255, 215, 0) # gold-like yellow
|
||||
|
||||
# Precompute widths of a space and bounding box height for vertical centering
|
||||
space_width = pil_font.getbbox(" ")[2] - pil_font.getbbox(" ")[0]
|
||||
bbox = pil_font.getbbox("A")
|
||||
text_height = bbox[3] - bbox[1]
|
||||
|
||||
def make_caption_frame(t: float):
|
||||
"""Generate an image for the caption at time t."""
|
||||
# Determine current word index
|
||||
idx_cur = 0
|
||||
for i, w in enumerate(words):
|
||||
if w["start"] <= t < w["end"]:
|
||||
idx_cur = i
|
||||
break
|
||||
if t >= w["end"]:
|
||||
idx_cur = i
|
||||
# Define window of words to display: show up to 5 words
|
||||
start_idx = max(0, idx_cur - 2)
|
||||
end_idx = min(len(words), idx_cur + 3)
|
||||
window = words[start_idx:end_idx]
|
||||
# Compute widths for each word
|
||||
word_sizes = []
|
||||
for w in window:
|
||||
bbox = pil_font.getbbox(w["word"])
|
||||
word_width = bbox[2] - bbox[0]
|
||||
word_sizes.append(word_width)
|
||||
total_width = sum(word_sizes) + space_width * (len(window) - 1 if window else 0)
|
||||
# Create blank image for caption area
|
||||
img = Image.new("RGB", (final_width, bottom_h), color=(0, 0, 0))
|
||||
draw = ImageDraw.Draw(img)
|
||||
x = int((final_width - total_width) / 2)
|
||||
y_pos = int((bottom_h - text_height) / 2)
|
||||
for j, w in enumerate(window):
|
||||
color = highlight_color if (start_idx + j) == idx_cur else default_color
|
||||
draw.text((x, y_pos), w["word"], font=pil_font, fill=color)
|
||||
x += word_sizes[j] + space_width
|
||||
return np.array(img)
|
||||
|
||||
caption_clip = VideoClip(make_frame=make_caption_frame, duration=dur)
|
||||
caption_clip = caption_clip.set_position((0, final_height - bottom_h))
|
||||
|
||||
# Compose final clip
|
||||
final = CompositeVideoClip([
|
||||
bg,
|
||||
video_resized,
|
||||
title_clip,
|
||||
caption_clip,
|
||||
], size=(final_width, final_height))
|
||||
# Use the original audio from the video segment
|
||||
final_audio = segment.audio
|
||||
if final_audio is not None:
|
||||
final = final.set_audio(final_audio)
|
||||
# Define output path
|
||||
out_path = os.path.join(out_dir, f"clip_{idx}.mp4")
|
||||
# Write to disk
|
||||
final.write_videofile(
|
||||
out_path,
|
||||
codec=video_codec,
|
||||
fps=30,
|
||||
bitrate=bitrate,
|
||||
audio_codec="aac",
|
||||
preset="ultrafast",
|
||||
ffmpeg_params=[
|
||||
"-tune", "zerolatency",
|
||||
"-pix_fmt", "yuv420p",
|
||||
"-profile:v", "high",
|
||||
"-level", "4.1",
|
||||
],
|
||||
threads=4,
|
||||
)
|
||||
# Close clips to free resources
|
||||
final.close()
|
||||
segment.close()
|
||||
return out_path
|
||||
Reference in New Issue
Block a user