Inicia novos recursos

Dentre eles estão recurso de adicao do faster-whisper, geração de legenda e integracao com Gemini e Open Router
2025-10-17 09:27:50 -03:00
commit 0c0a9c3b5c
15 changed files with 997 additions and 0 deletions
--- a/render.py
+++ b/render.py
@@ -0,0 +1,205 @@
+"""Rendering logic for producing vertical clips with dynamic captions.
+
+This module defines a single function ``render_clip`` which takes a video
+segment and produces a vertical clip suitable for social media. Each clip
+contains three regions:
+
+* A top region (480px high) showing a title generated by an LLM.
+* A middle region (960px high) containing the original video, scaled to
+  fit horizontally while preserving aspect ratio and centred vertically.
+* A bottom region (480px high) showing a dynamic caption. The caption
+  displays a sliding window of three to five words from the transcript,
+  colouring the currently spoken word differently to draw the viewer's
+  attention.
+
+The function uses the MoviePy library to compose the various elements and
+writes the resulting video to disk. It returns the path to the created
+file.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Dict, List
+
+import numpy as np
+from moviepy.video.io.VideoFileClip import VideoFileClip
+from moviepy.video.VideoClip import ColorClip, VideoClip
+from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
+from moviepy.video.VideoClip import TextClip
+from PIL import Image, ImageDraw, ImageFont
+
+from .utils import wrap_text
+
+
+def render_clip(
+    video_path: str,
+    start: float,
+    end: float,
+    top_text: str,
+    words: List[Dict[str, float]],
+    out_dir: str,
+    base_name: str,
+    idx: int,
+    # Use a widely available system font by default. DejaVuSans is installed
+    # in most Debian-based containers. The caller can override this path.
+    font_path: str = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
+    final_width: int = 1080,
+    final_height: int = 1920,
+    top_h: int = 480,
+    middle_h: int = 960,
+    bottom_h: int = 480,
+    video_codec: str = "libx264",
+    bitrate: str = "3000k",
+) -> str:
+    """Render a single clip with title and dynamic caption.
+
+    Parameters
+    ----------
+    video_path: str
+        Path to the source video file.
+    start: float
+        Start time of the clip in seconds.
+    end: float
+        End time of the clip in seconds.
+    top_text: str
+        The title to display in the top region.
+    words: List[Dict[str, float]]
+        List of word-level timestamps for this clip. Each dict must have
+        ``start``, ``end`` and ``word`` keys. The start and end values
+        should be relative to the beginning of this clip (i.e. start at 0).
+    out_dir: str
+        Directory where the output file should be saved. The function
+        creates this directory if it doesn't exist.
+    base_name: str
+        Base name of the original video (sanitized). Used to build the
+        output filename.
+    idx: int
+        Index of the clip. Output will be named ``clip_{idx}.mp4``.
+    font_path: str
+        Path to the TrueType font to use for both title and caption.
+    final_width: int
+        Width of the final video in pixels.
+    final_height: int
+        Height of the final video in pixels.
+    top_h: int
+        Height of the title area in pixels.
+    middle_h: int
+        Height of the video area in pixels.
+    bottom_h: int
+        Height of the caption area in pixels.
+    video_codec: str
+        FFmpeg codec to use when writing the video.
+    bitrate: str
+        Bitrate for the output video.
+
+    Returns
+    -------
+    str
+        The path to the rendered video file.
+    """
+    os.makedirs(out_dir, exist_ok=True)
+    # Extract the segment from the source video
+    with VideoFileClip(video_path) as clip:
+        segment = clip.subclip(start, end)
+        dur = segment.duration
+        # Background
+        bg = ColorClip(size=(final_width, final_height), color=(0, 0, 0), duration=dur)
+        # Resize video to fit width
+        video_resized = segment.resize(width=final_width)
+        # Compute vertical position to centre in the middle region
+        y = top_h + (middle_h - video_resized.h) // 2
+        video_resized = video_resized.set_position((0, y))
+
+        # Build title clip
+        # Wrap the title to avoid overflow
+        wrapped_lines = wrap_text(top_text, max_chars=40)
+        wrapped_title = "\n".join(wrapped_lines)
+        title_clip = TextClip(
+            wrapped_title,
+            font=font_path,
+            fontsize=70,
+            color="white",
+            method="caption",
+            size=(final_width, top_h),
+            align="center",
+        ).set_duration(dur).set_position((0, 0))
+
+        # Prepare font for caption rendering
+        pil_font = ImageFont.truetype(font_path, size=60)
+        default_color = (255, 255, 255)  # white
+        highlight_color = (255, 215, 0)  # gold-like yellow
+
+        # Precompute widths of a space and bounding box height for vertical centering
+        space_width = pil_font.getbbox(" ")[2] - pil_font.getbbox(" ")[0]
+        bbox = pil_font.getbbox("A")
+        text_height = bbox[3] - bbox[1]
+
+        def make_caption_frame(t: float):
+            """Generate an image for the caption at time t."""
+            # Determine current word index
+            idx_cur = 0
+            for i, w in enumerate(words):
+                if w["start"] <= t < w["end"]:
+                    idx_cur = i
+                    break
+                if t >= w["end"]:
+                    idx_cur = i
+            # Define window of words to display: show up to 5 words
+            start_idx = max(0, idx_cur - 2)
+            end_idx = min(len(words), idx_cur + 3)
+            window = words[start_idx:end_idx]
+            # Compute widths for each word
+            word_sizes = []
+            for w in window:
+                bbox = pil_font.getbbox(w["word"])
+                word_width = bbox[2] - bbox[0]
+                word_sizes.append(word_width)
+            total_width = sum(word_sizes) + space_width * (len(window) - 1 if window else 0)
+            # Create blank image for caption area
+            img = Image.new("RGB", (final_width, bottom_h), color=(0, 0, 0))
+            draw = ImageDraw.Draw(img)
+            x = int((final_width - total_width) / 2)
+            y_pos = int((bottom_h - text_height) / 2)
+            for j, w in enumerate(window):
+                color = highlight_color if (start_idx + j) == idx_cur else default_color
+                draw.text((x, y_pos), w["word"], font=pil_font, fill=color)
+                x += word_sizes[j] + space_width
+            return np.array(img)
+
+        caption_clip = VideoClip(make_frame=make_caption_frame, duration=dur)
+        caption_clip = caption_clip.set_position((0, final_height - bottom_h))
+
+        # Compose final clip
+        final = CompositeVideoClip([
+            bg,
+            video_resized,
+            title_clip,
+            caption_clip,
+        ], size=(final_width, final_height))
+        # Use the original audio from the video segment
+        final_audio = segment.audio
+        if final_audio is not None:
+            final = final.set_audio(final_audio)
+        # Define output path
+        out_path = os.path.join(out_dir, f"clip_{idx}.mp4")
+        # Write to disk
+        final.write_videofile(
+            out_path,
+            codec=video_codec,
+            fps=30,
+            bitrate=bitrate,
+            audio_codec="aac",
+            preset="ultrafast",
+            ffmpeg_params=[
+                "-tune", "zerolatency",
+                "-pix_fmt", "yuv420p",
+                "-profile:v", "high",
+                "-level", "4.1",
+            ],
+            threads=4,
+        )
+        # Close clips to free resources
+        final.close()
+        segment.close()
+    return out_path