Cria novos components

2025-10-20 17:56:36 -03:00
parent 2b99d2ad78
commit b090f7c2cb
38 changed files with 1391 additions and 1024 deletions
--- a/video_render/rendering.py
+++ b/video_render/rendering.py
@@ -0,0 +1,406 @@
+from __future__ import annotations
+
+import logging
+import math
+import re
+from dataclasses import dataclass
+from typing import Iterable, List, Sequence, Tuple
+
+import numpy as np
+from moviepy.editor import (
+    ColorClip,
+    CompositeVideoClip,
+    ImageClip,
+    TextClip,
+    VideoFileClip,
+)
+from PIL import Image, ImageColor, ImageDraw, ImageFont
+
+from .config import Settings
+from .transcription import TranscriptionResult, WordTiming
+
+logger = logging.getLogger(__name__)
+
+
+def clamp_time(value: float, minimum: float = 0.0) -> float:
+    return max(minimum, float(value))
+
+
+@dataclass
+class CaptionClipSet:
+    base: ImageClip
+    highlights: List[ImageClip]
+
+
+class CaptionBuilder:
+    def __init__(self, settings: Settings) -> None:
+        self.settings = settings
+        self.font_path = settings.rendering.font_path
+        if not self.font_path.exists():
+            raise FileNotFoundError(f"Fonte nao encontrada: {self.font_path}")
+
+        self.font = ImageFont.truetype(
+            str(self.font_path), settings.rendering.subtitle_font_size
+        )
+        self.base_color = ImageColor.getrgb(settings.rendering.base_color)
+        self.highlight_color = ImageColor.getrgb(settings.rendering.highlight_color)
+        self.canvas_width = settings.rendering.frame_width - 160
+        self.canvas_height = int(settings.rendering.subtitle_font_size * 2.2)
+        self.min_words = settings.rendering.caption_min_words
+        self.max_words = settings.rendering.caption_max_words
+
+        bbox = self.font.getbbox("Ay")
+        self.text_height = bbox[3] - bbox[1]
+        self.baseline = (self.canvas_height - self.text_height) // 2 - bbox[1]
+        self.space_width = self.font.getbbox(" ")[2] - self.font.getbbox(" ")[0]
+
+    def build(self, words: Sequence[WordTiming], clip_start: float) -> List[CaptionClipSet]:
+        grouped = self._group_words(words)
+        clip_sets: List[CaptionClipSet] = []
+
+        for group in grouped:
+            group_start = clamp_time(group[0].start, minimum=clip_start)
+            group_end = clamp_time(group[-1].end, minimum=group_start + 0.05)
+            duration = max(0.05, group_end - group_start)
+            start_offset = group_start - clip_start
+
+            base_image, highlight_images = self._render_group(group)
+
+            base_clip = (
+                ImageClip(np.array(base_image))
+                .with_start(start_offset)
+                .with_duration(duration)
+            )
+
+            highlight_clips: List[ImageClip] = []
+            for word, image in zip(group, highlight_images):
+                h_start = clamp_time(word.start, minimum=clip_start) - clip_start
+                h_end = clamp_time(word.end, minimum=word.start + 0.02) - clip_start
+                h_duration = max(0.05, h_end - h_start)
+                highlight_clip = (
+                    ImageClip(np.array(image))
+                    .with_start(h_start)
+                    .with_duration(h_duration)
+                )
+                highlight_clips.append(highlight_clip)
+
+            clip_sets.append(CaptionClipSet(base=base_clip, highlights=highlight_clips))
+
+        return clip_sets
+
+    def _render_group(self, group: Sequence[WordTiming]) -> Tuple[Image.Image, List[Image.Image]]:
+        texts = [self._clean_word(word.word) for word in group]
+
+        widths = []
+        for text in texts:
+            bbox = self.font.getbbox(text)
+            widths.append(bbox[2] - bbox[0])
+
+        total_width = sum(widths)
+        if len(widths) > 1:
+            total_width += self.space_width * (len(widths) - 1)
+
+        start_x = max(0, (self.canvas_width - total_width) // 2)
+
+        base_image = Image.new("RGBA", (self.canvas_width, self.canvas_height), (0, 0, 0, 0))
+        base_draw = ImageDraw.Draw(base_image)
+        highlight_images: List[Image.Image] = []
+
+        x = start_x
+        for text, width in zip(texts, widths):
+            base_draw.text((x, self.baseline), text, font=self.font, fill=self.base_color)
+
+            highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
+            highlight_draw = ImageDraw.Draw(highlight_image)
+            highlight_draw.text(
+                (x, self.baseline), text, font=self.font, fill=self.highlight_color
+            )
+            highlight_images.append(highlight_image)
+
+            x += width + self.space_width
+
+        return base_image, highlight_images
+
+    def _group_words(self, words: Sequence[WordTiming]) -> List[List[WordTiming]]:
+        if not words:
+            return []
+
+        grouped: List[List[WordTiming]] = []
+        buffer: List[WordTiming] = []
+
+        for word in words:
+            buffer.append(word)
+            if len(buffer) == self.max_words:
+                grouped.append(buffer)
+                buffer = []
+
+        if buffer:
+            if len(buffer) == 1 and grouped:
+                grouped[-1].extend(buffer)
+            else:
+                grouped.append(buffer)
+
+        # Rebalance groups to respect minimum size when possible
+        for idx, group in enumerate(grouped[:-1]):
+            if len(group) < self.min_words and len(grouped[idx + 1]) > self.min_words:
+                deficit = self.min_words - len(group)
+                transfer = grouped[idx + 1][:deficit]
+                grouped[idx] = group + transfer
+                grouped[idx + 1] = grouped[idx + 1][deficit:]
+
+        grouped = [grp for grp in grouped if grp]
+        return grouped
+
+    @staticmethod
+    def _clean_word(text: str) -> str:
+        text = text.strip()
+        text = re.sub(r"\s+", " ", text)
+        return text or "..."
+
+
+class VideoRenderer:
+    def __init__(self, settings: Settings) -> None:
+        self.settings = settings
+        self.captions = CaptionBuilder(settings)
+
+    def render(
+        self,
+        workspace_path: str,
+        highlight_windows: Sequence,
+        transcription: TranscriptionResult,
+        titles: Sequence[str],
+        output_dir,
+    ) -> List[Tuple[str, float, float, str, str, int]]:
+        results: List[Tuple[str, float, float, str, str, int]] = []
+
+        with VideoFileClip(workspace_path) as base_clip:
+            video_duration = base_clip.duration or 0
+            for index, window in enumerate(highlight_windows, start=1):
+                start = clamp_time(window.start)
+                end = clamp_time(window.end)
+                start = min(start, video_duration)
+                end = min(end, video_duration)
+                if end <= start:
+                    logger.info("Janela ignorada por intervalo invalido: %s", window)
+                    continue
+
+                subclip = base_clip.subclipped(start, end)
+                try:
+                    rendered_path = self._render_single_clip(
+                        subclip=subclip,
+                        start=start,
+                        end=end,
+                        title=titles[index - 1] if index - 1 < len(titles) else window.summary,
+                        summary=window.summary,
+                        index=index,
+                        transcription=transcription,
+                        output_dir=output_dir,
+                    )
+                finally:
+                    subclip.close()
+
+                results.append(
+                    (
+                        rendered_path,
+                        float(start),
+                        float(end),
+                        titles[index - 1] if index - 1 < len(titles) else window.summary,
+                        window.summary,
+                        index,
+                    )
+                )
+
+        return results
+
+    def _render_single_clip(
+        self,
+        subclip: VideoFileClip,
+        start: float,
+        end: float,
+        title: str,
+        summary: str,
+        index: int,
+        transcription: TranscriptionResult,
+        output_dir,
+    ) -> str:
+        duration = end - start
+        frame_w = self.settings.rendering.frame_width
+        frame_h = self.settings.rendering.frame_height
+        top_h = int(frame_h * 0.18)
+        bottom_h = int(frame_h * 0.20)
+        video_area_h = frame_h - top_h - bottom_h
+
+        scale_factor = min(
+            frame_w / subclip.w,
+            video_area_h / subclip.h,
+        )
+        resized_clip = subclip.resized(scale_factor)
+        video_y = top_h + (video_area_h - resized_clip.h) // 2
+
+        video_clip = resized_clip.with_position(
+            ((frame_w - resized_clip.w) // 2, video_y)
+        )
+
+        background = ColorClip(size=(frame_w, frame_h), color=(0, 0, 0)).with_duration(duration)
+        top_panel = (
+            ColorClip(size=(frame_w, top_h), color=(12, 12, 12))
+            .with_duration(duration)
+            .with_opacity(0.85)
+        )
+        bottom_panel = (
+            ColorClip(size=(frame_w, bottom_h), color=(12, 12, 12))
+            .with_position((0, frame_h - bottom_h))
+            .with_duration(duration)
+            .with_opacity(0.85)
+        )
+
+        title_text = title or summary
+        wrapped_title = self._wrap_text(title_text, max_width=frame_w - 160)
+        title_clip = (
+            TextClip(
+                text=wrapped_title,
+                font=str(self.settings.rendering.font_path),
+                font_size=self.settings.rendering.title_font_size,
+                color=self.settings.rendering.base_color,
+                method="caption",
+                size=(frame_w - 160, top_h - 40),
+            )
+            .with_duration(duration)
+        )
+        title_clip = title_clip.with_position(
+            ((frame_w - title_clip.w) // 2, (top_h - title_clip.h) // 2)
+        )
+
+        words = self._collect_words(transcription, start, end)
+        caption_sets = self.captions.build(words, clip_start=start)
+
+        caption_clips = []
+        caption_resources: List[ImageClip] = []
+        caption_y = frame_h - bottom_h + (bottom_h - self.captions.canvas_height) // 2
+        for clip_set in caption_sets:
+            base_positioned = clip_set.base.with_position(("center", caption_y))
+            caption_clips.append(base_positioned)
+            caption_resources.append(clip_set.base)
+            for highlight in clip_set.highlights:
+                positioned = highlight.with_position(("center", caption_y))
+                caption_clips.append(positioned)
+                caption_resources.append(highlight)
+
+        if not caption_clips:
+            fallback_text = self._wrap_text(summary or title, max_width=frame_w - 160)
+            caption_clips.append(
+                TextClip(
+                    text=fallback_text,
+                    font=str(self.settings.rendering.font_path),
+                    font_size=self.settings.rendering.subtitle_font_size,
+                    color=self.settings.rendering.base_color,
+                    method="caption",
+                    size=(frame_w - 160, bottom_h - 40),
+                )
+                .with_duration(duration)
+                .with_position(("center", caption_y))
+            )
+
+        composite = CompositeVideoClip(
+            [background, top_panel, bottom_panel, video_clip, title_clip, *caption_clips],
+            size=(frame_w, frame_h),
+        )
+
+        output_path = output_dir / f"clip_{index:02d}.mp4"
+        composite.write_videofile(
+            str(output_path),
+            codec=self.settings.rendering.video_codec,
+            audio_codec=self.settings.rendering.audio_codec,
+            fps=self.settings.rendering.fps,
+            bitrate=self.settings.rendering.bitrate,
+            ffmpeg_params=[
+                "-preset",
+                self.settings.rendering.preset,
+                "-pix_fmt",
+                "yuv420p",
+            ],
+            temp_audiofile=str(output_dir / f"temp_audio_{index:02d}.m4a"),
+            remove_temp=True,
+            threads=4,
+        )
+
+        composite.close()
+        resized_clip.close()
+        video_clip.close()
+        title_clip.close()
+        background.close()
+        top_panel.close()
+        bottom_panel.close()
+        for clip in caption_clips:
+            clip.close()
+        for clip in caption_resources:
+            clip.close()
+
+        return str(output_path)
+
+    def _collect_words(
+        self, transcription: TranscriptionResult, start: float, end: float
+    ) -> List[WordTiming]:
+        collected: List[WordTiming] = []
+        for segment in transcription.segments:
+            if segment.end < start or segment.start > end:
+                continue
+
+            if segment.words:
+                for word in segment.words:
+                    if word.end < start or word.start > end:
+                        continue
+                    collected.append(
+                        WordTiming(
+                            start=max(start, word.start),
+                            end=min(end, word.end),
+                            word=word.word,
+                        )
+                    )
+            else:
+                collected.extend(self._fallback_words(segment.text, segment.start, segment.end, start, end))
+
+        collected.sort(key=lambda w: w.start)
+        return collected
+
+    def _fallback_words(
+        self,
+        text: str,
+        segment_start: float,
+        segment_end: float,
+        window_start: float,
+        window_end: float,
+    ) -> Iterable[WordTiming]:
+        words = [w for w in re.split(r"\s+", text.strip()) if w]
+        if not words:
+            return []
+
+        seg_start = max(segment_start, window_start)
+        seg_end = min(segment_end, window_end)
+        duration = max(0.01, seg_end - seg_start)
+        step = duration / len(words)
+
+        timings: List[WordTiming] = []
+        for idx, word in enumerate(words):
+            w_start = seg_start + idx * step
+            w_end = min(seg_end, w_start + step)
+            timings.append(WordTiming(start=w_start, end=w_end, word=word))
+        return timings
+
+    @staticmethod
+    def _wrap_text(text: str, max_width: int) -> str:
+        text = text.strip()
+        if not text:
+            return ""
+
+        words = text.split()
+        lines: List[str] = []
+        current: List[str] = []
+        for word in words:
+            current.append(word)
+            if len(" ".join(current)) > max_width // 18:
+                lines.append(" ".join(current[:-1]))
+                current = [current[-1]]
+        if current:
+            lines.append(" ".join(current))
+        return "\n".join(lines)