#v2 - Inicia testes da v2

- Adiciona rastreamento de objetos - Facial detection - Legenda interativa - Cortes mais precisos - Refinamento do Prompt
2025-11-12 11:38:09 -03:00
parent 87c6a5e27c
commit c5d3e83a5f
15 changed files with 1739 additions and 313 deletions
--- a/video_render/rendering.py
+++ b/video_render/rendering.py
@@ -15,6 +15,7 @@ from PIL import Image, ImageColor, ImageDraw, ImageFont

 from video_render.config import Settings
 from video_render.transcription import TranscriptionResult, WordTiming
+from video_render.smart_framing import SmartFramer, extract_audio_samples

 logger = logging.getLogger(__name__)

@@ -54,7 +55,41 @@ class CaptionBuilder:
        self.space_width = self.font.getbbox(" ")[2] - self.font.getbbox(" ")[0]

    def build(self, words: Sequence[WordTiming], clip_start: float) -> List[CaptionClipSet]:
-        grouped = self._group_words(words)
+        # Filter out empty, whitespace-only, or very short words (likely noise)
+        valid_words = [
+            w for w in words
+            if w.word
+            and w.word.strip()
+            and len(w.word.strip()) >= 2  # At least 2 characters
+            and not w.word.strip() in ['...', '..', '.', ',', '-', 'hmm', 'hm', 'ah', 'eh', 'uh']  # Not just punctuation or filler
+        ]
+
+        # Note: We don't filter out words based on gaps here
+        # Gap detection is handled in _group_words_with_gaps
+        # This ensures captions disappear during silence naturally
+        filtered_words = valid_words
+
+        # Calculate speech density (words per second)
+        # If density is too low, it's likely just noise/silence being misinterpreted
+        if filtered_words:
+            first_word_time = filtered_words[0].start
+            last_word_time = filtered_words[-1].end
+            duration = last_word_time - first_word_time
+
+            if duration > 0:
+                words_per_second = len(filtered_words) / duration
+                # Typical speech is 2-3 words per second
+                # If less than 0.5 words/second, it's probably silence/noise
+                if words_per_second < 0.5:
+                    logger.debug(f"Captions suprimidas: densidade muito baixa ({words_per_second:.2f} palavras/seg)")
+                    return []
+
+        # Only show captions if we have at least 3 valid words (reduced from 5 for 2-word groups)
+        # This prevents showing captions for noise/mumbling
+        if len(filtered_words) < 3:
+            return []
+
+        grouped = self._group_words_with_gaps(filtered_words)
        clip_sets: List[CaptionClipSet] = []

        for group in grouped:
@@ -101,6 +136,92 @@ class CaptionBuilder:
        if len(widths) > 1:
            total_width += self.space_width * (len(widths) - 1)

+        # Check if text needs to wrap to multiple lines
+        # If total width exceeds canvas width, break into 2 lines
+        needs_wrap = total_width > self.canvas_width
+
+        if needs_wrap:
+            # Split into 2 lines - try to balance the lines
+            mid_point = len(texts) // 2
+            line1_texts = texts[:mid_point]
+            line2_texts = texts[mid_point:]
+            line1_widths = widths[:mid_point]
+            line2_widths = widths[mid_point:]
+
+            # Calculate widths for each line
+            line1_width = sum(line1_widths)
+            if len(line1_widths) > 1:
+                line1_width += self.space_width * (len(line1_widths) - 1)
+
+            line2_width = sum(line2_widths)
+            if len(line2_widths) > 1:
+                line2_width += self.space_width * (len(line2_widths) - 1)
+
+            # Double the canvas height for 2 lines
+            canvas_height = self.canvas_height * 2
+            base_image = Image.new("RGBA", (self.canvas_width, canvas_height), (0, 0, 0, 0))
+            base_draw = ImageDraw.Draw(base_image)
+            highlight_images: List[Image.Image] = []
+
+            # Stroke settings: 8px black stroke for better readability
+            stroke_width = 8
+            stroke_color = (0, 0, 0, 255)  # Black
+
+            # Draw line 1
+            x = max(0, (self.canvas_width - line1_width) // 2)
+            y = self.baseline
+            for i, (text, width) in enumerate(zip(line1_texts, line1_widths)):
+                base_draw.text(
+                    (x, y),
+                    text,
+                    font=self.font,
+                    fill=self.base_color,
+                    stroke_width=stroke_width,
+                    stroke_fill=stroke_color
+                )
+
+                highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
+                highlight_draw = ImageDraw.Draw(highlight_image)
+                highlight_draw.text(
+                    (x, y),
+                    text,
+                    font=self.font,
+                    fill=self.highlight_color,
+                    stroke_width=stroke_width,
+                    stroke_fill=stroke_color
+                )
+                highlight_images.append(highlight_image)
+                x += width + self.space_width
+
+            # Draw line 2
+            x = max(0, (self.canvas_width - line2_width) // 2)
+            y = self.baseline + self.text_height + 5  # 5px spacing between lines
+            for i, (text, width) in enumerate(zip(line2_texts, line2_widths)):
+                base_draw.text(
+                    (x, y),
+                    text,
+                    font=self.font,
+                    fill=self.base_color,
+                    stroke_width=stroke_width,
+                    stroke_fill=stroke_color
+                )
+
+                highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
+                highlight_draw = ImageDraw.Draw(highlight_image)
+                highlight_draw.text(
+                    (x, y),
+                    text,
+                    font=self.font,
+                    fill=self.highlight_color,
+                    stroke_width=stroke_width,
+                    stroke_fill=stroke_color
+                )
+                highlight_images.append(highlight_image)
+                x += width + self.space_width
+
+            return base_image, highlight_images
+
+        # Single line rendering (original code)
        start_x = max(0, (self.canvas_width - total_width) // 2)

        base_image = Image.new("RGBA", (self.canvas_width, self.canvas_height), (0, 0, 0, 0))
@@ -108,13 +229,31 @@ class CaptionBuilder:
        highlight_images: List[Image.Image] = []
        x = start_x

-        for text, width in zip(texts, widths):
-            base_draw.text((x, self.baseline), text, font=self.font, fill=self.base_color)
+        # Stroke settings: 8px black stroke for better readability
+        stroke_width = 8
+        stroke_color = (0, 0, 0, 255)  # Black

+        for text, width in zip(texts, widths):
+            # Draw base text with stroke
+            base_draw.text(
+                (x, self.baseline),
+                text,
+                font=self.font,
+                fill=self.base_color,
+                stroke_width=stroke_width,
+                stroke_fill=stroke_color
+            )
+
+            # Draw highlight text with stroke
            highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
            highlight_draw = ImageDraw.Draw(highlight_image)
            highlight_draw.text(
-                (x, self.baseline), text, font=self.font, fill=self.highlight_color
+                (x, self.baseline),
+                text,
+                font=self.font,
+                fill=self.highlight_color,
+                stroke_width=stroke_width,
+                stroke_fill=stroke_color
            )
            highlight_images.append(highlight_image)

@@ -153,6 +292,44 @@ class CaptionBuilder:

        return grouped

+    def _group_words_with_gaps(self, words: Sequence[WordTiming]) -> List[List[WordTiming]]:
+        """
+        Group words into 2-word chunks, respecting silence gaps.
+        Creates natural breaks where there are pauses > 1.5s
+        """
+        if not words:
+            return []
+
+        grouped: List[List[WordTiming]] = []
+        buffer: List[WordTiming] = []
+
+        for i, word in enumerate(words):
+            # Check if there's a long pause before this word
+            if i > 0:
+                gap = word.start - words[i-1].end
+                # If gap > 1.5s, finish current buffer and start new group
+                if gap > 1.5:
+                    if buffer:
+                        grouped.append(buffer)
+                        buffer = []
+
+            buffer.append(word)
+
+            # Group into 2 words maximum
+            if len(buffer) == 2:
+                grouped.append(buffer)
+                buffer = []
+
+        # Handle remaining words
+        if buffer:
+            if len(buffer) == 1 and grouped:
+                # Add single remaining word to last group
+                grouped[-1].append(buffer[0])
+            else:
+                grouped.append(buffer)
+
+        return [grp for grp in grouped if grp]
+
    @staticmethod
    def _clean_word(text: str) -> str:
        text = text.strip()
@@ -164,6 +341,12 @@ class VideoRenderer:
    def __init__(self, settings: Settings) -> None:
        self.settings = settings
        self.captions = CaptionBuilder(settings)
+        self.smart_framer = SmartFramer(
+            target_width=settings.rendering.frame_width,
+            target_height=settings.rendering.frame_height,
+            frame_skip=settings.rendering.smart_framing_frame_skip,
+            smoothing_window=settings.rendering.smart_framing_smoothing_window
+        )

    def render(
        self,
@@ -234,26 +417,100 @@ class VideoRenderer:
        duration = end - start
        frame_w = self.settings.rendering.frame_width
        frame_h = self.settings.rendering.frame_height
-        top_h = int(frame_h * 0.18)
+        # Removed top panel - no longer showing title
        bottom_h = int(frame_h * 0.20)
-        video_area_h = max(1, frame_h - top_h - bottom_h)

-        scale_factor = min(
-            frame_w / subclip.w,
-            video_area_h / subclip.h,
-        )
-        resized_clip = subclip.resized(scale_factor)
-        video_y = top_h + (video_area_h - resized_clip.h) // 2
-        video_clip = resized_clip.with_position(
-            ((frame_w - resized_clip.w) // 2, video_y)
-        )
+        # Use smart framing to create intelligent 9:16 video (if enabled)
+        if self.settings.rendering.enable_smart_framing:
+            logger.info(f"Creating smart framing plan for clip {index} ({start:.2f}s - {end:.2f}s)")
+
+            try:
+                # Extract audio for speech detection
+                audio_samples = extract_audio_samples(source_path, start, end)
+
+                # Create framing plan
+                framing_plan = self.smart_framer.create_framing_plan(
+                    video_path=source_path,
+                    start_time=start,
+                    end_time=end,
+                    audio_samples=audio_samples
+                )
+
+                # Apply smart framing based on detected layout
+                use_split_screen = framing_plan.layout_mode in ["dual_split", "grid"]
+                video_clip = self.smart_framer.apply_framing(
+                    video_clip=subclip,
+                    framing_plan=framing_plan,
+                    use_split_screen=use_split_screen
+                )
+
+                logger.info(f"Smart framing applied: layout={framing_plan.layout_mode}, "
+                           f"faces_detected={len(framing_plan.frame_contexts[0].detected_faces) if framing_plan.frame_contexts else 0}")
+
+            except Exception as exc:
+                logger.warning(f"Smart framing failed for clip {index}, falling back to center crop: {exc}", exc_info=True)
+
+                # Fallback to center crop (maintains aspect ratio, crops to fit)
+                video_area_h = max(1, frame_h - bottom_h)
+
+                # Use MAX to ensure video covers entire area (will crop excess)
+                scale_factor = max(
+                    frame_w / subclip.w,
+                    video_area_h / subclip.h,
+                )
+
+                # Resize to cover area
+                resized_clip = subclip.resized(scale_factor)
+
+                # Calculate crop region (center crop)
+                crop_x1 = max(0, (resized_clip.w - frame_w) // 2)
+                crop_y1 = max(0, (resized_clip.h - video_area_h) // 2)
+                crop_x2 = crop_x1 + frame_w
+                crop_y2 = crop_y1 + video_area_h
+
+                # Crop to fit target dimensions using MoviePy crop(x1, y1, x2, y2)
+                cropped_clip = resized_clip.cropped(
+                    x1=crop_x1,
+                    y1=crop_y1,
+                    x2=crop_x2,
+                    y2=crop_y2
+                )
+
+                video_clip = cropped_clip.with_position((0, 0))
+                resized_clip.close()
+        else:
+            # Use center crop (smart framing disabled)
+            logger.info(f"Using center crop for clip {index} (smart framing disabled)")
+            video_area_h = max(1, frame_h - bottom_h)
+
+            # Use MAX to ensure video covers entire area (will crop excess)
+            scale_factor = max(
+                frame_w / subclip.w,
+                video_area_h / subclip.h,
+            )
+
+            # Resize to cover area
+            resized_clip = subclip.resized(scale_factor)
+
+            # Calculate crop region (center crop)
+            crop_x1 = max(0, (resized_clip.w - frame_w) // 2)
+            crop_y1 = max(0, (resized_clip.h - video_area_h) // 2)
+            crop_x2 = crop_x1 + frame_w
+            crop_y2 = crop_y1 + video_area_h
+
+            # Crop to fit target dimensions using MoviePy crop(x1, y1, x2, y2)
+            cropped_clip = resized_clip.cropped(
+                x1=crop_x1,
+                y1=crop_y1,
+                x2=crop_x2,
+                y2=crop_y2
+            )
+
+            video_clip = cropped_clip.with_position((0, 0))
+            resized_clip.close()

        background = ColorClip(size=(frame_w, frame_h), color=(0, 0, 0)).with_duration(duration)
-        top_panel = (
-            ColorClip(size=(frame_w, top_h), color=(12, 12, 12))
-            .with_duration(duration)
-            .with_opacity(0.85)
-        )
+        # Removed top panel and title - no longer needed
        bottom_panel = (
            ColorClip(size=(frame_w, bottom_h), color=(12, 12, 12))
            .with_position((0, frame_h - bottom_h))
@@ -261,34 +518,42 @@ class VideoRenderer:
            .with_opacity(0.85)
        )

-        title_clip = self._build_title_clip(
-            title=title,
-            summary=summary,
-            duration=duration,
-            frame_width=frame_w,
-            top_panel_height=top_h,
-        )
-        title_clip = title_clip.with_position(
-            ((frame_w - title_clip.w) // 2, (top_h - title_clip.h) // 2)
-        )
-
        words = self._collect_words(transcription, start, end)
-        caption_sets = self.captions.build(words, clip_start=start)
+
+        # Calculate speech coverage: how much of the clip has actual speech?
+        # If less than 30% of the clip has speech, don't show captions
+        clip_duration = end - start
+        if words and clip_duration > 0:
+            # Calculate total time with speech
+            total_speech_time = sum(w.end - w.start for w in words)
+            speech_coverage = total_speech_time / clip_duration
+
+            if speech_coverage < 0.3:  # Less than 30% speech
+                logger.debug(f"Captions suprimidas: cobertura de fala baixa ({speech_coverage:.1%})")
+                words = []  # Clear words to prevent captions
+
+        # Only build captions if there are actual words to display
+        # This prevents empty/placeholder captions from appearing
+        caption_sets = self.captions.build(words, clip_start=start) if words else []

        caption_clips = []
        caption_resources: List[ImageClip] = []
-        caption_area_top = frame_h - bottom_h
-        caption_area_height = bottom_h
+
+        # Position captions 120px below center (for 1920px height, center is 960px, so 1080px)
+        # This ensures they're visible, well-positioned, and don't interfere with faces
+        # Range: 100-150px as requested, using 120px for optimal positioning
+        center_y = frame_h // 2
+        caption_y = center_y + 120
        caption_margin = 20
-        raw_caption_y = caption_area_top + (caption_area_height - self.captions.canvas_height) // 2
-        min_caption_y = caption_area_top + caption_margin
-        max_caption_y = (
-            caption_area_top + caption_area_height - self.captions.canvas_height - caption_margin
-        )
+
+        # Ensure captions stay within reasonable bounds (no top panel now)
+        min_caption_y = caption_margin
+        max_caption_y = frame_h - bottom_h - self.captions.canvas_height - caption_margin
+
        if max_caption_y < min_caption_y:
            caption_y = min_caption_y
        else:
-            caption_y = min(max(raw_caption_y, min_caption_y), max_caption_y)
+            caption_y = min(max(caption_y, min_caption_y), max_caption_y)

        for clip_set in caption_sets:
            base_positioned = clip_set.base.with_position(("center", caption_y))
@@ -299,30 +564,20 @@ class VideoRenderer:
                caption_clips.append(positioned)
                caption_resources.append(highlight)

-        if not caption_clips:
-            fallback_text = self._wrap_text(summary or title, max_width=frame_w - 160)
-            caption_clips.append(
-                self._make_textclip(
-                    text=fallback_text,
-                    font_path=self.settings.rendering.font_path,
-                    font_size=self.settings.rendering.subtitle_font_size,
-                    color=self.settings.rendering.base_color,
-                    size=(frame_w - 160, max(40, self.captions.canvas_height)),
-                )
-                .with_duration(duration)
-                .with_position(("center", caption_y))
-            )
+        # No fallback captions - if there are no dynamic captions, show nothing
+        # This matches Opus Clip behavior where captions only appear when there's actual speech

        audio_clip, audio_needs_close = self._materialize_audio(
            source_path=source_path,
            start=start,
            end=end,
            duration=duration,
-            fallback_audio=video_clip.audio or resized_clip.audio or subclip.audio,
+            fallback_audio=video_clip.audio or subclip.audio,
        )

+        # Composite with background, bottom panel, video, and captions only (no top panel or title)
        composite = CompositeVideoClip(
-            [background, top_panel, bottom_panel, video_clip, title_clip, *caption_clips],
+            [background, bottom_panel, video_clip, *caption_clips],
            size=(frame_w, frame_h),
        )
        if audio_clip is not None:
@@ -337,11 +592,8 @@ class VideoRenderer:
        )

        composite.close()
-        resized_clip.close()
        video_clip.close()
-        title_clip.close()
        background.close()
-        top_panel.close()
        bottom_panel.close()
        for clip in caption_clips:
            clip.close()
@@ -352,95 +604,6 @@ class VideoRenderer:

        return str(output_path)

-    def _build_title_clip(
-        self,
-        *,
-        title: str,
-        summary: str,
-        duration: float,
-        frame_width: int,
-        top_panel_height: int,
-    ) -> ImageClip:
-        text = (title or summary or "").strip()
-        if not text:
-            text = summary or ""
-
-        max_width = max(200, frame_width - 160)
-        font_size = self.settings.rendering.title_font_size
-        min_font_size = max(28, int(font_size * 0.6))
-        target_height = max(80, top_panel_height - 40)
-        title_color = ImageColor.getrgb(self.settings.rendering.base_color)
-        font_path = self.settings.rendering.font_path
-
-        while True:
-            font = ImageFont.truetype(str(font_path), font_size)
-            lines = self._split_title_lines(text, font, max_width)
-            line_height = font.getbbox("Ay")[3] - font.getbbox("Ay")[1]
-            spacing = max(4, int(line_height * 0.25))
-            text_height = self._measure_text_height(len(lines), line_height, spacing)
-
-            if text_height <= target_height or font_size <= min_font_size:
-                break
-
-            font_size = max(min_font_size, font_size - 6)
-
-        # Recompute dimensions with final font size to ensure consistency
-        font = ImageFont.truetype(str(font_path), font_size)
-        lines = self._split_title_lines(text, font, max_width)
-        line_height = font.getbbox("Ay")[3] - font.getbbox("Ay")[1]
-        spacing = max(4, int(line_height * 0.25))
-        text_height = self._measure_text_height(len(lines), line_height, spacing)
-        canvas_height = max(1, text_height)
-
-        image = Image.new("RGBA", (max_width, canvas_height), (0, 0, 0, 0))
-        draw = ImageDraw.Draw(image)
-        y = 0
-        for idx, line in enumerate(lines):
-            bbox = font.getbbox(line)
-            line_width = bbox[2] - bbox[0]
-            x = max(0, (max_width - line_width) // 2)
-            draw.text((x, y - bbox[1]), line, font=font, fill=title_color)
-            y += line_height
-            if idx < len(lines) - 1:
-                y += spacing
-
-        return ImageClip(np.array(image)).with_duration(duration)
-
-    @staticmethod
-    def _measure_text_height(line_count: int, line_height: int, spacing: int) -> int:
-        if line_count <= 0:
-            return line_height
-        return line_count * line_height + max(0, line_count - 1) * spacing
-
-    @staticmethod
-    def _split_title_lines(
-        text: str, font: ImageFont.FreeTypeFont, max_width: int
-    ) -> List[str]:
-        words = text.split()
-        if not words:
-            return [""]
-
-        lines: List[str] = []
-        current: List[str] = []
-        for word in words:
-            test_line = " ".join(current + [word]) if current else word
-            bbox = font.getbbox(test_line)
-            line_width = bbox[2] - bbox[0]
-            if line_width <= max_width or not current:
-                current.append(word)
-                if line_width > max_width and not current[:-1]:
-                    lines.append(" ".join(current))
-                    current = []
-                continue
-
-            lines.append(" ".join(current))
-            current = [word]
-
-        if current:
-            lines.append(" ".join(current))
-
-        return lines
-
    def _materialize_audio(
        self,
        *,