#v2 - Inicia testes da v2

- Adiciona rastreamento de objetos - Facial detection - Legenda interativa - Cortes mais precisos - Refinamento do Prompt
2025-11-12 11:38:09 -03:00
parent 87c6a5e27c
commit c5d3e83a5f
15 changed files with 1739 additions and 313 deletions
--- a/video_render/smart_framing.py
+++ b/video_render/smart_framing.py
@@ -0,0 +1,687 @@
+"""
+Smart framing module for intelligent video cropping and composition.
+
+This module provides functionality to create 9:16 vertical videos with
+intelligent framing that follows the action and speakers.
+"""
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import cv2
+import numpy as np
+from moviepy.video.VideoClip import VideoClip
+from moviepy.video.io.VideoFileClip import VideoFileClip
+from scipy import signal
+
+from video_render.context_detection import ContextAnalyzer, FrameContext, FaceDetection
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class CropRegion:
+    """Defines a crop region for a frame."""
+    x: int
+    y: int
+    width: int
+    height: int
+
+
+@dataclass
+class FramingPlan:
+    """Complete framing plan for a video segment."""
+    frame_contexts: List[FrameContext]
+    crop_regions: List[CropRegion]
+    layout_mode: str
+    fps: float
+
+
+class SmartFramer:
+    """Creates intelligent 9:16 framing for horizontal videos."""
+
+    def __init__(
+        self,
+        target_width: int = 1080,
+        target_height: int = 1920,
+        frame_skip: int = 2,
+        smoothing_window: int = 15
+    ):
+        self.target_width = target_width
+        self.target_height = target_height
+        self.target_aspect = target_height / target_width
+
+        # Performance parameters
+        self.frame_skip = frame_skip  # Process every Nth frame (CPU optimization)
+
+        # Smoothing parameters
+        self.smoothing_window = smoothing_window
+        self.max_velocity = 30  # pixels per frame (reduced for smoother transitions)
+
+        logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip})")
+
+    def create_framing_plan(
+        self,
+        video_path: str,
+        start_time: float,
+        end_time: float,
+        audio_samples: Optional[np.ndarray] = None
+    ) -> FramingPlan:
+        """
+        Analyze video and create a complete framing plan.
+
+        Args:
+            video_path: Path to video file
+            start_time: Start time in seconds
+            end_time: End time in seconds
+            audio_samples: Optional audio samples for speech detection
+
+        Returns:
+            FramingPlan with all frame contexts and crop regions
+        """
+        analyzer = ContextAnalyzer()
+
+        # Detect speaking periods from audio if available
+        speaking_periods = None
+        if audio_samples is not None:
+            speaking_periods = analyzer.audio_detector.detect_speaking_periods(audio_samples)
+
+        # Open video with error suppression for AV1 codec warnings
+        import os
+        os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'loglevel;quiet'
+
+        cap = cv2.VideoCapture(video_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+
+        # Calculate frame range
+        start_frame = int(start_time * fps)
+        end_frame = int(end_time * fps)
+
+        # Set to start frame
+        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
+
+        frame_contexts = []
+        frame_number = start_frame
+        processed_count = 0
+
+        logger.info(f"Analyzing frames {start_frame} to {end_frame} (fps={fps}, skip={self.frame_skip})")
+
+        while frame_number < end_frame:
+            ret, frame = cap.read()
+            if not ret:
+                break
+
+            # Only process every Nth frame for performance (CPU optimization)
+            if processed_count % self.frame_skip == 0:
+                timestamp = frame_number / fps
+                context = analyzer.analyze_frame(frame, timestamp, frame_number, speaking_periods)
+                frame_contexts.append(context)
+
+            frame_number += 1
+            processed_count += 1
+
+        # Get video dimensions before releasing capture
+        source_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        source_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+        cap.release()
+        analyzer.close()
+
+        # Determine overall layout mode (most common)
+        layout_modes = [ctx.layout_mode for ctx in frame_contexts]
+        if layout_modes:
+            overall_layout = max(set(layout_modes), key=layout_modes.count)
+        else:
+            overall_layout = "single"
+
+        # Calculate crop regions based on contexts
+
+        crop_regions = self._calculate_crop_regions(
+            frame_contexts,
+            source_width,
+            source_height
+        )
+
+        return FramingPlan(
+            frame_contexts=frame_contexts,
+            crop_regions=crop_regions,
+            layout_mode=overall_layout,
+            fps=fps
+        )
+
+    def _calculate_crop_regions(
+        self,
+        contexts: List[FrameContext],
+        source_width: int,
+        source_height: int
+    ) -> List[CropRegion]:
+        """
+        Calculate smooth crop regions for each frame.
+
+        Args:
+            contexts: List of frame contexts
+            source_width: Source video width
+            source_height: Source video height
+
+        Returns:
+            List of crop regions
+        """
+        if not contexts:
+            return []
+
+        # Calculate ideal crop dimensions maintaining EXACT 9:16 aspect ratio
+        source_aspect = source_width / source_height
+
+        if source_aspect > self.target_aspect:
+            # Source is wider - crop horizontally (use full height)
+            crop_height = source_height
+            crop_width = int(crop_height / self.target_aspect)
+
+            # Ensure crop width fits within source
+            if crop_width > source_width:
+                crop_width = source_width
+                crop_height = int(crop_width * self.target_aspect)
+        else:
+            # Source is taller - crop vertically (use full width)
+            crop_width = source_width
+            crop_height = int(crop_width * self.target_aspect)
+
+            # Ensure crop height fits within source
+            if crop_height > source_height:
+                crop_height = source_height
+                crop_width = int(crop_height / self.target_aspect)
+
+        # Calculate center points for each frame
+        # Since we now always focus on ONE person directly (not averaging),
+        # we can use the focus point directly without complex validation
+        center_xs = []
+        center_ys = []
+
+        for ctx in contexts:
+            if ctx.primary_focus:
+                # Primary focus is now always a single person's center, never averaged
+                # This means it will never be on the table/empty space
+                center_xs.append(ctx.primary_focus[0])
+                center_ys.append(ctx.primary_focus[1])
+            else:
+                # Default to center only if no faces detected at all
+                center_xs.append(source_width // 2)
+                center_ys.append(source_height // 2)
+
+        # Smooth the center points
+        if len(center_xs) > self.smoothing_window:
+            kernel_size = min(self.smoothing_window, len(center_xs))
+            if kernel_size % 2 == 0:
+                kernel_size -= 1
+
+            center_xs = signal.medfilt(center_xs, kernel_size=kernel_size).tolist()
+            center_ys = signal.medfilt(center_ys, kernel_size=kernel_size).tolist()
+
+        # Limit velocity (prevent jarring movements)
+        center_xs = self._limit_velocity(center_xs, self.max_velocity)
+        center_ys = self._limit_velocity(center_ys, self.max_velocity)
+
+        # Convert to crop regions
+        crop_regions = []
+        for center_x, center_y in zip(center_xs, center_ys):
+            # Calculate top-left corner
+            x = int(center_x - crop_width // 2)
+            y = int(center_y - crop_height // 2)
+
+            # Clamp to valid bounds
+            x = max(0, min(x, source_width - crop_width))
+            y = max(0, min(y, source_height - crop_height))
+
+            crop_regions.append(CropRegion(
+                x=x,
+                y=y,
+                width=crop_width,
+                height=crop_height
+            ))
+
+        return crop_regions
+
+    def _limit_velocity(self, positions: List[float], max_velocity: float) -> List[float]:
+        """
+        Limit the velocity of position changes.
+
+        Args:
+            positions: List of positions
+            max_velocity: Maximum allowed change per frame
+
+        Returns:
+            Smoothed positions
+        """
+        if len(positions) <= 1:
+            return positions
+
+        limited = [positions[0]]
+
+        for i in range(1, len(positions)):
+            delta = positions[i] - limited[i - 1]
+            if abs(delta) > max_velocity:
+                delta = max_velocity if delta > 0 else -max_velocity
+
+            limited.append(limited[i - 1] + delta)
+
+        return limited
+
+    def apply_framing(
+        self,
+        video_clip: VideoFileClip,
+        framing_plan: FramingPlan,
+        use_split_screen: bool = False
+    ) -> VideoClip:
+        """
+        Apply smart framing to a video clip.
+
+        Args:
+            video_clip: Source video clip
+            framing_plan: Framing plan to apply
+            use_split_screen: Whether to use split screen for multiple people
+
+        Returns:
+            Reframed video clip
+        """
+        # Handle different layout modes
+        if framing_plan.layout_mode in ["single", "single_speaker"]:
+            # Single person or single speaker - use focused single framing
+            return self._apply_single_framing(video_clip, framing_plan)
+        elif framing_plan.layout_mode == "dual_split" and use_split_screen:
+            # Two people in conversation - use split screen
+            return self._apply_split_screen(video_clip, framing_plan)
+        elif framing_plan.layout_mode == "grid" and use_split_screen:
+            # 3+ people - use grid layout
+            return self._apply_grid_layout(video_clip, framing_plan)
+        else:
+            # Fallback to single framing
+            return self._apply_single_framing(video_clip, framing_plan)
+
+    def _apply_single_framing(
+        self,
+        video_clip: VideoFileClip,
+        framing_plan: FramingPlan
+    ) -> VideoClip:
+        """
+        Apply single-focus framing (following one person or action).
+
+        Args:
+            video_clip: Source video clip
+            framing_plan: Framing plan
+
+        Returns:
+            Reframed video clip
+        """
+        def make_frame(t):
+            # Get the original frame
+            frame = video_clip.get_frame(t)
+
+            # Ensure we have valid crop regions
+            if not framing_plan.crop_regions:
+                # Fallback: return center crop
+                h, w = frame.shape[:2]
+                crop_h = int(w * self.target_aspect)
+                crop_w = w
+                if crop_h > h:
+                    crop_h = h
+                    crop_w = int(h / self.target_aspect)
+                y = (h - crop_h) // 2
+                x = (w - crop_w) // 2
+                cropped = frame[y:y + crop_h, x:x + crop_w]
+            else:
+                # Calculate exact frame index with decimal precision for interpolation
+                exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
+
+                # Get the two adjacent analyzed frames
+                idx_floor = int(exact_frame_idx)
+                idx_ceil = idx_floor + 1
+
+                # Interpolation factor (0.0 to 1.0)
+                alpha = exact_frame_idx - idx_floor
+
+                # Clamp indices to valid range
+                idx_floor = max(0, min(idx_floor, len(framing_plan.crop_regions) - 1))
+                idx_ceil = max(0, min(idx_ceil, len(framing_plan.crop_regions) - 1))
+
+                # Get crop regions
+                crop1 = framing_plan.crop_regions[idx_floor]
+                crop2 = framing_plan.crop_regions[idx_ceil]
+
+                # Linear interpolation between crop regions
+                x = int(crop1.x * (1 - alpha) + crop2.x * alpha)
+                y = int(crop1.y * (1 - alpha) + crop2.y * alpha)
+                width = int(crop1.width * (1 - alpha) + crop2.width * alpha)
+                height = int(crop1.height * (1 - alpha) + crop2.height * alpha)
+
+                # Ensure crop stays within frame bounds
+                h, w = frame.shape[:2]
+                x = max(0, min(x, w - width))
+                y = max(0, min(y, h - height))
+                width = min(width, w - x)
+                height = min(height, h - y)
+
+                # Crop the frame
+                cropped = frame[y:y + height, x:x + width]
+
+            # Resize to target dimensions
+            resized = cv2.resize(
+                cropped,
+                (self.target_width, self.target_height),
+                interpolation=cv2.INTER_LINEAR
+            )
+
+            return resized
+
+        # MoviePy 2.x compatible way to create VideoClip
+        new_clip = VideoClip(duration=video_clip.duration)
+        new_clip.size = (self.target_width, self.target_height)
+        new_clip.frame_function = make_frame
+        return new_clip
+
+    def _apply_split_screen(
+        self,
+        video_clip: VideoFileClip,
+        framing_plan: FramingPlan
+    ) -> VideoClip:
+        """
+        Apply split screen for two people.
+
+        Args:
+            video_clip: Source video clip
+            framing_plan: Framing plan
+
+        Returns:
+            Split screen video clip
+        """
+        def make_frame(t):
+            frame = video_clip.get_frame(t)
+            # Calculate exact frame index with decimal precision for smooth interpolation
+            exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
+            frame_idx = int(exact_frame_idx)
+
+            # Ensure we have valid contexts
+            if not framing_plan.frame_contexts:
+                # Fallback to simple center crop
+                h, w = frame.shape[:2]
+                crop_h = int(w * self.target_aspect)
+                crop_w = w
+                if crop_h > h:
+                    crop_h = h
+                    crop_w = int(h / self.target_aspect)
+                y = (h - crop_h) // 2
+                x = (w - crop_w) // 2
+                cropped = frame[y:y + crop_h, x:x + crop_w]
+                return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
+
+            # Clamp index to valid range
+            frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
+            context = framing_plan.frame_contexts[frame_idx]
+
+            # Create output frame
+            output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
+
+            if len(context.detected_faces) >= 2:
+                # Split vertically 50/50 (two columns)
+                half_width = self.target_width // 2
+
+                # Select the 2 most relevant faces
+                # Priority: ALWAYS show active speaker first + most confident other person
+                if context.active_speakers and len(context.active_speakers) >= 1:
+                    # Get the PRIMARY speaker (most confident among active speakers)
+                    speaker_faces = [context.detected_faces[i] for i in context.active_speakers
+                                   if i < len(context.detected_faces)]
+
+                    primary_speaker = max(speaker_faces, key=lambda f: f.confidence)
+
+                    # Get OTHER faces (not the primary speaker)
+                    other_faces = [f for f in context.detected_faces if f != primary_speaker]
+
+                    if len(speaker_faces) >= 2:
+                        # Multiple speakers: show primary + second most confident speaker
+                        other_speakers = [f for f in speaker_faces if f != primary_speaker]
+                        secondary_person = max(other_speakers, key=lambda f: f.confidence)
+                    elif other_faces:
+                        # One speaker: show speaker + most confident other person
+                        secondary_person = max(other_faces, key=lambda f: f.confidence)
+                    else:
+                        # Fallback: only one person detected
+                        secondary_person = primary_speaker
+
+                    selected_faces = [primary_speaker, secondary_person]
+                else:
+                    # No speakers: take 2 most confident faces
+                    selected_faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:2]
+
+                # Sort selected faces by horizontal position for consistent left/right placement
+                faces = sorted(selected_faces, key=lambda f: f.center_x)
+                left_face = faces[0]
+                right_face = faces[1]
+
+                # Process each person's frame
+                for idx, face in enumerate([left_face, right_face]):
+                    # Calculate crop region focused on this person
+                    # Each person gets half the width, full target aspect ratio (9:16)
+                    # This ensures NO distortion when resizing
+
+                    # For split screen: each side is half_width x full_height
+                    # We need to maintain 9:16 aspect for each half
+                    half_width = self.target_width // 2
+                    half_aspect = self.target_height / half_width  # Aspect ratio for half
+
+                    # Determine crop size based on face with padding
+                    face_width = max(face.width, frame.shape[1] // 4)  # At least 1/4 of frame width
+                    crop_width = int(face_width * 2.5)  # Add padding around face
+                    crop_height = int(crop_width * half_aspect)  # Maintain correct aspect
+
+                    # Ensure crop fits in frame, maintaining aspect ratio
+                    max_crop_width = frame.shape[1] // 2  # Half the source width
+                    max_crop_height = frame.shape[0]  # Full source height
+
+                    # If crop is too wide, scale down proportionally
+                    if crop_width > max_crop_width:
+                        crop_width = max_crop_width
+                        crop_height = int(crop_width * half_aspect)
+
+                    # If crop is too tall, scale down proportionally
+                    if crop_height > max_crop_height:
+                        crop_height = max_crop_height
+                        crop_width = int(crop_height / half_aspect)
+
+                    # Center crop on face
+                    x = max(0, face.center_x - crop_width // 2)
+                    y = max(0, face.center_y - crop_height // 2)
+
+                    # Clamp to frame boundaries
+                    x = min(x, frame.shape[1] - crop_width)
+                    y = min(y, frame.shape[0] - crop_height)
+
+                    # Extract and resize crop
+                    cropped = frame[y:y + crop_height, x:x + crop_width]
+                    resized = cv2.resize(
+                        cropped,
+                        (half_width, self.target_height),
+                        interpolation=cv2.INTER_LINEAR
+                    )
+
+                    # Place in output at appropriate horizontal position
+                    x_offset = idx * half_width
+                    output[:, x_offset:x_offset + half_width] = resized
+            else:
+                # Fall back to single framing
+                if framing_plan.crop_regions:
+                    crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
+                    crop = framing_plan.crop_regions[crop_idx]
+                    cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
+                else:
+                    # Fallback to center crop if no crop regions available
+                    h, w = frame.shape[:2]
+                    crop_h = int(w * self.target_aspect)
+                    crop_w = w
+                    if crop_h > h:
+                        crop_h = h
+                        crop_w = int(h / self.target_aspect)
+                    y = (h - crop_h) // 2
+                    x = (w - crop_w) // 2
+                    cropped = frame[y:y + crop_h, x:x + crop_w]
+                output = cv2.resize(
+                    cropped,
+                    (self.target_width, self.target_height),
+                    interpolation=cv2.INTER_LINEAR
+                )
+
+            return output
+
+        # MoviePy 2.x compatible way to create VideoClip
+        new_clip = VideoClip(duration=video_clip.duration)
+        new_clip.size = (self.target_width, self.target_height)
+        new_clip.frame_function = make_frame
+        return new_clip
+
+    def _apply_grid_layout(
+        self,
+        video_clip: VideoFileClip,
+        framing_plan: FramingPlan
+    ) -> VideoClip:
+        """
+        Apply grid layout for 3+ people.
+
+        Args:
+            video_clip: Source video clip
+            framing_plan: Framing plan
+
+        Returns:
+            Grid layout video clip
+        """
+        def make_frame(t):
+            frame = video_clip.get_frame(t)
+            # Calculate exact frame index with decimal precision for smooth interpolation
+            exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
+            frame_idx = int(exact_frame_idx)
+
+            # Ensure we have valid contexts
+            if not framing_plan.frame_contexts:
+                # Fallback to simple center crop
+                h, w = frame.shape[:2]
+                crop_h = int(w * self.target_aspect)
+                crop_w = w
+                if crop_h > h:
+                    crop_h = h
+                    crop_w = int(h / self.target_aspect)
+                y = (h - crop_h) // 2
+                x = (w - crop_w) // 2
+                cropped = frame[y:y + crop_h, x:x + crop_w]
+                return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
+
+            # Clamp index to valid range
+            frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
+            context = framing_plan.frame_contexts[frame_idx]
+
+            output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
+
+            num_faces = len(context.detected_faces)
+
+            if num_faces >= 3:
+                # Create 2x2 grid
+                cell_width = self.target_width // 2
+                cell_height = self.target_height // 2
+
+                for idx, face in enumerate(context.detected_faces[:4]):
+                    # Calculate grid position
+                    row = idx // 2
+                    col = idx % 2
+
+                    # Each grid cell maintains aspect ratio (square in this case: cell_width = cell_height)
+                    cell_aspect = cell_height / cell_width
+
+                    # Crop around face with correct aspect ratio
+                    crop_width = frame.shape[1] // 2
+                    crop_height = int(crop_width * cell_aspect)
+
+                    # Ensure crop fits in frame, maintaining aspect
+                    max_crop_width = frame.shape[1] // 2
+                    max_crop_height = frame.shape[0] // 2
+
+                    if crop_width > max_crop_width:
+                        crop_width = max_crop_width
+                        crop_height = int(crop_width * cell_aspect)
+
+                    if crop_height > max_crop_height:
+                        crop_height = max_crop_height
+                        crop_width = int(crop_height / cell_aspect)
+
+                    # Center crop on face
+                    x = max(0, face.center_x - crop_width // 2)
+                    y = max(0, face.center_y - crop_height // 2)
+
+                    # Clamp to frame boundaries
+                    x = min(x, frame.shape[1] - crop_width)
+                    y = min(y, frame.shape[0] - crop_height)
+
+                    cropped = frame[y:y + crop_height, x:x + crop_width]
+                    resized = cv2.resize(
+                        cropped,
+                        (cell_width, cell_height),
+                        interpolation=cv2.INTER_LINEAR
+                    )
+
+                    # Place in grid
+                    y_offset = row * cell_height
+                    x_offset = col * cell_width
+                    output[y_offset:y_offset + cell_height, x_offset:x_offset + cell_width] = resized
+            else:
+                # Fall back to single framing
+                if framing_plan.crop_regions:
+                    crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
+                    crop = framing_plan.crop_regions[crop_idx]
+                    cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
+                else:
+                    # Fallback to center crop if no crop regions available
+                    h, w = frame.shape[:2]
+                    crop_h = int(w * self.target_aspect)
+                    crop_w = w
+                    if crop_h > h:
+                        crop_h = h
+                        crop_w = int(h / self.target_aspect)
+                    y = (h - crop_h) // 2
+                    x = (w - crop_w) // 2
+                    cropped = frame[y:y + crop_h, x:x + crop_w]
+                output = cv2.resize(
+                    cropped,
+                    (self.target_width, self.target_height),
+                    interpolation=cv2.INTER_LINEAR
+                )
+
+            return output
+
+        # MoviePy 2.x compatible way to create VideoClip
+        new_clip = VideoClip(duration=video_clip.duration)
+        new_clip.size = (self.target_width, self.target_height)
+        new_clip.frame_function = make_frame
+        return new_clip
+
+
+def extract_audio_samples(video_path: str, start_time: float, end_time: float) -> Optional[np.ndarray]:
+    """
+    Extract audio samples from video for speech detection.
+
+    Args:
+        video_path: Path to video file
+        start_time: Start time in seconds
+        end_time: End time in seconds
+
+    Returns:
+        Audio samples array or None if no audio
+    """
+    try:
+        from moviepy.audio.io.AudioFileClip import AudioFileClip
+
+        with AudioFileClip(video_path) as audio:
+            segment = audio.subclipped(start_time, end_time)
+            fps = getattr(segment, 'fps', 44100)
+            samples = segment.to_soundarray(fps=fps)
+            return samples
+    except Exception as exc:
+        logger.warning(f"Failed to extract audio: {exc}")
+        return None