Realiza varios ajustes para melhorar o tracking e o render de video

2025-12-18 02:26:25 -03:00
parent 78e35d65fd
commit 07d301f110
11 changed files with 984 additions and 316 deletions
--- a/video_render/smart_framing.py
+++ b/video_render/smart_framing.py
@@ -46,21 +46,20 @@ class SmartFramer:
        self,
        target_width: int = 1080,
        target_height: int = 1920,
-        frame_skip: int = 2,
-        smoothing_window: int = 15
+        frame_skip: int = 1,
+        smoothing_window: int = 30,
+        max_velocity: int = 20,
+        person_switch_cooldown: int = 999999
    ):
        self.target_width = target_width
        self.target_height = target_height
        self.target_aspect = target_height / target_width
-
-        # Performance parameters
-        self.frame_skip = frame_skip  # Process every Nth frame (CPU optimization)
-
-        # Smoothing parameters
+        self.frame_skip = frame_skip
        self.smoothing_window = smoothing_window
-        self.max_velocity = 30  # pixels per frame (reduced for smoother transitions)
+        self.max_velocity = max_velocity
+        self.person_switch_cooldown = person_switch_cooldown

-        logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip})")
+        logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip}, smoothing={smoothing_window}, velocity={max_velocity}, cooldown={person_switch_cooldown})")

    def create_framing_plan(
        self,
@@ -81,25 +80,21 @@ class SmartFramer:
        Returns:
            FramingPlan with all frame contexts and crop regions
        """
-        analyzer = ContextAnalyzer()
+        analyzer = ContextAnalyzer(person_switch_cooldown=self.person_switch_cooldown)

-        # Detect speaking periods from audio if available
        speaking_periods = None
        if audio_samples is not None:
            speaking_periods = analyzer.audio_detector.detect_speaking_periods(audio_samples)

-        # Open video with error suppression for AV1 codec warnings
        import os
        os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'loglevel;quiet'

        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)

-        # Calculate frame range
        start_frame = int(start_time * fps)
        end_frame = int(end_time * fps)

-        # Set to start frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

        frame_contexts = []
@@ -113,7 +108,6 @@ class SmartFramer:
            if not ret:
                break

-            # Only process every Nth frame for performance (CPU optimization)
            if processed_count % self.frame_skip == 0:
                timestamp = frame_number / fps
                context = analyzer.analyze_frame(frame, timestamp, frame_number, speaking_periods)
@@ -122,35 +116,36 @@ class SmartFramer:
            frame_number += 1
            processed_count += 1

-        # Get video dimensions before releasing capture
        source_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        source_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

        cap.release()
        analyzer.close()

-        # Determine overall layout mode (most common)
        layout_modes = [ctx.layout_mode for ctx in frame_contexts]
        if layout_modes:
            overall_layout = max(set(layout_modes), key=layout_modes.count)
        else:
            overall_layout = "single"

-        # Calculate crop regions based on contexts
-
        crop_regions = self._calculate_crop_regions(
            frame_contexts,
            source_width,
            source_height
        )

-        return FramingPlan(
+        framing_plan = FramingPlan(
            frame_contexts=frame_contexts,
            crop_regions=crop_regions,
            layout_mode=overall_layout,
            fps=fps
        )

+        import gc
+        gc.collect()
+
+        return framing_plan
+
    def _calculate_crop_regions(
        self,
        contexts: List[FrameContext],
@@ -171,66 +166,122 @@ class SmartFramer:
        if not contexts:
            return []

-        # Calculate ideal crop dimensions maintaining EXACT 9:16 aspect ratio
        source_aspect = source_width / source_height

        if source_aspect > self.target_aspect:
-            # Source is wider - crop horizontally (use full height)
            crop_height = source_height
            crop_width = int(crop_height / self.target_aspect)

-            # Ensure crop width fits within source
            if crop_width > source_width:
                crop_width = source_width
                crop_height = int(crop_width * self.target_aspect)
        else:
-            # Source is taller - crop vertically (use full width)
            crop_width = source_width
            crop_height = int(crop_width * self.target_aspect)

-            # Ensure crop height fits within source
            if crop_height > source_height:
                crop_height = source_height
                crop_width = int(crop_height / self.target_aspect)

-        # Calculate center points for each frame
-        # Since we now always focus on ONE person directly (not averaging),
-        # we can use the focus point directly without complex validation
-        center_xs = []
-        center_ys = []
+        safe_zone_margin_x = crop_width * 0.40
+        safe_zone_margin_y = crop_height * 0.40

-        for ctx in contexts:
-            if ctx.primary_focus:
-                # Primary focus is now always a single person's center, never averaged
-                # This means it will never be on the table/empty space
-                center_xs.append(ctx.primary_focus[0])
-                center_ys.append(ctx.primary_focus[1])
+        dead_zone_threshold = 100
+
+        if contexts and contexts[0].primary_focus:
+            current_crop_center_x = contexts[0].primary_focus[0]
+            current_crop_center_y = contexts[0].primary_focus[1]
+        else:
+            current_crop_center_x = source_width // 2
+            current_crop_center_y = source_height // 2
+
+        center_xs = [current_crop_center_x]
+        center_ys = [current_crop_center_y]
+
+        for ctx in contexts[1:]:
+            if ctx.primary_focus and ctx.selected_people and len(ctx.detected_faces) > 0:
+                primary_person_idx = ctx.selected_people[0] if ctx.selected_people else 0
+                if primary_person_idx < len(ctx.detected_faces):
+                    face = ctx.detected_faces[primary_person_idx]
+
+                    face_left = face.x
+                    face_right = face.x + face.width
+                    face_top = face.y
+                    face_bottom = face.y + face.height
+
+                    crop_left = current_crop_center_x - crop_width // 2
+                    crop_right = current_crop_center_x + crop_width // 2
+                    crop_top = current_crop_center_y - crop_height // 2
+                    crop_bottom = current_crop_center_y + crop_height // 2
+
+                    face_rel_left = face_left - crop_left
+                    face_rel_right = face_right - crop_left
+                    face_rel_top = face_top - crop_top
+                    face_rel_bottom = face_bottom - crop_top
+
+                    face_left_safe = face_rel_left >= safe_zone_margin_x
+                    face_right_safe = face_rel_right <= (crop_width - safe_zone_margin_x)
+                    face_top_safe = face_rel_top >= safe_zone_margin_y
+                    face_bottom_safe = face_rel_bottom <= (crop_height - safe_zone_margin_y)
+
+                    face_fully_visible = face_left_safe and face_right_safe and face_top_safe and face_bottom_safe
+
+                    if face_fully_visible:
+                        center_xs.append(current_crop_center_x)
+                        center_ys.append(current_crop_center_y)
+                    else:
+                        shift_x = 0
+                        shift_y = 0
+
+                        if not face_left_safe:
+                            shift_x = face_rel_left - safe_zone_margin_x
+                        elif not face_right_safe:
+                            shift_x = face_rel_right - (crop_width - safe_zone_margin_x)
+
+                        if not face_top_safe:
+                            shift_y = face_rel_top - safe_zone_margin_y
+                        elif not face_bottom_safe:
+                            shift_y = face_rel_bottom - (crop_height - safe_zone_margin_y)
+
+                        if abs(shift_x) > dead_zone_threshold:
+                            current_crop_center_x += shift_x
+                        if abs(shift_y) > dead_zone_threshold:
+                            current_crop_center_y += shift_y
+
+                        center_xs.append(current_crop_center_x)
+                        center_ys.append(current_crop_center_y)
+                else:
+                    center_xs.append(current_crop_center_x)
+                    center_ys.append(current_crop_center_y)
            else:
-                # Default to center only if no faces detected at all
-                center_xs.append(source_width // 2)
-                center_ys.append(source_height // 2)
+                center_xs.append(current_crop_center_x)
+                center_ys.append(current_crop_center_y)

-        # Smooth the center points
-        if len(center_xs) > self.smoothing_window:
-            kernel_size = min(self.smoothing_window, len(center_xs))
-            if kernel_size % 2 == 0:
-                kernel_size -= 1
+        if len(center_xs) > 1:
+            alpha = 0.002
+            smoothed_xs = [center_xs[0]]
+            smoothed_ys = [center_ys[0]]
+            for i in range(1, len(center_xs)):
+                if center_xs[i] != center_xs[i-1] or center_ys[i] != center_ys[i-1]:
+                    smoothed_xs.append(alpha * center_xs[i] + (1 - alpha) * smoothed_xs[i-1])
+                    smoothed_ys.append(alpha * center_ys[i] + (1 - alpha) * smoothed_ys[i-1])
+                else:
+                    smoothed_xs.append(smoothed_xs[i-1])
+                    smoothed_ys.append(smoothed_ys[i-1])
+            center_xs = smoothed_xs
+            center_ys = smoothed_ys

-            center_xs = signal.medfilt(center_xs, kernel_size=kernel_size).tolist()
-            center_ys = signal.medfilt(center_ys, kernel_size=kernel_size).tolist()
+        center_xs = self._limit_velocity(center_xs, 2)
+        center_ys = self._limit_velocity(center_ys, 2)

-        # Limit velocity (prevent jarring movements)
-        center_xs = self._limit_velocity(center_xs, self.max_velocity)
-        center_ys = self._limit_velocity(center_ys, self.max_velocity)
+        center_xs = self._apply_dead_zone(center_xs, 5)
+        center_ys = self._apply_dead_zone(center_ys, 5)

-        # Convert to crop regions
        crop_regions = []
        for center_x, center_y in zip(center_xs, center_ys):
-            # Calculate top-left corner
            x = int(center_x - crop_width // 2)
            y = int(center_y - crop_height // 2)

-            # Clamp to valid bounds
            x = max(0, min(x, source_width - crop_width))
            y = max(0, min(y, source_height - crop_height))

@@ -241,8 +292,37 @@ class SmartFramer:
                height=crop_height
            ))

+        center_xs.clear()
+        center_ys.clear()
+
        return crop_regions

+    def _apply_dead_zone(self, positions: List[float], threshold: float) -> List[float]:
+        """
+        Apply dead zone to eliminate micro-movements.
+        If change is smaller than threshold, keep previous position.
+
+        Args:
+            positions: List of positions
+            threshold: Minimum change needed to move (pixels)
+
+        Returns:
+            Positions with dead zone applied
+        """
+        if len(positions) <= 1:
+            return positions
+
+        filtered = [positions[0]]
+
+        for i in range(1, len(positions)):
+            delta = abs(positions[i] - filtered[i - 1])
+            if delta < threshold:
+                filtered.append(filtered[i - 1])
+            else:
+                filtered.append(positions[i])
+
+        return filtered
+
    def _limit_velocity(self, positions: List[float], max_velocity: float) -> List[float]:
        """
        Limit the velocity of position changes.
@@ -271,33 +351,20 @@ class SmartFramer:
    def apply_framing(
        self,
        video_clip: VideoFileClip,
-        framing_plan: FramingPlan,
-        use_split_screen: bool = False
+        framing_plan: FramingPlan
    ) -> VideoClip:
        """
        Apply smart framing to a video clip.
+        Always uses single-person focus (no split screen).

        Args:
            video_clip: Source video clip
            framing_plan: Framing plan to apply
-            use_split_screen: Whether to use split screen for multiple people

        Returns:
            Reframed video clip
        """
-        # Handle different layout modes
-        if framing_plan.layout_mode in ["single", "single_speaker"]:
-            # Single person or single speaker - use focused single framing
-            return self._apply_single_framing(video_clip, framing_plan)
-        elif framing_plan.layout_mode == "dual_split" and use_split_screen:
-            # Two people in conversation - use split screen
-            return self._apply_split_screen(video_clip, framing_plan)
-        elif framing_plan.layout_mode == "grid" and use_split_screen:
-            # 3+ people - use grid layout
-            return self._apply_grid_layout(video_clip, framing_plan)
-        else:
-            # Fallback to single framing
-            return self._apply_single_framing(video_clip, framing_plan)
+        return self._apply_single_framing(video_clip, framing_plan)

    def _apply_single_framing(
        self,
@@ -315,12 +382,9 @@ class SmartFramer:
            Reframed video clip
        """
        def make_frame(t):
-            # Get the original frame
            frame = video_clip.get_frame(t)

-            # Ensure we have valid crop regions
            if not framing_plan.crop_regions:
-                # Fallback: return center crop
                h, w = frame.shape[:2]
                crop_h = int(w * self.target_aspect)
                crop_w = w
@@ -331,41 +395,32 @@ class SmartFramer:
                x = (w - crop_w) // 2
                cropped = frame[y:y + crop_h, x:x + crop_w]
            else:
-                # Calculate exact frame index with decimal precision for interpolation
                exact_frame_idx = (t * framing_plan.fps) / self.frame_skip

-                # Get the two adjacent analyzed frames
                idx_floor = int(exact_frame_idx)
                idx_ceil = idx_floor + 1

-                # Interpolation factor (0.0 to 1.0)
                alpha = exact_frame_idx - idx_floor

-                # Clamp indices to valid range
                idx_floor = max(0, min(idx_floor, len(framing_plan.crop_regions) - 1))
                idx_ceil = max(0, min(idx_ceil, len(framing_plan.crop_regions) - 1))

-                # Get crop regions
                crop1 = framing_plan.crop_regions[idx_floor]
                crop2 = framing_plan.crop_regions[idx_ceil]

-                # Linear interpolation between crop regions
                x = int(crop1.x * (1 - alpha) + crop2.x * alpha)
                y = int(crop1.y * (1 - alpha) + crop2.y * alpha)
                width = int(crop1.width * (1 - alpha) + crop2.width * alpha)
                height = int(crop1.height * (1 - alpha) + crop2.height * alpha)

-                # Ensure crop stays within frame bounds
                h, w = frame.shape[:2]
                x = max(0, min(x, w - width))
                y = max(0, min(y, h - height))
                width = min(width, w - x)
                height = min(height, h - y)

-                # Crop the frame
                cropped = frame[y:y + height, x:x + width]

-            # Resize to target dimensions
            resized = cv2.resize(
                cropped,
                (self.target_width, self.target_height),
@@ -374,7 +429,6 @@ class SmartFramer:

            return resized

-        # MoviePy 2.x compatible way to create VideoClip
        new_clip = VideoClip(duration=video_clip.duration)
        new_clip.size = (self.target_width, self.target_height)
        new_clip.frame_function = make_frame
@@ -397,13 +451,10 @@ class SmartFramer:
        """
        def make_frame(t):
            frame = video_clip.get_frame(t)
-            # Calculate exact frame index with decimal precision for smooth interpolation
            exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
            frame_idx = int(exact_frame_idx)

-            # Ensure we have valid contexts
            if not framing_plan.frame_contexts:
-                # Fallback to simple center crop
                h, w = frame.shape[:2]
                crop_h = int(w * self.target_aspect)
                crop_w = w
@@ -415,107 +466,81 @@ class SmartFramer:
                cropped = frame[y:y + crop_h, x:x + crop_w]
                return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)

-            # Clamp index to valid range
            frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
            context = framing_plan.frame_contexts[frame_idx]

-            # Create output frame
            output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)

-            if len(context.detected_faces) >= 2:
-                # Split vertically 50/50 (two columns)
-                half_width = self.target_width // 2
+            if context.selected_people and len(context.selected_people) >= 2:
+                selected_faces = [context.detected_faces[i] for i in context.selected_people[:2]
+                                if i < len(context.detected_faces)]

-                # Select the 2 most relevant faces
-                # Priority: ALWAYS show active speaker first + most confident other person
-                if context.active_speakers and len(context.active_speakers) >= 1:
-                    # Get the PRIMARY speaker (most confident among active speakers)
-                    speaker_faces = [context.detected_faces[i] for i in context.active_speakers
-                                   if i < len(context.detected_faces)]
+                if len(selected_faces) >= 2:
+                    faces = sorted(selected_faces, key=lambda f: f.center_x)
+                    left_face = faces[0]
+                    right_face = faces[1]

-                    primary_speaker = max(speaker_faces, key=lambda f: f.confidence)
+                    for idx, face in enumerate([left_face, right_face]):

-                    # Get OTHER faces (not the primary speaker)
-                    other_faces = [f for f in context.detected_faces if f != primary_speaker]
+                        half_width = self.target_width // 2
+                        half_aspect = self.target_height / half_width  # Aspect ratio for half

-                    if len(speaker_faces) >= 2:
-                        # Multiple speakers: show primary + second most confident speaker
-                        other_speakers = [f for f in speaker_faces if f != primary_speaker]
-                        secondary_person = max(other_speakers, key=lambda f: f.confidence)
-                    elif other_faces:
-                        # One speaker: show speaker + most confident other person
-                        secondary_person = max(other_faces, key=lambda f: f.confidence)
-                    else:
-                        # Fallback: only one person detected
-                        secondary_person = primary_speaker
+                        face_width = max(face.width, frame.shape[1] // 4)  # At least 1/4 of frame width
+                        crop_width = int(face_width * 2.5)  # Add padding around face
+                        crop_height = int(crop_width * half_aspect)  # Maintain correct aspect

-                    selected_faces = [primary_speaker, secondary_person]
+                        max_crop_width = frame.shape[1] // 2  # Half the source width
+                        max_crop_height = frame.shape[0]  # Full source height
+
+                        if crop_width > max_crop_width:
+                            crop_width = max_crop_width
+                            crop_height = int(crop_width * half_aspect)
+
+                        if crop_height > max_crop_height:
+                            crop_height = max_crop_height
+                            crop_width = int(crop_height / half_aspect)
+
+                        x = max(0, face.center_x - crop_width // 2)
+                        y = max(0, face.center_y - crop_height // 2)
+
+                        x = min(x, frame.shape[1] - crop_width)
+                        y = min(y, frame.shape[0] - crop_height)
+
+                        cropped = frame[y:y + crop_height, x:x + crop_width]
+                        resized = cv2.resize(
+                            cropped,
+                            (half_width, self.target_height),
+                            interpolation=cv2.INTER_LINEAR
+                        )
+
+                        x_offset = idx * half_width
+                        output[:, x_offset:x_offset + half_width] = resized
                else:
-                    # No speakers: take 2 most confident faces
-                    selected_faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:2]
-
-                # Sort selected faces by horizontal position for consistent left/right placement
-                faces = sorted(selected_faces, key=lambda f: f.center_x)
-                left_face = faces[0]
-                right_face = faces[1]
-
-                # Process each person's frame
-                for idx, face in enumerate([left_face, right_face]):
-                    # Calculate crop region focused on this person
-                    # Each person gets half the width, full target aspect ratio (9:16)
-                    # This ensures NO distortion when resizing
-
-                    # For split screen: each side is half_width x full_height
-                    # We need to maintain 9:16 aspect for each half
-                    half_width = self.target_width // 2
-                    half_aspect = self.target_height / half_width  # Aspect ratio for half
-
-                    # Determine crop size based on face with padding
-                    face_width = max(face.width, frame.shape[1] // 4)  # At least 1/4 of frame width
-                    crop_width = int(face_width * 2.5)  # Add padding around face
-                    crop_height = int(crop_width * half_aspect)  # Maintain correct aspect
-
-                    # Ensure crop fits in frame, maintaining aspect ratio
-                    max_crop_width = frame.shape[1] // 2  # Half the source width
-                    max_crop_height = frame.shape[0]  # Full source height
-
-                    # If crop is too wide, scale down proportionally
-                    if crop_width > max_crop_width:
-                        crop_width = max_crop_width
-                        crop_height = int(crop_width * half_aspect)
-
-                    # If crop is too tall, scale down proportionally
-                    if crop_height > max_crop_height:
-                        crop_height = max_crop_height
-                        crop_width = int(crop_height / half_aspect)
-
-                    # Center crop on face
-                    x = max(0, face.center_x - crop_width // 2)
-                    y = max(0, face.center_y - crop_height // 2)
-
-                    # Clamp to frame boundaries
-                    x = min(x, frame.shape[1] - crop_width)
-                    y = min(y, frame.shape[0] - crop_height)
-
-                    # Extract and resize crop
-                    cropped = frame[y:y + crop_height, x:x + crop_width]
-                    resized = cv2.resize(
+                    if framing_plan.crop_regions:
+                        crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
+                        crop = framing_plan.crop_regions[crop_idx]
+                        cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
+                    else:
+                        h, w = frame.shape[:2]
+                        crop_h = int(w * self.target_aspect)
+                        crop_w = w
+                        if crop_h > h:
+                            crop_h = h
+                            crop_w = int(h / self.target_aspect)
+                        y = (h - crop_h) // 2
+                        x = (w - crop_w) // 2
+                        cropped = frame[y:y + crop_h, x:x + crop_w]
+                    output = cv2.resize(
                        cropped,
-                        (half_width, self.target_height),
+                        (self.target_width, self.target_height),
                        interpolation=cv2.INTER_LINEAR
                    )
-
-                    # Place in output at appropriate horizontal position
-                    x_offset = idx * half_width
-                    output[:, x_offset:x_offset + half_width] = resized
            else:
-                # Fall back to single framing
                if framing_plan.crop_regions:
                    crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
                    crop = framing_plan.crop_regions[crop_idx]
                    cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
                else:
-                    # Fallback to center crop if no crop regions available
                    h, w = frame.shape[:2]
                    crop_h = int(w * self.target_aspect)
                    crop_w = w
@@ -533,7 +558,6 @@ class SmartFramer:

            return output

-        # MoviePy 2.x compatible way to create VideoClip
        new_clip = VideoClip(duration=video_clip.duration)
        new_clip.size = (self.target_width, self.target_height)
        new_clip.frame_function = make_frame
@@ -556,13 +580,10 @@ class SmartFramer:
        """
        def make_frame(t):
            frame = video_clip.get_frame(t)
-            # Calculate exact frame index with decimal precision for smooth interpolation
            exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
            frame_idx = int(exact_frame_idx)

-            # Ensure we have valid contexts
            if not framing_plan.frame_contexts:
-                # Fallback to simple center crop
                h, w = frame.shape[:2]
                crop_h = int(w * self.target_aspect)
                crop_w = w
@@ -574,7 +595,6 @@ class SmartFramer:
                cropped = frame[y:y + crop_h, x:x + crop_w]
                return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)

-            # Clamp index to valid range
            frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
            context = framing_plan.frame_contexts[frame_idx]

@@ -583,23 +603,18 @@ class SmartFramer:
            num_faces = len(context.detected_faces)

            if num_faces >= 3:
-                # Create 2x2 grid
                cell_width = self.target_width // 2
                cell_height = self.target_height // 2

                for idx, face in enumerate(context.detected_faces[:4]):
-                    # Calculate grid position
                    row = idx // 2
                    col = idx % 2

-                    # Each grid cell maintains aspect ratio (square in this case: cell_width = cell_height)
                    cell_aspect = cell_height / cell_width

-                    # Crop around face with correct aspect ratio
                    crop_width = frame.shape[1] // 2
                    crop_height = int(crop_width * cell_aspect)

-                    # Ensure crop fits in frame, maintaining aspect
                    max_crop_width = frame.shape[1] // 2
                    max_crop_height = frame.shape[0] // 2

@@ -611,11 +626,9 @@ class SmartFramer:
                        crop_height = max_crop_height
                        crop_width = int(crop_height / cell_aspect)

-                    # Center crop on face
                    x = max(0, face.center_x - crop_width // 2)
                    y = max(0, face.center_y - crop_height // 2)

-                    # Clamp to frame boundaries
                    x = min(x, frame.shape[1] - crop_width)
                    y = min(y, frame.shape[0] - crop_height)

@@ -626,18 +639,15 @@ class SmartFramer:
                        interpolation=cv2.INTER_LINEAR
                    )

-                    # Place in grid
                    y_offset = row * cell_height
                    x_offset = col * cell_width
                    output[y_offset:y_offset + cell_height, x_offset:x_offset + cell_width] = resized
            else:
-                # Fall back to single framing
                if framing_plan.crop_regions:
                    crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
                    crop = framing_plan.crop_regions[crop_idx]
                    cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
                else:
-                    # Fallback to center crop if no crop regions available
                    h, w = frame.shape[:2]
                    crop_h = int(w * self.target_aspect)
                    crop_w = w
@@ -655,7 +665,6 @@ class SmartFramer:

            return output

-        # MoviePy 2.x compatible way to create VideoClip
        new_clip = VideoClip(duration=video_clip.duration)
        new_clip.size = (self.target_width, self.target_height)
        new_clip.frame_function = make_frame