video-render/video_render/smart_framing.py

"""
Smart framing module for intelligent video cropping and composition.

This module provides functionality to create 9:16 vertical videos with
intelligent framing that follows the action and speakers.
"""
from __future__ import annotations

import logging
from dataclasses import dataclass
from typing import List, Optional, Tuple

import cv2
import numpy as np
from moviepy.video.VideoClip import VideoClip
from moviepy.video.io.VideoFileClip import VideoFileClip
from scipy import signal

from video_render.context_detection import ContextAnalyzer, FrameContext, FaceDetection

logger = logging.getLogger(__name__)


@dataclass
class CropRegion:
    """Defines a crop region for a frame."""
    x: int
    y: int
    width: int
    height: int


@dataclass
class FramingPlan:
    """Complete framing plan for a video segment."""
    frame_contexts: List[FrameContext]
    crop_regions: List[CropRegion]
    layout_mode: str
    fps: float


class SmartFramer:
    """Creates intelligent 9:16 framing for horizontal videos."""

    def __init__(
        self,
        target_width: int = 1080,
        target_height: int = 1920,
        frame_skip: int = 2,
        smoothing_window: int = 15
    ):
        self.target_width = target_width
        self.target_height = target_height
        self.target_aspect = target_height / target_width

        # Performance parameters
        self.frame_skip = frame_skip  # Process every Nth frame (CPU optimization)

        # Smoothing parameters
        self.smoothing_window = smoothing_window
        self.max_velocity = 30  # pixels per frame (reduced for smoother transitions)

        logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip})")

    def create_framing_plan(
        self,
        video_path: str,
        start_time: float,
        end_time: float,
        audio_samples: Optional[np.ndarray] = None
    ) -> FramingPlan:
        """
        Analyze video and create a complete framing plan.

        Args:
            video_path: Path to video file
            start_time: Start time in seconds
            end_time: End time in seconds
            audio_samples: Optional audio samples for speech detection

        Returns:
            FramingPlan with all frame contexts and crop regions
        """
        analyzer = ContextAnalyzer()

        # Detect speaking periods from audio if available
        speaking_periods = None
        if audio_samples is not None:
            speaking_periods = analyzer.audio_detector.detect_speaking_periods(audio_samples)

        # Open video with error suppression for AV1 codec warnings
        import os
        os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'loglevel;quiet'

        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)

        # Calculate frame range
        start_frame = int(start_time * fps)
        end_frame = int(end_time * fps)

        # Set to start frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

        frame_contexts = []
        frame_number = start_frame
        processed_count = 0

        logger.info(f"Analyzing frames {start_frame} to {end_frame} (fps={fps}, skip={self.frame_skip})")

        while frame_number < end_frame:
            ret, frame = cap.read()
            if not ret:
                break

            # Only process every Nth frame for performance (CPU optimization)
            if processed_count % self.frame_skip == 0:
                timestamp = frame_number / fps
                context = analyzer.analyze_frame(frame, timestamp, frame_number, speaking_periods)
                frame_contexts.append(context)

            frame_number += 1
            processed_count += 1

        # Get video dimensions before releasing capture
        source_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        source_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

        cap.release()
        analyzer.close()

        # Determine overall layout mode (most common)
        layout_modes = [ctx.layout_mode for ctx in frame_contexts]
        if layout_modes:
            overall_layout = max(set(layout_modes), key=layout_modes.count)
        else:
            overall_layout = "single"

        # Calculate crop regions based on contexts

        crop_regions = self._calculate_crop_regions(
            frame_contexts,
            source_width,
            source_height
        )

        return FramingPlan(
            frame_contexts=frame_contexts,
            crop_regions=crop_regions,
            layout_mode=overall_layout,
            fps=fps
        )

    def _calculate_crop_regions(
        self,
        contexts: List[FrameContext],
        source_width: int,
        source_height: int
    ) -> List[CropRegion]:
        """
        Calculate smooth crop regions for each frame.

        Args:
            contexts: List of frame contexts
            source_width: Source video width
            source_height: Source video height

        Returns:
            List of crop regions
        """
        if not contexts:
            return []

        # Calculate ideal crop dimensions maintaining EXACT 9:16 aspect ratio
        source_aspect = source_width / source_height

        if source_aspect > self.target_aspect:
            # Source is wider - crop horizontally (use full height)
            crop_height = source_height
            crop_width = int(crop_height / self.target_aspect)

            # Ensure crop width fits within source
            if crop_width > source_width:
                crop_width = source_width
                crop_height = int(crop_width * self.target_aspect)
        else:
            # Source is taller - crop vertically (use full width)
            crop_width = source_width
            crop_height = int(crop_width * self.target_aspect)

            # Ensure crop height fits within source
            if crop_height > source_height:
                crop_height = source_height
                crop_width = int(crop_height / self.target_aspect)

        # Calculate center points for each frame
        # Since we now always focus on ONE person directly (not averaging),
        # we can use the focus point directly without complex validation
        center_xs = []
        center_ys = []

        for ctx in contexts:
            if ctx.primary_focus:
                # Primary focus is now always a single person's center, never averaged
                # This means it will never be on the table/empty space
                center_xs.append(ctx.primary_focus[0])
                center_ys.append(ctx.primary_focus[1])
            else:
                # Default to center only if no faces detected at all
                center_xs.append(source_width // 2)
                center_ys.append(source_height // 2)

        # Smooth the center points
        if len(center_xs) > self.smoothing_window:
            kernel_size = min(self.smoothing_window, len(center_xs))
            if kernel_size % 2 == 0:
                kernel_size -= 1

            center_xs = signal.medfilt(center_xs, kernel_size=kernel_size).tolist()
            center_ys = signal.medfilt(center_ys, kernel_size=kernel_size).tolist()

        # Limit velocity (prevent jarring movements)
        center_xs = self._limit_velocity(center_xs, self.max_velocity)
        center_ys = self._limit_velocity(center_ys, self.max_velocity)

        # Convert to crop regions
        crop_regions = []
        for center_x, center_y in zip(center_xs, center_ys):
            # Calculate top-left corner
            x = int(center_x - crop_width // 2)
            y = int(center_y - crop_height // 2)

            # Clamp to valid bounds
            x = max(0, min(x, source_width - crop_width))
            y = max(0, min(y, source_height - crop_height))

            crop_regions.append(CropRegion(
                x=x,
                y=y,
                width=crop_width,
                height=crop_height
            ))

        return crop_regions

    def _limit_velocity(self, positions: List[float], max_velocity: float) -> List[float]:
        """
        Limit the velocity of position changes.

        Args:
            positions: List of positions
            max_velocity: Maximum allowed change per frame

        Returns:
            Smoothed positions
        """
        if len(positions) <= 1:
            return positions

        limited = [positions[0]]

        for i in range(1, len(positions)):
            delta = positions[i] - limited[i - 1]
            if abs(delta) > max_velocity:
                delta = max_velocity if delta > 0 else -max_velocity

            limited.append(limited[i - 1] + delta)

        return limited

    def apply_framing(
        self,
        video_clip: VideoFileClip,
        framing_plan: FramingPlan,
        use_split_screen: bool = False
    ) -> VideoClip:
        """
        Apply smart framing to a video clip.

        Args:
            video_clip: Source video clip
            framing_plan: Framing plan to apply
            use_split_screen: Whether to use split screen for multiple people

        Returns:
            Reframed video clip
        """
        # Handle different layout modes
        if framing_plan.layout_mode in ["single", "single_speaker"]:
            # Single person or single speaker - use focused single framing
            return self._apply_single_framing(video_clip, framing_plan)
        elif framing_plan.layout_mode == "dual_split" and use_split_screen:
            # Two people in conversation - use split screen
            return self._apply_split_screen(video_clip, framing_plan)
        elif framing_plan.layout_mode == "grid" and use_split_screen:
            # 3+ people - use grid layout
            return self._apply_grid_layout(video_clip, framing_plan)
        else:
            # Fallback to single framing
            return self._apply_single_framing(video_clip, framing_plan)

    def _apply_single_framing(
        self,
        video_clip: VideoFileClip,
        framing_plan: FramingPlan
    ) -> VideoClip:
        """
        Apply single-focus framing (following one person or action).

        Args:
            video_clip: Source video clip
            framing_plan: Framing plan

        Returns:
            Reframed video clip
        """
        def make_frame(t):
            # Get the original frame
            frame = video_clip.get_frame(t)

            # Ensure we have valid crop regions
            if not framing_plan.crop_regions:
                # Fallback: return center crop
                h, w = frame.shape[:2]
                crop_h = int(w * self.target_aspect)
                crop_w = w
                if crop_h > h:
                    crop_h = h
                    crop_w = int(h / self.target_aspect)
                y = (h - crop_h) // 2
                x = (w - crop_w) // 2
                cropped = frame[y:y + crop_h, x:x + crop_w]
            else:
                # Calculate exact frame index with decimal precision for interpolation
                exact_frame_idx = (t * framing_plan.fps) / self.frame_skip

                # Get the two adjacent analyzed frames
                idx_floor = int(exact_frame_idx)
                idx_ceil = idx_floor + 1

                # Interpolation factor (0.0 to 1.0)
                alpha = exact_frame_idx - idx_floor

                # Clamp indices to valid range
                idx_floor = max(0, min(idx_floor, len(framing_plan.crop_regions) - 1))
                idx_ceil = max(0, min(idx_ceil, len(framing_plan.crop_regions) - 1))

                # Get crop regions
                crop1 = framing_plan.crop_regions[idx_floor]
                crop2 = framing_plan.crop_regions[idx_ceil]

                # Linear interpolation between crop regions
                x = int(crop1.x * (1 - alpha) + crop2.x * alpha)
                y = int(crop1.y * (1 - alpha) + crop2.y * alpha)
                width = int(crop1.width * (1 - alpha) + crop2.width * alpha)
                height = int(crop1.height * (1 - alpha) + crop2.height * alpha)

                # Ensure crop stays within frame bounds
                h, w = frame.shape[:2]
                x = max(0, min(x, w - width))
                y = max(0, min(y, h - height))
                width = min(width, w - x)
                height = min(height, h - y)

                # Crop the frame
                cropped = frame[y:y + height, x:x + width]

            # Resize to target dimensions
            resized = cv2.resize(
                cropped,
                (self.target_width, self.target_height),
                interpolation=cv2.INTER_LINEAR
            )

            return resized

        # MoviePy 2.x compatible way to create VideoClip
        new_clip = VideoClip(duration=video_clip.duration)
        new_clip.size = (self.target_width, self.target_height)
        new_clip.frame_function = make_frame
        return new_clip

    def _apply_split_screen(
        self,
        video_clip: VideoFileClip,
        framing_plan: FramingPlan
    ) -> VideoClip:
        """
        Apply split screen for two people.

        Args:
            video_clip: Source video clip
            framing_plan: Framing plan

        Returns:
            Split screen video clip
        """
        def make_frame(t):
            frame = video_clip.get_frame(t)
            # Calculate exact frame index with decimal precision for smooth interpolation
            exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
            frame_idx = int(exact_frame_idx)

            # Ensure we have valid contexts
            if not framing_plan.frame_contexts:
                # Fallback to simple center crop
                h, w = frame.shape[:2]
                crop_h = int(w * self.target_aspect)
                crop_w = w
                if crop_h > h:
                    crop_h = h
                    crop_w = int(h / self.target_aspect)
                y = (h - crop_h) // 2
                x = (w - crop_w) // 2
                cropped = frame[y:y + crop_h, x:x + crop_w]
                return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)

            # Clamp index to valid range
            frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
            context = framing_plan.frame_contexts[frame_idx]

            # Create output frame
            output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)

            if len(context.detected_faces) >= 2:
                # Split vertically 50/50 (two columns)
                half_width = self.target_width // 2

                # Select the 2 most relevant faces
                # Priority: ALWAYS show active speaker first + most confident other person
                if context.active_speakers and len(context.active_speakers) >= 1:
                    # Get the PRIMARY speaker (most confident among active speakers)
                    speaker_faces = [context.detected_faces[i] for i in context.active_speakers
                                   if i < len(context.detected_faces)]

                    primary_speaker = max(speaker_faces, key=lambda f: f.confidence)

                    # Get OTHER faces (not the primary speaker)
                    other_faces = [f for f in context.detected_faces if f != primary_speaker]

                    if len(speaker_faces) >= 2:
                        # Multiple speakers: show primary + second most confident speaker
                        other_speakers = [f for f in speaker_faces if f != primary_speaker]
                        secondary_person = max(other_speakers, key=lambda f: f.confidence)
                    elif other_faces:
                        # One speaker: show speaker + most confident other person
                        secondary_person = max(other_faces, key=lambda f: f.confidence)
                    else:
                        # Fallback: only one person detected
                        secondary_person = primary_speaker

                    selected_faces = [primary_speaker, secondary_person]
                else:
                    # No speakers: take 2 most confident faces
                    selected_faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:2]

                # Sort selected faces by horizontal position for consistent left/right placement
                faces = sorted(selected_faces, key=lambda f: f.center_x)
                left_face = faces[0]
                right_face = faces[1]

                # Process each person's frame
                for idx, face in enumerate([left_face, right_face]):
                    # Calculate crop region focused on this person
                    # Each person gets half the width, full target aspect ratio (9:16)
                    # This ensures NO distortion when resizing

                    # For split screen: each side is half_width x full_height
                    # We need to maintain 9:16 aspect for each half
                    half_width = self.target_width // 2
                    half_aspect = self.target_height / half_width  # Aspect ratio for half

                    # Determine crop size based on face with padding
                    face_width = max(face.width, frame.shape[1] // 4)  # At least 1/4 of frame width
                    crop_width = int(face_width * 2.5)  # Add padding around face
                    crop_height = int(crop_width * half_aspect)  # Maintain correct aspect

                    # Ensure crop fits in frame, maintaining aspect ratio
                    max_crop_width = frame.shape[1] // 2  # Half the source width
                    max_crop_height = frame.shape[0]  # Full source height

                    # If crop is too wide, scale down proportionally
                    if crop_width > max_crop_width:
                        crop_width = max_crop_width
                        crop_height = int(crop_width * half_aspect)

                    # If crop is too tall, scale down proportionally
                    if crop_height > max_crop_height:
                        crop_height = max_crop_height
                        crop_width = int(crop_height / half_aspect)

                    # Center crop on face
                    x = max(0, face.center_x - crop_width // 2)
                    y = max(0, face.center_y - crop_height // 2)

                    # Clamp to frame boundaries
                    x = min(x, frame.shape[1] - crop_width)
                    y = min(y, frame.shape[0] - crop_height)

                    # Extract and resize crop
                    cropped = frame[y:y + crop_height, x:x + crop_width]
                    resized = cv2.resize(
                        cropped,
                        (half_width, self.target_height),
                        interpolation=cv2.INTER_LINEAR
                    )

                    # Place in output at appropriate horizontal position
                    x_offset = idx * half_width
                    output[:, x_offset:x_offset + half_width] = resized
            else:
                # Fall back to single framing
                if framing_plan.crop_regions:
                    crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
                    crop = framing_plan.crop_regions[crop_idx]
                    cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
                else:
                    # Fallback to center crop if no crop regions available
                    h, w = frame.shape[:2]
                    crop_h = int(w * self.target_aspect)
                    crop_w = w
                    if crop_h > h:
                        crop_h = h
                        crop_w = int(h / self.target_aspect)
                    y = (h - crop_h) // 2
                    x = (w - crop_w) // 2
                    cropped = frame[y:y + crop_h, x:x + crop_w]
                output = cv2.resize(
                    cropped,
                    (self.target_width, self.target_height),
                    interpolation=cv2.INTER_LINEAR
                )

            return output

        # MoviePy 2.x compatible way to create VideoClip
        new_clip = VideoClip(duration=video_clip.duration)
        new_clip.size = (self.target_width, self.target_height)
        new_clip.frame_function = make_frame
        return new_clip

    def _apply_grid_layout(
        self,
        video_clip: VideoFileClip,
        framing_plan: FramingPlan
    ) -> VideoClip:
        """
        Apply grid layout for 3+ people.

        Args:
            video_clip: Source video clip
            framing_plan: Framing plan

        Returns:
            Grid layout video clip
        """
        def make_frame(t):
            frame = video_clip.get_frame(t)
            # Calculate exact frame index with decimal precision for smooth interpolation
            exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
            frame_idx = int(exact_frame_idx)

            # Ensure we have valid contexts
            if not framing_plan.frame_contexts:
                # Fallback to simple center crop
                h, w = frame.shape[:2]
                crop_h = int(w * self.target_aspect)
                crop_w = w
                if crop_h > h:
                    crop_h = h
                    crop_w = int(h / self.target_aspect)
                y = (h - crop_h) // 2
                x = (w - crop_w) // 2
                cropped = frame[y:y + crop_h, x:x + crop_w]
                return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)

            # Clamp index to valid range
            frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
            context = framing_plan.frame_contexts[frame_idx]

            output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)

            num_faces = len(context.detected_faces)

            if num_faces >= 3:
                # Create 2x2 grid
                cell_width = self.target_width // 2
                cell_height = self.target_height // 2

                for idx, face in enumerate(context.detected_faces[:4]):
                    # Calculate grid position
                    row = idx // 2
                    col = idx % 2

                    # Each grid cell maintains aspect ratio (square in this case: cell_width = cell_height)
                    cell_aspect = cell_height / cell_width

                    # Crop around face with correct aspect ratio
                    crop_width = frame.shape[1] // 2
                    crop_height = int(crop_width * cell_aspect)

                    # Ensure crop fits in frame, maintaining aspect
                    max_crop_width = frame.shape[1] // 2
                    max_crop_height = frame.shape[0] // 2

                    if crop_width > max_crop_width:
                        crop_width = max_crop_width
                        crop_height = int(crop_width * cell_aspect)

                    if crop_height > max_crop_height:
                        crop_height = max_crop_height
                        crop_width = int(crop_height / cell_aspect)

                    # Center crop on face
                    x = max(0, face.center_x - crop_width // 2)
                    y = max(0, face.center_y - crop_height // 2)

                    # Clamp to frame boundaries
                    x = min(x, frame.shape[1] - crop_width)
                    y = min(y, frame.shape[0] - crop_height)

                    cropped = frame[y:y + crop_height, x:x + crop_width]
                    resized = cv2.resize(
                        cropped,
                        (cell_width, cell_height),
                        interpolation=cv2.INTER_LINEAR
                    )

                    # Place in grid
                    y_offset = row * cell_height
                    x_offset = col * cell_width
                    output[y_offset:y_offset + cell_height, x_offset:x_offset + cell_width] = resized
            else:
                # Fall back to single framing
                if framing_plan.crop_regions:
                    crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
                    crop = framing_plan.crop_regions[crop_idx]
                    cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
                else:
                    # Fallback to center crop if no crop regions available
                    h, w = frame.shape[:2]
                    crop_h = int(w * self.target_aspect)
                    crop_w = w
                    if crop_h > h:
                        crop_h = h
                        crop_w = int(h / self.target_aspect)
                    y = (h - crop_h) // 2
                    x = (w - crop_w) // 2
                    cropped = frame[y:y + crop_h, x:x + crop_w]
                output = cv2.resize(
                    cropped,
                    (self.target_width, self.target_height),
                    interpolation=cv2.INTER_LINEAR
                )

            return output

        # MoviePy 2.x compatible way to create VideoClip
        new_clip = VideoClip(duration=video_clip.duration)
        new_clip.size = (self.target_width, self.target_height)
        new_clip.frame_function = make_frame
        return new_clip


def extract_audio_samples(video_path: str, start_time: float, end_time: float) -> Optional[np.ndarray]:
    """
    Extract audio samples from video for speech detection.

    Args:
        video_path: Path to video file
        start_time: Start time in seconds
        end_time: End time in seconds

    Returns:
        Audio samples array or None if no audio
    """
    try:
        from moviepy.audio.io.AudioFileClip import AudioFileClip

        with AudioFileClip(video_path) as audio:
            segment = audio.subclipped(start_time, end_time)
            fps = getattr(segment, 'fps', 44100)
            samples = segment.to_soundarray(fps=fps)
            return samples
    except Exception as exc:
        logger.warning(f"Failed to extract audio: {exc}")
        return None