video-render/video_render/smart_framing.py

"""
Smart framing module for intelligent video cropping and composition.

This module provides functionality to create 9:16 vertical videos with
intelligent framing that follows the action and speakers.
"""
from __future__ import annotations

import logging
from dataclasses import dataclass
from typing import List, Optional, Tuple

import cv2
import numpy as np
from moviepy.video.VideoClip import VideoClip
from moviepy.video.io.VideoFileClip import VideoFileClip
from scipy import signal

from video_render.context_detection import ContextAnalyzer, FrameContext, FaceDetection

logger = logging.getLogger(__name__)


@dataclass
class CropRegion:
    """Defines a crop region for a frame."""
    x: int
    y: int
    width: int
    height: int


@dataclass
class FramingPlan:
    """Complete framing plan for a video segment."""
    frame_contexts: List[FrameContext]
    crop_regions: List[CropRegion]
    layout_mode: str
    fps: float


class SmartFramer:
    """Creates intelligent 9:16 framing for horizontal videos."""

    def __init__(
        self,
        target_width: int = 1080,
        target_height: int = 1920,
        frame_skip: int = 1,
        smoothing_window: int = 30,
        max_velocity: int = 20,
        person_switch_cooldown: int = 999999
    ):
        self.target_width = target_width
        self.target_height = target_height
        self.target_aspect = target_height / target_width
        self.frame_skip = frame_skip
        self.smoothing_window = smoothing_window
        self.max_velocity = max_velocity
        self.person_switch_cooldown = person_switch_cooldown

        logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip}, smoothing={smoothing_window}, velocity={max_velocity}, cooldown={person_switch_cooldown})")

    def create_framing_plan(
        self,
        video_path: str,
        start_time: float,
        end_time: float,
        audio_samples: Optional[np.ndarray] = None
    ) -> FramingPlan:
        """
        Analyze video and create a complete framing plan.

        Args:
            video_path: Path to video file
            start_time: Start time in seconds
            end_time: End time in seconds
            audio_samples: Optional audio samples for speech detection

        Returns:
            FramingPlan with all frame contexts and crop regions
        """
        analyzer = ContextAnalyzer(person_switch_cooldown=self.person_switch_cooldown)

        speaking_periods = None
        if audio_samples is not None:
            speaking_periods = analyzer.audio_detector.detect_speaking_periods(audio_samples)

        import os
        os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'loglevel;quiet'

        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)

        start_frame = int(start_time * fps)
        end_frame = int(end_time * fps)

        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

        frame_contexts = []
        frame_number = start_frame
        processed_count = 0

        logger.info(f"Analyzing frames {start_frame} to {end_frame} (fps={fps}, skip={self.frame_skip})")

        while frame_number < end_frame:
            ret, frame = cap.read()
            if not ret:
                break

            if processed_count % self.frame_skip == 0:
                timestamp = frame_number / fps
                context = analyzer.analyze_frame(frame, timestamp, frame_number, speaking_periods)
                frame_contexts.append(context)

            frame_number += 1
            processed_count += 1

        source_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        source_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

        cap.release()
        analyzer.close()

        layout_modes = [ctx.layout_mode for ctx in frame_contexts]
        if layout_modes:
            overall_layout = max(set(layout_modes), key=layout_modes.count)
        else:
            overall_layout = "single"

        crop_regions = self._calculate_crop_regions(
            frame_contexts,
            source_width,
            source_height
        )

        framing_plan = FramingPlan(
            frame_contexts=frame_contexts,
            crop_regions=crop_regions,
            layout_mode=overall_layout,
            fps=fps
        )

        import gc
        gc.collect()

        return framing_plan

    def _calculate_crop_regions(
        self,
        contexts: List[FrameContext],
        source_width: int,
        source_height: int
    ) -> List[CropRegion]:
        """
        Calculate smooth crop regions for each frame.

        Args:
            contexts: List of frame contexts
            source_width: Source video width
            source_height: Source video height

        Returns:
            List of crop regions
        """
        if not contexts:
            return []

        source_aspect = source_width / source_height

        if source_aspect > self.target_aspect:
            crop_height = source_height
            crop_width = int(crop_height / self.target_aspect)

            if crop_width > source_width:
                crop_width = source_width
                crop_height = int(crop_width * self.target_aspect)
        else:
            crop_width = source_width
            crop_height = int(crop_width * self.target_aspect)

            if crop_height > source_height:
                crop_height = source_height
                crop_width = int(crop_height / self.target_aspect)

        safe_zone_margin_x = crop_width * 0.40
        safe_zone_margin_y = crop_height * 0.40

        dead_zone_threshold = 100

        if contexts and contexts[0].primary_focus:
            current_crop_center_x = contexts[0].primary_focus[0]
            current_crop_center_y = contexts[0].primary_focus[1]
        else:
            current_crop_center_x = source_width // 2
            current_crop_center_y = source_height // 2

        center_xs = [current_crop_center_x]
        center_ys = [current_crop_center_y]

        for ctx in contexts[1:]:
            if ctx.primary_focus and ctx.selected_people and len(ctx.detected_faces) > 0:
                primary_person_idx = ctx.selected_people[0] if ctx.selected_people else 0
                if primary_person_idx < len(ctx.detected_faces):
                    face = ctx.detected_faces[primary_person_idx]

                    face_left = face.x
                    face_right = face.x + face.width
                    face_top = face.y
                    face_bottom = face.y + face.height

                    crop_left = current_crop_center_x - crop_width // 2
                    crop_right = current_crop_center_x + crop_width // 2
                    crop_top = current_crop_center_y - crop_height // 2
                    crop_bottom = current_crop_center_y + crop_height // 2

                    face_rel_left = face_left - crop_left
                    face_rel_right = face_right - crop_left
                    face_rel_top = face_top - crop_top
                    face_rel_bottom = face_bottom - crop_top

                    face_left_safe = face_rel_left >= safe_zone_margin_x
                    face_right_safe = face_rel_right <= (crop_width - safe_zone_margin_x)
                    face_top_safe = face_rel_top >= safe_zone_margin_y
                    face_bottom_safe = face_rel_bottom <= (crop_height - safe_zone_margin_y)

                    face_fully_visible = face_left_safe and face_right_safe and face_top_safe and face_bottom_safe

                    if face_fully_visible:
                        center_xs.append(current_crop_center_x)
                        center_ys.append(current_crop_center_y)
                    else:
                        shift_x = 0
                        shift_y = 0

                        if not face_left_safe:
                            shift_x = face_rel_left - safe_zone_margin_x
                        elif not face_right_safe:
                            shift_x = face_rel_right - (crop_width - safe_zone_margin_x)

                        if not face_top_safe:
                            shift_y = face_rel_top - safe_zone_margin_y
                        elif not face_bottom_safe:
                            shift_y = face_rel_bottom - (crop_height - safe_zone_margin_y)

                        if abs(shift_x) > dead_zone_threshold:
                            current_crop_center_x += shift_x
                        if abs(shift_y) > dead_zone_threshold:
                            current_crop_center_y += shift_y

                        center_xs.append(current_crop_center_x)
                        center_ys.append(current_crop_center_y)
                else:
                    center_xs.append(current_crop_center_x)
                    center_ys.append(current_crop_center_y)
            else:
                center_xs.append(current_crop_center_x)
                center_ys.append(current_crop_center_y)

        if len(center_xs) > 1:
            alpha = 0.002
            smoothed_xs = [center_xs[0]]
            smoothed_ys = [center_ys[0]]
            for i in range(1, len(center_xs)):
                if center_xs[i] != center_xs[i-1] or center_ys[i] != center_ys[i-1]:
                    smoothed_xs.append(alpha * center_xs[i] + (1 - alpha) * smoothed_xs[i-1])
                    smoothed_ys.append(alpha * center_ys[i] + (1 - alpha) * smoothed_ys[i-1])
                else:
                    smoothed_xs.append(smoothed_xs[i-1])
                    smoothed_ys.append(smoothed_ys[i-1])
            center_xs = smoothed_xs
            center_ys = smoothed_ys

        center_xs = self._limit_velocity(center_xs, 2)
        center_ys = self._limit_velocity(center_ys, 2)

        center_xs = self._apply_dead_zone(center_xs, 5)
        center_ys = self._apply_dead_zone(center_ys, 5)

        crop_regions = []
        for center_x, center_y in zip(center_xs, center_ys):
            x = int(center_x - crop_width // 2)
            y = int(center_y - crop_height // 2)

            x = max(0, min(x, source_width - crop_width))
            y = max(0, min(y, source_height - crop_height))

            crop_regions.append(CropRegion(
                x=x,
                y=y,
                width=crop_width,
                height=crop_height
            ))

        center_xs.clear()
        center_ys.clear()

        return crop_regions

    def _apply_dead_zone(self, positions: List[float], threshold: float) -> List[float]:
        """
        Apply dead zone to eliminate micro-movements.
        If change is smaller than threshold, keep previous position.

        Args:
            positions: List of positions
            threshold: Minimum change needed to move (pixels)

        Returns:
            Positions with dead zone applied
        """
        if len(positions) <= 1:
            return positions

        filtered = [positions[0]]

        for i in range(1, len(positions)):
            delta = abs(positions[i] - filtered[i - 1])
            if delta < threshold:
                filtered.append(filtered[i - 1])
            else:
                filtered.append(positions[i])

        return filtered

    def _limit_velocity(self, positions: List[float], max_velocity: float) -> List[float]:
        """
        Limit the velocity of position changes.

        Args:
            positions: List of positions
            max_velocity: Maximum allowed change per frame

        Returns:
            Smoothed positions
        """
        if len(positions) <= 1:
            return positions

        limited = [positions[0]]

        for i in range(1, len(positions)):
            delta = positions[i] - limited[i - 1]
            if abs(delta) > max_velocity:
                delta = max_velocity if delta > 0 else -max_velocity

            limited.append(limited[i - 1] + delta)

        return limited

    def apply_framing(
        self,
        video_clip: VideoFileClip,
        framing_plan: FramingPlan
    ) -> VideoClip:
        """
        Apply smart framing to a video clip.
        Always uses single-person focus (no split screen).

        Args:
            video_clip: Source video clip
            framing_plan: Framing plan to apply

        Returns:
            Reframed video clip
        """
        return self._apply_single_framing(video_clip, framing_plan)

    def _apply_single_framing(
        self,
        video_clip: VideoFileClip,
        framing_plan: FramingPlan
    ) -> VideoClip:
        """
        Apply single-focus framing (following one person or action).

        Args:
            video_clip: Source video clip
            framing_plan: Framing plan

        Returns:
            Reframed video clip
        """
        def make_frame(t):
            frame = video_clip.get_frame(t)

            if not framing_plan.crop_regions:
                h, w = frame.shape[:2]
                crop_h = int(w * self.target_aspect)
                crop_w = w
                if crop_h > h:
                    crop_h = h
                    crop_w = int(h / self.target_aspect)
                y = (h - crop_h) // 2
                x = (w - crop_w) // 2
                cropped = frame[y:y + crop_h, x:x + crop_w]
            else:
                exact_frame_idx = (t * framing_plan.fps) / self.frame_skip

                idx_floor = int(exact_frame_idx)
                idx_ceil = idx_floor + 1

                alpha = exact_frame_idx - idx_floor

                idx_floor = max(0, min(idx_floor, len(framing_plan.crop_regions) - 1))
                idx_ceil = max(0, min(idx_ceil, len(framing_plan.crop_regions) - 1))

                crop1 = framing_plan.crop_regions[idx_floor]
                crop2 = framing_plan.crop_regions[idx_ceil]

                x = int(crop1.x * (1 - alpha) + crop2.x * alpha)
                y = int(crop1.y * (1 - alpha) + crop2.y * alpha)
                width = int(crop1.width * (1 - alpha) + crop2.width * alpha)
                height = int(crop1.height * (1 - alpha) + crop2.height * alpha)

                h, w = frame.shape[:2]
                x = max(0, min(x, w - width))
                y = max(0, min(y, h - height))
                width = min(width, w - x)
                height = min(height, h - y)

                cropped = frame[y:y + height, x:x + width]

            resized = cv2.resize(
                cropped,
                (self.target_width, self.target_height),
                interpolation=cv2.INTER_LINEAR
            )

            return resized

        new_clip = VideoClip(duration=video_clip.duration)
        new_clip.size = (self.target_width, self.target_height)
        new_clip.frame_function = make_frame
        return new_clip

    def _apply_split_screen(
        self,
        video_clip: VideoFileClip,
        framing_plan: FramingPlan
    ) -> VideoClip:
        """
        Apply split screen for two people.

        Args:
            video_clip: Source video clip
            framing_plan: Framing plan

        Returns:
            Split screen video clip
        """
        def make_frame(t):
            frame = video_clip.get_frame(t)
            exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
            frame_idx = int(exact_frame_idx)

            if not framing_plan.frame_contexts:
                h, w = frame.shape[:2]
                crop_h = int(w * self.target_aspect)
                crop_w = w
                if crop_h > h:
                    crop_h = h
                    crop_w = int(h / self.target_aspect)
                y = (h - crop_h) // 2
                x = (w - crop_w) // 2
                cropped = frame[y:y + crop_h, x:x + crop_w]
                return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)

            frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
            context = framing_plan.frame_contexts[frame_idx]

            output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)

            if context.selected_people and len(context.selected_people) >= 2:
                selected_faces = [context.detected_faces[i] for i in context.selected_people[:2]
                                if i < len(context.detected_faces)]

                if len(selected_faces) >= 2:
                    faces = sorted(selected_faces, key=lambda f: f.center_x)
                    left_face = faces[0]
                    right_face = faces[1]

                    for idx, face in enumerate([left_face, right_face]):

                        half_width = self.target_width // 2
                        half_aspect = self.target_height / half_width  # Aspect ratio for half

                        face_width = max(face.width, frame.shape[1] // 4)  # At least 1/4 of frame width
                        crop_width = int(face_width * 2.5)  # Add padding around face
                        crop_height = int(crop_width * half_aspect)  # Maintain correct aspect

                        max_crop_width = frame.shape[1] // 2  # Half the source width
                        max_crop_height = frame.shape[0]  # Full source height

                        if crop_width > max_crop_width:
                            crop_width = max_crop_width
                            crop_height = int(crop_width * half_aspect)

                        if crop_height > max_crop_height:
                            crop_height = max_crop_height
                            crop_width = int(crop_height / half_aspect)

                        x = max(0, face.center_x - crop_width // 2)
                        y = max(0, face.center_y - crop_height // 2)

                        x = min(x, frame.shape[1] - crop_width)
                        y = min(y, frame.shape[0] - crop_height)

                        cropped = frame[y:y + crop_height, x:x + crop_width]
                        resized = cv2.resize(
                            cropped,
                            (half_width, self.target_height),
                            interpolation=cv2.INTER_LINEAR
                        )

                        x_offset = idx * half_width
                        output[:, x_offset:x_offset + half_width] = resized
                else:
                    if framing_plan.crop_regions:
                        crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
                        crop = framing_plan.crop_regions[crop_idx]
                        cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
                    else:
                        h, w = frame.shape[:2]
                        crop_h = int(w * self.target_aspect)
                        crop_w = w
                        if crop_h > h:
                            crop_h = h
                            crop_w = int(h / self.target_aspect)
                        y = (h - crop_h) // 2
                        x = (w - crop_w) // 2
                        cropped = frame[y:y + crop_h, x:x + crop_w]
                    output = cv2.resize(
                        cropped,
                        (self.target_width, self.target_height),
                        interpolation=cv2.INTER_LINEAR
                    )
            else:
                if framing_plan.crop_regions:
                    crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
                    crop = framing_plan.crop_regions[crop_idx]
                    cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
                else:
                    h, w = frame.shape[:2]
                    crop_h = int(w * self.target_aspect)
                    crop_w = w
                    if crop_h > h:
                        crop_h = h
                        crop_w = int(h / self.target_aspect)
                    y = (h - crop_h) // 2
                    x = (w - crop_w) // 2
                    cropped = frame[y:y + crop_h, x:x + crop_w]
                output = cv2.resize(
                    cropped,
                    (self.target_width, self.target_height),
                    interpolation=cv2.INTER_LINEAR
                )

            return output

        new_clip = VideoClip(duration=video_clip.duration)
        new_clip.size = (self.target_width, self.target_height)
        new_clip.frame_function = make_frame
        return new_clip

    def _apply_grid_layout(
        self,
        video_clip: VideoFileClip,
        framing_plan: FramingPlan
    ) -> VideoClip:
        """
        Apply grid layout for 3+ people.

        Args:
            video_clip: Source video clip
            framing_plan: Framing plan

        Returns:
            Grid layout video clip
        """
        def make_frame(t):
            frame = video_clip.get_frame(t)
            exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
            frame_idx = int(exact_frame_idx)

            if not framing_plan.frame_contexts:
                h, w = frame.shape[:2]
                crop_h = int(w * self.target_aspect)
                crop_w = w
                if crop_h > h:
                    crop_h = h
                    crop_w = int(h / self.target_aspect)
                y = (h - crop_h) // 2
                x = (w - crop_w) // 2
                cropped = frame[y:y + crop_h, x:x + crop_w]
                return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)

            frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
            context = framing_plan.frame_contexts[frame_idx]

            output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)

            num_faces = len(context.detected_faces)

            if num_faces >= 3:
                cell_width = self.target_width // 2
                cell_height = self.target_height // 2

                for idx, face in enumerate(context.detected_faces[:4]):
                    row = idx // 2
                    col = idx % 2

                    cell_aspect = cell_height / cell_width

                    crop_width = frame.shape[1] // 2
                    crop_height = int(crop_width * cell_aspect)

                    max_crop_width = frame.shape[1] // 2
                    max_crop_height = frame.shape[0] // 2

                    if crop_width > max_crop_width:
                        crop_width = max_crop_width
                        crop_height = int(crop_width * cell_aspect)

                    if crop_height > max_crop_height:
                        crop_height = max_crop_height
                        crop_width = int(crop_height / cell_aspect)

                    x = max(0, face.center_x - crop_width // 2)
                    y = max(0, face.center_y - crop_height // 2)

                    x = min(x, frame.shape[1] - crop_width)
                    y = min(y, frame.shape[0] - crop_height)

                    cropped = frame[y:y + crop_height, x:x + crop_width]
                    resized = cv2.resize(
                        cropped,
                        (cell_width, cell_height),
                        interpolation=cv2.INTER_LINEAR
                    )

                    y_offset = row * cell_height
                    x_offset = col * cell_width
                    output[y_offset:y_offset + cell_height, x_offset:x_offset + cell_width] = resized
            else:
                if framing_plan.crop_regions:
                    crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
                    crop = framing_plan.crop_regions[crop_idx]
                    cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
                else:
                    h, w = frame.shape[:2]
                    crop_h = int(w * self.target_aspect)
                    crop_w = w
                    if crop_h > h:
                        crop_h = h
                        crop_w = int(h / self.target_aspect)
                    y = (h - crop_h) // 2
                    x = (w - crop_w) // 2
                    cropped = frame[y:y + crop_h, x:x + crop_w]
                output = cv2.resize(
                    cropped,
                    (self.target_width, self.target_height),
                    interpolation=cv2.INTER_LINEAR
                )

            return output

        new_clip = VideoClip(duration=video_clip.duration)
        new_clip.size = (self.target_width, self.target_height)
        new_clip.frame_function = make_frame
        return new_clip


def extract_audio_samples(video_path: str, start_time: float, end_time: float) -> Optional[np.ndarray]:
    """
    Extract audio samples from video for speech detection.

    Args:
        video_path: Path to video file
        start_time: Start time in seconds
        end_time: End time in seconds

    Returns:
        Audio samples array or None if no audio
    """
    try:
        from moviepy.audio.io.AudioFileClip import AudioFileClip

        with AudioFileClip(video_path) as audio:
            segment = audio.subclipped(start_time, end_time)
            fps = getattr(segment, 'fps', 44100)
            samples = segment.to_soundarray(fps=fps)
            return samples
    except Exception as exc:
        logger.warning(f"Failed to extract audio: {exc}")
        return None