""" Smart framing module for intelligent video cropping and composition. This module provides functionality to create 9:16 vertical videos with intelligent framing that follows the action and speakers. """ from __future__ import annotations import logging from dataclasses import dataclass from typing import List, Optional, Tuple import cv2 import numpy as np from moviepy.video.VideoClip import VideoClip from moviepy.video.io.VideoFileClip import VideoFileClip from scipy import signal from video_render.context_detection import ContextAnalyzer, FrameContext, FaceDetection logger = logging.getLogger(__name__) @dataclass class CropRegion: """Defines a crop region for a frame.""" x: int y: int width: int height: int @dataclass class FramingPlan: """Complete framing plan for a video segment.""" frame_contexts: List[FrameContext] crop_regions: List[CropRegion] layout_mode: str fps: float class SmartFramer: """Creates intelligent 9:16 framing for horizontal videos.""" def __init__( self, target_width: int = 1080, target_height: int = 1920, frame_skip: int = 2, smoothing_window: int = 15 ): self.target_width = target_width self.target_height = target_height self.target_aspect = target_height / target_width # Performance parameters self.frame_skip = frame_skip # Process every Nth frame (CPU optimization) # Smoothing parameters self.smoothing_window = smoothing_window self.max_velocity = 30 # pixels per frame (reduced for smoother transitions) logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip})") def create_framing_plan( self, video_path: str, start_time: float, end_time: float, audio_samples: Optional[np.ndarray] = None ) -> FramingPlan: """ Analyze video and create a complete framing plan. Args: video_path: Path to video file start_time: Start time in seconds end_time: End time in seconds audio_samples: Optional audio samples for speech detection Returns: FramingPlan with all frame contexts and crop regions """ analyzer = ContextAnalyzer() # Detect speaking periods from audio if available speaking_periods = None if audio_samples is not None: speaking_periods = analyzer.audio_detector.detect_speaking_periods(audio_samples) # Open video with error suppression for AV1 codec warnings import os os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'loglevel;quiet' cap = cv2.VideoCapture(video_path) fps = cap.get(cv2.CAP_PROP_FPS) # Calculate frame range start_frame = int(start_time * fps) end_frame = int(end_time * fps) # Set to start frame cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame) frame_contexts = [] frame_number = start_frame processed_count = 0 logger.info(f"Analyzing frames {start_frame} to {end_frame} (fps={fps}, skip={self.frame_skip})") while frame_number < end_frame: ret, frame = cap.read() if not ret: break # Only process every Nth frame for performance (CPU optimization) if processed_count % self.frame_skip == 0: timestamp = frame_number / fps context = analyzer.analyze_frame(frame, timestamp, frame_number, speaking_periods) frame_contexts.append(context) frame_number += 1 processed_count += 1 # Get video dimensions before releasing capture source_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) source_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) cap.release() analyzer.close() # Determine overall layout mode (most common) layout_modes = [ctx.layout_mode for ctx in frame_contexts] if layout_modes: overall_layout = max(set(layout_modes), key=layout_modes.count) else: overall_layout = "single" # Calculate crop regions based on contexts crop_regions = self._calculate_crop_regions( frame_contexts, source_width, source_height ) return FramingPlan( frame_contexts=frame_contexts, crop_regions=crop_regions, layout_mode=overall_layout, fps=fps ) def _calculate_crop_regions( self, contexts: List[FrameContext], source_width: int, source_height: int ) -> List[CropRegion]: """ Calculate smooth crop regions for each frame. Args: contexts: List of frame contexts source_width: Source video width source_height: Source video height Returns: List of crop regions """ if not contexts: return [] # Calculate ideal crop dimensions maintaining EXACT 9:16 aspect ratio source_aspect = source_width / source_height if source_aspect > self.target_aspect: # Source is wider - crop horizontally (use full height) crop_height = source_height crop_width = int(crop_height / self.target_aspect) # Ensure crop width fits within source if crop_width > source_width: crop_width = source_width crop_height = int(crop_width * self.target_aspect) else: # Source is taller - crop vertically (use full width) crop_width = source_width crop_height = int(crop_width * self.target_aspect) # Ensure crop height fits within source if crop_height > source_height: crop_height = source_height crop_width = int(crop_height / self.target_aspect) # Calculate center points for each frame # Since we now always focus on ONE person directly (not averaging), # we can use the focus point directly without complex validation center_xs = [] center_ys = [] for ctx in contexts: if ctx.primary_focus: # Primary focus is now always a single person's center, never averaged # This means it will never be on the table/empty space center_xs.append(ctx.primary_focus[0]) center_ys.append(ctx.primary_focus[1]) else: # Default to center only if no faces detected at all center_xs.append(source_width // 2) center_ys.append(source_height // 2) # Smooth the center points if len(center_xs) > self.smoothing_window: kernel_size = min(self.smoothing_window, len(center_xs)) if kernel_size % 2 == 0: kernel_size -= 1 center_xs = signal.medfilt(center_xs, kernel_size=kernel_size).tolist() center_ys = signal.medfilt(center_ys, kernel_size=kernel_size).tolist() # Limit velocity (prevent jarring movements) center_xs = self._limit_velocity(center_xs, self.max_velocity) center_ys = self._limit_velocity(center_ys, self.max_velocity) # Convert to crop regions crop_regions = [] for center_x, center_y in zip(center_xs, center_ys): # Calculate top-left corner x = int(center_x - crop_width // 2) y = int(center_y - crop_height // 2) # Clamp to valid bounds x = max(0, min(x, source_width - crop_width)) y = max(0, min(y, source_height - crop_height)) crop_regions.append(CropRegion( x=x, y=y, width=crop_width, height=crop_height )) return crop_regions def _limit_velocity(self, positions: List[float], max_velocity: float) -> List[float]: """ Limit the velocity of position changes. Args: positions: List of positions max_velocity: Maximum allowed change per frame Returns: Smoothed positions """ if len(positions) <= 1: return positions limited = [positions[0]] for i in range(1, len(positions)): delta = positions[i] - limited[i - 1] if abs(delta) > max_velocity: delta = max_velocity if delta > 0 else -max_velocity limited.append(limited[i - 1] + delta) return limited def apply_framing( self, video_clip: VideoFileClip, framing_plan: FramingPlan, use_split_screen: bool = False ) -> VideoClip: """ Apply smart framing to a video clip. Args: video_clip: Source video clip framing_plan: Framing plan to apply use_split_screen: Whether to use split screen for multiple people Returns: Reframed video clip """ # Handle different layout modes if framing_plan.layout_mode in ["single", "single_speaker"]: # Single person or single speaker - use focused single framing return self._apply_single_framing(video_clip, framing_plan) elif framing_plan.layout_mode == "dual_split" and use_split_screen: # Two people in conversation - use split screen return self._apply_split_screen(video_clip, framing_plan) elif framing_plan.layout_mode == "grid" and use_split_screen: # 3+ people - use grid layout return self._apply_grid_layout(video_clip, framing_plan) else: # Fallback to single framing return self._apply_single_framing(video_clip, framing_plan) def _apply_single_framing( self, video_clip: VideoFileClip, framing_plan: FramingPlan ) -> VideoClip: """ Apply single-focus framing (following one person or action). Args: video_clip: Source video clip framing_plan: Framing plan Returns: Reframed video clip """ def make_frame(t): # Get the original frame frame = video_clip.get_frame(t) # Ensure we have valid crop regions if not framing_plan.crop_regions: # Fallback: return center crop h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w if crop_h > h: crop_h = h crop_w = int(h / self.target_aspect) y = (h - crop_h) // 2 x = (w - crop_w) // 2 cropped = frame[y:y + crop_h, x:x + crop_w] else: # Calculate exact frame index with decimal precision for interpolation exact_frame_idx = (t * framing_plan.fps) / self.frame_skip # Get the two adjacent analyzed frames idx_floor = int(exact_frame_idx) idx_ceil = idx_floor + 1 # Interpolation factor (0.0 to 1.0) alpha = exact_frame_idx - idx_floor # Clamp indices to valid range idx_floor = max(0, min(idx_floor, len(framing_plan.crop_regions) - 1)) idx_ceil = max(0, min(idx_ceil, len(framing_plan.crop_regions) - 1)) # Get crop regions crop1 = framing_plan.crop_regions[idx_floor] crop2 = framing_plan.crop_regions[idx_ceil] # Linear interpolation between crop regions x = int(crop1.x * (1 - alpha) + crop2.x * alpha) y = int(crop1.y * (1 - alpha) + crop2.y * alpha) width = int(crop1.width * (1 - alpha) + crop2.width * alpha) height = int(crop1.height * (1 - alpha) + crop2.height * alpha) # Ensure crop stays within frame bounds h, w = frame.shape[:2] x = max(0, min(x, w - width)) y = max(0, min(y, h - height)) width = min(width, w - x) height = min(height, h - y) # Crop the frame cropped = frame[y:y + height, x:x + width] # Resize to target dimensions resized = cv2.resize( cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR ) return resized # MoviePy 2.x compatible way to create VideoClip new_clip = VideoClip(duration=video_clip.duration) new_clip.size = (self.target_width, self.target_height) new_clip.frame_function = make_frame return new_clip def _apply_split_screen( self, video_clip: VideoFileClip, framing_plan: FramingPlan ) -> VideoClip: """ Apply split screen for two people. Args: video_clip: Source video clip framing_plan: Framing plan Returns: Split screen video clip """ def make_frame(t): frame = video_clip.get_frame(t) # Calculate exact frame index with decimal precision for smooth interpolation exact_frame_idx = (t * framing_plan.fps) / self.frame_skip frame_idx = int(exact_frame_idx) # Ensure we have valid contexts if not framing_plan.frame_contexts: # Fallback to simple center crop h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w if crop_h > h: crop_h = h crop_w = int(h / self.target_aspect) y = (h - crop_h) // 2 x = (w - crop_w) // 2 cropped = frame[y:y + crop_h, x:x + crop_w] return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR) # Clamp index to valid range frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1)) context = framing_plan.frame_contexts[frame_idx] # Create output frame output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8) if len(context.detected_faces) >= 2: # Split vertically 50/50 (two columns) half_width = self.target_width // 2 # Select the 2 most relevant faces # Priority: ALWAYS show active speaker first + most confident other person if context.active_speakers and len(context.active_speakers) >= 1: # Get the PRIMARY speaker (most confident among active speakers) speaker_faces = [context.detected_faces[i] for i in context.active_speakers if i < len(context.detected_faces)] primary_speaker = max(speaker_faces, key=lambda f: f.confidence) # Get OTHER faces (not the primary speaker) other_faces = [f for f in context.detected_faces if f != primary_speaker] if len(speaker_faces) >= 2: # Multiple speakers: show primary + second most confident speaker other_speakers = [f for f in speaker_faces if f != primary_speaker] secondary_person = max(other_speakers, key=lambda f: f.confidence) elif other_faces: # One speaker: show speaker + most confident other person secondary_person = max(other_faces, key=lambda f: f.confidence) else: # Fallback: only one person detected secondary_person = primary_speaker selected_faces = [primary_speaker, secondary_person] else: # No speakers: take 2 most confident faces selected_faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:2] # Sort selected faces by horizontal position for consistent left/right placement faces = sorted(selected_faces, key=lambda f: f.center_x) left_face = faces[0] right_face = faces[1] # Process each person's frame for idx, face in enumerate([left_face, right_face]): # Calculate crop region focused on this person # Each person gets half the width, full target aspect ratio (9:16) # This ensures NO distortion when resizing # For split screen: each side is half_width x full_height # We need to maintain 9:16 aspect for each half half_width = self.target_width // 2 half_aspect = self.target_height / half_width # Aspect ratio for half # Determine crop size based on face with padding face_width = max(face.width, frame.shape[1] // 4) # At least 1/4 of frame width crop_width = int(face_width * 2.5) # Add padding around face crop_height = int(crop_width * half_aspect) # Maintain correct aspect # Ensure crop fits in frame, maintaining aspect ratio max_crop_width = frame.shape[1] // 2 # Half the source width max_crop_height = frame.shape[0] # Full source height # If crop is too wide, scale down proportionally if crop_width > max_crop_width: crop_width = max_crop_width crop_height = int(crop_width * half_aspect) # If crop is too tall, scale down proportionally if crop_height > max_crop_height: crop_height = max_crop_height crop_width = int(crop_height / half_aspect) # Center crop on face x = max(0, face.center_x - crop_width // 2) y = max(0, face.center_y - crop_height // 2) # Clamp to frame boundaries x = min(x, frame.shape[1] - crop_width) y = min(y, frame.shape[0] - crop_height) # Extract and resize crop cropped = frame[y:y + crop_height, x:x + crop_width] resized = cv2.resize( cropped, (half_width, self.target_height), interpolation=cv2.INTER_LINEAR ) # Place in output at appropriate horizontal position x_offset = idx * half_width output[:, x_offset:x_offset + half_width] = resized else: # Fall back to single framing if framing_plan.crop_regions: crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1)) crop = framing_plan.crop_regions[crop_idx] cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width] else: # Fallback to center crop if no crop regions available h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w if crop_h > h: crop_h = h crop_w = int(h / self.target_aspect) y = (h - crop_h) // 2 x = (w - crop_w) // 2 cropped = frame[y:y + crop_h, x:x + crop_w] output = cv2.resize( cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR ) return output # MoviePy 2.x compatible way to create VideoClip new_clip = VideoClip(duration=video_clip.duration) new_clip.size = (self.target_width, self.target_height) new_clip.frame_function = make_frame return new_clip def _apply_grid_layout( self, video_clip: VideoFileClip, framing_plan: FramingPlan ) -> VideoClip: """ Apply grid layout for 3+ people. Args: video_clip: Source video clip framing_plan: Framing plan Returns: Grid layout video clip """ def make_frame(t): frame = video_clip.get_frame(t) # Calculate exact frame index with decimal precision for smooth interpolation exact_frame_idx = (t * framing_plan.fps) / self.frame_skip frame_idx = int(exact_frame_idx) # Ensure we have valid contexts if not framing_plan.frame_contexts: # Fallback to simple center crop h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w if crop_h > h: crop_h = h crop_w = int(h / self.target_aspect) y = (h - crop_h) // 2 x = (w - crop_w) // 2 cropped = frame[y:y + crop_h, x:x + crop_w] return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR) # Clamp index to valid range frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1)) context = framing_plan.frame_contexts[frame_idx] output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8) num_faces = len(context.detected_faces) if num_faces >= 3: # Create 2x2 grid cell_width = self.target_width // 2 cell_height = self.target_height // 2 for idx, face in enumerate(context.detected_faces[:4]): # Calculate grid position row = idx // 2 col = idx % 2 # Each grid cell maintains aspect ratio (square in this case: cell_width = cell_height) cell_aspect = cell_height / cell_width # Crop around face with correct aspect ratio crop_width = frame.shape[1] // 2 crop_height = int(crop_width * cell_aspect) # Ensure crop fits in frame, maintaining aspect max_crop_width = frame.shape[1] // 2 max_crop_height = frame.shape[0] // 2 if crop_width > max_crop_width: crop_width = max_crop_width crop_height = int(crop_width * cell_aspect) if crop_height > max_crop_height: crop_height = max_crop_height crop_width = int(crop_height / cell_aspect) # Center crop on face x = max(0, face.center_x - crop_width // 2) y = max(0, face.center_y - crop_height // 2) # Clamp to frame boundaries x = min(x, frame.shape[1] - crop_width) y = min(y, frame.shape[0] - crop_height) cropped = frame[y:y + crop_height, x:x + crop_width] resized = cv2.resize( cropped, (cell_width, cell_height), interpolation=cv2.INTER_LINEAR ) # Place in grid y_offset = row * cell_height x_offset = col * cell_width output[y_offset:y_offset + cell_height, x_offset:x_offset + cell_width] = resized else: # Fall back to single framing if framing_plan.crop_regions: crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1)) crop = framing_plan.crop_regions[crop_idx] cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width] else: # Fallback to center crop if no crop regions available h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w if crop_h > h: crop_h = h crop_w = int(h / self.target_aspect) y = (h - crop_h) // 2 x = (w - crop_w) // 2 cropped = frame[y:y + crop_h, x:x + crop_w] output = cv2.resize( cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR ) return output # MoviePy 2.x compatible way to create VideoClip new_clip = VideoClip(duration=video_clip.duration) new_clip.size = (self.target_width, self.target_height) new_clip.frame_function = make_frame return new_clip def extract_audio_samples(video_path: str, start_time: float, end_time: float) -> Optional[np.ndarray]: """ Extract audio samples from video for speech detection. Args: video_path: Path to video file start_time: Start time in seconds end_time: End time in seconds Returns: Audio samples array or None if no audio """ try: from moviepy.audio.io.AudioFileClip import AudioFileClip with AudioFileClip(video_path) as audio: segment = audio.subclipped(start_time, end_time) fps = getattr(segment, 'fps', 44100) samples = segment.to_soundarray(fps=fps) return samples except Exception as exc: logger.warning(f"Failed to extract audio: {exc}") return None