""" Smart framing module for intelligent video cropping and composition. This module provides functionality to create 9:16 vertical videos with intelligent framing that follows the action and speakers. """ from __future__ import annotations import logging from dataclasses import dataclass from typing import List, Optional, Tuple import cv2 import numpy as np from moviepy.video.VideoClip import VideoClip from moviepy.video.io.VideoFileClip import VideoFileClip from scipy import signal from video_render.context_detection import ContextAnalyzer, FrameContext, FaceDetection, GroupBoundingBox logger = logging.getLogger(__name__) @dataclass class CropRegion: """Defines a crop region for a frame.""" x: int y: int width: int height: int @dataclass class FramingPlan: """Complete framing plan for a video segment.""" frame_contexts: List[FrameContext] crop_regions: List[CropRegion] layout_mode: str fps: float class SmartFramer: """Creates intelligent 9:16 framing for horizontal videos with multi-person support.""" def __init__( self, target_width: int = 1080, target_height: int = 1920, frame_skip: int = 1, smoothing_window: int = 30, max_velocity: int = 25, person_switch_cooldown: int = 30, response_time: float = 0.6, group_padding: float = 0.15, max_zoom_out: float = 2.0, dead_zone: int = 100, min_face_confidence: float = 0.3 ): self.target_width = target_width self.target_height = target_height self.target_aspect = target_height / target_width self.frame_skip = frame_skip self.smoothing_window = smoothing_window self.max_velocity = max_velocity self.person_switch_cooldown = person_switch_cooldown self.response_time = response_time self.group_padding = group_padding self.max_zoom_out = max_zoom_out self.dead_zone = dead_zone self.min_face_confidence = min_face_confidence self.position_history_size = 45 self.hysteresis_frames = 8 logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, response_time={response_time}s, max_velocity={max_velocity}, dead_zone={dead_zone})") def create_framing_plan( self, video_path: str, start_time: float, end_time: float, audio_samples: Optional[np.ndarray] = None ) -> FramingPlan: """ Analyze video and create a complete framing plan. Args: video_path: Path to video file start_time: Start time in seconds end_time: End time in seconds audio_samples: Optional audio samples for speech detection Returns: FramingPlan with all frame contexts and crop regions """ analyzer = ContextAnalyzer( person_switch_cooldown=self.person_switch_cooldown, min_face_confidence=self.min_face_confidence ) speaking_periods = None if audio_samples is not None: speaking_periods = analyzer.audio_detector.detect_speaking_periods(audio_samples) import os os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'loglevel;quiet' cap = cv2.VideoCapture(video_path) fps = cap.get(cv2.CAP_PROP_FPS) start_frame = int(start_time * fps) end_frame = int(end_time * fps) cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame) frame_contexts = [] frame_number = start_frame processed_count = 0 logger.info(f"Analyzing frames {start_frame} to {end_frame} (fps={fps}, skip={self.frame_skip})") while frame_number < end_frame: ret, frame = cap.read() if not ret: break if processed_count % self.frame_skip == 0: timestamp = frame_number / fps context = analyzer.analyze_frame(frame, timestamp, frame_number, speaking_periods) frame_contexts.append(context) frame_number += 1 processed_count += 1 source_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) source_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) cap.release() analyzer.close() layout_modes = [ctx.layout_mode for ctx in frame_contexts] if layout_modes: overall_layout = max(set(layout_modes), key=layout_modes.count) else: overall_layout = "single" crop_regions = self._calculate_crop_regions( frame_contexts, source_width, source_height, fps=fps ) framing_plan = FramingPlan( frame_contexts=frame_contexts, crop_regions=crop_regions, layout_mode=overall_layout, fps=fps ) import gc gc.collect() return framing_plan def _segment_by_face_detection( self, has_face_flags: List[bool], min_segment_frames: int = 10 ) -> List[Tuple[int, int, bool]]: """ Segment the video into continuous regions with/without face. Returns list of (start_idx, end_idx, has_face) tuples. Small segments are merged with neighbors. """ if not has_face_flags: return [] segments = [] start_idx = 0 current_state = has_face_flags[0] for i in range(1, len(has_face_flags)): if has_face_flags[i] != current_state: segments.append((start_idx, i - 1, current_state)) start_idx = i current_state = has_face_flags[i] segments.append((start_idx, len(has_face_flags) - 1, current_state)) merged = [] for seg in segments: start, end, has_face = seg length = end - start + 1 if length < min_segment_frames and merged: prev_start, prev_end, prev_has_face = merged[-1] merged[-1] = (prev_start, end, prev_has_face) else: merged.append(seg) return merged def _interpolate_smooth( self, positions: List[float], segments: List[Tuple[int, int, bool]], transition_frames: int = 15 ) -> List[float]: """ Create smooth transitions between segments using cosine interpolation. Within each segment, position is constant. Between segments, smooth transition. """ if not positions or not segments: return positions result = list(positions) segment_values = [] for start, end, has_face in segments: seg_positions = positions[start:end + 1] if seg_positions: segment_values.append(float(np.median(seg_positions))) else: segment_values.append(positions[start] if start < len(positions) else 0.0) for i, (start, end, has_face) in enumerate(segments): value = segment_values[i] for j in range(start, end + 1): result[j] = value for i in range(len(segments) - 1): seg1_start, seg1_end, _ = segments[i] seg2_start, seg2_end, _ = segments[i + 1] val1 = segment_values[i] val2 = segment_values[i + 1] if abs(val2 - val1) < self.dead_zone * 0.5: continue trans_start = max(seg1_end - transition_frames // 2, seg1_start) trans_end = min(seg2_start + transition_frames // 2, seg2_end) trans_length = trans_end - trans_start + 1 if trans_length < 2: continue for j in range(trans_length): t = j / (trans_length - 1) smooth_t = 0.5 - 0.5 * np.cos(t * np.pi) idx = trans_start + j if 0 <= idx < len(result): result[idx] = val1 + (val2 - val1) * smooth_t return result def _apply_savgol_filter( self, positions: List[float], window_length: int = 61, polyorder: int = 2 ) -> List[float]: """ Apply Savitzky-Golay filter for ultra-smooth position tracking. This is a signal processing filter that preserves trends while removing noise. """ if len(positions) < window_length: window_length = len(positions) if len(positions) % 2 == 1 else len(positions) - 1 if window_length < 3: return positions if window_length % 2 == 0: window_length -= 1 if window_length <= polyorder: polyorder = max(1, window_length - 1) try: smoothed = signal.savgol_filter(positions, window_length, polyorder, mode='nearest') return smoothed.tolist() except Exception as e: logger.warning(f"Savgol filter failed: {e}, returning original positions") return positions def _apply_median_filter(self, positions: List[float], window_size: int = 5) -> List[float]: """ Apply median filter to remove detection noise. Median filter is ideal for removing outliers while preserving edges (real movements). Window size of 5 means each position is replaced by the median of itself and 2 neighbors on each side. Args: positions: Raw positions from detection window_size: Window size (must be odd), default 5 Returns: Filtered positions with noise removed """ if len(positions) < window_size: return positions from scipy.signal import medfilt if window_size % 2 == 0: window_size += 1 filtered = medfilt(positions, kernel_size=window_size) return filtered.tolist() def _is_detection_stable(self, has_face_flags: List[bool], window_size: int = 30) -> bool: """ Check if face detection is stable enough to use smart framing. If detection is too unstable (frequent changes), it's better to use static center crop. Args: has_face_flags: Boolean flags indicating if face was detected per frame window_size: Number of frames to analyze for stability Returns: True if detection is stable, False if too unstable """ if len(has_face_flags) < window_size: window_size = len(has_face_flags) if window_size == 0: return False changes = 0 for i in range(1, len(has_face_flags)): if has_face_flags[i] != has_face_flags[i-1]: changes += 1 change_rate = changes / len(has_face_flags) return change_rate < 0.3 def _stabilize_no_face_sequences( self, positions: List[float], has_face_flags: List[bool], source_center: float = None ) -> List[float]: """ Stabilize positions during sequences without face detection. Uses median of all valid positions for maximum stability. """ if len(positions) != len(has_face_flags): return positions fallback = source_center if source_center else (positions[0] if positions else 0.0) face_ratio = sum(has_face_flags) / len(has_face_flags) if has_face_flags else 0 if face_ratio < 0.15: return [fallback] * len(positions) changes = sum(1 for i in range(1, len(has_face_flags)) if has_face_flags[i] != has_face_flags[i-1]) instability_ratio = changes / len(has_face_flags) if has_face_flags else 0 if instability_ratio > 0.25: valid_positions = [positions[i] for i, has_face in enumerate(has_face_flags) if has_face] if valid_positions: return [float(np.median(valid_positions))] * len(positions) return [fallback] * len(positions) valid_positions = [positions[i] for i, has_face in enumerate(has_face_flags) if has_face] if not valid_positions: return [fallback] * len(positions) global_median = float(np.median(valid_positions)) stabilized = list(positions) i = 0 while i < len(has_face_flags): if not has_face_flags[i]: start_idx = i recent_valid = [] for j in range(max(0, start_idx - self.position_history_size), start_idx): if has_face_flags[j]: recent_valid.append(positions[j]) lock_value = float(np.median(recent_valid)) if len(recent_valid) >= 5 else global_median while i < len(has_face_flags) and not has_face_flags[i]: stabilized[i] = lock_value i += 1 else: i += 1 return stabilized def _calculate_crop_regions( self, contexts: List[FrameContext], source_width: int, source_height: int, fps: Optional[float] = None ) -> List[CropRegion]: """ Calculate smooth crop regions for each frame with multi-person support. Args: contexts: List of frame contexts source_width: Source video width source_height: Source video height Returns: List of crop regions """ if not contexts: return [] source_aspect = source_width / source_height # Calculate base crop dimensions for 9:16 if source_aspect > self.target_aspect: base_crop_height = source_height base_crop_width = int(base_crop_height / self.target_aspect) if base_crop_width > source_width: base_crop_width = source_width base_crop_height = int(base_crop_width * self.target_aspect) else: base_crop_width = source_width base_crop_height = int(base_crop_width * self.target_aspect) if base_crop_height > source_height: base_crop_height = source_height base_crop_width = int(base_crop_height / self.target_aspect) center_xs = [] center_ys = [] zoom_factors = [] has_face_flags = [] static_center_x = float(source_width // 2) static_center_y = float(source_height // 2) last_valid_x = static_center_x last_valid_y = static_center_y last_valid_zoom = 1.0 for ctx in contexts: selected_face = None if ctx.selected_people: idx = ctx.selected_people[0] if 0 <= idx < len(ctx.detected_faces): selected_face = ctx.detected_faces[idx] if selected_face: center_x = float(selected_face.center_x) center_y = float(selected_face.center_y) center_xs.append(center_x) center_ys.append(center_y) required_width = selected_face.width * (1 + self.group_padding * 2) required_height = selected_face.height * (1 + self.group_padding * 3) zoom_w = required_width / base_crop_width zoom_h = required_height / base_crop_height zoom = max(zoom_w, zoom_h, 1.0) zoom = min(zoom, self.max_zoom_out) zoom_factors.append(zoom) last_valid_x = center_x last_valid_y = center_y last_valid_zoom = zoom has_face_flags.append(True) elif ctx.group_bounds and ctx.group_bounds.face_count > 0: group = ctx.group_bounds center_x = float(group.center_x) center_y = float(group.center_y) center_xs.append(center_x) center_ys.append(center_y) required_width = group.width * (1 + self.group_padding * 2) required_height = group.height * (1 + self.group_padding * 3) zoom_w = required_width / base_crop_width zoom_h = required_height / base_crop_height zoom = max(zoom_w, zoom_h, 1.0) zoom = min(zoom, self.max_zoom_out) zoom_factors.append(zoom) last_valid_x = center_x last_valid_y = center_y last_valid_zoom = zoom has_face_flags.append(True) elif ctx.primary_focus and len(ctx.detected_faces) > 0: center_x = float(ctx.primary_focus[0]) center_y = float(ctx.primary_focus[1]) center_xs.append(center_x) center_ys.append(center_y) zoom_factors.append(1.0) last_valid_x = center_x last_valid_y = center_y last_valid_zoom = 1.0 has_face_flags.append(True) else: center_xs.append(last_valid_x) center_ys.append(last_valid_y) zoom_factors.append(last_valid_zoom) has_face_flags.append(False) center_x_video = float(source_width // 2) center_y_video = float(source_height // 2) if not self._is_detection_stable(has_face_flags): final_xs = [center_x_video] * len(center_xs) final_ys = [center_y_video] * len(center_ys) final_zooms = [1.0] * len(zoom_factors) else: center_xs = self._stabilize_no_face_sequences( center_xs, has_face_flags, source_center=center_x_video ) center_ys = self._stabilize_no_face_sequences( center_ys, has_face_flags, source_center=center_y_video ) zoom_factors = self._stabilize_no_face_sequences( zoom_factors, has_face_flags, source_center=1.0 ) face_count = sum(has_face_flags) if face_count < len(has_face_flags) * 0.3: final_xs = [center_x_video] * len(center_xs) final_ys = [center_y_video] * len(center_ys) final_zooms = [1.0] * len(zoom_factors) else: valid_xs = [center_xs[i] for i, has_face in enumerate(has_face_flags) if has_face] valid_ys = [center_ys[i] for i, has_face in enumerate(has_face_flags) if has_face] valid_zooms = [zoom_factors[i] for i, has_face in enumerate(has_face_flags) if has_face] target_x = float(np.median(valid_xs)) if valid_xs else center_x_video target_y = float(np.median(valid_ys)) if valid_ys else center_y_video target_zoom = float(np.median(valid_zooms)) if valid_zooms else 1.0 for i in range(len(center_xs)): if not has_face_flags[i]: center_xs[i] = target_x center_ys[i] = target_y zoom_factors[i] = target_zoom final_xs = self._apply_savgol_filter(center_xs, window_length=61, polyorder=2) final_ys = self._apply_savgol_filter(center_ys, window_length=61, polyorder=2) final_zooms = self._apply_savgol_filter(zoom_factors, window_length=61, polyorder=2) if fps and self.response_time > 0: dt = self.frame_skip / fps alpha = 1 - np.exp(-dt / self.response_time) final_xs = self._apply_exponential_smoothing(final_xs, alpha) final_ys = self._apply_exponential_smoothing(final_ys, alpha) final_zooms = self._apply_exponential_smoothing(final_zooms, alpha) # Generate crop regions crop_regions = [] for cx, cy, zoom in zip(final_xs, final_ys, final_zooms): # Calculate actual crop size with zoom crop_width = int(base_crop_width * zoom) crop_height = int(base_crop_height * zoom) # Clamp to source dimensions crop_width = min(crop_width, source_width) crop_height = min(crop_height, source_height) # Maintain aspect ratio after clamping if crop_width / crop_height > base_crop_width / base_crop_height: crop_width = int(crop_height * base_crop_width / base_crop_height) else: crop_height = int(crop_width * base_crop_height / base_crop_width) # Calculate top-left corner x = int(cx - crop_width // 2) y = int(cy - crop_height // 2) # Keep within bounds x = max(0, min(x, source_width - crop_width)) y = max(0, min(y, source_height - crop_height)) crop_regions.append(CropRegion( x=x, y=y, width=crop_width, height=crop_height )) # Clear temporary lists center_xs.clear() center_ys.clear() zoom_factors.clear() return crop_regions def _apply_exponential_smoothing(self, positions: List[float], alpha: float) -> List[float]: """ Smooth positions with exponential moving average. """ if not positions: return positions alpha = max(0.0, min(alpha, 1.0)) smoothed = [positions[0]] for i in range(1, len(positions)): prev = smoothed[-1] smoothed.append(prev + alpha * (positions[i] - prev)) return smoothed def _apply_dead_zone(self, positions: List[float], threshold: float) -> List[float]: """ Apply dead zone to eliminate micro-movements. If change is smaller than threshold, keep previous position. Args: positions: List of positions threshold: Minimum change needed to move (pixels) Returns: Positions with dead zone applied """ if len(positions) <= 1: return positions filtered = [positions[0]] for i in range(1, len(positions)): delta = abs(positions[i] - filtered[i - 1]) if delta < threshold: filtered.append(filtered[i - 1]) else: filtered.append(positions[i]) return filtered def _limit_velocity(self, positions: List[float], max_velocity: float) -> List[float]: """ Limit the velocity of position changes. Args: positions: List of positions max_velocity: Maximum allowed change per frame Returns: Smoothed positions """ if len(positions) <= 1: return positions limited = [positions[0]] for i in range(1, len(positions)): delta = positions[i] - limited[i - 1] if abs(delta) > max_velocity: delta = max_velocity if delta > 0 else -max_velocity limited.append(limited[i - 1] + delta) return limited def apply_framing( self, video_clip: VideoFileClip, framing_plan: FramingPlan ) -> VideoClip: """ Apply smart framing to a video clip. Automatically selects layout based on number of people detected. Layouts: - 1 person: Single framing (follow person) - 2 people: Vertical split screen (side by side) - 3 people: 1 on top, 2 on bottom - 4 people: 2x2 grid Args: video_clip: Source video clip framing_plan: Framing plan to apply Returns: Reframed video clip """ # Determine predominant number of faces across all frames if not framing_plan.frame_contexts: return self._apply_single_framing(video_clip, framing_plan) face_counts = [] for ctx in framing_plan.frame_contexts: if ctx.active_speakers: face_counts.append(len(ctx.active_speakers)) elif ctx.group_bounds: face_counts.append(ctx.group_bounds.face_count) else: face_counts.append(len(ctx.detected_faces)) # Use mode (most common) face count, minimum 1 if face_counts: from collections import Counter count_freq = Counter(face_counts) # Get the most common count, but ignore 0 non_zero_counts = {k: v for k, v in count_freq.items() if k > 0} if non_zero_counts: predominant_faces = max(non_zero_counts, key=non_zero_counts.get) else: predominant_faces = 1 else: predominant_faces = 1 logger.info(f"Layout selection: predominant_faces={predominant_faces}") if predominant_faces == 1: return self._apply_single_framing(video_clip, framing_plan) elif predominant_faces == 2: return self._apply_split_screen(video_clip, framing_plan) elif predominant_faces == 3: return self._apply_three_person_layout(video_clip, framing_plan) else: # 4 or more return self._apply_grid_layout(video_clip, framing_plan) def _apply_single_framing( self, video_clip: VideoFileClip, framing_plan: FramingPlan ) -> VideoClip: """ Apply single-focus framing (following one person or action). Args: video_clip: Source video clip framing_plan: Framing plan Returns: Reframed video clip """ def make_frame(t): frame = video_clip.get_frame(t) if not framing_plan.crop_regions: h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w if crop_h > h: crop_h = h crop_w = int(h / self.target_aspect) y = (h - crop_h) // 2 x = (w - crop_w) // 2 cropped = frame[y:y + crop_h, x:x + crop_w] else: exact_frame_idx = (t * framing_plan.fps) / self.frame_skip last_idx = len(framing_plan.crop_regions) - 1 if last_idx <= 0: crop = framing_plan.crop_regions[0] x, y, width, height = crop.x, crop.y, crop.width, crop.height else: exact_frame_idx = max(0.0, min(exact_frame_idx, float(last_idx))) low_idx = int(np.floor(exact_frame_idx)) high_idx = min(low_idx + 1, last_idx) alpha = exact_frame_idx - low_idx crop_a = framing_plan.crop_regions[low_idx] crop_b = framing_plan.crop_regions[high_idx] x = int(round(crop_a.x + (crop_b.x - crop_a.x) * alpha)) y = int(round(crop_a.y + (crop_b.y - crop_a.y) * alpha)) width = int(round(crop_a.width + (crop_b.width - crop_a.width) * alpha)) height = int(round(crop_a.height + (crop_b.height - crop_a.height) * alpha)) h, w = frame.shape[:2] x = max(0, min(x, w - width)) y = max(0, min(y, h - height)) width = min(width, w - x) height = min(height, h - y) cropped = frame[y:y + height, x:x + width] resized = cv2.resize( cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR ) return resized new_clip = VideoClip(duration=video_clip.duration) new_clip.size = (self.target_width, self.target_height) new_clip.frame_function = make_frame return new_clip def _apply_split_screen( self, video_clip: VideoFileClip, framing_plan: FramingPlan ) -> VideoClip: """ Apply split screen for two people (side by side vertical split). Args: video_clip: Source video clip framing_plan: Framing plan Returns: Split screen video clip """ def make_frame(t): frame = video_clip.get_frame(t) exact_frame_idx = (t * framing_plan.fps) / self.frame_skip frame_idx = int(exact_frame_idx) if not framing_plan.frame_contexts: h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w if crop_h > h: crop_h = h crop_w = int(h / self.target_aspect) y = (h - crop_h) // 2 x = (w - crop_w) // 2 cropped = frame[y:y + crop_h, x:x + crop_w] return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR) frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1)) context = framing_plan.frame_contexts[frame_idx] output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8) if context.active_speakers: faces = [ context.detected_faces[idx] for idx in context.active_speakers if 0 <= idx < len(context.detected_faces) ][:2] else: # Use top faces by confidence for stability faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:2] if len(faces) >= 2: # Sort by X position (left to right) faces_sorted = sorted(faces, key=lambda f: f.center_x) left_face = faces_sorted[0] right_face = faces_sorted[1] half_width = self.target_width // 2 half_aspect = self.target_height / half_width for idx, face in enumerate([left_face, right_face]): # Calculate crop region around face crop_width = int(face.width * 3) # 3x face width for good framing crop_height = int(crop_width * half_aspect) # Clamp to reasonable limits crop_width = max(crop_width, frame.shape[1] // 4) crop_width = min(crop_width, frame.shape[1]) crop_height = min(crop_height, frame.shape[0]) # Ensure proper aspect ratio if crop_height / crop_width > half_aspect: crop_height = int(crop_width * half_aspect) else: crop_width = int(crop_height / half_aspect) # Center crop on face x = max(0, min(face.center_x - crop_width // 2, frame.shape[1] - crop_width)) y = max(0, min(face.center_y - crop_height // 2, frame.shape[0] - crop_height)) # Extract and resize cropped = frame[y:y + crop_height, x:x + crop_width] resized = cv2.resize(cropped, (half_width, self.target_height), interpolation=cv2.INTER_LINEAR) x_offset = idx * half_width output[:, x_offset:x_offset + half_width] = resized else: # Fallback to single framing if framing_plan.crop_regions: crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1)) crop = framing_plan.crop_regions[crop_idx] cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width] else: h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w if crop_h > h: crop_h = h crop_w = int(h / self.target_aspect) y = (h - crop_h) // 2 x = (w - crop_w) // 2 cropped = frame[y:y + crop_h, x:x + crop_w] output = cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR) return output new_clip = VideoClip(duration=video_clip.duration) new_clip.size = (self.target_width, self.target_height) new_clip.frame_function = make_frame return new_clip def _apply_three_person_layout( self, video_clip: VideoFileClip, framing_plan: FramingPlan ) -> VideoClip: """ Apply layout for 3 people: 1 on top (full width), 2 on bottom (side by side). Args: video_clip: Source video clip framing_plan: Framing plan Returns: Three-person layout video clip """ def make_frame(t): frame = video_clip.get_frame(t) exact_frame_idx = (t * framing_plan.fps) / self.frame_skip frame_idx = int(exact_frame_idx) if not framing_plan.frame_contexts: h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w if crop_h > h: crop_h = h crop_w = int(h / self.target_aspect) y = (h - crop_h) // 2 x = (w - crop_w) // 2 cropped = frame[y:y + crop_h, x:x + crop_w] return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR) frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1)) context = framing_plan.frame_contexts[frame_idx] output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8) if context.active_speakers: faces = [ context.detected_faces[idx] for idx in context.active_speakers if 0 <= idx < len(context.detected_faces) ][:3] else: faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:3] # Max 3 faces num_faces = len(faces) if num_faces >= 3: # Sort faces by Y position (top to bottom), then X for bottom row faces_sorted = sorted(faces, key=lambda f: f.center_y) top_face = faces_sorted[0] # Topmost face bottom_faces = sorted(faces_sorted[1:], key=lambda f: f.center_x) # Sort bottom by X # Top section: full width, half height top_height = self.target_height // 2 top_width = self.target_width top_aspect = top_height / top_width # Crop around top face crop_w = int(top_face.width * 3) # 3x face width for context crop_h = int(crop_w * top_aspect) crop_w = min(crop_w, frame.shape[1]) crop_h = min(crop_h, frame.shape[0]) x = max(0, min(top_face.center_x - crop_w // 2, frame.shape[1] - crop_w)) y = max(0, min(top_face.center_y - crop_h // 2, frame.shape[0] - crop_h)) cropped_top = frame[y:y + crop_h, x:x + crop_w] resized_top = cv2.resize(cropped_top, (top_width, top_height), interpolation=cv2.INTER_LINEAR) output[0:top_height, :] = resized_top # Bottom section: two halves bottom_height = self.target_height - top_height half_width = self.target_width // 2 bottom_aspect = bottom_height / half_width for idx, face in enumerate(bottom_faces[:2]): crop_w = int(face.width * 3) crop_h = int(crop_w * bottom_aspect) crop_w = min(crop_w, frame.shape[1] // 2) crop_h = min(crop_h, frame.shape[0]) x = max(0, min(face.center_x - crop_w // 2, frame.shape[1] - crop_w)) y = max(0, min(face.center_y - crop_h // 2, frame.shape[0] - crop_h)) cropped = frame[y:y + crop_h, x:x + crop_w] resized = cv2.resize(cropped, (half_width, bottom_height), interpolation=cv2.INTER_LINEAR) x_offset = idx * half_width output[top_height:, x_offset:x_offset + half_width] = resized else: # Fallback to single framing if framing_plan.crop_regions: crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1)) crop = framing_plan.crop_regions[crop_idx] cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width] else: h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w if crop_h > h: crop_h = h crop_w = int(h / self.target_aspect) y = (h - crop_h) // 2 x = (w - crop_w) // 2 cropped = frame[y:y + crop_h, x:x + crop_w] output = cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR) return output new_clip = VideoClip(duration=video_clip.duration) new_clip.size = (self.target_width, self.target_height) new_clip.frame_function = make_frame return new_clip def _apply_grid_layout( self, video_clip: VideoFileClip, framing_plan: FramingPlan ) -> VideoClip: """ Apply grid layout for 4 people (2x2 grid). Layout: top-left, top-right, bottom-left, bottom-right Args: video_clip: Source video clip framing_plan: Framing plan Returns: Grid layout video clip """ def make_frame(t): frame = video_clip.get_frame(t) exact_frame_idx = (t * framing_plan.fps) / self.frame_skip frame_idx = int(exact_frame_idx) if not framing_plan.frame_contexts: h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w if crop_h > h: crop_h = h crop_w = int(h / self.target_aspect) y = (h - crop_h) // 2 x = (w - crop_w) // 2 cropped = frame[y:y + crop_h, x:x + crop_w] return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR) frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1)) context = framing_plan.frame_contexts[frame_idx] output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8) if context.active_speakers: faces = [ context.detected_faces[idx] for idx in context.active_speakers if 0 <= idx < len(context.detected_faces) ][:4] else: faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:4] # Max 4 faces num_faces = len(faces) if num_faces >= 4: cell_width = self.target_width // 2 cell_height = self.target_height // 2 cell_aspect = cell_height / cell_width # Sort faces into grid positions by their actual position # First sort by Y (top row vs bottom row), then by X within each row sorted_by_y = sorted(faces, key=lambda f: f.center_y) top_row = sorted(sorted_by_y[:2], key=lambda f: f.center_x) bottom_row = sorted(sorted_by_y[2:], key=lambda f: f.center_x) grid_faces = top_row + bottom_row for idx, face in enumerate(grid_faces): row = idx // 2 col = idx % 2 # Calculate crop region centered on face crop_width = int(face.width * 3) # 3x face width crop_height = int(crop_width * cell_aspect) # Clamp to reasonable limits crop_width = max(crop_width, frame.shape[1] // 4) crop_width = min(crop_width, frame.shape[1]) crop_height = min(crop_height, frame.shape[0]) # Ensure proper aspect ratio if crop_height / crop_width > cell_aspect: crop_height = int(crop_width * cell_aspect) else: crop_width = int(crop_height / cell_aspect) # Center crop on face x = max(0, min(face.center_x - crop_width // 2, frame.shape[1] - crop_width)) y = max(0, min(face.center_y - crop_height // 2, frame.shape[0] - crop_height)) cropped = frame[y:y + crop_height, x:x + crop_width] resized = cv2.resize(cropped, (cell_width, cell_height), interpolation=cv2.INTER_LINEAR) y_offset = row * cell_height x_offset = col * cell_width output[y_offset:y_offset + cell_height, x_offset:x_offset + cell_width] = resized else: if framing_plan.crop_regions: crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1)) crop = framing_plan.crop_regions[crop_idx] cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width] else: h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w if crop_h > h: crop_h = h crop_w = int(h / self.target_aspect) y = (h - crop_h) // 2 x = (w - crop_w) // 2 cropped = frame[y:y + crop_h, x:x + crop_w] output = cv2.resize( cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR ) return output new_clip = VideoClip(duration=video_clip.duration) new_clip.size = (self.target_width, self.target_height) new_clip.frame_function = make_frame return new_clip def extract_audio_samples(video_path: str, start_time: float, end_time: float) -> Optional[np.ndarray]: """ Extract audio samples from video for speech detection. Args: video_path: Path to video file start_time: Start time in seconds end_time: End time in seconds Returns: Audio samples array or None if no audio """ try: from moviepy.audio.io.AudioFileClip import AudioFileClip with AudioFileClip(video_path) as audio: segment = audio.subclipped(start_time, end_time) fps = getattr(segment, 'fps', 44100) samples = segment.to_soundarray(fps=fps) return samples except Exception as exc: logger.warning(f"Failed to extract audio: {exc}") return None