- Adiciona rastreamento de objetos - Facial detection - Legenda interativa - Cortes mais precisos - Refinamento do Prompt
688 lines
26 KiB
Python
688 lines
26 KiB
Python
"""
|
|
Smart framing module for intelligent video cropping and composition.
|
|
|
|
This module provides functionality to create 9:16 vertical videos with
|
|
intelligent framing that follows the action and speakers.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from typing import List, Optional, Tuple
|
|
|
|
import cv2
|
|
import numpy as np
|
|
from moviepy.video.VideoClip import VideoClip
|
|
from moviepy.video.io.VideoFileClip import VideoFileClip
|
|
from scipy import signal
|
|
|
|
from video_render.context_detection import ContextAnalyzer, FrameContext, FaceDetection
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class CropRegion:
|
|
"""Defines a crop region for a frame."""
|
|
x: int
|
|
y: int
|
|
width: int
|
|
height: int
|
|
|
|
|
|
@dataclass
|
|
class FramingPlan:
|
|
"""Complete framing plan for a video segment."""
|
|
frame_contexts: List[FrameContext]
|
|
crop_regions: List[CropRegion]
|
|
layout_mode: str
|
|
fps: float
|
|
|
|
|
|
class SmartFramer:
|
|
"""Creates intelligent 9:16 framing for horizontal videos."""
|
|
|
|
def __init__(
|
|
self,
|
|
target_width: int = 1080,
|
|
target_height: int = 1920,
|
|
frame_skip: int = 2,
|
|
smoothing_window: int = 15
|
|
):
|
|
self.target_width = target_width
|
|
self.target_height = target_height
|
|
self.target_aspect = target_height / target_width
|
|
|
|
# Performance parameters
|
|
self.frame_skip = frame_skip # Process every Nth frame (CPU optimization)
|
|
|
|
# Smoothing parameters
|
|
self.smoothing_window = smoothing_window
|
|
self.max_velocity = 30 # pixels per frame (reduced for smoother transitions)
|
|
|
|
logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip})")
|
|
|
|
def create_framing_plan(
|
|
self,
|
|
video_path: str,
|
|
start_time: float,
|
|
end_time: float,
|
|
audio_samples: Optional[np.ndarray] = None
|
|
) -> FramingPlan:
|
|
"""
|
|
Analyze video and create a complete framing plan.
|
|
|
|
Args:
|
|
video_path: Path to video file
|
|
start_time: Start time in seconds
|
|
end_time: End time in seconds
|
|
audio_samples: Optional audio samples for speech detection
|
|
|
|
Returns:
|
|
FramingPlan with all frame contexts and crop regions
|
|
"""
|
|
analyzer = ContextAnalyzer()
|
|
|
|
# Detect speaking periods from audio if available
|
|
speaking_periods = None
|
|
if audio_samples is not None:
|
|
speaking_periods = analyzer.audio_detector.detect_speaking_periods(audio_samples)
|
|
|
|
# Open video with error suppression for AV1 codec warnings
|
|
import os
|
|
os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'loglevel;quiet'
|
|
|
|
cap = cv2.VideoCapture(video_path)
|
|
fps = cap.get(cv2.CAP_PROP_FPS)
|
|
|
|
# Calculate frame range
|
|
start_frame = int(start_time * fps)
|
|
end_frame = int(end_time * fps)
|
|
|
|
# Set to start frame
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
|
|
|
|
frame_contexts = []
|
|
frame_number = start_frame
|
|
processed_count = 0
|
|
|
|
logger.info(f"Analyzing frames {start_frame} to {end_frame} (fps={fps}, skip={self.frame_skip})")
|
|
|
|
while frame_number < end_frame:
|
|
ret, frame = cap.read()
|
|
if not ret:
|
|
break
|
|
|
|
# Only process every Nth frame for performance (CPU optimization)
|
|
if processed_count % self.frame_skip == 0:
|
|
timestamp = frame_number / fps
|
|
context = analyzer.analyze_frame(frame, timestamp, frame_number, speaking_periods)
|
|
frame_contexts.append(context)
|
|
|
|
frame_number += 1
|
|
processed_count += 1
|
|
|
|
# Get video dimensions before releasing capture
|
|
source_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
|
source_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
|
|
|
cap.release()
|
|
analyzer.close()
|
|
|
|
# Determine overall layout mode (most common)
|
|
layout_modes = [ctx.layout_mode for ctx in frame_contexts]
|
|
if layout_modes:
|
|
overall_layout = max(set(layout_modes), key=layout_modes.count)
|
|
else:
|
|
overall_layout = "single"
|
|
|
|
# Calculate crop regions based on contexts
|
|
|
|
crop_regions = self._calculate_crop_regions(
|
|
frame_contexts,
|
|
source_width,
|
|
source_height
|
|
)
|
|
|
|
return FramingPlan(
|
|
frame_contexts=frame_contexts,
|
|
crop_regions=crop_regions,
|
|
layout_mode=overall_layout,
|
|
fps=fps
|
|
)
|
|
|
|
def _calculate_crop_regions(
|
|
self,
|
|
contexts: List[FrameContext],
|
|
source_width: int,
|
|
source_height: int
|
|
) -> List[CropRegion]:
|
|
"""
|
|
Calculate smooth crop regions for each frame.
|
|
|
|
Args:
|
|
contexts: List of frame contexts
|
|
source_width: Source video width
|
|
source_height: Source video height
|
|
|
|
Returns:
|
|
List of crop regions
|
|
"""
|
|
if not contexts:
|
|
return []
|
|
|
|
# Calculate ideal crop dimensions maintaining EXACT 9:16 aspect ratio
|
|
source_aspect = source_width / source_height
|
|
|
|
if source_aspect > self.target_aspect:
|
|
# Source is wider - crop horizontally (use full height)
|
|
crop_height = source_height
|
|
crop_width = int(crop_height / self.target_aspect)
|
|
|
|
# Ensure crop width fits within source
|
|
if crop_width > source_width:
|
|
crop_width = source_width
|
|
crop_height = int(crop_width * self.target_aspect)
|
|
else:
|
|
# Source is taller - crop vertically (use full width)
|
|
crop_width = source_width
|
|
crop_height = int(crop_width * self.target_aspect)
|
|
|
|
# Ensure crop height fits within source
|
|
if crop_height > source_height:
|
|
crop_height = source_height
|
|
crop_width = int(crop_height / self.target_aspect)
|
|
|
|
# Calculate center points for each frame
|
|
# Since we now always focus on ONE person directly (not averaging),
|
|
# we can use the focus point directly without complex validation
|
|
center_xs = []
|
|
center_ys = []
|
|
|
|
for ctx in contexts:
|
|
if ctx.primary_focus:
|
|
# Primary focus is now always a single person's center, never averaged
|
|
# This means it will never be on the table/empty space
|
|
center_xs.append(ctx.primary_focus[0])
|
|
center_ys.append(ctx.primary_focus[1])
|
|
else:
|
|
# Default to center only if no faces detected at all
|
|
center_xs.append(source_width // 2)
|
|
center_ys.append(source_height // 2)
|
|
|
|
# Smooth the center points
|
|
if len(center_xs) > self.smoothing_window:
|
|
kernel_size = min(self.smoothing_window, len(center_xs))
|
|
if kernel_size % 2 == 0:
|
|
kernel_size -= 1
|
|
|
|
center_xs = signal.medfilt(center_xs, kernel_size=kernel_size).tolist()
|
|
center_ys = signal.medfilt(center_ys, kernel_size=kernel_size).tolist()
|
|
|
|
# Limit velocity (prevent jarring movements)
|
|
center_xs = self._limit_velocity(center_xs, self.max_velocity)
|
|
center_ys = self._limit_velocity(center_ys, self.max_velocity)
|
|
|
|
# Convert to crop regions
|
|
crop_regions = []
|
|
for center_x, center_y in zip(center_xs, center_ys):
|
|
# Calculate top-left corner
|
|
x = int(center_x - crop_width // 2)
|
|
y = int(center_y - crop_height // 2)
|
|
|
|
# Clamp to valid bounds
|
|
x = max(0, min(x, source_width - crop_width))
|
|
y = max(0, min(y, source_height - crop_height))
|
|
|
|
crop_regions.append(CropRegion(
|
|
x=x,
|
|
y=y,
|
|
width=crop_width,
|
|
height=crop_height
|
|
))
|
|
|
|
return crop_regions
|
|
|
|
def _limit_velocity(self, positions: List[float], max_velocity: float) -> List[float]:
|
|
"""
|
|
Limit the velocity of position changes.
|
|
|
|
Args:
|
|
positions: List of positions
|
|
max_velocity: Maximum allowed change per frame
|
|
|
|
Returns:
|
|
Smoothed positions
|
|
"""
|
|
if len(positions) <= 1:
|
|
return positions
|
|
|
|
limited = [positions[0]]
|
|
|
|
for i in range(1, len(positions)):
|
|
delta = positions[i] - limited[i - 1]
|
|
if abs(delta) > max_velocity:
|
|
delta = max_velocity if delta > 0 else -max_velocity
|
|
|
|
limited.append(limited[i - 1] + delta)
|
|
|
|
return limited
|
|
|
|
def apply_framing(
|
|
self,
|
|
video_clip: VideoFileClip,
|
|
framing_plan: FramingPlan,
|
|
use_split_screen: bool = False
|
|
) -> VideoClip:
|
|
"""
|
|
Apply smart framing to a video clip.
|
|
|
|
Args:
|
|
video_clip: Source video clip
|
|
framing_plan: Framing plan to apply
|
|
use_split_screen: Whether to use split screen for multiple people
|
|
|
|
Returns:
|
|
Reframed video clip
|
|
"""
|
|
# Handle different layout modes
|
|
if framing_plan.layout_mode in ["single", "single_speaker"]:
|
|
# Single person or single speaker - use focused single framing
|
|
return self._apply_single_framing(video_clip, framing_plan)
|
|
elif framing_plan.layout_mode == "dual_split" and use_split_screen:
|
|
# Two people in conversation - use split screen
|
|
return self._apply_split_screen(video_clip, framing_plan)
|
|
elif framing_plan.layout_mode == "grid" and use_split_screen:
|
|
# 3+ people - use grid layout
|
|
return self._apply_grid_layout(video_clip, framing_plan)
|
|
else:
|
|
# Fallback to single framing
|
|
return self._apply_single_framing(video_clip, framing_plan)
|
|
|
|
def _apply_single_framing(
|
|
self,
|
|
video_clip: VideoFileClip,
|
|
framing_plan: FramingPlan
|
|
) -> VideoClip:
|
|
"""
|
|
Apply single-focus framing (following one person or action).
|
|
|
|
Args:
|
|
video_clip: Source video clip
|
|
framing_plan: Framing plan
|
|
|
|
Returns:
|
|
Reframed video clip
|
|
"""
|
|
def make_frame(t):
|
|
# Get the original frame
|
|
frame = video_clip.get_frame(t)
|
|
|
|
# Ensure we have valid crop regions
|
|
if not framing_plan.crop_regions:
|
|
# Fallback: return center crop
|
|
h, w = frame.shape[:2]
|
|
crop_h = int(w * self.target_aspect)
|
|
crop_w = w
|
|
if crop_h > h:
|
|
crop_h = h
|
|
crop_w = int(h / self.target_aspect)
|
|
y = (h - crop_h) // 2
|
|
x = (w - crop_w) // 2
|
|
cropped = frame[y:y + crop_h, x:x + crop_w]
|
|
else:
|
|
# Calculate exact frame index with decimal precision for interpolation
|
|
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
|
|
|
|
# Get the two adjacent analyzed frames
|
|
idx_floor = int(exact_frame_idx)
|
|
idx_ceil = idx_floor + 1
|
|
|
|
# Interpolation factor (0.0 to 1.0)
|
|
alpha = exact_frame_idx - idx_floor
|
|
|
|
# Clamp indices to valid range
|
|
idx_floor = max(0, min(idx_floor, len(framing_plan.crop_regions) - 1))
|
|
idx_ceil = max(0, min(idx_ceil, len(framing_plan.crop_regions) - 1))
|
|
|
|
# Get crop regions
|
|
crop1 = framing_plan.crop_regions[idx_floor]
|
|
crop2 = framing_plan.crop_regions[idx_ceil]
|
|
|
|
# Linear interpolation between crop regions
|
|
x = int(crop1.x * (1 - alpha) + crop2.x * alpha)
|
|
y = int(crop1.y * (1 - alpha) + crop2.y * alpha)
|
|
width = int(crop1.width * (1 - alpha) + crop2.width * alpha)
|
|
height = int(crop1.height * (1 - alpha) + crop2.height * alpha)
|
|
|
|
# Ensure crop stays within frame bounds
|
|
h, w = frame.shape[:2]
|
|
x = max(0, min(x, w - width))
|
|
y = max(0, min(y, h - height))
|
|
width = min(width, w - x)
|
|
height = min(height, h - y)
|
|
|
|
# Crop the frame
|
|
cropped = frame[y:y + height, x:x + width]
|
|
|
|
# Resize to target dimensions
|
|
resized = cv2.resize(
|
|
cropped,
|
|
(self.target_width, self.target_height),
|
|
interpolation=cv2.INTER_LINEAR
|
|
)
|
|
|
|
return resized
|
|
|
|
# MoviePy 2.x compatible way to create VideoClip
|
|
new_clip = VideoClip(duration=video_clip.duration)
|
|
new_clip.size = (self.target_width, self.target_height)
|
|
new_clip.frame_function = make_frame
|
|
return new_clip
|
|
|
|
def _apply_split_screen(
|
|
self,
|
|
video_clip: VideoFileClip,
|
|
framing_plan: FramingPlan
|
|
) -> VideoClip:
|
|
"""
|
|
Apply split screen for two people.
|
|
|
|
Args:
|
|
video_clip: Source video clip
|
|
framing_plan: Framing plan
|
|
|
|
Returns:
|
|
Split screen video clip
|
|
"""
|
|
def make_frame(t):
|
|
frame = video_clip.get_frame(t)
|
|
# Calculate exact frame index with decimal precision for smooth interpolation
|
|
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
|
|
frame_idx = int(exact_frame_idx)
|
|
|
|
# Ensure we have valid contexts
|
|
if not framing_plan.frame_contexts:
|
|
# Fallback to simple center crop
|
|
h, w = frame.shape[:2]
|
|
crop_h = int(w * self.target_aspect)
|
|
crop_w = w
|
|
if crop_h > h:
|
|
crop_h = h
|
|
crop_w = int(h / self.target_aspect)
|
|
y = (h - crop_h) // 2
|
|
x = (w - crop_w) // 2
|
|
cropped = frame[y:y + crop_h, x:x + crop_w]
|
|
return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
|
|
|
|
# Clamp index to valid range
|
|
frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
|
|
context = framing_plan.frame_contexts[frame_idx]
|
|
|
|
# Create output frame
|
|
output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
|
|
|
|
if len(context.detected_faces) >= 2:
|
|
# Split vertically 50/50 (two columns)
|
|
half_width = self.target_width // 2
|
|
|
|
# Select the 2 most relevant faces
|
|
# Priority: ALWAYS show active speaker first + most confident other person
|
|
if context.active_speakers and len(context.active_speakers) >= 1:
|
|
# Get the PRIMARY speaker (most confident among active speakers)
|
|
speaker_faces = [context.detected_faces[i] for i in context.active_speakers
|
|
if i < len(context.detected_faces)]
|
|
|
|
primary_speaker = max(speaker_faces, key=lambda f: f.confidence)
|
|
|
|
# Get OTHER faces (not the primary speaker)
|
|
other_faces = [f for f in context.detected_faces if f != primary_speaker]
|
|
|
|
if len(speaker_faces) >= 2:
|
|
# Multiple speakers: show primary + second most confident speaker
|
|
other_speakers = [f for f in speaker_faces if f != primary_speaker]
|
|
secondary_person = max(other_speakers, key=lambda f: f.confidence)
|
|
elif other_faces:
|
|
# One speaker: show speaker + most confident other person
|
|
secondary_person = max(other_faces, key=lambda f: f.confidence)
|
|
else:
|
|
# Fallback: only one person detected
|
|
secondary_person = primary_speaker
|
|
|
|
selected_faces = [primary_speaker, secondary_person]
|
|
else:
|
|
# No speakers: take 2 most confident faces
|
|
selected_faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:2]
|
|
|
|
# Sort selected faces by horizontal position for consistent left/right placement
|
|
faces = sorted(selected_faces, key=lambda f: f.center_x)
|
|
left_face = faces[0]
|
|
right_face = faces[1]
|
|
|
|
# Process each person's frame
|
|
for idx, face in enumerate([left_face, right_face]):
|
|
# Calculate crop region focused on this person
|
|
# Each person gets half the width, full target aspect ratio (9:16)
|
|
# This ensures NO distortion when resizing
|
|
|
|
# For split screen: each side is half_width x full_height
|
|
# We need to maintain 9:16 aspect for each half
|
|
half_width = self.target_width // 2
|
|
half_aspect = self.target_height / half_width # Aspect ratio for half
|
|
|
|
# Determine crop size based on face with padding
|
|
face_width = max(face.width, frame.shape[1] // 4) # At least 1/4 of frame width
|
|
crop_width = int(face_width * 2.5) # Add padding around face
|
|
crop_height = int(crop_width * half_aspect) # Maintain correct aspect
|
|
|
|
# Ensure crop fits in frame, maintaining aspect ratio
|
|
max_crop_width = frame.shape[1] // 2 # Half the source width
|
|
max_crop_height = frame.shape[0] # Full source height
|
|
|
|
# If crop is too wide, scale down proportionally
|
|
if crop_width > max_crop_width:
|
|
crop_width = max_crop_width
|
|
crop_height = int(crop_width * half_aspect)
|
|
|
|
# If crop is too tall, scale down proportionally
|
|
if crop_height > max_crop_height:
|
|
crop_height = max_crop_height
|
|
crop_width = int(crop_height / half_aspect)
|
|
|
|
# Center crop on face
|
|
x = max(0, face.center_x - crop_width // 2)
|
|
y = max(0, face.center_y - crop_height // 2)
|
|
|
|
# Clamp to frame boundaries
|
|
x = min(x, frame.shape[1] - crop_width)
|
|
y = min(y, frame.shape[0] - crop_height)
|
|
|
|
# Extract and resize crop
|
|
cropped = frame[y:y + crop_height, x:x + crop_width]
|
|
resized = cv2.resize(
|
|
cropped,
|
|
(half_width, self.target_height),
|
|
interpolation=cv2.INTER_LINEAR
|
|
)
|
|
|
|
# Place in output at appropriate horizontal position
|
|
x_offset = idx * half_width
|
|
output[:, x_offset:x_offset + half_width] = resized
|
|
else:
|
|
# Fall back to single framing
|
|
if framing_plan.crop_regions:
|
|
crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
|
|
crop = framing_plan.crop_regions[crop_idx]
|
|
cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
|
|
else:
|
|
# Fallback to center crop if no crop regions available
|
|
h, w = frame.shape[:2]
|
|
crop_h = int(w * self.target_aspect)
|
|
crop_w = w
|
|
if crop_h > h:
|
|
crop_h = h
|
|
crop_w = int(h / self.target_aspect)
|
|
y = (h - crop_h) // 2
|
|
x = (w - crop_w) // 2
|
|
cropped = frame[y:y + crop_h, x:x + crop_w]
|
|
output = cv2.resize(
|
|
cropped,
|
|
(self.target_width, self.target_height),
|
|
interpolation=cv2.INTER_LINEAR
|
|
)
|
|
|
|
return output
|
|
|
|
# MoviePy 2.x compatible way to create VideoClip
|
|
new_clip = VideoClip(duration=video_clip.duration)
|
|
new_clip.size = (self.target_width, self.target_height)
|
|
new_clip.frame_function = make_frame
|
|
return new_clip
|
|
|
|
def _apply_grid_layout(
|
|
self,
|
|
video_clip: VideoFileClip,
|
|
framing_plan: FramingPlan
|
|
) -> VideoClip:
|
|
"""
|
|
Apply grid layout for 3+ people.
|
|
|
|
Args:
|
|
video_clip: Source video clip
|
|
framing_plan: Framing plan
|
|
|
|
Returns:
|
|
Grid layout video clip
|
|
"""
|
|
def make_frame(t):
|
|
frame = video_clip.get_frame(t)
|
|
# Calculate exact frame index with decimal precision for smooth interpolation
|
|
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
|
|
frame_idx = int(exact_frame_idx)
|
|
|
|
# Ensure we have valid contexts
|
|
if not framing_plan.frame_contexts:
|
|
# Fallback to simple center crop
|
|
h, w = frame.shape[:2]
|
|
crop_h = int(w * self.target_aspect)
|
|
crop_w = w
|
|
if crop_h > h:
|
|
crop_h = h
|
|
crop_w = int(h / self.target_aspect)
|
|
y = (h - crop_h) // 2
|
|
x = (w - crop_w) // 2
|
|
cropped = frame[y:y + crop_h, x:x + crop_w]
|
|
return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
|
|
|
|
# Clamp index to valid range
|
|
frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
|
|
context = framing_plan.frame_contexts[frame_idx]
|
|
|
|
output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
|
|
|
|
num_faces = len(context.detected_faces)
|
|
|
|
if num_faces >= 3:
|
|
# Create 2x2 grid
|
|
cell_width = self.target_width // 2
|
|
cell_height = self.target_height // 2
|
|
|
|
for idx, face in enumerate(context.detected_faces[:4]):
|
|
# Calculate grid position
|
|
row = idx // 2
|
|
col = idx % 2
|
|
|
|
# Each grid cell maintains aspect ratio (square in this case: cell_width = cell_height)
|
|
cell_aspect = cell_height / cell_width
|
|
|
|
# Crop around face with correct aspect ratio
|
|
crop_width = frame.shape[1] // 2
|
|
crop_height = int(crop_width * cell_aspect)
|
|
|
|
# Ensure crop fits in frame, maintaining aspect
|
|
max_crop_width = frame.shape[1] // 2
|
|
max_crop_height = frame.shape[0] // 2
|
|
|
|
if crop_width > max_crop_width:
|
|
crop_width = max_crop_width
|
|
crop_height = int(crop_width * cell_aspect)
|
|
|
|
if crop_height > max_crop_height:
|
|
crop_height = max_crop_height
|
|
crop_width = int(crop_height / cell_aspect)
|
|
|
|
# Center crop on face
|
|
x = max(0, face.center_x - crop_width // 2)
|
|
y = max(0, face.center_y - crop_height // 2)
|
|
|
|
# Clamp to frame boundaries
|
|
x = min(x, frame.shape[1] - crop_width)
|
|
y = min(y, frame.shape[0] - crop_height)
|
|
|
|
cropped = frame[y:y + crop_height, x:x + crop_width]
|
|
resized = cv2.resize(
|
|
cropped,
|
|
(cell_width, cell_height),
|
|
interpolation=cv2.INTER_LINEAR
|
|
)
|
|
|
|
# Place in grid
|
|
y_offset = row * cell_height
|
|
x_offset = col * cell_width
|
|
output[y_offset:y_offset + cell_height, x_offset:x_offset + cell_width] = resized
|
|
else:
|
|
# Fall back to single framing
|
|
if framing_plan.crop_regions:
|
|
crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
|
|
crop = framing_plan.crop_regions[crop_idx]
|
|
cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
|
|
else:
|
|
# Fallback to center crop if no crop regions available
|
|
h, w = frame.shape[:2]
|
|
crop_h = int(w * self.target_aspect)
|
|
crop_w = w
|
|
if crop_h > h:
|
|
crop_h = h
|
|
crop_w = int(h / self.target_aspect)
|
|
y = (h - crop_h) // 2
|
|
x = (w - crop_w) // 2
|
|
cropped = frame[y:y + crop_h, x:x + crop_w]
|
|
output = cv2.resize(
|
|
cropped,
|
|
(self.target_width, self.target_height),
|
|
interpolation=cv2.INTER_LINEAR
|
|
)
|
|
|
|
return output
|
|
|
|
# MoviePy 2.x compatible way to create VideoClip
|
|
new_clip = VideoClip(duration=video_clip.duration)
|
|
new_clip.size = (self.target_width, self.target_height)
|
|
new_clip.frame_function = make_frame
|
|
return new_clip
|
|
|
|
|
|
def extract_audio_samples(video_path: str, start_time: float, end_time: float) -> Optional[np.ndarray]:
|
|
"""
|
|
Extract audio samples from video for speech detection.
|
|
|
|
Args:
|
|
video_path: Path to video file
|
|
start_time: Start time in seconds
|
|
end_time: End time in seconds
|
|
|
|
Returns:
|
|
Audio samples array or None if no audio
|
|
"""
|
|
try:
|
|
from moviepy.audio.io.AudioFileClip import AudioFileClip
|
|
|
|
with AudioFileClip(video_path) as audio:
|
|
segment = audio.subclipped(start_time, end_time)
|
|
fps = getattr(segment, 'fps', 44100)
|
|
samples = segment.to_soundarray(fps=fps)
|
|
return samples
|
|
except Exception as exc:
|
|
logger.warning(f"Failed to extract audio: {exc}")
|
|
return None
|