#v2 - Inicia testes da v2
- Adiciona rastreamento de objetos - Facial detection - Legenda interativa - Cortes mais precisos - Refinamento do Prompt
This commit is contained in:
687
video_render/smart_framing.py
Normal file
687
video_render/smart_framing.py
Normal file
@@ -0,0 +1,687 @@
|
||||
"""
|
||||
Smart framing module for intelligent video cropping and composition.
|
||||
|
||||
This module provides functionality to create 9:16 vertical videos with
|
||||
intelligent framing that follows the action and speakers.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from moviepy.video.VideoClip import VideoClip
|
||||
from moviepy.video.io.VideoFileClip import VideoFileClip
|
||||
from scipy import signal
|
||||
|
||||
from video_render.context_detection import ContextAnalyzer, FrameContext, FaceDetection
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CropRegion:
|
||||
"""Defines a crop region for a frame."""
|
||||
x: int
|
||||
y: int
|
||||
width: int
|
||||
height: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class FramingPlan:
|
||||
"""Complete framing plan for a video segment."""
|
||||
frame_contexts: List[FrameContext]
|
||||
crop_regions: List[CropRegion]
|
||||
layout_mode: str
|
||||
fps: float
|
||||
|
||||
|
||||
class SmartFramer:
|
||||
"""Creates intelligent 9:16 framing for horizontal videos."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
target_width: int = 1080,
|
||||
target_height: int = 1920,
|
||||
frame_skip: int = 2,
|
||||
smoothing_window: int = 15
|
||||
):
|
||||
self.target_width = target_width
|
||||
self.target_height = target_height
|
||||
self.target_aspect = target_height / target_width
|
||||
|
||||
# Performance parameters
|
||||
self.frame_skip = frame_skip # Process every Nth frame (CPU optimization)
|
||||
|
||||
# Smoothing parameters
|
||||
self.smoothing_window = smoothing_window
|
||||
self.max_velocity = 30 # pixels per frame (reduced for smoother transitions)
|
||||
|
||||
logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip})")
|
||||
|
||||
def create_framing_plan(
|
||||
self,
|
||||
video_path: str,
|
||||
start_time: float,
|
||||
end_time: float,
|
||||
audio_samples: Optional[np.ndarray] = None
|
||||
) -> FramingPlan:
|
||||
"""
|
||||
Analyze video and create a complete framing plan.
|
||||
|
||||
Args:
|
||||
video_path: Path to video file
|
||||
start_time: Start time in seconds
|
||||
end_time: End time in seconds
|
||||
audio_samples: Optional audio samples for speech detection
|
||||
|
||||
Returns:
|
||||
FramingPlan with all frame contexts and crop regions
|
||||
"""
|
||||
analyzer = ContextAnalyzer()
|
||||
|
||||
# Detect speaking periods from audio if available
|
||||
speaking_periods = None
|
||||
if audio_samples is not None:
|
||||
speaking_periods = analyzer.audio_detector.detect_speaking_periods(audio_samples)
|
||||
|
||||
# Open video with error suppression for AV1 codec warnings
|
||||
import os
|
||||
os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'loglevel;quiet'
|
||||
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
|
||||
# Calculate frame range
|
||||
start_frame = int(start_time * fps)
|
||||
end_frame = int(end_time * fps)
|
||||
|
||||
# Set to start frame
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
|
||||
|
||||
frame_contexts = []
|
||||
frame_number = start_frame
|
||||
processed_count = 0
|
||||
|
||||
logger.info(f"Analyzing frames {start_frame} to {end_frame} (fps={fps}, skip={self.frame_skip})")
|
||||
|
||||
while frame_number < end_frame:
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
break
|
||||
|
||||
# Only process every Nth frame for performance (CPU optimization)
|
||||
if processed_count % self.frame_skip == 0:
|
||||
timestamp = frame_number / fps
|
||||
context = analyzer.analyze_frame(frame, timestamp, frame_number, speaking_periods)
|
||||
frame_contexts.append(context)
|
||||
|
||||
frame_number += 1
|
||||
processed_count += 1
|
||||
|
||||
# Get video dimensions before releasing capture
|
||||
source_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
source_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
|
||||
cap.release()
|
||||
analyzer.close()
|
||||
|
||||
# Determine overall layout mode (most common)
|
||||
layout_modes = [ctx.layout_mode for ctx in frame_contexts]
|
||||
if layout_modes:
|
||||
overall_layout = max(set(layout_modes), key=layout_modes.count)
|
||||
else:
|
||||
overall_layout = "single"
|
||||
|
||||
# Calculate crop regions based on contexts
|
||||
|
||||
crop_regions = self._calculate_crop_regions(
|
||||
frame_contexts,
|
||||
source_width,
|
||||
source_height
|
||||
)
|
||||
|
||||
return FramingPlan(
|
||||
frame_contexts=frame_contexts,
|
||||
crop_regions=crop_regions,
|
||||
layout_mode=overall_layout,
|
||||
fps=fps
|
||||
)
|
||||
|
||||
def _calculate_crop_regions(
|
||||
self,
|
||||
contexts: List[FrameContext],
|
||||
source_width: int,
|
||||
source_height: int
|
||||
) -> List[CropRegion]:
|
||||
"""
|
||||
Calculate smooth crop regions for each frame.
|
||||
|
||||
Args:
|
||||
contexts: List of frame contexts
|
||||
source_width: Source video width
|
||||
source_height: Source video height
|
||||
|
||||
Returns:
|
||||
List of crop regions
|
||||
"""
|
||||
if not contexts:
|
||||
return []
|
||||
|
||||
# Calculate ideal crop dimensions maintaining EXACT 9:16 aspect ratio
|
||||
source_aspect = source_width / source_height
|
||||
|
||||
if source_aspect > self.target_aspect:
|
||||
# Source is wider - crop horizontally (use full height)
|
||||
crop_height = source_height
|
||||
crop_width = int(crop_height / self.target_aspect)
|
||||
|
||||
# Ensure crop width fits within source
|
||||
if crop_width > source_width:
|
||||
crop_width = source_width
|
||||
crop_height = int(crop_width * self.target_aspect)
|
||||
else:
|
||||
# Source is taller - crop vertically (use full width)
|
||||
crop_width = source_width
|
||||
crop_height = int(crop_width * self.target_aspect)
|
||||
|
||||
# Ensure crop height fits within source
|
||||
if crop_height > source_height:
|
||||
crop_height = source_height
|
||||
crop_width = int(crop_height / self.target_aspect)
|
||||
|
||||
# Calculate center points for each frame
|
||||
# Since we now always focus on ONE person directly (not averaging),
|
||||
# we can use the focus point directly without complex validation
|
||||
center_xs = []
|
||||
center_ys = []
|
||||
|
||||
for ctx in contexts:
|
||||
if ctx.primary_focus:
|
||||
# Primary focus is now always a single person's center, never averaged
|
||||
# This means it will never be on the table/empty space
|
||||
center_xs.append(ctx.primary_focus[0])
|
||||
center_ys.append(ctx.primary_focus[1])
|
||||
else:
|
||||
# Default to center only if no faces detected at all
|
||||
center_xs.append(source_width // 2)
|
||||
center_ys.append(source_height // 2)
|
||||
|
||||
# Smooth the center points
|
||||
if len(center_xs) > self.smoothing_window:
|
||||
kernel_size = min(self.smoothing_window, len(center_xs))
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size -= 1
|
||||
|
||||
center_xs = signal.medfilt(center_xs, kernel_size=kernel_size).tolist()
|
||||
center_ys = signal.medfilt(center_ys, kernel_size=kernel_size).tolist()
|
||||
|
||||
# Limit velocity (prevent jarring movements)
|
||||
center_xs = self._limit_velocity(center_xs, self.max_velocity)
|
||||
center_ys = self._limit_velocity(center_ys, self.max_velocity)
|
||||
|
||||
# Convert to crop regions
|
||||
crop_regions = []
|
||||
for center_x, center_y in zip(center_xs, center_ys):
|
||||
# Calculate top-left corner
|
||||
x = int(center_x - crop_width // 2)
|
||||
y = int(center_y - crop_height // 2)
|
||||
|
||||
# Clamp to valid bounds
|
||||
x = max(0, min(x, source_width - crop_width))
|
||||
y = max(0, min(y, source_height - crop_height))
|
||||
|
||||
crop_regions.append(CropRegion(
|
||||
x=x,
|
||||
y=y,
|
||||
width=crop_width,
|
||||
height=crop_height
|
||||
))
|
||||
|
||||
return crop_regions
|
||||
|
||||
def _limit_velocity(self, positions: List[float], max_velocity: float) -> List[float]:
|
||||
"""
|
||||
Limit the velocity of position changes.
|
||||
|
||||
Args:
|
||||
positions: List of positions
|
||||
max_velocity: Maximum allowed change per frame
|
||||
|
||||
Returns:
|
||||
Smoothed positions
|
||||
"""
|
||||
if len(positions) <= 1:
|
||||
return positions
|
||||
|
||||
limited = [positions[0]]
|
||||
|
||||
for i in range(1, len(positions)):
|
||||
delta = positions[i] - limited[i - 1]
|
||||
if abs(delta) > max_velocity:
|
||||
delta = max_velocity if delta > 0 else -max_velocity
|
||||
|
||||
limited.append(limited[i - 1] + delta)
|
||||
|
||||
return limited
|
||||
|
||||
def apply_framing(
|
||||
self,
|
||||
video_clip: VideoFileClip,
|
||||
framing_plan: FramingPlan,
|
||||
use_split_screen: bool = False
|
||||
) -> VideoClip:
|
||||
"""
|
||||
Apply smart framing to a video clip.
|
||||
|
||||
Args:
|
||||
video_clip: Source video clip
|
||||
framing_plan: Framing plan to apply
|
||||
use_split_screen: Whether to use split screen for multiple people
|
||||
|
||||
Returns:
|
||||
Reframed video clip
|
||||
"""
|
||||
# Handle different layout modes
|
||||
if framing_plan.layout_mode in ["single", "single_speaker"]:
|
||||
# Single person or single speaker - use focused single framing
|
||||
return self._apply_single_framing(video_clip, framing_plan)
|
||||
elif framing_plan.layout_mode == "dual_split" and use_split_screen:
|
||||
# Two people in conversation - use split screen
|
||||
return self._apply_split_screen(video_clip, framing_plan)
|
||||
elif framing_plan.layout_mode == "grid" and use_split_screen:
|
||||
# 3+ people - use grid layout
|
||||
return self._apply_grid_layout(video_clip, framing_plan)
|
||||
else:
|
||||
# Fallback to single framing
|
||||
return self._apply_single_framing(video_clip, framing_plan)
|
||||
|
||||
def _apply_single_framing(
|
||||
self,
|
||||
video_clip: VideoFileClip,
|
||||
framing_plan: FramingPlan
|
||||
) -> VideoClip:
|
||||
"""
|
||||
Apply single-focus framing (following one person or action).
|
||||
|
||||
Args:
|
||||
video_clip: Source video clip
|
||||
framing_plan: Framing plan
|
||||
|
||||
Returns:
|
||||
Reframed video clip
|
||||
"""
|
||||
def make_frame(t):
|
||||
# Get the original frame
|
||||
frame = video_clip.get_frame(t)
|
||||
|
||||
# Ensure we have valid crop regions
|
||||
if not framing_plan.crop_regions:
|
||||
# Fallback: return center crop
|
||||
h, w = frame.shape[:2]
|
||||
crop_h = int(w * self.target_aspect)
|
||||
crop_w = w
|
||||
if crop_h > h:
|
||||
crop_h = h
|
||||
crop_w = int(h / self.target_aspect)
|
||||
y = (h - crop_h) // 2
|
||||
x = (w - crop_w) // 2
|
||||
cropped = frame[y:y + crop_h, x:x + crop_w]
|
||||
else:
|
||||
# Calculate exact frame index with decimal precision for interpolation
|
||||
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
|
||||
|
||||
# Get the two adjacent analyzed frames
|
||||
idx_floor = int(exact_frame_idx)
|
||||
idx_ceil = idx_floor + 1
|
||||
|
||||
# Interpolation factor (0.0 to 1.0)
|
||||
alpha = exact_frame_idx - idx_floor
|
||||
|
||||
# Clamp indices to valid range
|
||||
idx_floor = max(0, min(idx_floor, len(framing_plan.crop_regions) - 1))
|
||||
idx_ceil = max(0, min(idx_ceil, len(framing_plan.crop_regions) - 1))
|
||||
|
||||
# Get crop regions
|
||||
crop1 = framing_plan.crop_regions[idx_floor]
|
||||
crop2 = framing_plan.crop_regions[idx_ceil]
|
||||
|
||||
# Linear interpolation between crop regions
|
||||
x = int(crop1.x * (1 - alpha) + crop2.x * alpha)
|
||||
y = int(crop1.y * (1 - alpha) + crop2.y * alpha)
|
||||
width = int(crop1.width * (1 - alpha) + crop2.width * alpha)
|
||||
height = int(crop1.height * (1 - alpha) + crop2.height * alpha)
|
||||
|
||||
# Ensure crop stays within frame bounds
|
||||
h, w = frame.shape[:2]
|
||||
x = max(0, min(x, w - width))
|
||||
y = max(0, min(y, h - height))
|
||||
width = min(width, w - x)
|
||||
height = min(height, h - y)
|
||||
|
||||
# Crop the frame
|
||||
cropped = frame[y:y + height, x:x + width]
|
||||
|
||||
# Resize to target dimensions
|
||||
resized = cv2.resize(
|
||||
cropped,
|
||||
(self.target_width, self.target_height),
|
||||
interpolation=cv2.INTER_LINEAR
|
||||
)
|
||||
|
||||
return resized
|
||||
|
||||
# MoviePy 2.x compatible way to create VideoClip
|
||||
new_clip = VideoClip(duration=video_clip.duration)
|
||||
new_clip.size = (self.target_width, self.target_height)
|
||||
new_clip.frame_function = make_frame
|
||||
return new_clip
|
||||
|
||||
def _apply_split_screen(
|
||||
self,
|
||||
video_clip: VideoFileClip,
|
||||
framing_plan: FramingPlan
|
||||
) -> VideoClip:
|
||||
"""
|
||||
Apply split screen for two people.
|
||||
|
||||
Args:
|
||||
video_clip: Source video clip
|
||||
framing_plan: Framing plan
|
||||
|
||||
Returns:
|
||||
Split screen video clip
|
||||
"""
|
||||
def make_frame(t):
|
||||
frame = video_clip.get_frame(t)
|
||||
# Calculate exact frame index with decimal precision for smooth interpolation
|
||||
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
|
||||
frame_idx = int(exact_frame_idx)
|
||||
|
||||
# Ensure we have valid contexts
|
||||
if not framing_plan.frame_contexts:
|
||||
# Fallback to simple center crop
|
||||
h, w = frame.shape[:2]
|
||||
crop_h = int(w * self.target_aspect)
|
||||
crop_w = w
|
||||
if crop_h > h:
|
||||
crop_h = h
|
||||
crop_w = int(h / self.target_aspect)
|
||||
y = (h - crop_h) // 2
|
||||
x = (w - crop_w) // 2
|
||||
cropped = frame[y:y + crop_h, x:x + crop_w]
|
||||
return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
|
||||
|
||||
# Clamp index to valid range
|
||||
frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
|
||||
context = framing_plan.frame_contexts[frame_idx]
|
||||
|
||||
# Create output frame
|
||||
output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
|
||||
|
||||
if len(context.detected_faces) >= 2:
|
||||
# Split vertically 50/50 (two columns)
|
||||
half_width = self.target_width // 2
|
||||
|
||||
# Select the 2 most relevant faces
|
||||
# Priority: ALWAYS show active speaker first + most confident other person
|
||||
if context.active_speakers and len(context.active_speakers) >= 1:
|
||||
# Get the PRIMARY speaker (most confident among active speakers)
|
||||
speaker_faces = [context.detected_faces[i] for i in context.active_speakers
|
||||
if i < len(context.detected_faces)]
|
||||
|
||||
primary_speaker = max(speaker_faces, key=lambda f: f.confidence)
|
||||
|
||||
# Get OTHER faces (not the primary speaker)
|
||||
other_faces = [f for f in context.detected_faces if f != primary_speaker]
|
||||
|
||||
if len(speaker_faces) >= 2:
|
||||
# Multiple speakers: show primary + second most confident speaker
|
||||
other_speakers = [f for f in speaker_faces if f != primary_speaker]
|
||||
secondary_person = max(other_speakers, key=lambda f: f.confidence)
|
||||
elif other_faces:
|
||||
# One speaker: show speaker + most confident other person
|
||||
secondary_person = max(other_faces, key=lambda f: f.confidence)
|
||||
else:
|
||||
# Fallback: only one person detected
|
||||
secondary_person = primary_speaker
|
||||
|
||||
selected_faces = [primary_speaker, secondary_person]
|
||||
else:
|
||||
# No speakers: take 2 most confident faces
|
||||
selected_faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:2]
|
||||
|
||||
# Sort selected faces by horizontal position for consistent left/right placement
|
||||
faces = sorted(selected_faces, key=lambda f: f.center_x)
|
||||
left_face = faces[0]
|
||||
right_face = faces[1]
|
||||
|
||||
# Process each person's frame
|
||||
for idx, face in enumerate([left_face, right_face]):
|
||||
# Calculate crop region focused on this person
|
||||
# Each person gets half the width, full target aspect ratio (9:16)
|
||||
# This ensures NO distortion when resizing
|
||||
|
||||
# For split screen: each side is half_width x full_height
|
||||
# We need to maintain 9:16 aspect for each half
|
||||
half_width = self.target_width // 2
|
||||
half_aspect = self.target_height / half_width # Aspect ratio for half
|
||||
|
||||
# Determine crop size based on face with padding
|
||||
face_width = max(face.width, frame.shape[1] // 4) # At least 1/4 of frame width
|
||||
crop_width = int(face_width * 2.5) # Add padding around face
|
||||
crop_height = int(crop_width * half_aspect) # Maintain correct aspect
|
||||
|
||||
# Ensure crop fits in frame, maintaining aspect ratio
|
||||
max_crop_width = frame.shape[1] // 2 # Half the source width
|
||||
max_crop_height = frame.shape[0] # Full source height
|
||||
|
||||
# If crop is too wide, scale down proportionally
|
||||
if crop_width > max_crop_width:
|
||||
crop_width = max_crop_width
|
||||
crop_height = int(crop_width * half_aspect)
|
||||
|
||||
# If crop is too tall, scale down proportionally
|
||||
if crop_height > max_crop_height:
|
||||
crop_height = max_crop_height
|
||||
crop_width = int(crop_height / half_aspect)
|
||||
|
||||
# Center crop on face
|
||||
x = max(0, face.center_x - crop_width // 2)
|
||||
y = max(0, face.center_y - crop_height // 2)
|
||||
|
||||
# Clamp to frame boundaries
|
||||
x = min(x, frame.shape[1] - crop_width)
|
||||
y = min(y, frame.shape[0] - crop_height)
|
||||
|
||||
# Extract and resize crop
|
||||
cropped = frame[y:y + crop_height, x:x + crop_width]
|
||||
resized = cv2.resize(
|
||||
cropped,
|
||||
(half_width, self.target_height),
|
||||
interpolation=cv2.INTER_LINEAR
|
||||
)
|
||||
|
||||
# Place in output at appropriate horizontal position
|
||||
x_offset = idx * half_width
|
||||
output[:, x_offset:x_offset + half_width] = resized
|
||||
else:
|
||||
# Fall back to single framing
|
||||
if framing_plan.crop_regions:
|
||||
crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
|
||||
crop = framing_plan.crop_regions[crop_idx]
|
||||
cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
|
||||
else:
|
||||
# Fallback to center crop if no crop regions available
|
||||
h, w = frame.shape[:2]
|
||||
crop_h = int(w * self.target_aspect)
|
||||
crop_w = w
|
||||
if crop_h > h:
|
||||
crop_h = h
|
||||
crop_w = int(h / self.target_aspect)
|
||||
y = (h - crop_h) // 2
|
||||
x = (w - crop_w) // 2
|
||||
cropped = frame[y:y + crop_h, x:x + crop_w]
|
||||
output = cv2.resize(
|
||||
cropped,
|
||||
(self.target_width, self.target_height),
|
||||
interpolation=cv2.INTER_LINEAR
|
||||
)
|
||||
|
||||
return output
|
||||
|
||||
# MoviePy 2.x compatible way to create VideoClip
|
||||
new_clip = VideoClip(duration=video_clip.duration)
|
||||
new_clip.size = (self.target_width, self.target_height)
|
||||
new_clip.frame_function = make_frame
|
||||
return new_clip
|
||||
|
||||
def _apply_grid_layout(
|
||||
self,
|
||||
video_clip: VideoFileClip,
|
||||
framing_plan: FramingPlan
|
||||
) -> VideoClip:
|
||||
"""
|
||||
Apply grid layout for 3+ people.
|
||||
|
||||
Args:
|
||||
video_clip: Source video clip
|
||||
framing_plan: Framing plan
|
||||
|
||||
Returns:
|
||||
Grid layout video clip
|
||||
"""
|
||||
def make_frame(t):
|
||||
frame = video_clip.get_frame(t)
|
||||
# Calculate exact frame index with decimal precision for smooth interpolation
|
||||
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
|
||||
frame_idx = int(exact_frame_idx)
|
||||
|
||||
# Ensure we have valid contexts
|
||||
if not framing_plan.frame_contexts:
|
||||
# Fallback to simple center crop
|
||||
h, w = frame.shape[:2]
|
||||
crop_h = int(w * self.target_aspect)
|
||||
crop_w = w
|
||||
if crop_h > h:
|
||||
crop_h = h
|
||||
crop_w = int(h / self.target_aspect)
|
||||
y = (h - crop_h) // 2
|
||||
x = (w - crop_w) // 2
|
||||
cropped = frame[y:y + crop_h, x:x + crop_w]
|
||||
return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
|
||||
|
||||
# Clamp index to valid range
|
||||
frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
|
||||
context = framing_plan.frame_contexts[frame_idx]
|
||||
|
||||
output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
|
||||
|
||||
num_faces = len(context.detected_faces)
|
||||
|
||||
if num_faces >= 3:
|
||||
# Create 2x2 grid
|
||||
cell_width = self.target_width // 2
|
||||
cell_height = self.target_height // 2
|
||||
|
||||
for idx, face in enumerate(context.detected_faces[:4]):
|
||||
# Calculate grid position
|
||||
row = idx // 2
|
||||
col = idx % 2
|
||||
|
||||
# Each grid cell maintains aspect ratio (square in this case: cell_width = cell_height)
|
||||
cell_aspect = cell_height / cell_width
|
||||
|
||||
# Crop around face with correct aspect ratio
|
||||
crop_width = frame.shape[1] // 2
|
||||
crop_height = int(crop_width * cell_aspect)
|
||||
|
||||
# Ensure crop fits in frame, maintaining aspect
|
||||
max_crop_width = frame.shape[1] // 2
|
||||
max_crop_height = frame.shape[0] // 2
|
||||
|
||||
if crop_width > max_crop_width:
|
||||
crop_width = max_crop_width
|
||||
crop_height = int(crop_width * cell_aspect)
|
||||
|
||||
if crop_height > max_crop_height:
|
||||
crop_height = max_crop_height
|
||||
crop_width = int(crop_height / cell_aspect)
|
||||
|
||||
# Center crop on face
|
||||
x = max(0, face.center_x - crop_width // 2)
|
||||
y = max(0, face.center_y - crop_height // 2)
|
||||
|
||||
# Clamp to frame boundaries
|
||||
x = min(x, frame.shape[1] - crop_width)
|
||||
y = min(y, frame.shape[0] - crop_height)
|
||||
|
||||
cropped = frame[y:y + crop_height, x:x + crop_width]
|
||||
resized = cv2.resize(
|
||||
cropped,
|
||||
(cell_width, cell_height),
|
||||
interpolation=cv2.INTER_LINEAR
|
||||
)
|
||||
|
||||
# Place in grid
|
||||
y_offset = row * cell_height
|
||||
x_offset = col * cell_width
|
||||
output[y_offset:y_offset + cell_height, x_offset:x_offset + cell_width] = resized
|
||||
else:
|
||||
# Fall back to single framing
|
||||
if framing_plan.crop_regions:
|
||||
crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
|
||||
crop = framing_plan.crop_regions[crop_idx]
|
||||
cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
|
||||
else:
|
||||
# Fallback to center crop if no crop regions available
|
||||
h, w = frame.shape[:2]
|
||||
crop_h = int(w * self.target_aspect)
|
||||
crop_w = w
|
||||
if crop_h > h:
|
||||
crop_h = h
|
||||
crop_w = int(h / self.target_aspect)
|
||||
y = (h - crop_h) // 2
|
||||
x = (w - crop_w) // 2
|
||||
cropped = frame[y:y + crop_h, x:x + crop_w]
|
||||
output = cv2.resize(
|
||||
cropped,
|
||||
(self.target_width, self.target_height),
|
||||
interpolation=cv2.INTER_LINEAR
|
||||
)
|
||||
|
||||
return output
|
||||
|
||||
# MoviePy 2.x compatible way to create VideoClip
|
||||
new_clip = VideoClip(duration=video_clip.duration)
|
||||
new_clip.size = (self.target_width, self.target_height)
|
||||
new_clip.frame_function = make_frame
|
||||
return new_clip
|
||||
|
||||
|
||||
def extract_audio_samples(video_path: str, start_time: float, end_time: float) -> Optional[np.ndarray]:
|
||||
"""
|
||||
Extract audio samples from video for speech detection.
|
||||
|
||||
Args:
|
||||
video_path: Path to video file
|
||||
start_time: Start time in seconds
|
||||
end_time: End time in seconds
|
||||
|
||||
Returns:
|
||||
Audio samples array or None if no audio
|
||||
"""
|
||||
try:
|
||||
from moviepy.audio.io.AudioFileClip import AudioFileClip
|
||||
|
||||
with AudioFileClip(video_path) as audio:
|
||||
segment = audio.subclipped(start_time, end_time)
|
||||
fps = getattr(segment, 'fps', 44100)
|
||||
samples = segment.to_soundarray(fps=fps)
|
||||
return samples
|
||||
except Exception as exc:
|
||||
logger.warning(f"Failed to extract audio: {exc}")
|
||||
return None
|
||||
Reference in New Issue
Block a user