Files
video-render/video_render/smart_framing.py
LeoMortari c5d3e83a5f #v2 - Inicia testes da v2
- Adiciona rastreamento de objetos
- Facial detection
- Legenda interativa
- Cortes mais precisos
- Refinamento do Prompt
2025-11-12 11:38:09 -03:00

688 lines
26 KiB
Python

"""
Smart framing module for intelligent video cropping and composition.
This module provides functionality to create 9:16 vertical videos with
intelligent framing that follows the action and speakers.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from typing import List, Optional, Tuple
import cv2
import numpy as np
from moviepy.video.VideoClip import VideoClip
from moviepy.video.io.VideoFileClip import VideoFileClip
from scipy import signal
from video_render.context_detection import ContextAnalyzer, FrameContext, FaceDetection
logger = logging.getLogger(__name__)
@dataclass
class CropRegion:
"""Defines a crop region for a frame."""
x: int
y: int
width: int
height: int
@dataclass
class FramingPlan:
"""Complete framing plan for a video segment."""
frame_contexts: List[FrameContext]
crop_regions: List[CropRegion]
layout_mode: str
fps: float
class SmartFramer:
"""Creates intelligent 9:16 framing for horizontal videos."""
def __init__(
self,
target_width: int = 1080,
target_height: int = 1920,
frame_skip: int = 2,
smoothing_window: int = 15
):
self.target_width = target_width
self.target_height = target_height
self.target_aspect = target_height / target_width
# Performance parameters
self.frame_skip = frame_skip # Process every Nth frame (CPU optimization)
# Smoothing parameters
self.smoothing_window = smoothing_window
self.max_velocity = 30 # pixels per frame (reduced for smoother transitions)
logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip})")
def create_framing_plan(
self,
video_path: str,
start_time: float,
end_time: float,
audio_samples: Optional[np.ndarray] = None
) -> FramingPlan:
"""
Analyze video and create a complete framing plan.
Args:
video_path: Path to video file
start_time: Start time in seconds
end_time: End time in seconds
audio_samples: Optional audio samples for speech detection
Returns:
FramingPlan with all frame contexts and crop regions
"""
analyzer = ContextAnalyzer()
# Detect speaking periods from audio if available
speaking_periods = None
if audio_samples is not None:
speaking_periods = analyzer.audio_detector.detect_speaking_periods(audio_samples)
# Open video with error suppression for AV1 codec warnings
import os
os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'loglevel;quiet'
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
# Calculate frame range
start_frame = int(start_time * fps)
end_frame = int(end_time * fps)
# Set to start frame
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
frame_contexts = []
frame_number = start_frame
processed_count = 0
logger.info(f"Analyzing frames {start_frame} to {end_frame} (fps={fps}, skip={self.frame_skip})")
while frame_number < end_frame:
ret, frame = cap.read()
if not ret:
break
# Only process every Nth frame for performance (CPU optimization)
if processed_count % self.frame_skip == 0:
timestamp = frame_number / fps
context = analyzer.analyze_frame(frame, timestamp, frame_number, speaking_periods)
frame_contexts.append(context)
frame_number += 1
processed_count += 1
# Get video dimensions before releasing capture
source_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
source_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
analyzer.close()
# Determine overall layout mode (most common)
layout_modes = [ctx.layout_mode for ctx in frame_contexts]
if layout_modes:
overall_layout = max(set(layout_modes), key=layout_modes.count)
else:
overall_layout = "single"
# Calculate crop regions based on contexts
crop_regions = self._calculate_crop_regions(
frame_contexts,
source_width,
source_height
)
return FramingPlan(
frame_contexts=frame_contexts,
crop_regions=crop_regions,
layout_mode=overall_layout,
fps=fps
)
def _calculate_crop_regions(
self,
contexts: List[FrameContext],
source_width: int,
source_height: int
) -> List[CropRegion]:
"""
Calculate smooth crop regions for each frame.
Args:
contexts: List of frame contexts
source_width: Source video width
source_height: Source video height
Returns:
List of crop regions
"""
if not contexts:
return []
# Calculate ideal crop dimensions maintaining EXACT 9:16 aspect ratio
source_aspect = source_width / source_height
if source_aspect > self.target_aspect:
# Source is wider - crop horizontally (use full height)
crop_height = source_height
crop_width = int(crop_height / self.target_aspect)
# Ensure crop width fits within source
if crop_width > source_width:
crop_width = source_width
crop_height = int(crop_width * self.target_aspect)
else:
# Source is taller - crop vertically (use full width)
crop_width = source_width
crop_height = int(crop_width * self.target_aspect)
# Ensure crop height fits within source
if crop_height > source_height:
crop_height = source_height
crop_width = int(crop_height / self.target_aspect)
# Calculate center points for each frame
# Since we now always focus on ONE person directly (not averaging),
# we can use the focus point directly without complex validation
center_xs = []
center_ys = []
for ctx in contexts:
if ctx.primary_focus:
# Primary focus is now always a single person's center, never averaged
# This means it will never be on the table/empty space
center_xs.append(ctx.primary_focus[0])
center_ys.append(ctx.primary_focus[1])
else:
# Default to center only if no faces detected at all
center_xs.append(source_width // 2)
center_ys.append(source_height // 2)
# Smooth the center points
if len(center_xs) > self.smoothing_window:
kernel_size = min(self.smoothing_window, len(center_xs))
if kernel_size % 2 == 0:
kernel_size -= 1
center_xs = signal.medfilt(center_xs, kernel_size=kernel_size).tolist()
center_ys = signal.medfilt(center_ys, kernel_size=kernel_size).tolist()
# Limit velocity (prevent jarring movements)
center_xs = self._limit_velocity(center_xs, self.max_velocity)
center_ys = self._limit_velocity(center_ys, self.max_velocity)
# Convert to crop regions
crop_regions = []
for center_x, center_y in zip(center_xs, center_ys):
# Calculate top-left corner
x = int(center_x - crop_width // 2)
y = int(center_y - crop_height // 2)
# Clamp to valid bounds
x = max(0, min(x, source_width - crop_width))
y = max(0, min(y, source_height - crop_height))
crop_regions.append(CropRegion(
x=x,
y=y,
width=crop_width,
height=crop_height
))
return crop_regions
def _limit_velocity(self, positions: List[float], max_velocity: float) -> List[float]:
"""
Limit the velocity of position changes.
Args:
positions: List of positions
max_velocity: Maximum allowed change per frame
Returns:
Smoothed positions
"""
if len(positions) <= 1:
return positions
limited = [positions[0]]
for i in range(1, len(positions)):
delta = positions[i] - limited[i - 1]
if abs(delta) > max_velocity:
delta = max_velocity if delta > 0 else -max_velocity
limited.append(limited[i - 1] + delta)
return limited
def apply_framing(
self,
video_clip: VideoFileClip,
framing_plan: FramingPlan,
use_split_screen: bool = False
) -> VideoClip:
"""
Apply smart framing to a video clip.
Args:
video_clip: Source video clip
framing_plan: Framing plan to apply
use_split_screen: Whether to use split screen for multiple people
Returns:
Reframed video clip
"""
# Handle different layout modes
if framing_plan.layout_mode in ["single", "single_speaker"]:
# Single person or single speaker - use focused single framing
return self._apply_single_framing(video_clip, framing_plan)
elif framing_plan.layout_mode == "dual_split" and use_split_screen:
# Two people in conversation - use split screen
return self._apply_split_screen(video_clip, framing_plan)
elif framing_plan.layout_mode == "grid" and use_split_screen:
# 3+ people - use grid layout
return self._apply_grid_layout(video_clip, framing_plan)
else:
# Fallback to single framing
return self._apply_single_framing(video_clip, framing_plan)
def _apply_single_framing(
self,
video_clip: VideoFileClip,
framing_plan: FramingPlan
) -> VideoClip:
"""
Apply single-focus framing (following one person or action).
Args:
video_clip: Source video clip
framing_plan: Framing plan
Returns:
Reframed video clip
"""
def make_frame(t):
# Get the original frame
frame = video_clip.get_frame(t)
# Ensure we have valid crop regions
if not framing_plan.crop_regions:
# Fallback: return center crop
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
if crop_h > h:
crop_h = h
crop_w = int(h / self.target_aspect)
y = (h - crop_h) // 2
x = (w - crop_w) // 2
cropped = frame[y:y + crop_h, x:x + crop_w]
else:
# Calculate exact frame index with decimal precision for interpolation
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
# Get the two adjacent analyzed frames
idx_floor = int(exact_frame_idx)
idx_ceil = idx_floor + 1
# Interpolation factor (0.0 to 1.0)
alpha = exact_frame_idx - idx_floor
# Clamp indices to valid range
idx_floor = max(0, min(idx_floor, len(framing_plan.crop_regions) - 1))
idx_ceil = max(0, min(idx_ceil, len(framing_plan.crop_regions) - 1))
# Get crop regions
crop1 = framing_plan.crop_regions[idx_floor]
crop2 = framing_plan.crop_regions[idx_ceil]
# Linear interpolation between crop regions
x = int(crop1.x * (1 - alpha) + crop2.x * alpha)
y = int(crop1.y * (1 - alpha) + crop2.y * alpha)
width = int(crop1.width * (1 - alpha) + crop2.width * alpha)
height = int(crop1.height * (1 - alpha) + crop2.height * alpha)
# Ensure crop stays within frame bounds
h, w = frame.shape[:2]
x = max(0, min(x, w - width))
y = max(0, min(y, h - height))
width = min(width, w - x)
height = min(height, h - y)
# Crop the frame
cropped = frame[y:y + height, x:x + width]
# Resize to target dimensions
resized = cv2.resize(
cropped,
(self.target_width, self.target_height),
interpolation=cv2.INTER_LINEAR
)
return resized
# MoviePy 2.x compatible way to create VideoClip
new_clip = VideoClip(duration=video_clip.duration)
new_clip.size = (self.target_width, self.target_height)
new_clip.frame_function = make_frame
return new_clip
def _apply_split_screen(
self,
video_clip: VideoFileClip,
framing_plan: FramingPlan
) -> VideoClip:
"""
Apply split screen for two people.
Args:
video_clip: Source video clip
framing_plan: Framing plan
Returns:
Split screen video clip
"""
def make_frame(t):
frame = video_clip.get_frame(t)
# Calculate exact frame index with decimal precision for smooth interpolation
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
frame_idx = int(exact_frame_idx)
# Ensure we have valid contexts
if not framing_plan.frame_contexts:
# Fallback to simple center crop
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
if crop_h > h:
crop_h = h
crop_w = int(h / self.target_aspect)
y = (h - crop_h) // 2
x = (w - crop_w) // 2
cropped = frame[y:y + crop_h, x:x + crop_w]
return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
# Clamp index to valid range
frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
context = framing_plan.frame_contexts[frame_idx]
# Create output frame
output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
if len(context.detected_faces) >= 2:
# Split vertically 50/50 (two columns)
half_width = self.target_width // 2
# Select the 2 most relevant faces
# Priority: ALWAYS show active speaker first + most confident other person
if context.active_speakers and len(context.active_speakers) >= 1:
# Get the PRIMARY speaker (most confident among active speakers)
speaker_faces = [context.detected_faces[i] for i in context.active_speakers
if i < len(context.detected_faces)]
primary_speaker = max(speaker_faces, key=lambda f: f.confidence)
# Get OTHER faces (not the primary speaker)
other_faces = [f for f in context.detected_faces if f != primary_speaker]
if len(speaker_faces) >= 2:
# Multiple speakers: show primary + second most confident speaker
other_speakers = [f for f in speaker_faces if f != primary_speaker]
secondary_person = max(other_speakers, key=lambda f: f.confidence)
elif other_faces:
# One speaker: show speaker + most confident other person
secondary_person = max(other_faces, key=lambda f: f.confidence)
else:
# Fallback: only one person detected
secondary_person = primary_speaker
selected_faces = [primary_speaker, secondary_person]
else:
# No speakers: take 2 most confident faces
selected_faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:2]
# Sort selected faces by horizontal position for consistent left/right placement
faces = sorted(selected_faces, key=lambda f: f.center_x)
left_face = faces[0]
right_face = faces[1]
# Process each person's frame
for idx, face in enumerate([left_face, right_face]):
# Calculate crop region focused on this person
# Each person gets half the width, full target aspect ratio (9:16)
# This ensures NO distortion when resizing
# For split screen: each side is half_width x full_height
# We need to maintain 9:16 aspect for each half
half_width = self.target_width // 2
half_aspect = self.target_height / half_width # Aspect ratio for half
# Determine crop size based on face with padding
face_width = max(face.width, frame.shape[1] // 4) # At least 1/4 of frame width
crop_width = int(face_width * 2.5) # Add padding around face
crop_height = int(crop_width * half_aspect) # Maintain correct aspect
# Ensure crop fits in frame, maintaining aspect ratio
max_crop_width = frame.shape[1] // 2 # Half the source width
max_crop_height = frame.shape[0] # Full source height
# If crop is too wide, scale down proportionally
if crop_width > max_crop_width:
crop_width = max_crop_width
crop_height = int(crop_width * half_aspect)
# If crop is too tall, scale down proportionally
if crop_height > max_crop_height:
crop_height = max_crop_height
crop_width = int(crop_height / half_aspect)
# Center crop on face
x = max(0, face.center_x - crop_width // 2)
y = max(0, face.center_y - crop_height // 2)
# Clamp to frame boundaries
x = min(x, frame.shape[1] - crop_width)
y = min(y, frame.shape[0] - crop_height)
# Extract and resize crop
cropped = frame[y:y + crop_height, x:x + crop_width]
resized = cv2.resize(
cropped,
(half_width, self.target_height),
interpolation=cv2.INTER_LINEAR
)
# Place in output at appropriate horizontal position
x_offset = idx * half_width
output[:, x_offset:x_offset + half_width] = resized
else:
# Fall back to single framing
if framing_plan.crop_regions:
crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
crop = framing_plan.crop_regions[crop_idx]
cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
else:
# Fallback to center crop if no crop regions available
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
if crop_h > h:
crop_h = h
crop_w = int(h / self.target_aspect)
y = (h - crop_h) // 2
x = (w - crop_w) // 2
cropped = frame[y:y + crop_h, x:x + crop_w]
output = cv2.resize(
cropped,
(self.target_width, self.target_height),
interpolation=cv2.INTER_LINEAR
)
return output
# MoviePy 2.x compatible way to create VideoClip
new_clip = VideoClip(duration=video_clip.duration)
new_clip.size = (self.target_width, self.target_height)
new_clip.frame_function = make_frame
return new_clip
def _apply_grid_layout(
self,
video_clip: VideoFileClip,
framing_plan: FramingPlan
) -> VideoClip:
"""
Apply grid layout for 3+ people.
Args:
video_clip: Source video clip
framing_plan: Framing plan
Returns:
Grid layout video clip
"""
def make_frame(t):
frame = video_clip.get_frame(t)
# Calculate exact frame index with decimal precision for smooth interpolation
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
frame_idx = int(exact_frame_idx)
# Ensure we have valid contexts
if not framing_plan.frame_contexts:
# Fallback to simple center crop
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
if crop_h > h:
crop_h = h
crop_w = int(h / self.target_aspect)
y = (h - crop_h) // 2
x = (w - crop_w) // 2
cropped = frame[y:y + crop_h, x:x + crop_w]
return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
# Clamp index to valid range
frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
context = framing_plan.frame_contexts[frame_idx]
output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
num_faces = len(context.detected_faces)
if num_faces >= 3:
# Create 2x2 grid
cell_width = self.target_width // 2
cell_height = self.target_height // 2
for idx, face in enumerate(context.detected_faces[:4]):
# Calculate grid position
row = idx // 2
col = idx % 2
# Each grid cell maintains aspect ratio (square in this case: cell_width = cell_height)
cell_aspect = cell_height / cell_width
# Crop around face with correct aspect ratio
crop_width = frame.shape[1] // 2
crop_height = int(crop_width * cell_aspect)
# Ensure crop fits in frame, maintaining aspect
max_crop_width = frame.shape[1] // 2
max_crop_height = frame.shape[0] // 2
if crop_width > max_crop_width:
crop_width = max_crop_width
crop_height = int(crop_width * cell_aspect)
if crop_height > max_crop_height:
crop_height = max_crop_height
crop_width = int(crop_height / cell_aspect)
# Center crop on face
x = max(0, face.center_x - crop_width // 2)
y = max(0, face.center_y - crop_height // 2)
# Clamp to frame boundaries
x = min(x, frame.shape[1] - crop_width)
y = min(y, frame.shape[0] - crop_height)
cropped = frame[y:y + crop_height, x:x + crop_width]
resized = cv2.resize(
cropped,
(cell_width, cell_height),
interpolation=cv2.INTER_LINEAR
)
# Place in grid
y_offset = row * cell_height
x_offset = col * cell_width
output[y_offset:y_offset + cell_height, x_offset:x_offset + cell_width] = resized
else:
# Fall back to single framing
if framing_plan.crop_regions:
crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
crop = framing_plan.crop_regions[crop_idx]
cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
else:
# Fallback to center crop if no crop regions available
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
if crop_h > h:
crop_h = h
crop_w = int(h / self.target_aspect)
y = (h - crop_h) // 2
x = (w - crop_w) // 2
cropped = frame[y:y + crop_h, x:x + crop_w]
output = cv2.resize(
cropped,
(self.target_width, self.target_height),
interpolation=cv2.INTER_LINEAR
)
return output
# MoviePy 2.x compatible way to create VideoClip
new_clip = VideoClip(duration=video_clip.duration)
new_clip.size = (self.target_width, self.target_height)
new_clip.frame_function = make_frame
return new_clip
def extract_audio_samples(video_path: str, start_time: float, end_time: float) -> Optional[np.ndarray]:
"""
Extract audio samples from video for speech detection.
Args:
video_path: Path to video file
start_time: Start time in seconds
end_time: End time in seconds
Returns:
Audio samples array or None if no audio
"""
try:
from moviepy.audio.io.AudioFileClip import AudioFileClip
with AudioFileClip(video_path) as audio:
segment = audio.subclipped(start_time, end_time)
fps = getattr(segment, 'fps', 44100)
samples = segment.to_soundarray(fps=fps)
return samples
except Exception as exc:
logger.warning(f"Failed to extract audio: {exc}")
return None