Realiza varios ajustes para melhorar o tracking e o render de video

This commit is contained in:
LeoMortari
2025-12-18 02:26:25 -03:00
parent 78e35d65fd
commit 07d301f110
11 changed files with 984 additions and 316 deletions

View File

@@ -46,21 +46,20 @@ class SmartFramer:
self,
target_width: int = 1080,
target_height: int = 1920,
frame_skip: int = 2,
smoothing_window: int = 15
frame_skip: int = 1,
smoothing_window: int = 30,
max_velocity: int = 20,
person_switch_cooldown: int = 999999
):
self.target_width = target_width
self.target_height = target_height
self.target_aspect = target_height / target_width
# Performance parameters
self.frame_skip = frame_skip # Process every Nth frame (CPU optimization)
# Smoothing parameters
self.frame_skip = frame_skip
self.smoothing_window = smoothing_window
self.max_velocity = 30 # pixels per frame (reduced for smoother transitions)
self.max_velocity = max_velocity
self.person_switch_cooldown = person_switch_cooldown
logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip})")
logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip}, smoothing={smoothing_window}, velocity={max_velocity}, cooldown={person_switch_cooldown})")
def create_framing_plan(
self,
@@ -81,25 +80,21 @@ class SmartFramer:
Returns:
FramingPlan with all frame contexts and crop regions
"""
analyzer = ContextAnalyzer()
analyzer = ContextAnalyzer(person_switch_cooldown=self.person_switch_cooldown)
# Detect speaking periods from audio if available
speaking_periods = None
if audio_samples is not None:
speaking_periods = analyzer.audio_detector.detect_speaking_periods(audio_samples)
# Open video with error suppression for AV1 codec warnings
import os
os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'loglevel;quiet'
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
# Calculate frame range
start_frame = int(start_time * fps)
end_frame = int(end_time * fps)
# Set to start frame
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
frame_contexts = []
@@ -113,7 +108,6 @@ class SmartFramer:
if not ret:
break
# Only process every Nth frame for performance (CPU optimization)
if processed_count % self.frame_skip == 0:
timestamp = frame_number / fps
context = analyzer.analyze_frame(frame, timestamp, frame_number, speaking_periods)
@@ -122,35 +116,36 @@ class SmartFramer:
frame_number += 1
processed_count += 1
# Get video dimensions before releasing capture
source_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
source_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
analyzer.close()
# Determine overall layout mode (most common)
layout_modes = [ctx.layout_mode for ctx in frame_contexts]
if layout_modes:
overall_layout = max(set(layout_modes), key=layout_modes.count)
else:
overall_layout = "single"
# Calculate crop regions based on contexts
crop_regions = self._calculate_crop_regions(
frame_contexts,
source_width,
source_height
)
return FramingPlan(
framing_plan = FramingPlan(
frame_contexts=frame_contexts,
crop_regions=crop_regions,
layout_mode=overall_layout,
fps=fps
)
import gc
gc.collect()
return framing_plan
def _calculate_crop_regions(
self,
contexts: List[FrameContext],
@@ -171,66 +166,122 @@ class SmartFramer:
if not contexts:
return []
# Calculate ideal crop dimensions maintaining EXACT 9:16 aspect ratio
source_aspect = source_width / source_height
if source_aspect > self.target_aspect:
# Source is wider - crop horizontally (use full height)
crop_height = source_height
crop_width = int(crop_height / self.target_aspect)
# Ensure crop width fits within source
if crop_width > source_width:
crop_width = source_width
crop_height = int(crop_width * self.target_aspect)
else:
# Source is taller - crop vertically (use full width)
crop_width = source_width
crop_height = int(crop_width * self.target_aspect)
# Ensure crop height fits within source
if crop_height > source_height:
crop_height = source_height
crop_width = int(crop_height / self.target_aspect)
# Calculate center points for each frame
# Since we now always focus on ONE person directly (not averaging),
# we can use the focus point directly without complex validation
center_xs = []
center_ys = []
safe_zone_margin_x = crop_width * 0.40
safe_zone_margin_y = crop_height * 0.40
for ctx in contexts:
if ctx.primary_focus:
# Primary focus is now always a single person's center, never averaged
# This means it will never be on the table/empty space
center_xs.append(ctx.primary_focus[0])
center_ys.append(ctx.primary_focus[1])
dead_zone_threshold = 100
if contexts and contexts[0].primary_focus:
current_crop_center_x = contexts[0].primary_focus[0]
current_crop_center_y = contexts[0].primary_focus[1]
else:
current_crop_center_x = source_width // 2
current_crop_center_y = source_height // 2
center_xs = [current_crop_center_x]
center_ys = [current_crop_center_y]
for ctx in contexts[1:]:
if ctx.primary_focus and ctx.selected_people and len(ctx.detected_faces) > 0:
primary_person_idx = ctx.selected_people[0] if ctx.selected_people else 0
if primary_person_idx < len(ctx.detected_faces):
face = ctx.detected_faces[primary_person_idx]
face_left = face.x
face_right = face.x + face.width
face_top = face.y
face_bottom = face.y + face.height
crop_left = current_crop_center_x - crop_width // 2
crop_right = current_crop_center_x + crop_width // 2
crop_top = current_crop_center_y - crop_height // 2
crop_bottom = current_crop_center_y + crop_height // 2
face_rel_left = face_left - crop_left
face_rel_right = face_right - crop_left
face_rel_top = face_top - crop_top
face_rel_bottom = face_bottom - crop_top
face_left_safe = face_rel_left >= safe_zone_margin_x
face_right_safe = face_rel_right <= (crop_width - safe_zone_margin_x)
face_top_safe = face_rel_top >= safe_zone_margin_y
face_bottom_safe = face_rel_bottom <= (crop_height - safe_zone_margin_y)
face_fully_visible = face_left_safe and face_right_safe and face_top_safe and face_bottom_safe
if face_fully_visible:
center_xs.append(current_crop_center_x)
center_ys.append(current_crop_center_y)
else:
shift_x = 0
shift_y = 0
if not face_left_safe:
shift_x = face_rel_left - safe_zone_margin_x
elif not face_right_safe:
shift_x = face_rel_right - (crop_width - safe_zone_margin_x)
if not face_top_safe:
shift_y = face_rel_top - safe_zone_margin_y
elif not face_bottom_safe:
shift_y = face_rel_bottom - (crop_height - safe_zone_margin_y)
if abs(shift_x) > dead_zone_threshold:
current_crop_center_x += shift_x
if abs(shift_y) > dead_zone_threshold:
current_crop_center_y += shift_y
center_xs.append(current_crop_center_x)
center_ys.append(current_crop_center_y)
else:
center_xs.append(current_crop_center_x)
center_ys.append(current_crop_center_y)
else:
# Default to center only if no faces detected at all
center_xs.append(source_width // 2)
center_ys.append(source_height // 2)
center_xs.append(current_crop_center_x)
center_ys.append(current_crop_center_y)
# Smooth the center points
if len(center_xs) > self.smoothing_window:
kernel_size = min(self.smoothing_window, len(center_xs))
if kernel_size % 2 == 0:
kernel_size -= 1
if len(center_xs) > 1:
alpha = 0.002
smoothed_xs = [center_xs[0]]
smoothed_ys = [center_ys[0]]
for i in range(1, len(center_xs)):
if center_xs[i] != center_xs[i-1] or center_ys[i] != center_ys[i-1]:
smoothed_xs.append(alpha * center_xs[i] + (1 - alpha) * smoothed_xs[i-1])
smoothed_ys.append(alpha * center_ys[i] + (1 - alpha) * smoothed_ys[i-1])
else:
smoothed_xs.append(smoothed_xs[i-1])
smoothed_ys.append(smoothed_ys[i-1])
center_xs = smoothed_xs
center_ys = smoothed_ys
center_xs = signal.medfilt(center_xs, kernel_size=kernel_size).tolist()
center_ys = signal.medfilt(center_ys, kernel_size=kernel_size).tolist()
center_xs = self._limit_velocity(center_xs, 2)
center_ys = self._limit_velocity(center_ys, 2)
# Limit velocity (prevent jarring movements)
center_xs = self._limit_velocity(center_xs, self.max_velocity)
center_ys = self._limit_velocity(center_ys, self.max_velocity)
center_xs = self._apply_dead_zone(center_xs, 5)
center_ys = self._apply_dead_zone(center_ys, 5)
# Convert to crop regions
crop_regions = []
for center_x, center_y in zip(center_xs, center_ys):
# Calculate top-left corner
x = int(center_x - crop_width // 2)
y = int(center_y - crop_height // 2)
# Clamp to valid bounds
x = max(0, min(x, source_width - crop_width))
y = max(0, min(y, source_height - crop_height))
@@ -241,8 +292,37 @@ class SmartFramer:
height=crop_height
))
center_xs.clear()
center_ys.clear()
return crop_regions
def _apply_dead_zone(self, positions: List[float], threshold: float) -> List[float]:
"""
Apply dead zone to eliminate micro-movements.
If change is smaller than threshold, keep previous position.
Args:
positions: List of positions
threshold: Minimum change needed to move (pixels)
Returns:
Positions with dead zone applied
"""
if len(positions) <= 1:
return positions
filtered = [positions[0]]
for i in range(1, len(positions)):
delta = abs(positions[i] - filtered[i - 1])
if delta < threshold:
filtered.append(filtered[i - 1])
else:
filtered.append(positions[i])
return filtered
def _limit_velocity(self, positions: List[float], max_velocity: float) -> List[float]:
"""
Limit the velocity of position changes.
@@ -271,33 +351,20 @@ class SmartFramer:
def apply_framing(
self,
video_clip: VideoFileClip,
framing_plan: FramingPlan,
use_split_screen: bool = False
framing_plan: FramingPlan
) -> VideoClip:
"""
Apply smart framing to a video clip.
Always uses single-person focus (no split screen).
Args:
video_clip: Source video clip
framing_plan: Framing plan to apply
use_split_screen: Whether to use split screen for multiple people
Returns:
Reframed video clip
"""
# Handle different layout modes
if framing_plan.layout_mode in ["single", "single_speaker"]:
# Single person or single speaker - use focused single framing
return self._apply_single_framing(video_clip, framing_plan)
elif framing_plan.layout_mode == "dual_split" and use_split_screen:
# Two people in conversation - use split screen
return self._apply_split_screen(video_clip, framing_plan)
elif framing_plan.layout_mode == "grid" and use_split_screen:
# 3+ people - use grid layout
return self._apply_grid_layout(video_clip, framing_plan)
else:
# Fallback to single framing
return self._apply_single_framing(video_clip, framing_plan)
return self._apply_single_framing(video_clip, framing_plan)
def _apply_single_framing(
self,
@@ -315,12 +382,9 @@ class SmartFramer:
Reframed video clip
"""
def make_frame(t):
# Get the original frame
frame = video_clip.get_frame(t)
# Ensure we have valid crop regions
if not framing_plan.crop_regions:
# Fallback: return center crop
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
@@ -331,41 +395,32 @@ class SmartFramer:
x = (w - crop_w) // 2
cropped = frame[y:y + crop_h, x:x + crop_w]
else:
# Calculate exact frame index with decimal precision for interpolation
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
# Get the two adjacent analyzed frames
idx_floor = int(exact_frame_idx)
idx_ceil = idx_floor + 1
# Interpolation factor (0.0 to 1.0)
alpha = exact_frame_idx - idx_floor
# Clamp indices to valid range
idx_floor = max(0, min(idx_floor, len(framing_plan.crop_regions) - 1))
idx_ceil = max(0, min(idx_ceil, len(framing_plan.crop_regions) - 1))
# Get crop regions
crop1 = framing_plan.crop_regions[idx_floor]
crop2 = framing_plan.crop_regions[idx_ceil]
# Linear interpolation between crop regions
x = int(crop1.x * (1 - alpha) + crop2.x * alpha)
y = int(crop1.y * (1 - alpha) + crop2.y * alpha)
width = int(crop1.width * (1 - alpha) + crop2.width * alpha)
height = int(crop1.height * (1 - alpha) + crop2.height * alpha)
# Ensure crop stays within frame bounds
h, w = frame.shape[:2]
x = max(0, min(x, w - width))
y = max(0, min(y, h - height))
width = min(width, w - x)
height = min(height, h - y)
# Crop the frame
cropped = frame[y:y + height, x:x + width]
# Resize to target dimensions
resized = cv2.resize(
cropped,
(self.target_width, self.target_height),
@@ -374,7 +429,6 @@ class SmartFramer:
return resized
# MoviePy 2.x compatible way to create VideoClip
new_clip = VideoClip(duration=video_clip.duration)
new_clip.size = (self.target_width, self.target_height)
new_clip.frame_function = make_frame
@@ -397,13 +451,10 @@ class SmartFramer:
"""
def make_frame(t):
frame = video_clip.get_frame(t)
# Calculate exact frame index with decimal precision for smooth interpolation
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
frame_idx = int(exact_frame_idx)
# Ensure we have valid contexts
if not framing_plan.frame_contexts:
# Fallback to simple center crop
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
@@ -415,107 +466,81 @@ class SmartFramer:
cropped = frame[y:y + crop_h, x:x + crop_w]
return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
# Clamp index to valid range
frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
context = framing_plan.frame_contexts[frame_idx]
# Create output frame
output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
if len(context.detected_faces) >= 2:
# Split vertically 50/50 (two columns)
half_width = self.target_width // 2
if context.selected_people and len(context.selected_people) >= 2:
selected_faces = [context.detected_faces[i] for i in context.selected_people[:2]
if i < len(context.detected_faces)]
# Select the 2 most relevant faces
# Priority: ALWAYS show active speaker first + most confident other person
if context.active_speakers and len(context.active_speakers) >= 1:
# Get the PRIMARY speaker (most confident among active speakers)
speaker_faces = [context.detected_faces[i] for i in context.active_speakers
if i < len(context.detected_faces)]
if len(selected_faces) >= 2:
faces = sorted(selected_faces, key=lambda f: f.center_x)
left_face = faces[0]
right_face = faces[1]
primary_speaker = max(speaker_faces, key=lambda f: f.confidence)
for idx, face in enumerate([left_face, right_face]):
# Get OTHER faces (not the primary speaker)
other_faces = [f for f in context.detected_faces if f != primary_speaker]
half_width = self.target_width // 2
half_aspect = self.target_height / half_width # Aspect ratio for half
if len(speaker_faces) >= 2:
# Multiple speakers: show primary + second most confident speaker
other_speakers = [f for f in speaker_faces if f != primary_speaker]
secondary_person = max(other_speakers, key=lambda f: f.confidence)
elif other_faces:
# One speaker: show speaker + most confident other person
secondary_person = max(other_faces, key=lambda f: f.confidence)
else:
# Fallback: only one person detected
secondary_person = primary_speaker
face_width = max(face.width, frame.shape[1] // 4) # At least 1/4 of frame width
crop_width = int(face_width * 2.5) # Add padding around face
crop_height = int(crop_width * half_aspect) # Maintain correct aspect
selected_faces = [primary_speaker, secondary_person]
max_crop_width = frame.shape[1] // 2 # Half the source width
max_crop_height = frame.shape[0] # Full source height
if crop_width > max_crop_width:
crop_width = max_crop_width
crop_height = int(crop_width * half_aspect)
if crop_height > max_crop_height:
crop_height = max_crop_height
crop_width = int(crop_height / half_aspect)
x = max(0, face.center_x - crop_width // 2)
y = max(0, face.center_y - crop_height // 2)
x = min(x, frame.shape[1] - crop_width)
y = min(y, frame.shape[0] - crop_height)
cropped = frame[y:y + crop_height, x:x + crop_width]
resized = cv2.resize(
cropped,
(half_width, self.target_height),
interpolation=cv2.INTER_LINEAR
)
x_offset = idx * half_width
output[:, x_offset:x_offset + half_width] = resized
else:
# No speakers: take 2 most confident faces
selected_faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:2]
# Sort selected faces by horizontal position for consistent left/right placement
faces = sorted(selected_faces, key=lambda f: f.center_x)
left_face = faces[0]
right_face = faces[1]
# Process each person's frame
for idx, face in enumerate([left_face, right_face]):
# Calculate crop region focused on this person
# Each person gets half the width, full target aspect ratio (9:16)
# This ensures NO distortion when resizing
# For split screen: each side is half_width x full_height
# We need to maintain 9:16 aspect for each half
half_width = self.target_width // 2
half_aspect = self.target_height / half_width # Aspect ratio for half
# Determine crop size based on face with padding
face_width = max(face.width, frame.shape[1] // 4) # At least 1/4 of frame width
crop_width = int(face_width * 2.5) # Add padding around face
crop_height = int(crop_width * half_aspect) # Maintain correct aspect
# Ensure crop fits in frame, maintaining aspect ratio
max_crop_width = frame.shape[1] // 2 # Half the source width
max_crop_height = frame.shape[0] # Full source height
# If crop is too wide, scale down proportionally
if crop_width > max_crop_width:
crop_width = max_crop_width
crop_height = int(crop_width * half_aspect)
# If crop is too tall, scale down proportionally
if crop_height > max_crop_height:
crop_height = max_crop_height
crop_width = int(crop_height / half_aspect)
# Center crop on face
x = max(0, face.center_x - crop_width // 2)
y = max(0, face.center_y - crop_height // 2)
# Clamp to frame boundaries
x = min(x, frame.shape[1] - crop_width)
y = min(y, frame.shape[0] - crop_height)
# Extract and resize crop
cropped = frame[y:y + crop_height, x:x + crop_width]
resized = cv2.resize(
if framing_plan.crop_regions:
crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
crop = framing_plan.crop_regions[crop_idx]
cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
else:
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
if crop_h > h:
crop_h = h
crop_w = int(h / self.target_aspect)
y = (h - crop_h) // 2
x = (w - crop_w) // 2
cropped = frame[y:y + crop_h, x:x + crop_w]
output = cv2.resize(
cropped,
(half_width, self.target_height),
(self.target_width, self.target_height),
interpolation=cv2.INTER_LINEAR
)
# Place in output at appropriate horizontal position
x_offset = idx * half_width
output[:, x_offset:x_offset + half_width] = resized
else:
# Fall back to single framing
if framing_plan.crop_regions:
crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
crop = framing_plan.crop_regions[crop_idx]
cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
else:
# Fallback to center crop if no crop regions available
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
@@ -533,7 +558,6 @@ class SmartFramer:
return output
# MoviePy 2.x compatible way to create VideoClip
new_clip = VideoClip(duration=video_clip.duration)
new_clip.size = (self.target_width, self.target_height)
new_clip.frame_function = make_frame
@@ -556,13 +580,10 @@ class SmartFramer:
"""
def make_frame(t):
frame = video_clip.get_frame(t)
# Calculate exact frame index with decimal precision for smooth interpolation
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
frame_idx = int(exact_frame_idx)
# Ensure we have valid contexts
if not framing_plan.frame_contexts:
# Fallback to simple center crop
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
@@ -574,7 +595,6 @@ class SmartFramer:
cropped = frame[y:y + crop_h, x:x + crop_w]
return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
# Clamp index to valid range
frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
context = framing_plan.frame_contexts[frame_idx]
@@ -583,23 +603,18 @@ class SmartFramer:
num_faces = len(context.detected_faces)
if num_faces >= 3:
# Create 2x2 grid
cell_width = self.target_width // 2
cell_height = self.target_height // 2
for idx, face in enumerate(context.detected_faces[:4]):
# Calculate grid position
row = idx // 2
col = idx % 2
# Each grid cell maintains aspect ratio (square in this case: cell_width = cell_height)
cell_aspect = cell_height / cell_width
# Crop around face with correct aspect ratio
crop_width = frame.shape[1] // 2
crop_height = int(crop_width * cell_aspect)
# Ensure crop fits in frame, maintaining aspect
max_crop_width = frame.shape[1] // 2
max_crop_height = frame.shape[0] // 2
@@ -611,11 +626,9 @@ class SmartFramer:
crop_height = max_crop_height
crop_width = int(crop_height / cell_aspect)
# Center crop on face
x = max(0, face.center_x - crop_width // 2)
y = max(0, face.center_y - crop_height // 2)
# Clamp to frame boundaries
x = min(x, frame.shape[1] - crop_width)
y = min(y, frame.shape[0] - crop_height)
@@ -626,18 +639,15 @@ class SmartFramer:
interpolation=cv2.INTER_LINEAR
)
# Place in grid
y_offset = row * cell_height
x_offset = col * cell_width
output[y_offset:y_offset + cell_height, x_offset:x_offset + cell_width] = resized
else:
# Fall back to single framing
if framing_plan.crop_regions:
crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
crop = framing_plan.crop_regions[crop_idx]
cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
else:
# Fallback to center crop if no crop regions available
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
@@ -655,7 +665,6 @@ class SmartFramer:
return output
# MoviePy 2.x compatible way to create VideoClip
new_clip = VideoClip(duration=video_clip.duration)
new_clip.size = (self.target_width, self.target_height)
new_clip.frame_function = make_frame