Realiza varios ajustes para melhorar o tracking e o render de video

This commit is contained in:
LeoMortari
2025-12-18 02:26:25 -03:00
parent 78e35d65fd
commit 07d301f110
11 changed files with 984 additions and 316 deletions

View File

@@ -7,7 +7,7 @@ and identify who is speaking in video content using MediaPipe and audio analysis
from __future__ import annotations
import logging
from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import List, Optional, Tuple
import cv2
@@ -50,20 +50,22 @@ class FrameContext:
active_speakers: List[int] # indices of speaking faces
primary_focus: Optional[Tuple[int, int]] # (x, y) center point
layout_mode: str # "single", "dual_split", "grid"
selected_people: List[int] = field(default_factory=list) # indices of people selected for display (max 2)
class MediaPipeDetector:
"""Face and pose detection using MediaPipe."""
"""Face and pose detection using MediaPipe with OpenCV Haar Cascade fallback."""
def __init__(self, min_detection_confidence: float = 0.5, min_tracking_confidence: float = 0.5):
def __init__(self, min_detection_confidence: float = 0.3, min_tracking_confidence: float = 0.3):
self.min_detection_confidence = min_detection_confidence
self.min_tracking_confidence = min_tracking_confidence
self.mp_face_detection = mp.solutions.face_detection
self.mp_face_mesh = mp.solutions.face_mesh
# MediaPipe detectors with lower confidence for better cartoon detection
self.face_detection = self.mp_face_detection.FaceDetection(
min_detection_confidence=min_detection_confidence,
model_selection=1
model_selection=0 # Changed to 0 for better detection of varied faces (including cartoons)
)
self.face_mesh = self.mp_face_mesh.FaceMesh(
@@ -73,11 +75,17 @@ class MediaPipeDetector:
static_image_mode=False
)
logger.info("MediaPipe detector initialized")
# OpenCV Haar Cascade as fallback for cartoon/anime faces
self.haar_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
# Alternative cascade for profile/side faces
self.haar_cascade_profile = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_profileface.xml')
logger.info(f"Hybrid detector initialized (MediaPipe confidence={min_detection_confidence}, OpenCV Haar Cascade enabled)")
def detect_faces(self, frame: np.ndarray) -> List[FaceDetection]:
"""
Detect faces in a frame.
Detect faces in a frame using hybrid approach (MediaPipe + OpenCV Haar Cascade).
Args:
frame: RGB image array
@@ -94,6 +102,7 @@ class MediaPipeDetector:
else:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Try MediaPipe first
results = self.face_detection.process(frame_rgb)
faces = []
@@ -126,8 +135,111 @@ class MediaPipeDetector:
center_y=center_y
))
# Fallback to OpenCV Haar Cascade if MediaPipe found nothing
if not faces:
faces = self._detect_faces_haar_cascade(frame, width, height)
return faces
def _detect_faces_haar_cascade(self, frame: np.ndarray, width: int, height: int) -> List[FaceDetection]:
"""
Detect faces using OpenCV Haar Cascade (works better with cartoons/anime).
Args:
frame: Image frame (BGR format)
width: Frame width
height: Frame height
Returns:
List of detected faces
"""
# Convert to grayscale for Haar Cascade
if len(frame.shape) == 3:
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
else:
gray = frame
# Detect frontal faces with more sensitive parameters
frontal_faces = self.haar_cascade.detectMultiScale(
gray,
scaleFactor=1.05, # More sensitive to size variations
minNeighbors=3, # Lower threshold for detection (more permissive)
minSize=(30, 30), # Smaller minimum size
flags=cv2.CASCADE_SCALE_IMAGE
)
# Also try profile faces
profile_faces = self.haar_cascade_profile.detectMultiScale(
gray,
scaleFactor=1.1,
minNeighbors=3,
minSize=(30, 30),
flags=cv2.CASCADE_SCALE_IMAGE
)
# Combine frontal and profile detections
all_faces = []
for (x, y, w, h) in frontal_faces:
x = max(0, min(x, width - 1))
y = max(0, min(y, height - 1))
w = min(w, width - x)
h = min(h, height - y)
center_x = x + w // 2
center_y = y + h // 2
all_faces.append(FaceDetection(
x=x,
y=y,
width=w,
height=h,
confidence=0.7, # Haar Cascade doesn't provide confidence, use fixed value
center_x=center_x,
center_y=center_y
))
for (x, y, w, h) in profile_faces:
# Check if this face overlaps significantly with any frontal face
overlap = False
for existing_face in all_faces:
# Calculate IoU (Intersection over Union)
x1_overlap = max(x, existing_face.x)
y1_overlap = max(y, existing_face.y)
x2_overlap = min(x + w, existing_face.x + existing_face.width)
y2_overlap = min(y + h, existing_face.y + existing_face.height)
if x1_overlap < x2_overlap and y1_overlap < y2_overlap:
overlap_area = (x2_overlap - x1_overlap) * (y2_overlap - y1_overlap)
face_area = w * h
if overlap_area / face_area > 0.3: # 30% overlap threshold
overlap = True
break
if not overlap:
x = max(0, min(x, width - 1))
y = max(0, min(y, height - 1))
w = min(w, width - x)
h = min(h, height - y)
center_x = x + w // 2
center_y = y + h // 2
all_faces.append(FaceDetection(
x=x,
y=y,
width=w,
height=h,
confidence=0.6, # Slightly lower confidence for profile
center_x=center_x,
center_y=center_y
))
if all_faces:
logger.debug(f"Haar Cascade detected {len(all_faces)} faces (MediaPipe failed)")
return all_faces
def detect_face_landmarks(self, frame: np.ndarray) -> List[FaceDetection]:
"""
Detect faces with landmarks for lip sync detection.
@@ -203,8 +315,8 @@ class AudioActivityDetector:
def detect_speaking_periods(
self,
audio_samples: np.ndarray,
threshold: float = 0.02,
min_speech_duration: float = 0.1
threshold: float = 0.01, # Reduced from 0.02 for better speech detection
min_speech_duration: float = 0.05 # Reduced from 0.1 to catch shorter utterances
) -> List[Tuple[float, float]]:
"""
Detect periods of speech in audio.
@@ -250,6 +362,16 @@ class AudioActivityDetector:
if end_time - start_time >= min_speech_duration:
periods.append((start_time, end_time))
# Log detected speech periods for debugging
if periods:
total_speech_time = sum(end - start for start, end in periods)
logger.info(f"Audio speech detection: {len(periods)} periods found, "
f"total {total_speech_time:.1f}s of speech (threshold={threshold})")
else:
max_energy = max(energies) if energies else 0
logger.warning(f"No speech detected! Max energy={max_energy:.4f}, threshold={threshold} "
f"(try lowering threshold if speech should be present)")
return periods
def is_speaking_at_time(self, speaking_periods: List[Tuple[float, float]], time: float) -> bool:
@@ -263,12 +385,29 @@ class AudioActivityDetector:
class ContextAnalyzer:
"""Analyzes video context to determine focus and layout."""
def __init__(self):
def __init__(self, person_switch_cooldown: int = 30):
self.detector = MediaPipeDetector()
self.audio_detector = AudioActivityDetector()
self.previous_faces: List[FaceDetection] = []
logger.info("Context analyzer initialized")
# Person tracking state
self.current_selected_people: List[int] = [] # Indices of people currently on screen
self.last_switch_frame: int = -999 # Frame when we last switched people
self.person_switch_cooldown = person_switch_cooldown # Minimum frames before switching
# Stability tracking to prevent flip-flopping
self.desired_people_history: List[List[int]] = [] # Track recent desired selections
self.stability_threshold = 20 # Frames needed to confirm a switch (increased for more stability)
self.last_switched_people: List[int] = [] # People we just switched FROM
# Focus stability: track recent focus points for temporal smoothing
self.focus_history: List[Tuple[int, int]] = []
self.focus_history_size: int = 5 # Keep last 5 focus points for smoothing
# Debug logging
self.frame_log_interval = 30 # Log every N frames
logger.info(f"Context analyzer initialized (cooldown={person_switch_cooldown} frames, focus_smoothing={self.focus_history_size})")
def analyze_frame(
self,
@@ -296,33 +435,47 @@ class ContextAnalyzer:
# Determine who is speaking
active_speakers = []
has_audio_speech = speaking_periods and self.audio_detector.is_speaking_at_time(speaking_periods, timestamp)
for i, face in enumerate(faces):
is_speaking = False
if speaking_periods and self.audio_detector.is_speaking_at_time(speaking_periods, timestamp):
# Check audio-based speech detection
if has_audio_speech:
is_speaking = True
# Check lip movement (visual speech detection)
if face.landmarks and len(self.previous_faces) > i:
is_speaking = is_speaking or self._detect_lip_movement(face, self.previous_faces[i])
if is_speaking:
active_speakers.append(i)
num_faces = len(faces)
num_speakers = len(active_speakers)
# Debug: Log speech detection
if frame_number % 30 == 0: # Every second at 30fps
logger.info(f"Speech detection - Frame {frame_number}: audio_active={has_audio_speech}, "
f"speakers={active_speakers}, total_faces={len(faces)}")
if num_faces == 0:
layout_mode = "single"
elif num_faces == 1:
layout_mode = "single"
elif num_faces == 2:
layout_mode = "dual_split"
elif num_faces >= 3:
layout_mode = "dual_split"
else:
layout_mode = "single"
# Select THE person to focus on (always single person)
# Priority: 1) Who is speaking, 2) Who is most centered
selected_people = self._select_person_to_focus(
faces,
active_speakers,
frame_number,
frame.shape[1], # frame width for center calculation
frame.shape[0] # frame height for center calculation
)
primary_focus = self._calculate_focus_point(faces, active_speakers)
# Always use single-person layout (no split screen)
layout_mode = "single"
primary_focus = self._calculate_focus_point(faces, selected_people)
# Debug logging every N frames
if frame_number % self.frame_log_interval == 0:
focus_reason = "speaker" if active_speakers else "no_speech_detected"
logger.info(f"Frame {frame_number}: {len(faces)} faces, "
f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}")
self.previous_faces = faces
@@ -332,7 +485,8 @@ class ContextAnalyzer:
detected_faces=faces,
active_speakers=active_speakers,
primary_focus=primary_focus,
layout_mode=layout_mode
layout_mode=layout_mode,
selected_people=selected_people
)
def _detect_lip_movement(self, current_face: FaceDetection, previous_face: FaceDetection) -> bool:
@@ -363,36 +517,309 @@ class ContextAnalyzer:
threshold = 2.0
return abs(current_dist - previous_dist) > threshold
def _calculate_focus_point(
def _select_person_to_focus(
self,
faces: List[FaceDetection],
active_speakers: List[int]
) -> Optional[Tuple[int, int]]:
active_speakers: List[int],
frame_number: int,
frame_width: int,
frame_height: int
) -> List[int]:
"""
Calculate the primary focus point based on detected faces and speakers.
IMPORTANT: This focuses on ONE person to avoid focusing on empty space (table).
When multiple people are present, we pick the most relevant person, not average positions.
Select THE single person to focus on.
Priority: 1) Who is speaking, 2) Who is most centered in frame
Args:
faces: List of detected faces
active_speakers: Indices of faces that are speaking
active_speakers: Indices of people currently speaking
frame_number: Current frame number
frame_width: Frame width for center calculation
frame_height: Frame height for center calculation
Returns:
List with single person index [idx], or empty list if no faces
"""
if not faces:
self.current_selected_people = []
return []
# If only 1 person, always focus on them
if len(faces) == 1:
self.current_selected_people = [0]
return [0]
# Check if we can switch people (cooldown period)
frames_since_last_switch = frame_number - self.last_switch_frame
can_switch = frames_since_last_switch >= self.person_switch_cooldown
# Calculate frame center for distance comparison
frame_center_x = frame_width / 2
frame_center_y = frame_height / 2
# ULTRA-STABLE MODE: Select ONE person at start, NEVER switch
# This completely eliminates switching-related instability
desired_person_idx = None
# If we already have someone selected, ALWAYS KEEP THEM (never switch)
if self.current_selected_people and len(self.current_selected_people) > 0:
current_idx = self.current_selected_people[0]
if current_idx < len(faces):
# Current person still detected - keep them
desired_person_idx = current_idx
else:
# Current person lost - try to find them again by position/size similarity
# This handles temporary detection failures
current_person_found = False
if self.previous_faces and current_idx < len(self.previous_faces):
prev_face = self.previous_faces[current_idx]
# Find most similar face by position and size
best_match_idx = None
best_match_score = float('inf')
for idx, face in enumerate(faces):
# Distance between centers
dx = face.center_x - prev_face.center_x
dy = face.center_y - prev_face.center_y
dist = np.sqrt(dx**2 + dy**2)
# Size similarity
size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height)
score = dist + size_diff * 0.5
if score < best_match_score:
best_match_score = score
best_match_idx = idx
if best_match_idx is not None and best_match_score < 1000:
desired_person_idx = best_match_idx
current_person_found = True
if not current_person_found:
# Really lost - select most confident
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
face_confidences.sort(key=lambda x: x[1], reverse=True)
desired_person_idx = face_confidences[0][0]
logger.warning(f"Current person permanently lost - selecting new: {desired_person_idx}")
else:
# First frame - select most confident person ONCE
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
face_confidences.sort(key=lambda x: x[1], reverse=True)
desired_person_idx = face_confidences[0][0]
logger.info(f"INITIAL SELECTION - Person {desired_person_idx} (will be tracked throughout entire video)")
# IGNORE SPEECH DETECTION - it was causing instability
# We now track ONE person from start to finish, regardless of who speaks
# OLD LOGIC (commented out - was causing issues):
# This logic would switch based on "who is more centered" which caused constant switching
if False: # Disabled
# Calculate distance from center for each face
center_distances = []
for idx, face in enumerate(faces):
# Euclidean distance from frame center
dx = face.center_x - frame_center_x
dy = face.center_y - frame_center_y
distance = np.sqrt(dx**2 + dy**2)
center_distances.append((idx, distance, face.confidence))
# Sort by distance (closest first), then by confidence as tiebreaker
center_distances.sort(key=lambda x: (x[1], -x[2]))
most_centered_idx = center_distances[0][0]
most_centered_distance = center_distances[0][1]
# STICKY BEHAVIOR: If we already have someone selected, only switch if:
# - New person is SIGNIFICANTLY more centered (30% closer to center)
# - OR current person is now very far from center (>40% of frame width)
if self.current_selected_people and len(self.current_selected_people) > 0:
current_idx = self.current_selected_people[0]
if current_idx < len(faces):
current_face = faces[current_idx]
current_dx = current_face.center_x - frame_center_x
current_dy = current_face.center_y - frame_center_y
current_distance = np.sqrt(current_dx**2 + current_dy**2)
# Define "significantly better" threshold
max_acceptable_distance = frame_width * 0.4 # 40% of frame width
improvement_threshold = 0.7 # New person must be 30% closer (0.7 ratio)
# Keep current person if they're still reasonably centered
if current_distance < max_acceptable_distance:
# Current person is still acceptable - only switch if new is MUCH better
if most_centered_distance < current_distance * improvement_threshold:
desired_person_idx = most_centered_idx
logger.debug(f"Switching: new person MUCH more centered ({most_centered_distance:.0f} vs {current_distance:.0f})")
else:
desired_person_idx = current_idx # Keep current
logger.debug(f"Keeping current person: still reasonably centered ({current_distance:.0f} px from center)")
else:
# Current person is too far from center - switch
desired_person_idx = most_centered_idx
logger.debug(f"Current person too far from center ({current_distance:.0f} px), switching")
else:
# Current selection invalid
desired_person_idx = most_centered_idx
else:
# First time - select most centered
desired_person_idx = most_centered_idx
# Wrap in list for compatibility with existing code
desired_people = [desired_person_idx] if desired_person_idx is not None else []
# ULTRA-STABLE MODE: NO SWITCHING LOGIC AT ALL
# Simply set the person and never change
if not self.current_selected_people:
# First time only
self.current_selected_people = desired_people
self.last_switch_frame = frame_number
logger.info(f"Frame {frame_number}: LOCKED ON person {desired_people} - will never switch")
else:
# Already have someone - just update to desired (which is same person due to logic above)
self.current_selected_people = desired_people
return self.current_selected_people.copy()
def _ensure_distinct_people(
self,
faces: List[FaceDetection],
people_indices: List[int]
) -> List[int]:
"""
Ensure selected people are distinct by checking minimum distance between them.
Prevents showing the same person twice due to duplicate detection.
Args:
faces: List of detected faces
people_indices: Indices of people to validate
Returns:
List of distinct people indices (max 2)
"""
if len(people_indices) <= 1:
return people_indices
distinct_people = []
for idx in people_indices:
if idx >= len(faces):
continue
current_face = faces[idx]
is_distinct = True
# Check if this person is too close to any already selected person
for selected_idx in distinct_people:
selected_face = faces[selected_idx]
# Calculate distance between face centers
dx = current_face.center_x - selected_face.center_x
dy = current_face.center_y - selected_face.center_y
distance = np.sqrt(dx**2 + dy**2)
# Also check overlap via IoU (Intersection over Union)
x1_overlap = max(current_face.x, selected_face.x)
y1_overlap = max(current_face.y, selected_face.y)
x2_overlap = min(current_face.x + current_face.width, selected_face.x + selected_face.width)
y2_overlap = min(current_face.y + current_face.height, selected_face.y + selected_face.height)
overlap_area = 0
if x1_overlap < x2_overlap and y1_overlap < y2_overlap:
overlap_area = (x2_overlap - x1_overlap) * (y2_overlap - y1_overlap)
# Calculate areas
area1 = current_face.width * current_face.height
area2 = selected_face.width * selected_face.height
min_area = min(area1, area2)
# If faces are very close OR significantly overlapping, they're likely the same person
# Minimum distance: 1/4 of average face width
min_distance = (current_face.width + selected_face.width) / 8
overlap_threshold = 0.3 # 30% overlap
if distance < min_distance or (min_area > 0 and overlap_area / min_area > overlap_threshold):
is_distinct = False
logger.debug(f"Person {idx} too similar to person {selected_idx} (dist={distance:.1f}, overlap={overlap_area/min_area if min_area > 0 else 0:.2%})")
break
if is_distinct:
distinct_people.append(idx)
# Stop at 2 distinct people
if len(distinct_people) >= 2:
break
# If we couldn't find 2 distinct people, return at most 1
if len(distinct_people) < 2 and len(people_indices) >= 2:
logger.debug(f"Only {len(distinct_people)} distinct person(s) found from {len(people_indices)} detections")
return distinct_people
def _calculate_focus_point(
self,
faces: List[FaceDetection],
selected_people: List[int]
) -> Optional[Tuple[int, int]]:
"""
Calculate the primary focus point based on selected people with temporal smoothing.
Args:
faces: List of detected faces
selected_people: Indices of people selected for display
Returns:
(x, y) tuple of focus center, or None if no faces
"""
if not faces:
if not faces or not selected_people:
return None
if active_speakers:
speaker_faces = [faces[i] for i in active_speakers if i < len(faces)]
if speaker_faces:
primary_speaker = max(speaker_faces, key=lambda f: f.confidence)
return (primary_speaker.center_x, primary_speaker.center_y)
# Calculate raw focus point
raw_focus_x = 0
raw_focus_y = 0
most_confident = max(faces, key=lambda f: f.confidence)
return (most_confident.center_x, most_confident.center_y)
if len(selected_people) == 1:
# Single person - focus on them
if selected_people[0] < len(faces):
primary = faces[selected_people[0]]
raw_focus_x = primary.center_x
raw_focus_y = primary.center_y
else:
# Fallback
most_confident = max(faces, key=lambda f: f.confidence)
raw_focus_x = most_confident.center_x
raw_focus_y = most_confident.center_y
else:
# Multiple people - focus on the CENTER between them for stability
# This prevents jarring movements when switching focus between people
valid_people = [idx for idx in selected_people if idx < len(faces)]
if valid_people:
centers_x = [faces[idx].center_x for idx in valid_people]
centers_y = [faces[idx].center_y for idx in valid_people]
raw_focus_x = int(np.mean(centers_x))
raw_focus_y = int(np.mean(centers_y))
else:
# Fallback
most_confident = max(faces, key=lambda f: f.confidence)
raw_focus_x = most_confident.center_x
raw_focus_y = most_confident.center_y
# Apply temporal smoothing using focus history
self.focus_history.append((raw_focus_x, raw_focus_y))
if len(self.focus_history) > self.focus_history_size:
self.focus_history.pop(0)
# Calculate smoothed focus as weighted average (more weight to recent frames)
if len(self.focus_history) > 1:
# Exponential weights: recent frames have more influence
weights = [2 ** i for i in range(len(self.focus_history))]
total_weight = sum(weights)
smoothed_x = sum(x * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
smoothed_y = sum(y * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
return (int(smoothed_x), int(smoothed_y))
else:
return (raw_focus_x, raw_focus_y)
def close(self):
"""Release resources."""
self.detector.close()
# Clear tracking state to free memory
self.previous_faces.clear()
self.current_selected_people.clear()
self.focus_history.clear()