Ajusta contexto, falas e foco, tremulação do video e demais bugs

This commit is contained in:
LeoMortari
2026-01-03 19:42:23 -03:00
parent c1914dad00
commit 3f7329869d
7 changed files with 932 additions and 455 deletions

View File

@@ -41,6 +41,18 @@ class PersonTracking:
frame_number: int
@dataclass
class GroupBoundingBox:
"""Bounding box containing all tracked faces."""
x: int
y: int
width: int
height: int
center_x: int
center_y: int
face_count: int
@dataclass
class FrameContext:
"""Context information for a video frame."""
@@ -50,7 +62,8 @@ class FrameContext:
active_speakers: List[int] # indices of speaking faces
primary_focus: Optional[Tuple[int, int]] # (x, y) center point
layout_mode: str # "single", "dual_split", "grid"
selected_people: List[int] = field(default_factory=list) # indices of people selected for display (max 2)
selected_people: List[int] = field(default_factory=list) # indices of people selected for display
group_bounds: Optional[GroupBoundingBox] = None # bounding box for all detected faces
class MediaPipeDetector:
@@ -385,10 +398,11 @@ class AudioActivityDetector:
class ContextAnalyzer:
"""Analyzes video context to determine focus and layout."""
def __init__(self, person_switch_cooldown: int = 30):
def __init__(self, person_switch_cooldown: int = 30, min_face_confidence: float = 0.3):
self.detector = MediaPipeDetector()
self.audio_detector = AudioActivityDetector()
self.previous_faces: List[FaceDetection] = []
self.min_face_confidence = min_face_confidence
# Person tracking state
self.current_selected_people: List[int] = [] # Indices of people currently on screen
@@ -400,9 +414,9 @@ class ContextAnalyzer:
self.stability_threshold = 20 # Frames needed to confirm a switch (increased for more stability)
self.last_switched_people: List[int] = [] # People we just switched FROM
# Focus stability: track recent focus points for temporal smoothing
self.focus_history: List[Tuple[int, int]] = []
self.focus_history_size: int = 5 # Keep last 5 focus points for smoothing
self.focus_history_size: int = 20
self.focus_dead_zone: int = 60
# Debug logging
self.frame_log_interval = 30 # Log every N frames
@@ -429,9 +443,11 @@ class ContextAnalyzer:
FrameContext with detection results
"""
faces = self.detector.detect_face_landmarks(frame)
faces = [face for face in faces if face.confidence >= self.min_face_confidence] if faces else []
if not faces:
faces = self.detector.detect_faces(frame)
faces = [face for face in faces if face.confidence >= self.min_face_confidence] if faces else []
# Determine who is speaking
active_speakers = []
@@ -440,13 +456,13 @@ class ContextAnalyzer:
for i, face in enumerate(faces):
is_speaking = False
# Check audio-based speech detection
if has_audio_speech:
is_speaking = True
# Check lip movement (visual speech detection)
# Prefer visual cues when multiple faces are present.
if face.landmarks and len(self.previous_faces) > i:
is_speaking = is_speaking or self._detect_lip_movement(face, self.previous_faces[i])
is_speaking = self._detect_lip_movement(face, self.previous_faces[i])
# Audio can confirm speech when there's only one face.
if has_audio_speech and len(faces) == 1:
is_speaking = True
if is_speaking:
active_speakers.append(i)
@@ -456,26 +472,41 @@ class ContextAnalyzer:
logger.info(f"Speech detection - Frame {frame_number}: audio_active={has_audio_speech}, "
f"speakers={active_speakers}, total_faces={len(faces)}")
# Select THE person to focus on (always single person)
# Priority: 1) Who is speaking, 2) Who is most centered
selected_people = self._select_person_to_focus(
faces,
active_speakers,
frame_number,
frame.shape[1], # frame width for center calculation
frame.shape[0] # frame height for center calculation
)
if active_speakers:
selected_people = active_speakers[:4]
if len(selected_people) == 1:
layout_mode = "single"
elif len(selected_people) == 2:
layout_mode = "dual_split"
else:
layout_mode = "grid"
else:
# Select THE person to focus on (always single person)
# Priority: 1) Who is speaking, 2) Who is most centered
selected_people = self._select_person_to_focus(
faces,
active_speakers,
frame_number,
frame.shape[1], # frame width for center calculation
frame.shape[0] # frame height for center calculation
)
layout_mode = "single"
# Always use single-person layout (no split screen)
layout_mode = "single"
# Calculate group bounding box for ALL detected faces (multi-person support)
group_bounds = self._calculate_group_bounding_box(faces)
primary_focus = self._calculate_focus_point(faces, selected_people)
# For multi-person mode, use group center as primary focus
if group_bounds and group_bounds.face_count > 1:
primary_focus = (group_bounds.center_x, group_bounds.center_y)
else:
primary_focus = self._calculate_focus_point(faces, selected_people)
# Debug logging every N frames
if frame_number % self.frame_log_interval == 0:
focus_reason = "speaker" if active_speakers else "no_speech_detected"
group_info = f", group={group_bounds.face_count} faces" if group_bounds else ""
logger.info(f"Frame {frame_number}: {len(faces)} faces, "
f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}")
f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}{group_info}")
self.previous_faces = faces
@@ -486,7 +517,8 @@ class ContextAnalyzer:
active_speakers=active_speakers,
primary_focus=primary_focus,
layout_mode=layout_mode,
selected_people=selected_people
selected_people=selected_people,
group_bounds=group_bounds
)
def _detect_lip_movement(self, current_face: FaceDetection, previous_face: FaceDetection) -> bool:
@@ -543,134 +575,68 @@ class ContextAnalyzer:
self.current_selected_people = []
return []
# If only 1 person, always focus on them
if len(faces) == 1:
self.current_selected_people = [0]
return [0]
# Check if we can switch people (cooldown period)
frames_since_last_switch = frame_number - self.last_switch_frame
can_switch = frames_since_last_switch >= self.person_switch_cooldown
# Calculate frame center for distance comparison
frame_center_x = frame_width / 2
frame_center_y = frame_height / 2
# ULTRA-STABLE MODE: Select ONE person at start, NEVER switch
# This completely eliminates switching-related instability
desired_person_idx = None
# If we already have someone selected, ALWAYS KEEP THEM (never switch)
if self.current_selected_people and len(self.current_selected_people) > 0:
current_idx = self.current_selected_people[0]
if current_idx < len(faces):
# Current person still detected - keep them
desired_person_idx = current_idx
if active_speakers:
if self.current_selected_people and self.current_selected_people[0] in active_speakers:
desired_person_idx = self.current_selected_people[0]
else:
# Current person lost - try to find them again by position/size similarity
# This handles temporary detection failures
current_person_found = False
if self.previous_faces and current_idx < len(self.previous_faces):
prev_face = self.previous_faces[current_idx]
# Find most similar face by position and size
best_match_idx = None
best_match_score = float('inf')
for idx, face in enumerate(faces):
# Distance between centers
dx = face.center_x - prev_face.center_x
dy = face.center_y - prev_face.center_y
dist = np.sqrt(dx**2 + dy**2)
# Size similarity
size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height)
score = dist + size_diff * 0.5
if score < best_match_score:
best_match_score = score
best_match_idx = idx
if best_match_idx is not None and best_match_score < 1000:
desired_person_idx = best_match_idx
current_person_found = True
if not current_person_found:
# Really lost - select most confident
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
face_confidences.sort(key=lambda x: x[1], reverse=True)
desired_person_idx = face_confidences[0][0]
logger.warning(f"Current person permanently lost - selecting new: {desired_person_idx}")
if can_switch or not self.current_selected_people:
desired_person_idx = active_speakers[0]
if self.current_selected_people and desired_person_idx != self.current_selected_people[0]:
logger.info(f"Switching focus to speaker: {desired_person_idx}")
self.last_switch_frame = frame_number
else:
desired_person_idx = self.current_selected_people[0] if self.current_selected_people else active_speakers[0]
else:
# First frame - select most confident person ONCE
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
face_confidences.sort(key=lambda x: x[1], reverse=True)
desired_person_idx = face_confidences[0][0]
logger.info(f"INITIAL SELECTION - Person {desired_person_idx} (will be tracked throughout entire video)")
# IGNORE SPEECH DETECTION - it was causing instability
# We now track ONE person from start to finish, regardless of who speaks
# OLD LOGIC (commented out - was causing issues):
# This logic would switch based on "who is more centered" which caused constant switching
if False: # Disabled
# Calculate distance from center for each face
center_distances = []
for idx, face in enumerate(faces):
# Euclidean distance from frame center
dx = face.center_x - frame_center_x
dy = face.center_y - frame_center_y
distance = np.sqrt(dx**2 + dy**2)
center_distances.append((idx, distance, face.confidence))
# Sort by distance (closest first), then by confidence as tiebreaker
center_distances.sort(key=lambda x: (x[1], -x[2]))
most_centered_idx = center_distances[0][0]
most_centered_distance = center_distances[0][1]
# STICKY BEHAVIOR: If we already have someone selected, only switch if:
# - New person is SIGNIFICANTLY more centered (30% closer to center)
# - OR current person is now very far from center (>40% of frame width)
if self.current_selected_people and len(self.current_selected_people) > 0:
current_idx = self.current_selected_people[0]
if current_idx < len(faces):
current_face = faces[current_idx]
current_dx = current_face.center_x - frame_center_x
current_dy = current_face.center_y - frame_center_y
current_distance = np.sqrt(current_dx**2 + current_dy**2)
# Define "significantly better" threshold
max_acceptable_distance = frame_width * 0.4 # 40% of frame width
improvement_threshold = 0.7 # New person must be 30% closer (0.7 ratio)
# Keep current person if they're still reasonably centered
if current_distance < max_acceptable_distance:
# Current person is still acceptable - only switch if new is MUCH better
if most_centered_distance < current_distance * improvement_threshold:
desired_person_idx = most_centered_idx
logger.debug(f"Switching: new person MUCH more centered ({most_centered_distance:.0f} vs {current_distance:.0f})")
else:
desired_person_idx = current_idx # Keep current
logger.debug(f"Keeping current person: still reasonably centered ({current_distance:.0f} px from center)")
else:
# Current person is too far from center - switch
desired_person_idx = most_centered_idx
logger.debug(f"Current person too far from center ({current_distance:.0f} px), switching")
desired_person_idx = current_idx
else:
# Current selection invalid
desired_person_idx = most_centered_idx
else:
# First time - select most centered
desired_person_idx = most_centered_idx
if self.previous_faces and current_idx < len(self.previous_faces):
prev_face = self.previous_faces[current_idx]
best_match_idx = None
best_match_score = float('inf')
for idx, face in enumerate(faces):
dx = face.center_x - prev_face.center_x
dy = face.center_y - prev_face.center_y
dist = np.sqrt(dx**2 + dy**2)
size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height)
score = dist + size_diff * 0.5
if score < best_match_score:
best_match_score = score
best_match_idx = idx
if best_match_idx is not None and best_match_score < 1000:
desired_person_idx = best_match_idx
else:
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
face_confidences.sort(key=lambda x: x[1], reverse=True)
desired_person_idx = face_confidences[0][0]
else:
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
face_confidences.sort(key=lambda x: x[1], reverse=True)
desired_person_idx = face_confidences[0][0]
else:
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
face_confidences.sort(key=lambda x: x[1], reverse=True)
desired_person_idx = face_confidences[0][0]
# Wrap in list for compatibility with existing code
desired_people = [desired_person_idx] if desired_person_idx is not None else []
# ULTRA-STABLE MODE: NO SWITCHING LOGIC AT ALL
# Simply set the person and never change
if not self.current_selected_people:
# First time only
self.current_selected_people = desired_people
self.last_switch_frame = frame_number
logger.info(f"Frame {frame_number}: LOCKED ON person {desired_people} - will never switch")
logger.info(f"Frame {frame_number}: Locked on person {desired_people}")
else:
# Already have someone - just update to desired (which is same person due to logic above)
self.current_selected_people = desired_people
return self.current_selected_people.copy()
@@ -798,24 +764,77 @@ class ContextAnalyzer:
raw_focus_x = most_confident.center_x
raw_focus_y = most_confident.center_y
# Apply temporal smoothing using focus history
if self.focus_history:
last_x, last_y = self.focus_history[-1]
dx = abs(raw_focus_x - last_x)
dy = abs(raw_focus_y - last_y)
if dx < self.focus_dead_zone and dy < self.focus_dead_zone:
return self.focus_history[-1]
self.focus_history.append((raw_focus_x, raw_focus_y))
if len(self.focus_history) > self.focus_history_size:
self.focus_history.pop(0)
# Calculate smoothed focus as weighted average (more weight to recent frames)
if len(self.focus_history) > 1:
# Exponential weights: recent frames have more influence
weights = [2 ** i for i in range(len(self.focus_history))]
total_weight = sum(weights)
smoothed_x = sum(x * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
smoothed_y = sum(y * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
return (int(smoothed_x), int(smoothed_y))
if len(self.focus_history) >= 5:
xs = [x for x, y in self.focus_history]
ys = [y for x, y in self.focus_history]
median_x = int(np.median(xs))
median_y = int(np.median(ys))
return (median_x, median_y)
else:
return (raw_focus_x, raw_focus_y)
def _calculate_group_bounding_box(
self,
faces: List[FaceDetection],
padding_percent: float = 0.15,
max_faces: int = 6
) -> Optional[GroupBoundingBox]:
"""
Calculate bounding box containing all detected faces with padding.
Args:
faces: List of detected faces
padding_percent: Padding around group as percentage of bbox dimensions
max_faces: Maximum faces to include (use most confident if exceeded)
Returns:
GroupBoundingBox or None if no faces
"""
if not faces:
return None
# If too many faces, use most confident ones
if len(faces) > max_faces:
faces = sorted(faces, key=lambda f: f.confidence, reverse=True)[:max_faces]
# Calculate bounding box containing all faces
min_x = min(f.x for f in faces)
max_x = max(f.x + f.width for f in faces)
min_y = min(f.y for f in faces)
max_y = max(f.y + f.height for f in faces)
# Add padding
width = max_x - min_x
height = max_y - min_y
pad_x = int(width * padding_percent)
pad_y = int(height * padding_percent)
final_x = max(0, min_x - pad_x)
final_y = max(0, min_y - pad_y)
final_width = width + 2 * pad_x
final_height = height + 2 * pad_y
return GroupBoundingBox(
x=final_x,
y=final_y,
width=final_width,
height=final_height,
center_x=final_x + final_width // 2,
center_y=final_y + final_height // 2,
face_count=len(faces)
)
def close(self):
"""Release resources."""
self.detector.close()