Ajusta contexto, falas e foco, tremulação do video e demais bugs
This commit is contained in:
@@ -41,6 +41,18 @@ class PersonTracking:
|
||||
frame_number: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class GroupBoundingBox:
|
||||
"""Bounding box containing all tracked faces."""
|
||||
x: int
|
||||
y: int
|
||||
width: int
|
||||
height: int
|
||||
center_x: int
|
||||
center_y: int
|
||||
face_count: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class FrameContext:
|
||||
"""Context information for a video frame."""
|
||||
@@ -50,7 +62,8 @@ class FrameContext:
|
||||
active_speakers: List[int] # indices of speaking faces
|
||||
primary_focus: Optional[Tuple[int, int]] # (x, y) center point
|
||||
layout_mode: str # "single", "dual_split", "grid"
|
||||
selected_people: List[int] = field(default_factory=list) # indices of people selected for display (max 2)
|
||||
selected_people: List[int] = field(default_factory=list) # indices of people selected for display
|
||||
group_bounds: Optional[GroupBoundingBox] = None # bounding box for all detected faces
|
||||
|
||||
|
||||
class MediaPipeDetector:
|
||||
@@ -385,10 +398,11 @@ class AudioActivityDetector:
|
||||
class ContextAnalyzer:
|
||||
"""Analyzes video context to determine focus and layout."""
|
||||
|
||||
def __init__(self, person_switch_cooldown: int = 30):
|
||||
def __init__(self, person_switch_cooldown: int = 30, min_face_confidence: float = 0.3):
|
||||
self.detector = MediaPipeDetector()
|
||||
self.audio_detector = AudioActivityDetector()
|
||||
self.previous_faces: List[FaceDetection] = []
|
||||
self.min_face_confidence = min_face_confidence
|
||||
|
||||
# Person tracking state
|
||||
self.current_selected_people: List[int] = [] # Indices of people currently on screen
|
||||
@@ -400,9 +414,9 @@ class ContextAnalyzer:
|
||||
self.stability_threshold = 20 # Frames needed to confirm a switch (increased for more stability)
|
||||
self.last_switched_people: List[int] = [] # People we just switched FROM
|
||||
|
||||
# Focus stability: track recent focus points for temporal smoothing
|
||||
self.focus_history: List[Tuple[int, int]] = []
|
||||
self.focus_history_size: int = 5 # Keep last 5 focus points for smoothing
|
||||
self.focus_history_size: int = 20
|
||||
self.focus_dead_zone: int = 60
|
||||
|
||||
# Debug logging
|
||||
self.frame_log_interval = 30 # Log every N frames
|
||||
@@ -429,9 +443,11 @@ class ContextAnalyzer:
|
||||
FrameContext with detection results
|
||||
"""
|
||||
faces = self.detector.detect_face_landmarks(frame)
|
||||
faces = [face for face in faces if face.confidence >= self.min_face_confidence] if faces else []
|
||||
|
||||
if not faces:
|
||||
faces = self.detector.detect_faces(frame)
|
||||
faces = [face for face in faces if face.confidence >= self.min_face_confidence] if faces else []
|
||||
|
||||
# Determine who is speaking
|
||||
active_speakers = []
|
||||
@@ -440,13 +456,13 @@ class ContextAnalyzer:
|
||||
for i, face in enumerate(faces):
|
||||
is_speaking = False
|
||||
|
||||
# Check audio-based speech detection
|
||||
if has_audio_speech:
|
||||
is_speaking = True
|
||||
|
||||
# Check lip movement (visual speech detection)
|
||||
# Prefer visual cues when multiple faces are present.
|
||||
if face.landmarks and len(self.previous_faces) > i:
|
||||
is_speaking = is_speaking or self._detect_lip_movement(face, self.previous_faces[i])
|
||||
is_speaking = self._detect_lip_movement(face, self.previous_faces[i])
|
||||
|
||||
# Audio can confirm speech when there's only one face.
|
||||
if has_audio_speech and len(faces) == 1:
|
||||
is_speaking = True
|
||||
|
||||
if is_speaking:
|
||||
active_speakers.append(i)
|
||||
@@ -456,26 +472,41 @@ class ContextAnalyzer:
|
||||
logger.info(f"Speech detection - Frame {frame_number}: audio_active={has_audio_speech}, "
|
||||
f"speakers={active_speakers}, total_faces={len(faces)}")
|
||||
|
||||
# Select THE person to focus on (always single person)
|
||||
# Priority: 1) Who is speaking, 2) Who is most centered
|
||||
selected_people = self._select_person_to_focus(
|
||||
faces,
|
||||
active_speakers,
|
||||
frame_number,
|
||||
frame.shape[1], # frame width for center calculation
|
||||
frame.shape[0] # frame height for center calculation
|
||||
)
|
||||
if active_speakers:
|
||||
selected_people = active_speakers[:4]
|
||||
if len(selected_people) == 1:
|
||||
layout_mode = "single"
|
||||
elif len(selected_people) == 2:
|
||||
layout_mode = "dual_split"
|
||||
else:
|
||||
layout_mode = "grid"
|
||||
else:
|
||||
# Select THE person to focus on (always single person)
|
||||
# Priority: 1) Who is speaking, 2) Who is most centered
|
||||
selected_people = self._select_person_to_focus(
|
||||
faces,
|
||||
active_speakers,
|
||||
frame_number,
|
||||
frame.shape[1], # frame width for center calculation
|
||||
frame.shape[0] # frame height for center calculation
|
||||
)
|
||||
layout_mode = "single"
|
||||
|
||||
# Always use single-person layout (no split screen)
|
||||
layout_mode = "single"
|
||||
# Calculate group bounding box for ALL detected faces (multi-person support)
|
||||
group_bounds = self._calculate_group_bounding_box(faces)
|
||||
|
||||
primary_focus = self._calculate_focus_point(faces, selected_people)
|
||||
# For multi-person mode, use group center as primary focus
|
||||
if group_bounds and group_bounds.face_count > 1:
|
||||
primary_focus = (group_bounds.center_x, group_bounds.center_y)
|
||||
else:
|
||||
primary_focus = self._calculate_focus_point(faces, selected_people)
|
||||
|
||||
# Debug logging every N frames
|
||||
if frame_number % self.frame_log_interval == 0:
|
||||
focus_reason = "speaker" if active_speakers else "no_speech_detected"
|
||||
group_info = f", group={group_bounds.face_count} faces" if group_bounds else ""
|
||||
logger.info(f"Frame {frame_number}: {len(faces)} faces, "
|
||||
f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}")
|
||||
f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}{group_info}")
|
||||
|
||||
self.previous_faces = faces
|
||||
|
||||
@@ -486,7 +517,8 @@ class ContextAnalyzer:
|
||||
active_speakers=active_speakers,
|
||||
primary_focus=primary_focus,
|
||||
layout_mode=layout_mode,
|
||||
selected_people=selected_people
|
||||
selected_people=selected_people,
|
||||
group_bounds=group_bounds
|
||||
)
|
||||
|
||||
def _detect_lip_movement(self, current_face: FaceDetection, previous_face: FaceDetection) -> bool:
|
||||
@@ -543,134 +575,68 @@ class ContextAnalyzer:
|
||||
self.current_selected_people = []
|
||||
return []
|
||||
|
||||
# If only 1 person, always focus on them
|
||||
if len(faces) == 1:
|
||||
self.current_selected_people = [0]
|
||||
return [0]
|
||||
|
||||
# Check if we can switch people (cooldown period)
|
||||
frames_since_last_switch = frame_number - self.last_switch_frame
|
||||
can_switch = frames_since_last_switch >= self.person_switch_cooldown
|
||||
|
||||
# Calculate frame center for distance comparison
|
||||
frame_center_x = frame_width / 2
|
||||
frame_center_y = frame_height / 2
|
||||
|
||||
# ULTRA-STABLE MODE: Select ONE person at start, NEVER switch
|
||||
# This completely eliminates switching-related instability
|
||||
desired_person_idx = None
|
||||
|
||||
# If we already have someone selected, ALWAYS KEEP THEM (never switch)
|
||||
if self.current_selected_people and len(self.current_selected_people) > 0:
|
||||
current_idx = self.current_selected_people[0]
|
||||
if current_idx < len(faces):
|
||||
# Current person still detected - keep them
|
||||
desired_person_idx = current_idx
|
||||
if active_speakers:
|
||||
if self.current_selected_people and self.current_selected_people[0] in active_speakers:
|
||||
desired_person_idx = self.current_selected_people[0]
|
||||
else:
|
||||
# Current person lost - try to find them again by position/size similarity
|
||||
# This handles temporary detection failures
|
||||
current_person_found = False
|
||||
if self.previous_faces and current_idx < len(self.previous_faces):
|
||||
prev_face = self.previous_faces[current_idx]
|
||||
# Find most similar face by position and size
|
||||
best_match_idx = None
|
||||
best_match_score = float('inf')
|
||||
for idx, face in enumerate(faces):
|
||||
# Distance between centers
|
||||
dx = face.center_x - prev_face.center_x
|
||||
dy = face.center_y - prev_face.center_y
|
||||
dist = np.sqrt(dx**2 + dy**2)
|
||||
# Size similarity
|
||||
size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height)
|
||||
score = dist + size_diff * 0.5
|
||||
if score < best_match_score:
|
||||
best_match_score = score
|
||||
best_match_idx = idx
|
||||
|
||||
if best_match_idx is not None and best_match_score < 1000:
|
||||
desired_person_idx = best_match_idx
|
||||
current_person_found = True
|
||||
|
||||
if not current_person_found:
|
||||
# Really lost - select most confident
|
||||
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
|
||||
face_confidences.sort(key=lambda x: x[1], reverse=True)
|
||||
desired_person_idx = face_confidences[0][0]
|
||||
logger.warning(f"Current person permanently lost - selecting new: {desired_person_idx}")
|
||||
if can_switch or not self.current_selected_people:
|
||||
desired_person_idx = active_speakers[0]
|
||||
if self.current_selected_people and desired_person_idx != self.current_selected_people[0]:
|
||||
logger.info(f"Switching focus to speaker: {desired_person_idx}")
|
||||
self.last_switch_frame = frame_number
|
||||
else:
|
||||
desired_person_idx = self.current_selected_people[0] if self.current_selected_people else active_speakers[0]
|
||||
else:
|
||||
# First frame - select most confident person ONCE
|
||||
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
|
||||
face_confidences.sort(key=lambda x: x[1], reverse=True)
|
||||
desired_person_idx = face_confidences[0][0]
|
||||
logger.info(f"INITIAL SELECTION - Person {desired_person_idx} (will be tracked throughout entire video)")
|
||||
|
||||
# IGNORE SPEECH DETECTION - it was causing instability
|
||||
# We now track ONE person from start to finish, regardless of who speaks
|
||||
|
||||
# OLD LOGIC (commented out - was causing issues):
|
||||
# This logic would switch based on "who is more centered" which caused constant switching
|
||||
if False: # Disabled
|
||||
# Calculate distance from center for each face
|
||||
center_distances = []
|
||||
for idx, face in enumerate(faces):
|
||||
# Euclidean distance from frame center
|
||||
dx = face.center_x - frame_center_x
|
||||
dy = face.center_y - frame_center_y
|
||||
distance = np.sqrt(dx**2 + dy**2)
|
||||
center_distances.append((idx, distance, face.confidence))
|
||||
|
||||
# Sort by distance (closest first), then by confidence as tiebreaker
|
||||
center_distances.sort(key=lambda x: (x[1], -x[2]))
|
||||
most_centered_idx = center_distances[0][0]
|
||||
most_centered_distance = center_distances[0][1]
|
||||
|
||||
# STICKY BEHAVIOR: If we already have someone selected, only switch if:
|
||||
# - New person is SIGNIFICANTLY more centered (30% closer to center)
|
||||
# - OR current person is now very far from center (>40% of frame width)
|
||||
if self.current_selected_people and len(self.current_selected_people) > 0:
|
||||
current_idx = self.current_selected_people[0]
|
||||
if current_idx < len(faces):
|
||||
current_face = faces[current_idx]
|
||||
current_dx = current_face.center_x - frame_center_x
|
||||
current_dy = current_face.center_y - frame_center_y
|
||||
current_distance = np.sqrt(current_dx**2 + current_dy**2)
|
||||
|
||||
# Define "significantly better" threshold
|
||||
max_acceptable_distance = frame_width * 0.4 # 40% of frame width
|
||||
improvement_threshold = 0.7 # New person must be 30% closer (0.7 ratio)
|
||||
|
||||
# Keep current person if they're still reasonably centered
|
||||
if current_distance < max_acceptable_distance:
|
||||
# Current person is still acceptable - only switch if new is MUCH better
|
||||
if most_centered_distance < current_distance * improvement_threshold:
|
||||
desired_person_idx = most_centered_idx
|
||||
logger.debug(f"Switching: new person MUCH more centered ({most_centered_distance:.0f} vs {current_distance:.0f})")
|
||||
else:
|
||||
desired_person_idx = current_idx # Keep current
|
||||
logger.debug(f"Keeping current person: still reasonably centered ({current_distance:.0f} px from center)")
|
||||
else:
|
||||
# Current person is too far from center - switch
|
||||
desired_person_idx = most_centered_idx
|
||||
logger.debug(f"Current person too far from center ({current_distance:.0f} px), switching")
|
||||
desired_person_idx = current_idx
|
||||
else:
|
||||
# Current selection invalid
|
||||
desired_person_idx = most_centered_idx
|
||||
else:
|
||||
# First time - select most centered
|
||||
desired_person_idx = most_centered_idx
|
||||
if self.previous_faces and current_idx < len(self.previous_faces):
|
||||
prev_face = self.previous_faces[current_idx]
|
||||
best_match_idx = None
|
||||
best_match_score = float('inf')
|
||||
for idx, face in enumerate(faces):
|
||||
dx = face.center_x - prev_face.center_x
|
||||
dy = face.center_y - prev_face.center_y
|
||||
dist = np.sqrt(dx**2 + dy**2)
|
||||
size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height)
|
||||
score = dist + size_diff * 0.5
|
||||
if score < best_match_score:
|
||||
best_match_score = score
|
||||
best_match_idx = idx
|
||||
|
||||
if best_match_idx is not None and best_match_score < 1000:
|
||||
desired_person_idx = best_match_idx
|
||||
else:
|
||||
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
|
||||
face_confidences.sort(key=lambda x: x[1], reverse=True)
|
||||
desired_person_idx = face_confidences[0][0]
|
||||
else:
|
||||
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
|
||||
face_confidences.sort(key=lambda x: x[1], reverse=True)
|
||||
desired_person_idx = face_confidences[0][0]
|
||||
else:
|
||||
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
|
||||
face_confidences.sort(key=lambda x: x[1], reverse=True)
|
||||
desired_person_idx = face_confidences[0][0]
|
||||
|
||||
# Wrap in list for compatibility with existing code
|
||||
desired_people = [desired_person_idx] if desired_person_idx is not None else []
|
||||
|
||||
# ULTRA-STABLE MODE: NO SWITCHING LOGIC AT ALL
|
||||
# Simply set the person and never change
|
||||
if not self.current_selected_people:
|
||||
# First time only
|
||||
self.current_selected_people = desired_people
|
||||
self.last_switch_frame = frame_number
|
||||
logger.info(f"Frame {frame_number}: LOCKED ON person {desired_people} - will never switch")
|
||||
logger.info(f"Frame {frame_number}: Locked on person {desired_people}")
|
||||
else:
|
||||
# Already have someone - just update to desired (which is same person due to logic above)
|
||||
self.current_selected_people = desired_people
|
||||
|
||||
return self.current_selected_people.copy()
|
||||
@@ -798,24 +764,77 @@ class ContextAnalyzer:
|
||||
raw_focus_x = most_confident.center_x
|
||||
raw_focus_y = most_confident.center_y
|
||||
|
||||
# Apply temporal smoothing using focus history
|
||||
if self.focus_history:
|
||||
last_x, last_y = self.focus_history[-1]
|
||||
dx = abs(raw_focus_x - last_x)
|
||||
dy = abs(raw_focus_y - last_y)
|
||||
if dx < self.focus_dead_zone and dy < self.focus_dead_zone:
|
||||
return self.focus_history[-1]
|
||||
|
||||
self.focus_history.append((raw_focus_x, raw_focus_y))
|
||||
if len(self.focus_history) > self.focus_history_size:
|
||||
self.focus_history.pop(0)
|
||||
|
||||
# Calculate smoothed focus as weighted average (more weight to recent frames)
|
||||
if len(self.focus_history) > 1:
|
||||
# Exponential weights: recent frames have more influence
|
||||
weights = [2 ** i for i in range(len(self.focus_history))]
|
||||
total_weight = sum(weights)
|
||||
|
||||
smoothed_x = sum(x * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
|
||||
smoothed_y = sum(y * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
|
||||
|
||||
return (int(smoothed_x), int(smoothed_y))
|
||||
if len(self.focus_history) >= 5:
|
||||
xs = [x for x, y in self.focus_history]
|
||||
ys = [y for x, y in self.focus_history]
|
||||
median_x = int(np.median(xs))
|
||||
median_y = int(np.median(ys))
|
||||
return (median_x, median_y)
|
||||
else:
|
||||
return (raw_focus_x, raw_focus_y)
|
||||
|
||||
def _calculate_group_bounding_box(
|
||||
self,
|
||||
faces: List[FaceDetection],
|
||||
padding_percent: float = 0.15,
|
||||
max_faces: int = 6
|
||||
) -> Optional[GroupBoundingBox]:
|
||||
"""
|
||||
Calculate bounding box containing all detected faces with padding.
|
||||
|
||||
Args:
|
||||
faces: List of detected faces
|
||||
padding_percent: Padding around group as percentage of bbox dimensions
|
||||
max_faces: Maximum faces to include (use most confident if exceeded)
|
||||
|
||||
Returns:
|
||||
GroupBoundingBox or None if no faces
|
||||
"""
|
||||
if not faces:
|
||||
return None
|
||||
|
||||
# If too many faces, use most confident ones
|
||||
if len(faces) > max_faces:
|
||||
faces = sorted(faces, key=lambda f: f.confidence, reverse=True)[:max_faces]
|
||||
|
||||
# Calculate bounding box containing all faces
|
||||
min_x = min(f.x for f in faces)
|
||||
max_x = max(f.x + f.width for f in faces)
|
||||
min_y = min(f.y for f in faces)
|
||||
max_y = max(f.y + f.height for f in faces)
|
||||
|
||||
# Add padding
|
||||
width = max_x - min_x
|
||||
height = max_y - min_y
|
||||
pad_x = int(width * padding_percent)
|
||||
pad_y = int(height * padding_percent)
|
||||
|
||||
final_x = max(0, min_x - pad_x)
|
||||
final_y = max(0, min_y - pad_y)
|
||||
final_width = width + 2 * pad_x
|
||||
final_height = height + 2 * pad_y
|
||||
|
||||
return GroupBoundingBox(
|
||||
x=final_x,
|
||||
y=final_y,
|
||||
width=final_width,
|
||||
height=final_height,
|
||||
center_x=final_x + final_width // 2,
|
||||
center_y=final_y + final_height // 2,
|
||||
face_count=len(faces)
|
||||
)
|
||||
|
||||
def close(self):
|
||||
"""Release resources."""
|
||||
self.detector.close()
|
||||
|
||||
Reference in New Issue
Block a user