Ajusta contexto, falas e foco, tremulação do video e demais bugs

2026-01-03 19:42:23 -03:00
parent c1914dad00
commit 3f7329869d
7 changed files with 932 additions and 455 deletions
--- a/video_render/context_detection.py
+++ b/video_render/context_detection.py
@@ -41,6 +41,18 @@ class PersonTracking:
    frame_number: int


+@dataclass
+class GroupBoundingBox:
+    """Bounding box containing all tracked faces."""
+    x: int
+    y: int
+    width: int
+    height: int
+    center_x: int
+    center_y: int
+    face_count: int
+
+
@dataclass
 class FrameContext:
    """Context information for a video frame."""
@@ -50,7 +62,8 @@ class FrameContext:
    active_speakers: List[int]  # indices of speaking faces
    primary_focus: Optional[Tuple[int, int]]  # (x, y) center point
    layout_mode: str  # "single", "dual_split", "grid"
-    selected_people: List[int] = field(default_factory=list)  # indices of people selected for display (max 2)
+    selected_people: List[int] = field(default_factory=list)  # indices of people selected for display
+    group_bounds: Optional[GroupBoundingBox] = None  # bounding box for all detected faces


 class MediaPipeDetector:
@@ -385,10 +398,11 @@ class AudioActivityDetector:
 class ContextAnalyzer:
    """Analyzes video context to determine focus and layout."""

-    def __init__(self, person_switch_cooldown: int = 30):
+    def __init__(self, person_switch_cooldown: int = 30, min_face_confidence: float = 0.3):
        self.detector = MediaPipeDetector()
        self.audio_detector = AudioActivityDetector()
        self.previous_faces: List[FaceDetection] = []
+        self.min_face_confidence = min_face_confidence

        # Person tracking state
        self.current_selected_people: List[int] = []  # Indices of people currently on screen
@@ -400,9 +414,9 @@ class ContextAnalyzer:
        self.stability_threshold = 20  # Frames needed to confirm a switch (increased for more stability)
        self.last_switched_people: List[int] = []  # People we just switched FROM

-        # Focus stability: track recent focus points for temporal smoothing
        self.focus_history: List[Tuple[int, int]] = []
-        self.focus_history_size: int = 5  # Keep last 5 focus points for smoothing
+        self.focus_history_size: int = 20
+        self.focus_dead_zone: int = 60

        # Debug logging
        self.frame_log_interval = 30  # Log every N frames
@@ -429,9 +443,11 @@ class ContextAnalyzer:
            FrameContext with detection results
        """
        faces = self.detector.detect_face_landmarks(frame)
+        faces = [face for face in faces if face.confidence >= self.min_face_confidence] if faces else []

        if not faces:
            faces = self.detector.detect_faces(frame)
+            faces = [face for face in faces if face.confidence >= self.min_face_confidence] if faces else []

        # Determine who is speaking
        active_speakers = []
@@ -440,13 +456,13 @@ class ContextAnalyzer:
        for i, face in enumerate(faces):
            is_speaking = False

-            # Check audio-based speech detection
-            if has_audio_speech:
-                is_speaking = True
-
-            # Check lip movement (visual speech detection)
+            # Prefer visual cues when multiple faces are present.
            if face.landmarks and len(self.previous_faces) > i:
-                is_speaking = is_speaking or self._detect_lip_movement(face, self.previous_faces[i])
+                is_speaking = self._detect_lip_movement(face, self.previous_faces[i])
+
+            # Audio can confirm speech when there's only one face.
+            if has_audio_speech and len(faces) == 1:
+                is_speaking = True

            if is_speaking:
                active_speakers.append(i)
@@ -456,26 +472,41 @@ class ContextAnalyzer:
            logger.info(f"Speech detection - Frame {frame_number}: audio_active={has_audio_speech}, "
                       f"speakers={active_speakers}, total_faces={len(faces)}")

-        # Select THE person to focus on (always single person)
-        # Priority: 1) Who is speaking, 2) Who is most centered
-        selected_people = self._select_person_to_focus(
-            faces,
-            active_speakers,
-            frame_number,
-            frame.shape[1],  # frame width for center calculation
-            frame.shape[0]   # frame height for center calculation
-        )
+        if active_speakers:
+            selected_people = active_speakers[:4]
+            if len(selected_people) == 1:
+                layout_mode = "single"
+            elif len(selected_people) == 2:
+                layout_mode = "dual_split"
+            else:
+                layout_mode = "grid"
+        else:
+            # Select THE person to focus on (always single person)
+            # Priority: 1) Who is speaking, 2) Who is most centered
+            selected_people = self._select_person_to_focus(
+                faces,
+                active_speakers,
+                frame_number,
+                frame.shape[1],  # frame width for center calculation
+                frame.shape[0]   # frame height for center calculation
+            )
+            layout_mode = "single"

-        # Always use single-person layout (no split screen)
-        layout_mode = "single"
+        # Calculate group bounding box for ALL detected faces (multi-person support)
+        group_bounds = self._calculate_group_bounding_box(faces)

-        primary_focus = self._calculate_focus_point(faces, selected_people)
+        # For multi-person mode, use group center as primary focus
+        if group_bounds and group_bounds.face_count > 1:
+            primary_focus = (group_bounds.center_x, group_bounds.center_y)
+        else:
+            primary_focus = self._calculate_focus_point(faces, selected_people)

        # Debug logging every N frames
        if frame_number % self.frame_log_interval == 0:
            focus_reason = "speaker" if active_speakers else "no_speech_detected"
+            group_info = f", group={group_bounds.face_count} faces" if group_bounds else ""
            logger.info(f"Frame {frame_number}: {len(faces)} faces, "
-                       f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}")
+                       f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}{group_info}")

        self.previous_faces = faces

@@ -486,7 +517,8 @@ class ContextAnalyzer:
            active_speakers=active_speakers,
            primary_focus=primary_focus,
            layout_mode=layout_mode,
-            selected_people=selected_people
+            selected_people=selected_people,
+            group_bounds=group_bounds
        )

    def _detect_lip_movement(self, current_face: FaceDetection, previous_face: FaceDetection) -> bool:
@@ -543,134 +575,68 @@ class ContextAnalyzer:
            self.current_selected_people = []
            return []

-        # If only 1 person, always focus on them
        if len(faces) == 1:
            self.current_selected_people = [0]
            return [0]

-        # Check if we can switch people (cooldown period)
        frames_since_last_switch = frame_number - self.last_switch_frame
        can_switch = frames_since_last_switch >= self.person_switch_cooldown

-        # Calculate frame center for distance comparison
-        frame_center_x = frame_width / 2
-        frame_center_y = frame_height / 2
-
-        # ULTRA-STABLE MODE: Select ONE person at start, NEVER switch
-        # This completely eliminates switching-related instability
        desired_person_idx = None

-        # If we already have someone selected, ALWAYS KEEP THEM (never switch)
-        if self.current_selected_people and len(self.current_selected_people) > 0:
-            current_idx = self.current_selected_people[0]
-            if current_idx < len(faces):
-                # Current person still detected - keep them
-                desired_person_idx = current_idx
+        if active_speakers:
+            if self.current_selected_people and self.current_selected_people[0] in active_speakers:
+                desired_person_idx = self.current_selected_people[0]
            else:
-                # Current person lost - try to find them again by position/size similarity
-                # This handles temporary detection failures
-                current_person_found = False
-                if self.previous_faces and current_idx < len(self.previous_faces):
-                    prev_face = self.previous_faces[current_idx]
-                    # Find most similar face by position and size
-                    best_match_idx = None
-                    best_match_score = float('inf')
-                    for idx, face in enumerate(faces):
-                        # Distance between centers
-                        dx = face.center_x - prev_face.center_x
-                        dy = face.center_y - prev_face.center_y
-                        dist = np.sqrt(dx**2 + dy**2)
-                        # Size similarity
-                        size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height)
-                        score = dist + size_diff * 0.5
-                        if score < best_match_score:
-                            best_match_score = score
-                            best_match_idx = idx
-
-                    if best_match_idx is not None and best_match_score < 1000:
-                        desired_person_idx = best_match_idx
-                        current_person_found = True
-
-                if not current_person_found:
-                    # Really lost - select most confident
-                    face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
-                    face_confidences.sort(key=lambda x: x[1], reverse=True)
-                    desired_person_idx = face_confidences[0][0]
-                    logger.warning(f"Current person permanently lost - selecting new: {desired_person_idx}")
+                if can_switch or not self.current_selected_people:
+                    desired_person_idx = active_speakers[0]
+                    if self.current_selected_people and desired_person_idx != self.current_selected_people[0]:
+                        logger.info(f"Switching focus to speaker: {desired_person_idx}")
+                        self.last_switch_frame = frame_number
+                else:
+                    desired_person_idx = self.current_selected_people[0] if self.current_selected_people else active_speakers[0]
        else:
-            # First frame - select most confident person ONCE
-            face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
-            face_confidences.sort(key=lambda x: x[1], reverse=True)
-            desired_person_idx = face_confidences[0][0]
-            logger.info(f"INITIAL SELECTION - Person {desired_person_idx} (will be tracked throughout entire video)")
-
-        # IGNORE SPEECH DETECTION - it was causing instability
-        # We now track ONE person from start to finish, regardless of who speaks
-
-        # OLD LOGIC (commented out - was causing issues):
-        # This logic would switch based on "who is more centered" which caused constant switching
-        if False:  # Disabled
-            # Calculate distance from center for each face
-            center_distances = []
-            for idx, face in enumerate(faces):
-                # Euclidean distance from frame center
-                dx = face.center_x - frame_center_x
-                dy = face.center_y - frame_center_y
-                distance = np.sqrt(dx**2 + dy**2)
-                center_distances.append((idx, distance, face.confidence))
-
-            # Sort by distance (closest first), then by confidence as tiebreaker
-            center_distances.sort(key=lambda x: (x[1], -x[2]))
-            most_centered_idx = center_distances[0][0]
-            most_centered_distance = center_distances[0][1]
-
-            # STICKY BEHAVIOR: If we already have someone selected, only switch if:
-            # - New person is SIGNIFICANTLY more centered (30% closer to center)
-            # - OR current person is now very far from center (>40% of frame width)
            if self.current_selected_people and len(self.current_selected_people) > 0:
                current_idx = self.current_selected_people[0]
                if current_idx < len(faces):
-                    current_face = faces[current_idx]
-                    current_dx = current_face.center_x - frame_center_x
-                    current_dy = current_face.center_y - frame_center_y
-                    current_distance = np.sqrt(current_dx**2 + current_dy**2)
-
-                    # Define "significantly better" threshold
-                    max_acceptable_distance = frame_width * 0.4  # 40% of frame width
-                    improvement_threshold = 0.7  # New person must be 30% closer (0.7 ratio)
-
-                    # Keep current person if they're still reasonably centered
-                    if current_distance < max_acceptable_distance:
-                        # Current person is still acceptable - only switch if new is MUCH better
-                        if most_centered_distance < current_distance * improvement_threshold:
-                            desired_person_idx = most_centered_idx
-                            logger.debug(f"Switching: new person MUCH more centered ({most_centered_distance:.0f} vs {current_distance:.0f})")
-                        else:
-                            desired_person_idx = current_idx  # Keep current
-                            logger.debug(f"Keeping current person: still reasonably centered ({current_distance:.0f} px from center)")
-                    else:
-                        # Current person is too far from center - switch
-                        desired_person_idx = most_centered_idx
-                        logger.debug(f"Current person too far from center ({current_distance:.0f} px), switching")
+                    desired_person_idx = current_idx
                else:
-                    # Current selection invalid
-                    desired_person_idx = most_centered_idx
-            else:
-                # First time - select most centered
-                desired_person_idx = most_centered_idx
+                    if self.previous_faces and current_idx < len(self.previous_faces):
+                        prev_face = self.previous_faces[current_idx]
+                        best_match_idx = None
+                        best_match_score = float('inf')
+                        for idx, face in enumerate(faces):
+                            dx = face.center_x - prev_face.center_x
+                            dy = face.center_y - prev_face.center_y
+                            dist = np.sqrt(dx**2 + dy**2)
+                            size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height)
+                            score = dist + size_diff * 0.5
+                            if score < best_match_score:
+                                best_match_score = score
+                                best_match_idx = idx
+
+                        if best_match_idx is not None and best_match_score < 1000:
+                            desired_person_idx = best_match_idx
+                        else:
+                            face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
+                            face_confidences.sort(key=lambda x: x[1], reverse=True)
+                            desired_person_idx = face_confidences[0][0]
+                    else:
+                        face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
+                        face_confidences.sort(key=lambda x: x[1], reverse=True)
+                        desired_person_idx = face_confidences[0][0]
+            else:
+                face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
+                face_confidences.sort(key=lambda x: x[1], reverse=True)
+                desired_person_idx = face_confidences[0][0]

-        # Wrap in list for compatibility with existing code
        desired_people = [desired_person_idx] if desired_person_idx is not None else []

-        # ULTRA-STABLE MODE: NO SWITCHING LOGIC AT ALL
-        # Simply set the person and never change
        if not self.current_selected_people:
-            # First time only
            self.current_selected_people = desired_people
            self.last_switch_frame = frame_number
-            logger.info(f"Frame {frame_number}: LOCKED ON person {desired_people} - will never switch")
+            logger.info(f"Frame {frame_number}: Locked on person {desired_people}")
        else:
-            # Already have someone - just update to desired (which is same person due to logic above)
            self.current_selected_people = desired_people

        return self.current_selected_people.copy()
@@ -798,24 +764,77 @@ class ContextAnalyzer:
                raw_focus_x = most_confident.center_x
                raw_focus_y = most_confident.center_y

-        # Apply temporal smoothing using focus history
+        if self.focus_history:
+            last_x, last_y = self.focus_history[-1]
+            dx = abs(raw_focus_x - last_x)
+            dy = abs(raw_focus_y - last_y)
+            if dx < self.focus_dead_zone and dy < self.focus_dead_zone:
+                return self.focus_history[-1]
+
        self.focus_history.append((raw_focus_x, raw_focus_y))
        if len(self.focus_history) > self.focus_history_size:
            self.focus_history.pop(0)

-        # Calculate smoothed focus as weighted average (more weight to recent frames)
-        if len(self.focus_history) > 1:
-            # Exponential weights: recent frames have more influence
-            weights = [2 ** i for i in range(len(self.focus_history))]
-            total_weight = sum(weights)
-
-            smoothed_x = sum(x * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
-            smoothed_y = sum(y * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
-
-            return (int(smoothed_x), int(smoothed_y))
+        if len(self.focus_history) >= 5:
+            xs = [x for x, y in self.focus_history]
+            ys = [y for x, y in self.focus_history]
+            median_x = int(np.median(xs))
+            median_y = int(np.median(ys))
+            return (median_x, median_y)
        else:
            return (raw_focus_x, raw_focus_y)

+    def _calculate_group_bounding_box(
+        self,
+        faces: List[FaceDetection],
+        padding_percent: float = 0.15,
+        max_faces: int = 6
+    ) -> Optional[GroupBoundingBox]:
+        """
+        Calculate bounding box containing all detected faces with padding.
+
+        Args:
+            faces: List of detected faces
+            padding_percent: Padding around group as percentage of bbox dimensions
+            max_faces: Maximum faces to include (use most confident if exceeded)
+
+        Returns:
+            GroupBoundingBox or None if no faces
+        """
+        if not faces:
+            return None
+
+        # If too many faces, use most confident ones
+        if len(faces) > max_faces:
+            faces = sorted(faces, key=lambda f: f.confidence, reverse=True)[:max_faces]
+
+        # Calculate bounding box containing all faces
+        min_x = min(f.x for f in faces)
+        max_x = max(f.x + f.width for f in faces)
+        min_y = min(f.y for f in faces)
+        max_y = max(f.y + f.height for f in faces)
+
+        # Add padding
+        width = max_x - min_x
+        height = max_y - min_y
+        pad_x = int(width * padding_percent)
+        pad_y = int(height * padding_percent)
+
+        final_x = max(0, min_x - pad_x)
+        final_y = max(0, min_y - pad_y)
+        final_width = width + 2 * pad_x
+        final_height = height + 2 * pad_y
+
+        return GroupBoundingBox(
+            x=final_x,
+            y=final_y,
+            width=final_width,
+            height=final_height,
+            center_x=final_x + final_width // 2,
+            center_y=final_y + final_height // 2,
+            face_count=len(faces)
+        )
+
    def close(self):
        """Release resources."""
        self.detector.close()