Realiza varios ajustes para melhorar o tracking e o render de video

2025-12-18 02:26:25 -03:00
parent 78e35d65fd
commit 07d301f110
11 changed files with 984 additions and 316 deletions
--- a/video_render/context_detection.py
+++ b/video_render/context_detection.py
@@ -7,7 +7,7 @@ and identify who is speaking in video content using MediaPipe and audio analysis
 from __future__ import annotations

 import logging
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import List, Optional, Tuple

 import cv2
@@ -50,20 +50,22 @@ class FrameContext:
    active_speakers: List[int]  # indices of speaking faces
    primary_focus: Optional[Tuple[int, int]]  # (x, y) center point
    layout_mode: str  # "single", "dual_split", "grid"
+    selected_people: List[int] = field(default_factory=list)  # indices of people selected for display (max 2)


 class MediaPipeDetector:
-    """Face and pose detection using MediaPipe."""
+    """Face and pose detection using MediaPipe with OpenCV Haar Cascade fallback."""

-    def __init__(self, min_detection_confidence: float = 0.5, min_tracking_confidence: float = 0.5):
+    def __init__(self, min_detection_confidence: float = 0.3, min_tracking_confidence: float = 0.3):
        self.min_detection_confidence = min_detection_confidence
        self.min_tracking_confidence = min_tracking_confidence
        self.mp_face_detection = mp.solutions.face_detection
        self.mp_face_mesh = mp.solutions.face_mesh

+        # MediaPipe detectors with lower confidence for better cartoon detection
        self.face_detection = self.mp_face_detection.FaceDetection(
            min_detection_confidence=min_detection_confidence,
-            model_selection=1
+            model_selection=0  # Changed to 0 for better detection of varied faces (including cartoons)
        )

        self.face_mesh = self.mp_face_mesh.FaceMesh(
@@ -73,11 +75,17 @@ class MediaPipeDetector:
            static_image_mode=False
        )

-        logger.info("MediaPipe detector initialized")
+        # OpenCV Haar Cascade as fallback for cartoon/anime faces
+        self.haar_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
+
+        # Alternative cascade for profile/side faces
+        self.haar_cascade_profile = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_profileface.xml')
+
+        logger.info(f"Hybrid detector initialized (MediaPipe confidence={min_detection_confidence}, OpenCV Haar Cascade enabled)")

    def detect_faces(self, frame: np.ndarray) -> List[FaceDetection]:
        """
-        Detect faces in a frame.
+        Detect faces in a frame using hybrid approach (MediaPipe + OpenCV Haar Cascade).

        Args:
            frame: RGB image array
@@ -94,6 +102,7 @@ class MediaPipeDetector:
        else:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

+        # Try MediaPipe first
        results = self.face_detection.process(frame_rgb)

        faces = []
@@ -126,8 +135,111 @@ class MediaPipeDetector:
                    center_y=center_y
                ))

+        # Fallback to OpenCV Haar Cascade if MediaPipe found nothing
+        if not faces:
+            faces = self._detect_faces_haar_cascade(frame, width, height)
+
        return faces

+    def _detect_faces_haar_cascade(self, frame: np.ndarray, width: int, height: int) -> List[FaceDetection]:
+        """
+        Detect faces using OpenCV Haar Cascade (works better with cartoons/anime).
+
+        Args:
+            frame: Image frame (BGR format)
+            width: Frame width
+            height: Frame height
+
+        Returns:
+            List of detected faces
+        """
+        # Convert to grayscale for Haar Cascade
+        if len(frame.shape) == 3:
+            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = frame
+
+        # Detect frontal faces with more sensitive parameters
+        frontal_faces = self.haar_cascade.detectMultiScale(
+            gray,
+            scaleFactor=1.05,  # More sensitive to size variations
+            minNeighbors=3,     # Lower threshold for detection (more permissive)
+            minSize=(30, 30),   # Smaller minimum size
+            flags=cv2.CASCADE_SCALE_IMAGE
+        )
+
+        # Also try profile faces
+        profile_faces = self.haar_cascade_profile.detectMultiScale(
+            gray,
+            scaleFactor=1.1,
+            minNeighbors=3,
+            minSize=(30, 30),
+            flags=cv2.CASCADE_SCALE_IMAGE
+        )
+
+        # Combine frontal and profile detections
+        all_faces = []
+
+        for (x, y, w, h) in frontal_faces:
+            x = max(0, min(x, width - 1))
+            y = max(0, min(y, height - 1))
+            w = min(w, width - x)
+            h = min(h, height - y)
+
+            center_x = x + w // 2
+            center_y = y + h // 2
+
+            all_faces.append(FaceDetection(
+                x=x,
+                y=y,
+                width=w,
+                height=h,
+                confidence=0.7,  # Haar Cascade doesn't provide confidence, use fixed value
+                center_x=center_x,
+                center_y=center_y
+            ))
+
+        for (x, y, w, h) in profile_faces:
+            # Check if this face overlaps significantly with any frontal face
+            overlap = False
+            for existing_face in all_faces:
+                # Calculate IoU (Intersection over Union)
+                x1_overlap = max(x, existing_face.x)
+                y1_overlap = max(y, existing_face.y)
+                x2_overlap = min(x + w, existing_face.x + existing_face.width)
+                y2_overlap = min(y + h, existing_face.y + existing_face.height)
+
+                if x1_overlap < x2_overlap and y1_overlap < y2_overlap:
+                    overlap_area = (x2_overlap - x1_overlap) * (y2_overlap - y1_overlap)
+                    face_area = w * h
+                    if overlap_area / face_area > 0.3:  # 30% overlap threshold
+                        overlap = True
+                        break
+
+            if not overlap:
+                x = max(0, min(x, width - 1))
+                y = max(0, min(y, height - 1))
+                w = min(w, width - x)
+                h = min(h, height - y)
+
+                center_x = x + w // 2
+                center_y = y + h // 2
+
+                all_faces.append(FaceDetection(
+                    x=x,
+                    y=y,
+                    width=w,
+                    height=h,
+                    confidence=0.6,  # Slightly lower confidence for profile
+                    center_x=center_x,
+                    center_y=center_y
+                ))
+
+        if all_faces:
+            logger.debug(f"Haar Cascade detected {len(all_faces)} faces (MediaPipe failed)")
+
+        return all_faces
+
    def detect_face_landmarks(self, frame: np.ndarray) -> List[FaceDetection]:
        """
        Detect faces with landmarks for lip sync detection.
@@ -203,8 +315,8 @@ class AudioActivityDetector:
    def detect_speaking_periods(
        self,
        audio_samples: np.ndarray,
-        threshold: float = 0.02,
-        min_speech_duration: float = 0.1
+        threshold: float = 0.01,  # Reduced from 0.02 for better speech detection
+        min_speech_duration: float = 0.05  # Reduced from 0.1 to catch shorter utterances
    ) -> List[Tuple[float, float]]:
        """
        Detect periods of speech in audio.
@@ -250,6 +362,16 @@ class AudioActivityDetector:
            if end_time - start_time >= min_speech_duration:
                periods.append((start_time, end_time))

+        # Log detected speech periods for debugging
+        if periods:
+            total_speech_time = sum(end - start for start, end in periods)
+            logger.info(f"Audio speech detection: {len(periods)} periods found, "
+                       f"total {total_speech_time:.1f}s of speech (threshold={threshold})")
+        else:
+            max_energy = max(energies) if energies else 0
+            logger.warning(f"No speech detected! Max energy={max_energy:.4f}, threshold={threshold} "
+                          f"(try lowering threshold if speech should be present)")
+
        return periods

    def is_speaking_at_time(self, speaking_periods: List[Tuple[float, float]], time: float) -> bool:
@@ -263,12 +385,29 @@ class AudioActivityDetector:
 class ContextAnalyzer:
    """Analyzes video context to determine focus and layout."""

-    def __init__(self):
+    def __init__(self, person_switch_cooldown: int = 30):
        self.detector = MediaPipeDetector()
        self.audio_detector = AudioActivityDetector()
        self.previous_faces: List[FaceDetection] = []

-        logger.info("Context analyzer initialized")
+        # Person tracking state
+        self.current_selected_people: List[int] = []  # Indices of people currently on screen
+        self.last_switch_frame: int = -999  # Frame when we last switched people
+        self.person_switch_cooldown = person_switch_cooldown  # Minimum frames before switching
+
+        # Stability tracking to prevent flip-flopping
+        self.desired_people_history: List[List[int]] = []  # Track recent desired selections
+        self.stability_threshold = 20  # Frames needed to confirm a switch (increased for more stability)
+        self.last_switched_people: List[int] = []  # People we just switched FROM
+
+        # Focus stability: track recent focus points for temporal smoothing
+        self.focus_history: List[Tuple[int, int]] = []
+        self.focus_history_size: int = 5  # Keep last 5 focus points for smoothing
+
+        # Debug logging
+        self.frame_log_interval = 30  # Log every N frames
+
+        logger.info(f"Context analyzer initialized (cooldown={person_switch_cooldown} frames, focus_smoothing={self.focus_history_size})")

    def analyze_frame(
        self,
@@ -296,33 +435,47 @@ class ContextAnalyzer:

        # Determine who is speaking
        active_speakers = []
+        has_audio_speech = speaking_periods and self.audio_detector.is_speaking_at_time(speaking_periods, timestamp)
+
        for i, face in enumerate(faces):
            is_speaking = False

-            if speaking_periods and self.audio_detector.is_speaking_at_time(speaking_periods, timestamp):
+            # Check audio-based speech detection
+            if has_audio_speech:
                is_speaking = True

+            # Check lip movement (visual speech detection)
            if face.landmarks and len(self.previous_faces) > i:
                is_speaking = is_speaking or self._detect_lip_movement(face, self.previous_faces[i])

            if is_speaking:
                active_speakers.append(i)

-        num_faces = len(faces)
-        num_speakers = len(active_speakers)
+        # Debug: Log speech detection
+        if frame_number % 30 == 0:  # Every second at 30fps
+            logger.info(f"Speech detection - Frame {frame_number}: audio_active={has_audio_speech}, "
+                       f"speakers={active_speakers}, total_faces={len(faces)}")

-        if num_faces == 0:
-            layout_mode = "single"
-        elif num_faces == 1:
-            layout_mode = "single"
-        elif num_faces == 2:
-            layout_mode = "dual_split"
-        elif num_faces >= 3:
-            layout_mode = "dual_split"
-        else:
-            layout_mode = "single"
+        # Select THE person to focus on (always single person)
+        # Priority: 1) Who is speaking, 2) Who is most centered
+        selected_people = self._select_person_to_focus(
+            faces,
+            active_speakers,
+            frame_number,
+            frame.shape[1],  # frame width for center calculation
+            frame.shape[0]   # frame height for center calculation
+        )

-        primary_focus = self._calculate_focus_point(faces, active_speakers)
+        # Always use single-person layout (no split screen)
+        layout_mode = "single"
+
+        primary_focus = self._calculate_focus_point(faces, selected_people)
+
+        # Debug logging every N frames
+        if frame_number % self.frame_log_interval == 0:
+            focus_reason = "speaker" if active_speakers else "no_speech_detected"
+            logger.info(f"Frame {frame_number}: {len(faces)} faces, "
+                       f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}")

        self.previous_faces = faces

@@ -332,7 +485,8 @@ class ContextAnalyzer:
            detected_faces=faces,
            active_speakers=active_speakers,
            primary_focus=primary_focus,
-            layout_mode=layout_mode
+            layout_mode=layout_mode,
+            selected_people=selected_people
        )

    def _detect_lip_movement(self, current_face: FaceDetection, previous_face: FaceDetection) -> bool:
@@ -363,36 +517,309 @@ class ContextAnalyzer:
        threshold = 2.0
        return abs(current_dist - previous_dist) > threshold

-    def _calculate_focus_point(
+    def _select_person_to_focus(
        self,
        faces: List[FaceDetection],
-        active_speakers: List[int]
-    ) -> Optional[Tuple[int, int]]:
+        active_speakers: List[int],
+        frame_number: int,
+        frame_width: int,
+        frame_height: int
+    ) -> List[int]:
        """
-        Calculate the primary focus point based on detected faces and speakers.
-
-        IMPORTANT: This focuses on ONE person to avoid focusing on empty space (table).
-        When multiple people are present, we pick the most relevant person, not average positions.
+        Select THE single person to focus on.
+        Priority: 1) Who is speaking, 2) Who is most centered in frame

        Args:
            faces: List of detected faces
-            active_speakers: Indices of faces that are speaking
+            active_speakers: Indices of people currently speaking
+            frame_number: Current frame number
+            frame_width: Frame width for center calculation
+            frame_height: Frame height for center calculation
+
+        Returns:
+            List with single person index [idx], or empty list if no faces
+        """
+        if not faces:
+            self.current_selected_people = []
+            return []
+
+        # If only 1 person, always focus on them
+        if len(faces) == 1:
+            self.current_selected_people = [0]
+            return [0]
+
+        # Check if we can switch people (cooldown period)
+        frames_since_last_switch = frame_number - self.last_switch_frame
+        can_switch = frames_since_last_switch >= self.person_switch_cooldown
+
+        # Calculate frame center for distance comparison
+        frame_center_x = frame_width / 2
+        frame_center_y = frame_height / 2
+
+        # ULTRA-STABLE MODE: Select ONE person at start, NEVER switch
+        # This completely eliminates switching-related instability
+        desired_person_idx = None
+
+        # If we already have someone selected, ALWAYS KEEP THEM (never switch)
+        if self.current_selected_people and len(self.current_selected_people) > 0:
+            current_idx = self.current_selected_people[0]
+            if current_idx < len(faces):
+                # Current person still detected - keep them
+                desired_person_idx = current_idx
+            else:
+                # Current person lost - try to find them again by position/size similarity
+                # This handles temporary detection failures
+                current_person_found = False
+                if self.previous_faces and current_idx < len(self.previous_faces):
+                    prev_face = self.previous_faces[current_idx]
+                    # Find most similar face by position and size
+                    best_match_idx = None
+                    best_match_score = float('inf')
+                    for idx, face in enumerate(faces):
+                        # Distance between centers
+                        dx = face.center_x - prev_face.center_x
+                        dy = face.center_y - prev_face.center_y
+                        dist = np.sqrt(dx**2 + dy**2)
+                        # Size similarity
+                        size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height)
+                        score = dist + size_diff * 0.5
+                        if score < best_match_score:
+                            best_match_score = score
+                            best_match_idx = idx
+
+                    if best_match_idx is not None and best_match_score < 1000:
+                        desired_person_idx = best_match_idx
+                        current_person_found = True
+
+                if not current_person_found:
+                    # Really lost - select most confident
+                    face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
+                    face_confidences.sort(key=lambda x: x[1], reverse=True)
+                    desired_person_idx = face_confidences[0][0]
+                    logger.warning(f"Current person permanently lost - selecting new: {desired_person_idx}")
+        else:
+            # First frame - select most confident person ONCE
+            face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
+            face_confidences.sort(key=lambda x: x[1], reverse=True)
+            desired_person_idx = face_confidences[0][0]
+            logger.info(f"INITIAL SELECTION - Person {desired_person_idx} (will be tracked throughout entire video)")
+
+        # IGNORE SPEECH DETECTION - it was causing instability
+        # We now track ONE person from start to finish, regardless of who speaks
+
+        # OLD LOGIC (commented out - was causing issues):
+        # This logic would switch based on "who is more centered" which caused constant switching
+        if False:  # Disabled
+            # Calculate distance from center for each face
+            center_distances = []
+            for idx, face in enumerate(faces):
+                # Euclidean distance from frame center
+                dx = face.center_x - frame_center_x
+                dy = face.center_y - frame_center_y
+                distance = np.sqrt(dx**2 + dy**2)
+                center_distances.append((idx, distance, face.confidence))
+
+            # Sort by distance (closest first), then by confidence as tiebreaker
+            center_distances.sort(key=lambda x: (x[1], -x[2]))
+            most_centered_idx = center_distances[0][0]
+            most_centered_distance = center_distances[0][1]
+
+            # STICKY BEHAVIOR: If we already have someone selected, only switch if:
+            # - New person is SIGNIFICANTLY more centered (30% closer to center)
+            # - OR current person is now very far from center (>40% of frame width)
+            if self.current_selected_people and len(self.current_selected_people) > 0:
+                current_idx = self.current_selected_people[0]
+                if current_idx < len(faces):
+                    current_face = faces[current_idx]
+                    current_dx = current_face.center_x - frame_center_x
+                    current_dy = current_face.center_y - frame_center_y
+                    current_distance = np.sqrt(current_dx**2 + current_dy**2)
+
+                    # Define "significantly better" threshold
+                    max_acceptable_distance = frame_width * 0.4  # 40% of frame width
+                    improvement_threshold = 0.7  # New person must be 30% closer (0.7 ratio)
+
+                    # Keep current person if they're still reasonably centered
+                    if current_distance < max_acceptable_distance:
+                        # Current person is still acceptable - only switch if new is MUCH better
+                        if most_centered_distance < current_distance * improvement_threshold:
+                            desired_person_idx = most_centered_idx
+                            logger.debug(f"Switching: new person MUCH more centered ({most_centered_distance:.0f} vs {current_distance:.0f})")
+                        else:
+                            desired_person_idx = current_idx  # Keep current
+                            logger.debug(f"Keeping current person: still reasonably centered ({current_distance:.0f} px from center)")
+                    else:
+                        # Current person is too far from center - switch
+                        desired_person_idx = most_centered_idx
+                        logger.debug(f"Current person too far from center ({current_distance:.0f} px), switching")
+                else:
+                    # Current selection invalid
+                    desired_person_idx = most_centered_idx
+            else:
+                # First time - select most centered
+                desired_person_idx = most_centered_idx
+
+        # Wrap in list for compatibility with existing code
+        desired_people = [desired_person_idx] if desired_person_idx is not None else []
+
+        # ULTRA-STABLE MODE: NO SWITCHING LOGIC AT ALL
+        # Simply set the person and never change
+        if not self.current_selected_people:
+            # First time only
+            self.current_selected_people = desired_people
+            self.last_switch_frame = frame_number
+            logger.info(f"Frame {frame_number}: LOCKED ON person {desired_people} - will never switch")
+        else:
+            # Already have someone - just update to desired (which is same person due to logic above)
+            self.current_selected_people = desired_people
+
+        return self.current_selected_people.copy()
+
+    def _ensure_distinct_people(
+        self,
+        faces: List[FaceDetection],
+        people_indices: List[int]
+    ) -> List[int]:
+        """
+        Ensure selected people are distinct by checking minimum distance between them.
+        Prevents showing the same person twice due to duplicate detection.
+
+        Args:
+            faces: List of detected faces
+            people_indices: Indices of people to validate
+
+        Returns:
+            List of distinct people indices (max 2)
+        """
+        if len(people_indices) <= 1:
+            return people_indices
+
+        distinct_people = []
+
+        for idx in people_indices:
+            if idx >= len(faces):
+                continue
+
+            current_face = faces[idx]
+            is_distinct = True
+
+            # Check if this person is too close to any already selected person
+            for selected_idx in distinct_people:
+                selected_face = faces[selected_idx]
+
+                # Calculate distance between face centers
+                dx = current_face.center_x - selected_face.center_x
+                dy = current_face.center_y - selected_face.center_y
+                distance = np.sqrt(dx**2 + dy**2)
+
+                # Also check overlap via IoU (Intersection over Union)
+                x1_overlap = max(current_face.x, selected_face.x)
+                y1_overlap = max(current_face.y, selected_face.y)
+                x2_overlap = min(current_face.x + current_face.width, selected_face.x + selected_face.width)
+                y2_overlap = min(current_face.y + current_face.height, selected_face.y + selected_face.height)
+
+                overlap_area = 0
+                if x1_overlap < x2_overlap and y1_overlap < y2_overlap:
+                    overlap_area = (x2_overlap - x1_overlap) * (y2_overlap - y1_overlap)
+
+                # Calculate areas
+                area1 = current_face.width * current_face.height
+                area2 = selected_face.width * selected_face.height
+                min_area = min(area1, area2)
+
+                # If faces are very close OR significantly overlapping, they're likely the same person
+                # Minimum distance: 1/4 of average face width
+                min_distance = (current_face.width + selected_face.width) / 8
+                overlap_threshold = 0.3  # 30% overlap
+
+                if distance < min_distance or (min_area > 0 and overlap_area / min_area > overlap_threshold):
+                    is_distinct = False
+                    logger.debug(f"Person {idx} too similar to person {selected_idx} (dist={distance:.1f}, overlap={overlap_area/min_area if min_area > 0 else 0:.2%})")
+                    break
+
+            if is_distinct:
+                distinct_people.append(idx)
+
+            # Stop at 2 distinct people
+            if len(distinct_people) >= 2:
+                break
+
+        # If we couldn't find 2 distinct people, return at most 1
+        if len(distinct_people) < 2 and len(people_indices) >= 2:
+            logger.debug(f"Only {len(distinct_people)} distinct person(s) found from {len(people_indices)} detections")
+
+        return distinct_people
+
+    def _calculate_focus_point(
+        self,
+        faces: List[FaceDetection],
+        selected_people: List[int]
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Calculate the primary focus point based on selected people with temporal smoothing.
+
+        Args:
+            faces: List of detected faces
+            selected_people: Indices of people selected for display

        Returns:
            (x, y) tuple of focus center, or None if no faces
        """
-        if not faces:
+        if not faces or not selected_people:
            return None

-        if active_speakers:
-            speaker_faces = [faces[i] for i in active_speakers if i < len(faces)]
-            if speaker_faces:
-                primary_speaker = max(speaker_faces, key=lambda f: f.confidence)
-                return (primary_speaker.center_x, primary_speaker.center_y)
+        # Calculate raw focus point
+        raw_focus_x = 0
+        raw_focus_y = 0

-        most_confident = max(faces, key=lambda f: f.confidence)
-        return (most_confident.center_x, most_confident.center_y)
+        if len(selected_people) == 1:
+            # Single person - focus on them
+            if selected_people[0] < len(faces):
+                primary = faces[selected_people[0]]
+                raw_focus_x = primary.center_x
+                raw_focus_y = primary.center_y
+            else:
+                # Fallback
+                most_confident = max(faces, key=lambda f: f.confidence)
+                raw_focus_x = most_confident.center_x
+                raw_focus_y = most_confident.center_y
+        else:
+            # Multiple people - focus on the CENTER between them for stability
+            # This prevents jarring movements when switching focus between people
+            valid_people = [idx for idx in selected_people if idx < len(faces)]
+            if valid_people:
+                centers_x = [faces[idx].center_x for idx in valid_people]
+                centers_y = [faces[idx].center_y for idx in valid_people]
+                raw_focus_x = int(np.mean(centers_x))
+                raw_focus_y = int(np.mean(centers_y))
+            else:
+                # Fallback
+                most_confident = max(faces, key=lambda f: f.confidence)
+                raw_focus_x = most_confident.center_x
+                raw_focus_y = most_confident.center_y
+
+        # Apply temporal smoothing using focus history
+        self.focus_history.append((raw_focus_x, raw_focus_y))
+        if len(self.focus_history) > self.focus_history_size:
+            self.focus_history.pop(0)
+
+        # Calculate smoothed focus as weighted average (more weight to recent frames)
+        if len(self.focus_history) > 1:
+            # Exponential weights: recent frames have more influence
+            weights = [2 ** i for i in range(len(self.focus_history))]
+            total_weight = sum(weights)
+
+            smoothed_x = sum(x * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
+            smoothed_y = sum(y * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
+
+            return (int(smoothed_x), int(smoothed_y))
+        else:
+            return (raw_focus_x, raw_focus_y)

    def close(self):
        """Release resources."""
        self.detector.close()
+        # Clear tracking state to free memory
+        self.previous_faces.clear()
+        self.current_selected_people.clear()
+        self.focus_history.clear()