Files
video-render/video_render/context_detection.py

845 lines
31 KiB
Python

"""
Context detection module for video analysis.
This module provides functionality to detect faces, track people,
and identify who is speaking in video content using MediaPipe and audio analysis.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import List, Optional, Tuple
import cv2
import mediapipe as mp
import numpy as np
from scipy import signal
logger = logging.getLogger(__name__)
@dataclass
class FaceDetection:
"""Represents a detected face in a frame."""
x: int
y: int
width: int
height: int
confidence: float
center_x: int
center_y: int
landmarks: Optional[List[Tuple[int, int]]] = None
@dataclass
class PersonTracking:
"""Tracks a person across frames."""
person_id: int
face: FaceDetection
is_speaking: bool
speaking_confidence: float
frame_number: int
@dataclass
class GroupBoundingBox:
"""Bounding box containing all tracked faces."""
x: int
y: int
width: int
height: int
center_x: int
center_y: int
face_count: int
@dataclass
class FrameContext:
"""Context information for a video frame."""
frame_number: int
timestamp: float
detected_faces: List[FaceDetection]
active_speakers: List[int] # indices of speaking faces
primary_focus: Optional[Tuple[int, int]] # (x, y) center point
layout_mode: str # "single", "dual_split", "grid"
selected_people: List[int] = field(default_factory=list) # indices of people selected for display
group_bounds: Optional[GroupBoundingBox] = None # bounding box for all detected faces
class MediaPipeDetector:
"""Face and pose detection using MediaPipe with OpenCV Haar Cascade fallback."""
def __init__(self, min_detection_confidence: float = 0.3, min_tracking_confidence: float = 0.3):
self.min_detection_confidence = min_detection_confidence
self.min_tracking_confidence = min_tracking_confidence
self.mp_face_detection = mp.solutions.face_detection
self.mp_face_mesh = mp.solutions.face_mesh
# MediaPipe detectors with lower confidence for better cartoon detection
self.face_detection = self.mp_face_detection.FaceDetection(
min_detection_confidence=min_detection_confidence,
model_selection=0 # Changed to 0 for better detection of varied faces (including cartoons)
)
self.face_mesh = self.mp_face_mesh.FaceMesh(
max_num_faces=5,
min_detection_confidence=min_detection_confidence,
min_tracking_confidence=min_tracking_confidence,
static_image_mode=False
)
# OpenCV Haar Cascade as fallback for cartoon/anime faces
self.haar_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
# Alternative cascade for profile/side faces
self.haar_cascade_profile = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_profileface.xml')
logger.info(f"Hybrid detector initialized (MediaPipe confidence={min_detection_confidence}, OpenCV Haar Cascade enabled)")
def detect_faces(self, frame: np.ndarray) -> List[FaceDetection]:
"""
Detect faces in a frame using hybrid approach (MediaPipe + OpenCV Haar Cascade).
Args:
frame: RGB image array
Returns:
List of detected faces
"""
height, width = frame.shape[:2]
if len(frame.shape) == 2:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
elif frame.shape[2] == 4:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGRA2RGB)
else:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Try MediaPipe first
results = self.face_detection.process(frame_rgb)
faces = []
if results.detections:
for detection in results.detections:
bbox = detection.location_data.relative_bounding_box
x = int(bbox.xmin * width)
y = int(bbox.ymin * height)
w = int(bbox.width * width)
h = int(bbox.height * height)
x = max(0, min(x, width - 1))
y = max(0, min(y, height - 1))
w = min(w, width - x)
h = min(h, height - y)
center_x = x + w // 2
center_y = y + h // 2
confidence = detection.score[0] if detection.score else 0.0
faces.append(FaceDetection(
x=x,
y=y,
width=w,
height=h,
confidence=confidence,
center_x=center_x,
center_y=center_y
))
# Fallback to OpenCV Haar Cascade if MediaPipe found nothing
if not faces:
faces = self._detect_faces_haar_cascade(frame, width, height)
return faces
def _detect_faces_haar_cascade(self, frame: np.ndarray, width: int, height: int) -> List[FaceDetection]:
"""
Detect faces using OpenCV Haar Cascade (works better with cartoons/anime).
Args:
frame: Image frame (BGR format)
width: Frame width
height: Frame height
Returns:
List of detected faces
"""
# Convert to grayscale for Haar Cascade
if len(frame.shape) == 3:
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
else:
gray = frame
# Detect frontal faces with more sensitive parameters
frontal_faces = self.haar_cascade.detectMultiScale(
gray,
scaleFactor=1.05, # More sensitive to size variations
minNeighbors=3, # Lower threshold for detection (more permissive)
minSize=(30, 30), # Smaller minimum size
flags=cv2.CASCADE_SCALE_IMAGE
)
# Also try profile faces
profile_faces = self.haar_cascade_profile.detectMultiScale(
gray,
scaleFactor=1.1,
minNeighbors=3,
minSize=(30, 30),
flags=cv2.CASCADE_SCALE_IMAGE
)
# Combine frontal and profile detections
all_faces = []
for (x, y, w, h) in frontal_faces:
x = max(0, min(x, width - 1))
y = max(0, min(y, height - 1))
w = min(w, width - x)
h = min(h, height - y)
center_x = x + w // 2
center_y = y + h // 2
all_faces.append(FaceDetection(
x=x,
y=y,
width=w,
height=h,
confidence=0.7, # Haar Cascade doesn't provide confidence, use fixed value
center_x=center_x,
center_y=center_y
))
for (x, y, w, h) in profile_faces:
# Check if this face overlaps significantly with any frontal face
overlap = False
for existing_face in all_faces:
# Calculate IoU (Intersection over Union)
x1_overlap = max(x, existing_face.x)
y1_overlap = max(y, existing_face.y)
x2_overlap = min(x + w, existing_face.x + existing_face.width)
y2_overlap = min(y + h, existing_face.y + existing_face.height)
if x1_overlap < x2_overlap and y1_overlap < y2_overlap:
overlap_area = (x2_overlap - x1_overlap) * (y2_overlap - y1_overlap)
face_area = w * h
if overlap_area / face_area > 0.3: # 30% overlap threshold
overlap = True
break
if not overlap:
x = max(0, min(x, width - 1))
y = max(0, min(y, height - 1))
w = min(w, width - x)
h = min(h, height - y)
center_x = x + w // 2
center_y = y + h // 2
all_faces.append(FaceDetection(
x=x,
y=y,
width=w,
height=h,
confidence=0.6, # Slightly lower confidence for profile
center_x=center_x,
center_y=center_y
))
if all_faces:
logger.debug(f"Haar Cascade detected {len(all_faces)} faces (MediaPipe failed)")
return all_faces
def detect_face_landmarks(self, frame: np.ndarray) -> List[FaceDetection]:
"""
Detect faces with landmarks for lip sync detection.
Args:
frame: RGB image array
Returns:
List of detected faces with landmark information
"""
height, width = frame.shape[:2]
if len(frame.shape) == 2:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
elif frame.shape[2] == 4:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGRA2RGB)
else:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = self.face_mesh.process(frame_rgb)
faces = []
if results.multi_face_landmarks:
for face_landmarks in results.multi_face_landmarks:
xs = [lm.x for lm in face_landmarks.landmark]
ys = [lm.y for lm in face_landmarks.landmark]
x_min, x_max = min(xs), max(xs)
y_min, y_max = min(ys), max(ys)
x = int(x_min * width)
y = int(y_min * height)
w = int((x_max - x_min) * width)
h = int((y_max - y_min) * height)
center_x = x + w // 2
center_y = y + h // 2
lip_landmarks = []
for idx in [13, 14, 78, 308]:
lm = face_landmarks.landmark[idx]
lip_landmarks.append((int(lm.x * width), int(lm.y * height)))
faces.append(FaceDetection(
x=x,
y=y,
width=w,
height=h,
confidence=1.0,
center_x=center_x,
center_y=center_y,
landmarks=lip_landmarks
))
return faces
def close(self):
"""Release MediaPipe resources."""
self.face_detection.close()
self.face_mesh.close()
class AudioActivityDetector:
"""Detects speech activity in audio."""
def __init__(self, sample_rate: int = 44100, frame_duration_ms: int = 30):
self.sample_rate = sample_rate
self.frame_duration_ms = frame_duration_ms
self.frame_size = int(sample_rate * frame_duration_ms / 1000)
logger.info(f"Audio activity detector initialized (sr={sample_rate}, frame={frame_duration_ms}ms)")
def detect_speaking_periods(
self,
audio_samples: np.ndarray,
threshold: float = 0.01, # Reduced from 0.02 for better speech detection
min_speech_duration: float = 0.05 # Reduced from 0.1 to catch shorter utterances
) -> List[Tuple[float, float]]:
"""
Detect periods of speech in audio.
Args:
audio_samples: Audio samples array
threshold: Energy threshold for speech detection
min_speech_duration: Minimum duration of speech in seconds
Returns:
List of (start_time, end_time) tuples in seconds
"""
if audio_samples.ndim > 1:
audio_samples = audio_samples.mean(axis=1)
energies = []
for i in range(0, len(audio_samples), self.frame_size):
frame = audio_samples[i:i + self.frame_size]
if len(frame) > 0:
energy = np.sqrt(np.mean(frame ** 2))
energies.append(energy)
speaking_frames = [e > threshold for e in energies]
periods = []
start_frame = None
for i, is_speaking in enumerate(speaking_frames):
if is_speaking and start_frame is None:
start_frame = i
elif not is_speaking and start_frame is not None:
start_time = start_frame * self.frame_duration_ms / 1000
end_time = i * self.frame_duration_ms / 1000
if end_time - start_time >= min_speech_duration:
periods.append((start_time, end_time))
start_frame = None
if start_frame is not None:
start_time = start_frame * self.frame_duration_ms / 1000
end_time = len(speaking_frames) * self.frame_duration_ms / 1000
if end_time - start_time >= min_speech_duration:
periods.append((start_time, end_time))
# Log detected speech periods for debugging
if periods:
total_speech_time = sum(end - start for start, end in periods)
logger.info(f"Audio speech detection: {len(periods)} periods found, "
f"total {total_speech_time:.1f}s of speech (threshold={threshold})")
else:
max_energy = max(energies) if energies else 0
logger.warning(f"No speech detected! Max energy={max_energy:.4f}, threshold={threshold} "
f"(try lowering threshold if speech should be present)")
return periods
def is_speaking_at_time(self, speaking_periods: List[Tuple[float, float]], time: float) -> bool:
"""Check if there is speech activity at a given time."""
for start, end in speaking_periods:
if start <= time <= end:
return True
return False
class ContextAnalyzer:
"""Analyzes video context to determine focus and layout."""
def __init__(self, person_switch_cooldown: int = 30, min_face_confidence: float = 0.3):
self.detector = MediaPipeDetector()
self.audio_detector = AudioActivityDetector()
self.previous_faces: List[FaceDetection] = []
self.min_face_confidence = min_face_confidence
# Person tracking state
self.current_selected_people: List[int] = [] # Indices of people currently on screen
self.last_switch_frame: int = -999 # Frame when we last switched people
self.person_switch_cooldown = person_switch_cooldown # Minimum frames before switching
# Stability tracking to prevent flip-flopping
self.desired_people_history: List[List[int]] = [] # Track recent desired selections
self.stability_threshold = 20 # Frames needed to confirm a switch (increased for more stability)
self.last_switched_people: List[int] = [] # People we just switched FROM
self.focus_history: List[Tuple[int, int]] = []
self.focus_history_size: int = 20
self.focus_dead_zone: int = 60
# Debug logging
self.frame_log_interval = 30 # Log every N frames
logger.info(f"Context analyzer initialized (cooldown={person_switch_cooldown} frames, focus_smoothing={self.focus_history_size})")
def analyze_frame(
self,
frame: np.ndarray,
timestamp: float,
frame_number: int,
speaking_periods: Optional[List[Tuple[float, float]]] = None
) -> FrameContext:
"""
Analyze a single frame to extract context information.
Args:
frame: Video frame (BGR format from OpenCV)
timestamp: Frame timestamp in seconds
frame_number: Frame index
speaking_periods: List of (start, end) times where speech is detected
Returns:
FrameContext with detection results
"""
faces = self.detector.detect_face_landmarks(frame)
faces = [face for face in faces if face.confidence >= self.min_face_confidence] if faces else []
if not faces:
faces = self.detector.detect_faces(frame)
faces = [face for face in faces if face.confidence >= self.min_face_confidence] if faces else []
# Determine who is speaking
active_speakers = []
has_audio_speech = speaking_periods and self.audio_detector.is_speaking_at_time(speaking_periods, timestamp)
for i, face in enumerate(faces):
is_speaking = False
# Prefer visual cues when multiple faces are present.
if face.landmarks and len(self.previous_faces) > i:
is_speaking = self._detect_lip_movement(face, self.previous_faces[i])
# Audio can confirm speech when there's only one face.
if has_audio_speech and len(faces) == 1:
is_speaking = True
if is_speaking:
active_speakers.append(i)
# Debug: Log speech detection
if frame_number % 30 == 0: # Every second at 30fps
logger.info(f"Speech detection - Frame {frame_number}: audio_active={has_audio_speech}, "
f"speakers={active_speakers}, total_faces={len(faces)}")
if active_speakers:
selected_people = active_speakers[:4]
if len(selected_people) == 1:
layout_mode = "single"
elif len(selected_people) == 2:
layout_mode = "dual_split"
else:
layout_mode = "grid"
else:
# Select THE person to focus on (always single person)
# Priority: 1) Who is speaking, 2) Who is most centered
selected_people = self._select_person_to_focus(
faces,
active_speakers,
frame_number,
frame.shape[1], # frame width for center calculation
frame.shape[0] # frame height for center calculation
)
layout_mode = "single"
# Calculate group bounding box for ALL detected faces (multi-person support)
group_bounds = self._calculate_group_bounding_box(faces)
# For multi-person mode, use group center as primary focus
if group_bounds and group_bounds.face_count > 1:
primary_focus = (group_bounds.center_x, group_bounds.center_y)
else:
primary_focus = self._calculate_focus_point(faces, selected_people)
# Debug logging every N frames
if frame_number % self.frame_log_interval == 0:
focus_reason = "speaker" if active_speakers else "no_speech_detected"
group_info = f", group={group_bounds.face_count} faces" if group_bounds else ""
logger.info(f"Frame {frame_number}: {len(faces)} faces, "
f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}{group_info}")
self.previous_faces = faces
return FrameContext(
frame_number=frame_number,
timestamp=timestamp,
detected_faces=faces,
active_speakers=active_speakers,
primary_focus=primary_focus,
layout_mode=layout_mode,
selected_people=selected_people,
group_bounds=group_bounds
)
def _detect_lip_movement(self, current_face: FaceDetection, previous_face: FaceDetection) -> bool:
"""
Detect lip movement by comparing landmarks between frames.
Args:
current_face: Current frame face detection
previous_face: Previous frame face detection
Returns:
True if significant lip movement detected
"""
if not current_face.landmarks or not previous_face.landmarks:
return False
def lip_distance(landmarks):
if len(landmarks) < 4:
return 0
upper = np.array(landmarks[0:2])
lower = np.array(landmarks[2:4])
return np.linalg.norm(upper.mean(axis=0) - lower.mean(axis=0))
current_dist = lip_distance(current_face.landmarks)
previous_dist = lip_distance(previous_face.landmarks)
threshold = 2.0
return abs(current_dist - previous_dist) > threshold
def _select_person_to_focus(
self,
faces: List[FaceDetection],
active_speakers: List[int],
frame_number: int,
frame_width: int,
frame_height: int
) -> List[int]:
"""
Select THE single person to focus on.
Priority: 1) Who is speaking, 2) Who is most centered in frame
Args:
faces: List of detected faces
active_speakers: Indices of people currently speaking
frame_number: Current frame number
frame_width: Frame width for center calculation
frame_height: Frame height for center calculation
Returns:
List with single person index [idx], or empty list if no faces
"""
if not faces:
self.current_selected_people = []
return []
if len(faces) == 1:
self.current_selected_people = [0]
return [0]
frames_since_last_switch = frame_number - self.last_switch_frame
can_switch = frames_since_last_switch >= self.person_switch_cooldown
desired_person_idx = None
if active_speakers:
if self.current_selected_people and self.current_selected_people[0] in active_speakers:
desired_person_idx = self.current_selected_people[0]
else:
if can_switch or not self.current_selected_people:
desired_person_idx = active_speakers[0]
if self.current_selected_people and desired_person_idx != self.current_selected_people[0]:
logger.info(f"Switching focus to speaker: {desired_person_idx}")
self.last_switch_frame = frame_number
else:
desired_person_idx = self.current_selected_people[0] if self.current_selected_people else active_speakers[0]
else:
if self.current_selected_people and len(self.current_selected_people) > 0:
current_idx = self.current_selected_people[0]
if current_idx < len(faces):
desired_person_idx = current_idx
else:
if self.previous_faces and current_idx < len(self.previous_faces):
prev_face = self.previous_faces[current_idx]
best_match_idx = None
best_match_score = float('inf')
for idx, face in enumerate(faces):
dx = face.center_x - prev_face.center_x
dy = face.center_y - prev_face.center_y
dist = np.sqrt(dx**2 + dy**2)
size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height)
score = dist + size_diff * 0.5
if score < best_match_score:
best_match_score = score
best_match_idx = idx
if best_match_idx is not None and best_match_score < 1000:
desired_person_idx = best_match_idx
else:
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
face_confidences.sort(key=lambda x: x[1], reverse=True)
desired_person_idx = face_confidences[0][0]
else:
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
face_confidences.sort(key=lambda x: x[1], reverse=True)
desired_person_idx = face_confidences[0][0]
else:
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
face_confidences.sort(key=lambda x: x[1], reverse=True)
desired_person_idx = face_confidences[0][0]
desired_people = [desired_person_idx] if desired_person_idx is not None else []
if not self.current_selected_people:
self.current_selected_people = desired_people
self.last_switch_frame = frame_number
logger.info(f"Frame {frame_number}: Locked on person {desired_people}")
else:
self.current_selected_people = desired_people
return self.current_selected_people.copy()
def _ensure_distinct_people(
self,
faces: List[FaceDetection],
people_indices: List[int]
) -> List[int]:
"""
Ensure selected people are distinct by checking minimum distance between them.
Prevents showing the same person twice due to duplicate detection.
Args:
faces: List of detected faces
people_indices: Indices of people to validate
Returns:
List of distinct people indices (max 2)
"""
if len(people_indices) <= 1:
return people_indices
distinct_people = []
for idx in people_indices:
if idx >= len(faces):
continue
current_face = faces[idx]
is_distinct = True
# Check if this person is too close to any already selected person
for selected_idx in distinct_people:
selected_face = faces[selected_idx]
# Calculate distance between face centers
dx = current_face.center_x - selected_face.center_x
dy = current_face.center_y - selected_face.center_y
distance = np.sqrt(dx**2 + dy**2)
# Also check overlap via IoU (Intersection over Union)
x1_overlap = max(current_face.x, selected_face.x)
y1_overlap = max(current_face.y, selected_face.y)
x2_overlap = min(current_face.x + current_face.width, selected_face.x + selected_face.width)
y2_overlap = min(current_face.y + current_face.height, selected_face.y + selected_face.height)
overlap_area = 0
if x1_overlap < x2_overlap and y1_overlap < y2_overlap:
overlap_area = (x2_overlap - x1_overlap) * (y2_overlap - y1_overlap)
# Calculate areas
area1 = current_face.width * current_face.height
area2 = selected_face.width * selected_face.height
min_area = min(area1, area2)
# If faces are very close OR significantly overlapping, they're likely the same person
# Minimum distance: 1/4 of average face width
min_distance = (current_face.width + selected_face.width) / 8
overlap_threshold = 0.3 # 30% overlap
if distance < min_distance or (min_area > 0 and overlap_area / min_area > overlap_threshold):
is_distinct = False
logger.debug(f"Person {idx} too similar to person {selected_idx} (dist={distance:.1f}, overlap={overlap_area/min_area if min_area > 0 else 0:.2%})")
break
if is_distinct:
distinct_people.append(idx)
# Stop at 2 distinct people
if len(distinct_people) >= 2:
break
# If we couldn't find 2 distinct people, return at most 1
if len(distinct_people) < 2 and len(people_indices) >= 2:
logger.debug(f"Only {len(distinct_people)} distinct person(s) found from {len(people_indices)} detections")
return distinct_people
def _calculate_focus_point(
self,
faces: List[FaceDetection],
selected_people: List[int]
) -> Optional[Tuple[int, int]]:
"""
Calculate the primary focus point based on selected people with temporal smoothing.
Args:
faces: List of detected faces
selected_people: Indices of people selected for display
Returns:
(x, y) tuple of focus center, or None if no faces
"""
if not faces or not selected_people:
return None
# Calculate raw focus point
raw_focus_x = 0
raw_focus_y = 0
if len(selected_people) == 1:
# Single person - focus on them
if selected_people[0] < len(faces):
primary = faces[selected_people[0]]
raw_focus_x = primary.center_x
raw_focus_y = primary.center_y
else:
# Fallback
most_confident = max(faces, key=lambda f: f.confidence)
raw_focus_x = most_confident.center_x
raw_focus_y = most_confident.center_y
else:
# Multiple people - focus on the CENTER between them for stability
# This prevents jarring movements when switching focus between people
valid_people = [idx for idx in selected_people if idx < len(faces)]
if valid_people:
centers_x = [faces[idx].center_x for idx in valid_people]
centers_y = [faces[idx].center_y for idx in valid_people]
raw_focus_x = int(np.mean(centers_x))
raw_focus_y = int(np.mean(centers_y))
else:
# Fallback
most_confident = max(faces, key=lambda f: f.confidence)
raw_focus_x = most_confident.center_x
raw_focus_y = most_confident.center_y
if self.focus_history:
last_x, last_y = self.focus_history[-1]
dx = abs(raw_focus_x - last_x)
dy = abs(raw_focus_y - last_y)
if dx < self.focus_dead_zone and dy < self.focus_dead_zone:
return self.focus_history[-1]
self.focus_history.append((raw_focus_x, raw_focus_y))
if len(self.focus_history) > self.focus_history_size:
self.focus_history.pop(0)
if len(self.focus_history) >= 5:
xs = [x for x, y in self.focus_history]
ys = [y for x, y in self.focus_history]
median_x = int(np.median(xs))
median_y = int(np.median(ys))
return (median_x, median_y)
else:
return (raw_focus_x, raw_focus_y)
def _calculate_group_bounding_box(
self,
faces: List[FaceDetection],
padding_percent: float = 0.15,
max_faces: int = 6
) -> Optional[GroupBoundingBox]:
"""
Calculate bounding box containing all detected faces with padding.
Args:
faces: List of detected faces
padding_percent: Padding around group as percentage of bbox dimensions
max_faces: Maximum faces to include (use most confident if exceeded)
Returns:
GroupBoundingBox or None if no faces
"""
if not faces:
return None
# If too many faces, use most confident ones
if len(faces) > max_faces:
faces = sorted(faces, key=lambda f: f.confidence, reverse=True)[:max_faces]
# Calculate bounding box containing all faces
min_x = min(f.x for f in faces)
max_x = max(f.x + f.width for f in faces)
min_y = min(f.y for f in faces)
max_y = max(f.y + f.height for f in faces)
# Add padding
width = max_x - min_x
height = max_y - min_y
pad_x = int(width * padding_percent)
pad_y = int(height * padding_percent)
final_x = max(0, min_x - pad_x)
final_y = max(0, min_y - pad_y)
final_width = width + 2 * pad_x
final_height = height + 2 * pad_y
return GroupBoundingBox(
x=final_x,
y=final_y,
width=final_width,
height=final_height,
center_x=final_x + final_width // 2,
center_y=final_y + final_height // 2,
face_count=len(faces)
)
def close(self):
"""Release resources."""
self.detector.close()
# Clear tracking state to free memory
self.previous_faces.clear()
self.current_selected_people.clear()
self.focus_history.clear()