From 3f7329869d4675f6e1753f693b960d398ebc7bf3 Mon Sep 17 00:00:00 2001
From: LeoMortari <leo.mortari.forinn@gmail.com>
Date: Sat, 3 Jan 2026 19:42:23 -0300
Subject: [PATCH] =?UTF-8?q?Ajusta=20contexto,=20falas=20e=20foco,=20tremul?=
 =?UTF-8?q?a=C3=A7=C3=A3o=20do=20video=20e=20demais=20bugs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docker-compose.yml                |   2 +-
 prompts/generate.txt              | 181 +++---
 video_render/config.py            |  15 +-
 video_render/context_detection.py | 301 +++++-----
 video_render/llm.py               |   4 +-
 video_render/rendering.py         |   7 +-
 video_render/smart_framing.py     | 877 +++++++++++++++++++++++-------
 7 files changed, 932 insertions(+), 455 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 200f4a0..4dba674 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -9,7 +9,7 @@ services:
       - RABBITMQ_PASS=${RABBITMQ_PASS}
       - OPENROUTER_API_URL=${OPENROUTER_API_URL:-https://openrouter.ai/api/v1/chat/completions}
       - OPENROUTER_API_KEY=${OPENROUTER_API_KEY}
-      - OPENROUTER_MODEL=${OPENROUTER_MODEL:-openai/gpt-oss-20b:free}
+      - OPENROUTER_MODEL=${OPENROUTER_MODEL:-mistralai/mistral-small-3.1-24b-instruct:free}
       - OPENROUTER_PROMPT_PATH=${OPENROUTER_PROMPT_PATH:-prompts/generate.txt}
       - FASTER_WHISPER_MODEL_SIZE=${FASTER_WHISPER_MODEL_SIZE:-medium}
       - SMART_FRAMING_SMOOTHING_WINDOW=${SMART_FRAMING_SMOOTHING_WINDOW:-30}
diff --git a/prompts/generate.txt b/prompts/generate.txt
index bd90862..ab7556f 100644
--- a/prompts/generate.txt
+++ b/prompts/generate.txt
@@ -1,118 +1,111 @@
-Você é especialista em viralidade de redes sociais (TikTok, Instagram Reels, YouTube Shorts). Sua missão: EXTRAIR O MÁXIMO de clips virais possíveis, priorizando QUANTIDADE + QUALIDADE.
+# TAREFA: Extrair clips virais de uma transcrição de vídeo
 
-🎯 OBJETIVO: Transformar cada vídeo em MÚLTIPLOS clips que podem viralizar
+Você é um especialista em conteúdo viral para TikTok, Instagram Reels e YouTube Shorts.
 
-PROCESSO DE ANÁLISE:
-1. Mapear TODOS os potenciais trechos virais na transcrição
-2. Avaliar cada trecho usando sistema de pontuação abaixo
-3. Rankear do maior para menor score viral
-4. Selecionar TODOS os trechos com score ≥ 60 (não seja conservador!)
+## REGRA MAIS IMPORTANTE - DURAÇÃO DOS CLIPS
 
-SISTEMA DE PONTUAÇÃO VIRAL (0-100 pontos):
+**CADA CLIP DEVE TER ENTRE 60 E 120 SEGUNDOS DE DURAÇÃO.**
 
-🪝 GANCHO INICIAL (0-30 pontos) - CRÍTICO PARA VIRALIZAÇÃO:
-[30] Frase CHOCANTE, pergunta POLÊMICA ou promessa OUSADA nos primeiros 3 segundos
-[25] Hook forte: "Você não vai acreditar...", "O segredo que ninguém conta...", "Isso mudou tudo..."
-[20] Pergunta intrigante ou afirmação controversa
-[15] História interessante mas gancho fraco
-[10] Início genérico mas aceitável
-[0] "Oi", "então", "bem", silêncio - DESCARTAR
+- MÍNIMO ABSOLUTO: 60 segundos (end - start >= 60)
+- MÁXIMO: 120 segundos (end - start <= 120)
+- IDEAL: 60-90 segundos
 
-🔥 GATILHO EMOCIONAL (0-25 pontos):
-[25] Emoção EXTREMA: raiva, choque, riso intenso, WTF moment, revelação bombástica
-[20] Emoção forte: surpresa, indignação, humor, curiosidade intensa
-[15] Emoção moderada: interesse, leve humor, insight interessante
-[10] Emoção fraca: informativo sem impacto
-[0] Monótono, técnico, sem apelo emocional - EVITAR
+**CLIPS COM MENOS DE 60 SEGUNDOS SERÃO REJEITADOS PELO SISTEMA.**
 
-💎 VALOR/UTILIDADE (0-20 pontos):
-[20] Segredo VALIOSO, insight transformador, informação EXCLUSIVA
-[15] Ensina algo prático e IMEDIATAMENTE aplicável
-[10] Opinião interessante ou perspectiva única
-[5] Informação genérica ou conhecimento comum
-[0] Nenhum valor prático, puro "enrolation" - DESCARTAR
+Antes de incluir um clip, SEMPRE calcule: end - start >= 60
 
-📖 ESTRUTURA NARRATIVA (0-15 pontos):
-[15] História COMPLETA com início, conflito/clímax e resolução satisfatória
-[10] Segmento com começo e fim coerentes, faz sentido isolado
-[5] Trecho com sentido mas cortado abruptamente
-[0] Fragmento sem contexto - NÃO USAR
+## QUANTIDADE DE CLIPS
 
-⚡ RITMO E ENERGIA (0-10 pontos):
-[10] DINÂMICO, sem pausas longas, alta energia, palavras impactantes
-[7] Bom ritmo com pausas naturais curtas (< 2s)
-[3] Ritmo lento mas aceitável
-[0] Muitas pausas (> 3s), hesitações, monotonia - EVITAR
+Baseado na duração total do vídeo:
+- Até 10 min: 2-4 clips
+- 10-20 min: 4-6 clips
+- 20-30 min: 6-10 clips
+- 30+ min: 8-15 clips
 
-REGRAS DE QUANTIDADE (SER AGRESSIVO):
-📊 Quantidade MÍNIMA por duração:
-- 5-10 min: MÍNIMO 4-6 clips
-- 10-15 min: MÍNIMO 6-8 clips
-- 15-20 min: MÍNIMO 8-10 clips
-- 20-30 min: MÍNIMO 10-15 clips
-- 30+ min: MÍNIMO 15-20 clips
+## CRITÉRIOS DE SELEÇÃO
 
-🎯 REGRA DE OURO: 1 clip a cada 2-3 minutos de vídeo (NO MÍNIMO)
-- Se encontrar momentos virais, SEMPRE selecione!
-- Melhor ter 3 clips perfeitos que 10 clips bons
+Um bom clip viral possui:
 
-CRITÉRIOS DE SELEÇÃO:
-- Score viral ≥ 60 pontos (idealmente ≥ 70)
-- Duração ideal: 60-120s (formato ideal para Reels/Shorts)
-- Duração mínima: 60s | Duração máxima: 120s
-- Sem sobreposição temporal
-- DEVE ter gancho forte nos primeiros 3 segundos
-- Início e fim coerentes
+1. GANCHO FORTE nos primeiros 3 segundos (pergunta, afirmação chocante, promessa)
+2. EMOÇÃO (humor, surpresa, indignação, curiosidade)
+3. VALOR (ensina algo, revela segredo, dá dica prática)
+4. ESTRUTURA (início, meio e fim coerentes)
+5. RITMO (sem pausas longas, dinâmico)
 
-GANCHOS QUE FAZEM VIRALIZAR (use como filtro):
-- "O que ninguém te conta sobre..."
-- "O erro que 90% das pessoas cometem..."
-- "Você não vai acreditar o que aconteceu..."
-- Revelações chocantes ou contraintuitivas
-- Antes vs Depois, transformações
-- Segredos, bastidores, verdades ocultas
-- Polêmicas, opiniões fortes, hot takes
-- Histórias dramáticas com reviravolta
-- Dicas práticas e acionáveis
-- Momentos de humor genuíno
+## O QUE EVITAR
 
-❌ EVITE (mas não descarte se score alto):
-- Introduções genéricos SEM gancho
-- Trechos com pausas > 3s consecutivas
-- Explicações técnicas SEM gancho emocional
-- Segmentos sem conclusão clara
-- Momentos de transição vazios
+- Introduções genéricas ("oi pessoal", "então", "bem")
+- Trechos com pausas longas (> 3 segundos de silêncio)
+- Segmentos sem contexto ou conclusão
+- Explicações técnicas monótonas
 
-FORMATO JSON (retorne APENAS isto, SEM texto adicional):
+## FORMATO DE RESPOSTA
+
+Retorne APENAS um JSON válido, sem texto antes ou depois:
+
+```json
 {
   "highlights": [
     {
-      "start": <float>,
-      "end": <float>,
-      "summary": "Score: XX/100 | Gancho: [descreva] | Gatilho: [descreva]",
+      "start": 0.0,
+      "end": 75.0,
+      "summary": "Descrição do que acontece neste trecho"
+    },
+    {
+      "start": 120.5,
+      "end": 195.0,
+      "summary": "Descrição do que acontece neste trecho"
     }
   ]
 }
+```
 
-REGRAS TÉCNICAS:
-- Float com ponto decimal (45.5 NÃO 45,5)
-- Timestamps exatos dos segments fornecidos
-- Ordem cronológica (start crescente)
-- Summary conciso mas informativo (2-3 frases)
+## REGRAS DO JSON
 
-TAREFA PASSO A PASSO:
-1. Leia transcrição completa
-2. Identifique TODOS os momentos potencialmente virais
-3. Avalie e pontue cada trecho (seja generoso!)
-4. Rankear por score viral
-5. Selecione TODOS com score ≥ 60
-6. Garanta mínimo de 1 clip a cada 5 minutos
-7. Retorne JSON completo
+- "start" e "end" são números decimais (float) em SEGUNDOS
+- Use ponto como separador decimal (60.5, não 60,5)
+- "summary" é uma descrição breve do conteúdo (1-2 frases)
+- Clips em ordem cronológica (start crescente)
+- Clips não podem se sobrepor
 
-⚠️ IMPORTANTE:
-- NÃO seja conservador! Se encontrou 10 momentos bons, retorne os 10!
-- Pense em MAXIMIZAR alcance: mais clips = mais chances de viralizar
-- Se vídeo tem conteúdo fraco, seja criterioso, mas SEMPRE retorne pelo menos 3-5 clips
-- Priorize clips com GANCHOS FORTES - gancho fraco = baixo alcance
+## CHECKLIST ANTES DE RESPONDER
 
-🎯 MINDSET: Você é um criador de conteúdo viral. Seu objetivo é extrair MÁXIMO valor do vídeo original.
+Para CADA clip, verifique:
+- [ ] end - start >= 60 segundos?
+- [ ] end - start <= 120 segundos?
+- [ ] Tem gancho forte no início?
+- [ ] Faz sentido isolado do resto do vídeo?
+- [ ] JSON está válido?
+
+## EXEMPLO
+
+Se o vídeo tem 15 minutos e você encontrou 4 momentos virais:
+
+```json
+{
+  "highlights": [
+    {
+      "start": 60.0,
+      "end": 120.0,
+      "summary": "Revelação sobre como economizar 50% nas compras"
+    },
+    {
+      "start": 180.0,
+      "end": 255.0,
+      "summary": "História engraçada sobre cliente que tentou enganar a loja"
+    },
+    {
+      "start": 400.0,
+      "end": 480.0,
+      "summary": "Dica prática de negociação com fornecedores"
+    },
+    {
+      "start": 600.0,
+      "end": 690.0,
+      "summary": "Conclusão motivacional sobre empreendedorismo"
+    }
+  ]
+}
+```
+
+Agora analise a transcrição fornecida e extraia os clips virais seguindo estas instruções.
diff --git a/video_render/config.py b/video_render/config.py
index 0ca0c1e..6173f67 100644
--- a/video_render/config.py
+++ b/video_render/config.py
@@ -62,13 +62,16 @@ class RenderingSettings:
     subtitle_font_size: int = int(os.environ.get("RENDER_SUBTITLE_FONT_SIZE", 64))
     caption_min_words: int = int(os.environ.get("CAPTION_MIN_WORDS", 2))
     caption_max_words: int = int(os.environ.get("CAPTION_MAX_WORDS", 2))
-    # Smart framing settings - CONTAINMENT TRACKING mode
     enable_smart_framing: bool = os.environ.get("ENABLE_SMART_FRAMING", "true").lower() in ("true", "1", "yes")
-    smart_framing_min_confidence: float = float(os.environ.get("SMART_FRAMING_MIN_CONFIDENCE", 0.3))  # Lowered for better cartoon detection
-    smart_framing_smoothing_window: int = int(os.environ.get("SMART_FRAMING_SMOOTHING_WINDOW", 30))  # Reduced - not needed with containment tracking
-    smart_framing_frame_skip: int = int(os.environ.get("SMART_FRAMING_FRAME_SKIP", 1))  # Process every frame for smooth 30 FPS tracking
-    smart_framing_max_velocity: int = int(os.environ.get("SMART_FRAMING_MAX_VELOCITY", 20))  # Moderate - only used during transitions
-    smart_framing_person_switch_cooldown: int = int(os.environ.get("SMART_FRAMING_PERSON_SWITCH_COOLDOWN", 999999))  # DISABLED - never switch people
+    smart_framing_min_confidence: float = float(os.environ.get("SMART_FRAMING_MIN_CONFIDENCE", 0.3))
+    smart_framing_smoothing_window: int = int(os.environ.get("SMART_FRAMING_SMOOTHING_WINDOW", 30))
+    smart_framing_frame_skip: int = int(os.environ.get("SMART_FRAMING_FRAME_SKIP", 1))
+    smart_framing_max_velocity: int = int(os.environ.get("SMART_FRAMING_MAX_VELOCITY", 25))
+    smart_framing_person_switch_cooldown: int = int(os.environ.get("SMART_FRAMING_PERSON_SWITCH_COOLDOWN", 30))
+    smart_framing_response_time: float = float(os.environ.get("SMART_FRAMING_RESPONSE_TIME", 0.6))
+    smart_framing_group_padding: float = float(os.environ.get("SMART_FRAMING_GROUP_PADDING", 0.15))
+    smart_framing_max_zoom_out: float = float(os.environ.get("SMART_FRAMING_MAX_ZOOM_OUT", 2.0))
+    smart_framing_dead_zone: int = int(os.environ.get("SMART_FRAMING_DEAD_ZONE", 60))
 
 
 @dataclass(frozen=True)
diff --git a/video_render/context_detection.py b/video_render/context_detection.py
index ab5c203..3200a88 100644
--- a/video_render/context_detection.py
+++ b/video_render/context_detection.py
@@ -41,6 +41,18 @@ class PersonTracking:
     frame_number: int
 
 
+@dataclass
+class GroupBoundingBox:
+    """Bounding box containing all tracked faces."""
+    x: int
+    y: int
+    width: int
+    height: int
+    center_x: int
+    center_y: int
+    face_count: int
+
+
 @dataclass
 class FrameContext:
     """Context information for a video frame."""
@@ -50,7 +62,8 @@ class FrameContext:
     active_speakers: List[int]  # indices of speaking faces
     primary_focus: Optional[Tuple[int, int]]  # (x, y) center point
     layout_mode: str  # "single", "dual_split", "grid"
-    selected_people: List[int] = field(default_factory=list)  # indices of people selected for display (max 2)
+    selected_people: List[int] = field(default_factory=list)  # indices of people selected for display
+    group_bounds: Optional[GroupBoundingBox] = None  # bounding box for all detected faces
 
 
 class MediaPipeDetector:
@@ -385,10 +398,11 @@ class AudioActivityDetector:
 class ContextAnalyzer:
     """Analyzes video context to determine focus and layout."""
 
-    def __init__(self, person_switch_cooldown: int = 30):
+    def __init__(self, person_switch_cooldown: int = 30, min_face_confidence: float = 0.3):
         self.detector = MediaPipeDetector()
         self.audio_detector = AudioActivityDetector()
         self.previous_faces: List[FaceDetection] = []
+        self.min_face_confidence = min_face_confidence
 
         # Person tracking state
         self.current_selected_people: List[int] = []  # Indices of people currently on screen
@@ -400,9 +414,9 @@ class ContextAnalyzer:
         self.stability_threshold = 20  # Frames needed to confirm a switch (increased for more stability)
         self.last_switched_people: List[int] = []  # People we just switched FROM
 
-        # Focus stability: track recent focus points for temporal smoothing
         self.focus_history: List[Tuple[int, int]] = []
-        self.focus_history_size: int = 5  # Keep last 5 focus points for smoothing
+        self.focus_history_size: int = 20
+        self.focus_dead_zone: int = 60
 
         # Debug logging
         self.frame_log_interval = 30  # Log every N frames
@@ -429,9 +443,11 @@ class ContextAnalyzer:
             FrameContext with detection results
         """
         faces = self.detector.detect_face_landmarks(frame)
+        faces = [face for face in faces if face.confidence >= self.min_face_confidence] if faces else []
 
         if not faces:
             faces = self.detector.detect_faces(frame)
+            faces = [face for face in faces if face.confidence >= self.min_face_confidence] if faces else []
 
         # Determine who is speaking
         active_speakers = []
@@ -440,13 +456,13 @@ class ContextAnalyzer:
         for i, face in enumerate(faces):
             is_speaking = False
 
-            # Check audio-based speech detection
-            if has_audio_speech:
-                is_speaking = True
-
-            # Check lip movement (visual speech detection)
+            # Prefer visual cues when multiple faces are present.
             if face.landmarks and len(self.previous_faces) > i:
-                is_speaking = is_speaking or self._detect_lip_movement(face, self.previous_faces[i])
+                is_speaking = self._detect_lip_movement(face, self.previous_faces[i])
+
+            # Audio can confirm speech when there's only one face.
+            if has_audio_speech and len(faces) == 1:
+                is_speaking = True
 
             if is_speaking:
                 active_speakers.append(i)
@@ -456,26 +472,41 @@ class ContextAnalyzer:
             logger.info(f"Speech detection - Frame {frame_number}: audio_active={has_audio_speech}, "
                        f"speakers={active_speakers}, total_faces={len(faces)}")
 
-        # Select THE person to focus on (always single person)
-        # Priority: 1) Who is speaking, 2) Who is most centered
-        selected_people = self._select_person_to_focus(
-            faces,
-            active_speakers,
-            frame_number,
-            frame.shape[1],  # frame width for center calculation
-            frame.shape[0]   # frame height for center calculation
-        )
+        if active_speakers:
+            selected_people = active_speakers[:4]
+            if len(selected_people) == 1:
+                layout_mode = "single"
+            elif len(selected_people) == 2:
+                layout_mode = "dual_split"
+            else:
+                layout_mode = "grid"
+        else:
+            # Select THE person to focus on (always single person)
+            # Priority: 1) Who is speaking, 2) Who is most centered
+            selected_people = self._select_person_to_focus(
+                faces,
+                active_speakers,
+                frame_number,
+                frame.shape[1],  # frame width for center calculation
+                frame.shape[0]   # frame height for center calculation
+            )
+            layout_mode = "single"
 
-        # Always use single-person layout (no split screen)
-        layout_mode = "single"
+        # Calculate group bounding box for ALL detected faces (multi-person support)
+        group_bounds = self._calculate_group_bounding_box(faces)
 
-        primary_focus = self._calculate_focus_point(faces, selected_people)
+        # For multi-person mode, use group center as primary focus
+        if group_bounds and group_bounds.face_count > 1:
+            primary_focus = (group_bounds.center_x, group_bounds.center_y)
+        else:
+            primary_focus = self._calculate_focus_point(faces, selected_people)
 
         # Debug logging every N frames
         if frame_number % self.frame_log_interval == 0:
             focus_reason = "speaker" if active_speakers else "no_speech_detected"
+            group_info = f", group={group_bounds.face_count} faces" if group_bounds else ""
             logger.info(f"Frame {frame_number}: {len(faces)} faces, "
-                       f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}")
+                       f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}{group_info}")
 
         self.previous_faces = faces
 
@@ -486,7 +517,8 @@ class ContextAnalyzer:
             active_speakers=active_speakers,
             primary_focus=primary_focus,
             layout_mode=layout_mode,
-            selected_people=selected_people
+            selected_people=selected_people,
+            group_bounds=group_bounds
         )
 
     def _detect_lip_movement(self, current_face: FaceDetection, previous_face: FaceDetection) -> bool:
@@ -543,134 +575,68 @@ class ContextAnalyzer:
             self.current_selected_people = []
             return []
 
-        # If only 1 person, always focus on them
         if len(faces) == 1:
             self.current_selected_people = [0]
             return [0]
 
-        # Check if we can switch people (cooldown period)
         frames_since_last_switch = frame_number - self.last_switch_frame
         can_switch = frames_since_last_switch >= self.person_switch_cooldown
 
-        # Calculate frame center for distance comparison
-        frame_center_x = frame_width / 2
-        frame_center_y = frame_height / 2
-
-        # ULTRA-STABLE MODE: Select ONE person at start, NEVER switch
-        # This completely eliminates switching-related instability
         desired_person_idx = None
 
-        # If we already have someone selected, ALWAYS KEEP THEM (never switch)
-        if self.current_selected_people and len(self.current_selected_people) > 0:
-            current_idx = self.current_selected_people[0]
-            if current_idx < len(faces):
-                # Current person still detected - keep them
-                desired_person_idx = current_idx
+        if active_speakers:
+            if self.current_selected_people and self.current_selected_people[0] in active_speakers:
+                desired_person_idx = self.current_selected_people[0]
             else:
-                # Current person lost - try to find them again by position/size similarity
-                # This handles temporary detection failures
-                current_person_found = False
-                if self.previous_faces and current_idx < len(self.previous_faces):
-                    prev_face = self.previous_faces[current_idx]
-                    # Find most similar face by position and size
-                    best_match_idx = None
-                    best_match_score = float('inf')
-                    for idx, face in enumerate(faces):
-                        # Distance between centers
-                        dx = face.center_x - prev_face.center_x
-                        dy = face.center_y - prev_face.center_y
-                        dist = np.sqrt(dx**2 + dy**2)
-                        # Size similarity
-                        size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height)
-                        score = dist + size_diff * 0.5
-                        if score < best_match_score:
-                            best_match_score = score
-                            best_match_idx = idx
-
-                    if best_match_idx is not None and best_match_score < 1000:
-                        desired_person_idx = best_match_idx
-                        current_person_found = True
-
-                if not current_person_found:
-                    # Really lost - select most confident
-                    face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
-                    face_confidences.sort(key=lambda x: x[1], reverse=True)
-                    desired_person_idx = face_confidences[0][0]
-                    logger.warning(f"Current person permanently lost - selecting new: {desired_person_idx}")
+                if can_switch or not self.current_selected_people:
+                    desired_person_idx = active_speakers[0]
+                    if self.current_selected_people and desired_person_idx != self.current_selected_people[0]:
+                        logger.info(f"Switching focus to speaker: {desired_person_idx}")
+                        self.last_switch_frame = frame_number
+                else:
+                    desired_person_idx = self.current_selected_people[0] if self.current_selected_people else active_speakers[0]
         else:
-            # First frame - select most confident person ONCE
-            face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
-            face_confidences.sort(key=lambda x: x[1], reverse=True)
-            desired_person_idx = face_confidences[0][0]
-            logger.info(f"INITIAL SELECTION - Person {desired_person_idx} (will be tracked throughout entire video)")
-
-        # IGNORE SPEECH DETECTION - it was causing instability
-        # We now track ONE person from start to finish, regardless of who speaks
-
-        # OLD LOGIC (commented out - was causing issues):
-        # This logic would switch based on "who is more centered" which caused constant switching
-        if False:  # Disabled
-            # Calculate distance from center for each face
-            center_distances = []
-            for idx, face in enumerate(faces):
-                # Euclidean distance from frame center
-                dx = face.center_x - frame_center_x
-                dy = face.center_y - frame_center_y
-                distance = np.sqrt(dx**2 + dy**2)
-                center_distances.append((idx, distance, face.confidence))
-
-            # Sort by distance (closest first), then by confidence as tiebreaker
-            center_distances.sort(key=lambda x: (x[1], -x[2]))
-            most_centered_idx = center_distances[0][0]
-            most_centered_distance = center_distances[0][1]
-
-            # STICKY BEHAVIOR: If we already have someone selected, only switch if:
-            # - New person is SIGNIFICANTLY more centered (30% closer to center)
-            # - OR current person is now very far from center (>40% of frame width)
             if self.current_selected_people and len(self.current_selected_people) > 0:
                 current_idx = self.current_selected_people[0]
                 if current_idx < len(faces):
-                    current_face = faces[current_idx]
-                    current_dx = current_face.center_x - frame_center_x
-                    current_dy = current_face.center_y - frame_center_y
-                    current_distance = np.sqrt(current_dx**2 + current_dy**2)
-
-                    # Define "significantly better" threshold
-                    max_acceptable_distance = frame_width * 0.4  # 40% of frame width
-                    improvement_threshold = 0.7  # New person must be 30% closer (0.7 ratio)
-
-                    # Keep current person if they're still reasonably centered
-                    if current_distance < max_acceptable_distance:
-                        # Current person is still acceptable - only switch if new is MUCH better
-                        if most_centered_distance < current_distance * improvement_threshold:
-                            desired_person_idx = most_centered_idx
-                            logger.debug(f"Switching: new person MUCH more centered ({most_centered_distance:.0f} vs {current_distance:.0f})")
-                        else:
-                            desired_person_idx = current_idx  # Keep current
-                            logger.debug(f"Keeping current person: still reasonably centered ({current_distance:.0f} px from center)")
-                    else:
-                        # Current person is too far from center - switch
-                        desired_person_idx = most_centered_idx
-                        logger.debug(f"Current person too far from center ({current_distance:.0f} px), switching")
+                    desired_person_idx = current_idx
                 else:
-                    # Current selection invalid
-                    desired_person_idx = most_centered_idx
-            else:
-                # First time - select most centered
-                desired_person_idx = most_centered_idx
+                    if self.previous_faces and current_idx < len(self.previous_faces):
+                        prev_face = self.previous_faces[current_idx]
+                        best_match_idx = None
+                        best_match_score = float('inf')
+                        for idx, face in enumerate(faces):
+                            dx = face.center_x - prev_face.center_x
+                            dy = face.center_y - prev_face.center_y
+                            dist = np.sqrt(dx**2 + dy**2)
+                            size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height)
+                            score = dist + size_diff * 0.5
+                            if score < best_match_score:
+                                best_match_score = score
+                                best_match_idx = idx
+
+                        if best_match_idx is not None and best_match_score < 1000:
+                            desired_person_idx = best_match_idx
+                        else:
+                            face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
+                            face_confidences.sort(key=lambda x: x[1], reverse=True)
+                            desired_person_idx = face_confidences[0][0]
+                    else:
+                        face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
+                        face_confidences.sort(key=lambda x: x[1], reverse=True)
+                        desired_person_idx = face_confidences[0][0]
+            else:
+                face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
+                face_confidences.sort(key=lambda x: x[1], reverse=True)
+                desired_person_idx = face_confidences[0][0]
 
-        # Wrap in list for compatibility with existing code
         desired_people = [desired_person_idx] if desired_person_idx is not None else []
 
-        # ULTRA-STABLE MODE: NO SWITCHING LOGIC AT ALL
-        # Simply set the person and never change
         if not self.current_selected_people:
-            # First time only
             self.current_selected_people = desired_people
             self.last_switch_frame = frame_number
-            logger.info(f"Frame {frame_number}: LOCKED ON person {desired_people} - will never switch")
+            logger.info(f"Frame {frame_number}: Locked on person {desired_people}")
         else:
-            # Already have someone - just update to desired (which is same person due to logic above)
             self.current_selected_people = desired_people
 
         return self.current_selected_people.copy()
@@ -798,24 +764,77 @@ class ContextAnalyzer:
                 raw_focus_x = most_confident.center_x
                 raw_focus_y = most_confident.center_y
 
-        # Apply temporal smoothing using focus history
+        if self.focus_history:
+            last_x, last_y = self.focus_history[-1]
+            dx = abs(raw_focus_x - last_x)
+            dy = abs(raw_focus_y - last_y)
+            if dx < self.focus_dead_zone and dy < self.focus_dead_zone:
+                return self.focus_history[-1]
+
         self.focus_history.append((raw_focus_x, raw_focus_y))
         if len(self.focus_history) > self.focus_history_size:
             self.focus_history.pop(0)
 
-        # Calculate smoothed focus as weighted average (more weight to recent frames)
-        if len(self.focus_history) > 1:
-            # Exponential weights: recent frames have more influence
-            weights = [2 ** i for i in range(len(self.focus_history))]
-            total_weight = sum(weights)
-
-            smoothed_x = sum(x * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
-            smoothed_y = sum(y * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
-
-            return (int(smoothed_x), int(smoothed_y))
+        if len(self.focus_history) >= 5:
+            xs = [x for x, y in self.focus_history]
+            ys = [y for x, y in self.focus_history]
+            median_x = int(np.median(xs))
+            median_y = int(np.median(ys))
+            return (median_x, median_y)
         else:
             return (raw_focus_x, raw_focus_y)
 
+    def _calculate_group_bounding_box(
+        self,
+        faces: List[FaceDetection],
+        padding_percent: float = 0.15,
+        max_faces: int = 6
+    ) -> Optional[GroupBoundingBox]:
+        """
+        Calculate bounding box containing all detected faces with padding.
+
+        Args:
+            faces: List of detected faces
+            padding_percent: Padding around group as percentage of bbox dimensions
+            max_faces: Maximum faces to include (use most confident if exceeded)
+
+        Returns:
+            GroupBoundingBox or None if no faces
+        """
+        if not faces:
+            return None
+
+        # If too many faces, use most confident ones
+        if len(faces) > max_faces:
+            faces = sorted(faces, key=lambda f: f.confidence, reverse=True)[:max_faces]
+
+        # Calculate bounding box containing all faces
+        min_x = min(f.x for f in faces)
+        max_x = max(f.x + f.width for f in faces)
+        min_y = min(f.y for f in faces)
+        max_y = max(f.y + f.height for f in faces)
+
+        # Add padding
+        width = max_x - min_x
+        height = max_y - min_y
+        pad_x = int(width * padding_percent)
+        pad_y = int(height * padding_percent)
+
+        final_x = max(0, min_x - pad_x)
+        final_y = max(0, min_y - pad_y)
+        final_width = width + 2 * pad_x
+        final_height = height + 2 * pad_y
+
+        return GroupBoundingBox(
+            x=final_x,
+            y=final_y,
+            width=final_width,
+            height=final_height,
+            center_x=final_x + final_width // 2,
+            center_y=final_y + final_height // 2,
+            face_count=len(faces)
+        )
+
     def close(self):
         """Release resources."""
         self.detector.close()
diff --git a/video_render/llm.py b/video_render/llm.py
index 76be59d..f872089 100644
--- a/video_render/llm.py
+++ b/video_render/llm.py
@@ -137,11 +137,11 @@ class OpenRouterCopywriter:
                     continue
 
                 duration = end - start
-                if duration < 45:
+                if duration < 60:
                     logger.warning(f"Highlight ignorado: muito curto ({duration}s, minimo 45s)")
                     continue
 
-                if duration > 90:
+                if duration > 120:
                     logger.warning(f"Highlight ignorado: muito longo ({duration}s, maximo 90s)")
                     continue
 
diff --git a/video_render/rendering.py b/video_render/rendering.py
index b2ce7f5..2fadb9c 100644
--- a/video_render/rendering.py
+++ b/video_render/rendering.py
@@ -347,7 +347,12 @@ class VideoRenderer:
             frame_skip=settings.rendering.smart_framing_frame_skip,
             smoothing_window=settings.rendering.smart_framing_smoothing_window,
             max_velocity=settings.rendering.smart_framing_max_velocity,
-            person_switch_cooldown=settings.rendering.smart_framing_person_switch_cooldown
+            person_switch_cooldown=settings.rendering.smart_framing_person_switch_cooldown,
+            response_time=settings.rendering.smart_framing_response_time,
+            group_padding=settings.rendering.smart_framing_group_padding,
+            max_zoom_out=settings.rendering.smart_framing_max_zoom_out,
+            dead_zone=settings.rendering.smart_framing_dead_zone,
+            min_face_confidence=settings.rendering.smart_framing_min_confidence
         )
 
     def render(
diff --git a/video_render/smart_framing.py b/video_render/smart_framing.py
index 8b5f52a..e0bb4c9 100644
--- a/video_render/smart_framing.py
+++ b/video_render/smart_framing.py
@@ -16,7 +16,7 @@ from moviepy.video.VideoClip import VideoClip
 from moviepy.video.io.VideoFileClip import VideoFileClip
 from scipy import signal
 
-from video_render.context_detection import ContextAnalyzer, FrameContext, FaceDetection
+from video_render.context_detection import ContextAnalyzer, FrameContext, FaceDetection, GroupBoundingBox
 
 logger = logging.getLogger(__name__)
 
@@ -40,7 +40,7 @@ class FramingPlan:
 
 
 class SmartFramer:
-    """Creates intelligent 9:16 framing for horizontal videos."""
+    """Creates intelligent 9:16 framing for horizontal videos with multi-person support."""
 
     def __init__(
         self,
@@ -48,8 +48,13 @@ class SmartFramer:
         target_height: int = 1920,
         frame_skip: int = 1,
         smoothing_window: int = 30,
-        max_velocity: int = 20,
-        person_switch_cooldown: int = 999999
+        max_velocity: int = 25,
+        person_switch_cooldown: int = 30,
+        response_time: float = 0.6,
+        group_padding: float = 0.15,
+        max_zoom_out: float = 2.0,
+        dead_zone: int = 100,
+        min_face_confidence: float = 0.3
     ):
         self.target_width = target_width
         self.target_height = target_height
@@ -58,8 +63,15 @@ class SmartFramer:
         self.smoothing_window = smoothing_window
         self.max_velocity = max_velocity
         self.person_switch_cooldown = person_switch_cooldown
+        self.response_time = response_time
+        self.group_padding = group_padding
+        self.max_zoom_out = max_zoom_out
+        self.dead_zone = dead_zone
+        self.min_face_confidence = min_face_confidence
+        self.position_history_size = 45
+        self.hysteresis_frames = 8
 
-        logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip}, smoothing={smoothing_window}, velocity={max_velocity}, cooldown={person_switch_cooldown})")
+        logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, response_time={response_time}s, max_velocity={max_velocity}, dead_zone={dead_zone})")
 
     def create_framing_plan(
         self,
@@ -80,7 +92,10 @@ class SmartFramer:
         Returns:
             FramingPlan with all frame contexts and crop regions
         """
-        analyzer = ContextAnalyzer(person_switch_cooldown=self.person_switch_cooldown)
+        analyzer = ContextAnalyzer(
+            person_switch_cooldown=self.person_switch_cooldown,
+            min_face_confidence=self.min_face_confidence
+        )
 
         speaking_periods = None
         if audio_samples is not None:
@@ -131,7 +146,8 @@ class SmartFramer:
         crop_regions = self._calculate_crop_regions(
             frame_contexts,
             source_width,
-            source_height
+            source_height,
+            fps=fps
         )
 
         framing_plan = FramingPlan(
@@ -146,14 +162,241 @@ class SmartFramer:
 
         return framing_plan
 
+    def _segment_by_face_detection(
+        self,
+        has_face_flags: List[bool],
+        min_segment_frames: int = 10
+    ) -> List[Tuple[int, int, bool]]:
+        """
+        Segment the video into continuous regions with/without face.
+        Returns list of (start_idx, end_idx, has_face) tuples.
+        Small segments are merged with neighbors.
+        """
+        if not has_face_flags:
+            return []
+
+        segments = []
+        start_idx = 0
+        current_state = has_face_flags[0]
+
+        for i in range(1, len(has_face_flags)):
+            if has_face_flags[i] != current_state:
+                segments.append((start_idx, i - 1, current_state))
+                start_idx = i
+                current_state = has_face_flags[i]
+
+        segments.append((start_idx, len(has_face_flags) - 1, current_state))
+
+        merged = []
+        for seg in segments:
+            start, end, has_face = seg
+            length = end - start + 1
+
+            if length < min_segment_frames and merged:
+                prev_start, prev_end, prev_has_face = merged[-1]
+                merged[-1] = (prev_start, end, prev_has_face)
+            else:
+                merged.append(seg)
+
+        return merged
+
+    def _interpolate_smooth(
+        self,
+        positions: List[float],
+        segments: List[Tuple[int, int, bool]],
+        transition_frames: int = 15
+    ) -> List[float]:
+        """
+        Create smooth transitions between segments using cosine interpolation.
+        Within each segment, position is constant. Between segments, smooth transition.
+        """
+        if not positions or not segments:
+            return positions
+
+        result = list(positions)
+
+        segment_values = []
+        for start, end, has_face in segments:
+            seg_positions = positions[start:end + 1]
+            if seg_positions:
+                segment_values.append(float(np.median(seg_positions)))
+            else:
+                segment_values.append(positions[start] if start < len(positions) else 0.0)
+
+        for i, (start, end, has_face) in enumerate(segments):
+            value = segment_values[i]
+            for j in range(start, end + 1):
+                result[j] = value
+
+        for i in range(len(segments) - 1):
+            seg1_start, seg1_end, _ = segments[i]
+            seg2_start, seg2_end, _ = segments[i + 1]
+            val1 = segment_values[i]
+            val2 = segment_values[i + 1]
+
+            if abs(val2 - val1) < self.dead_zone * 0.5:
+                continue
+
+            trans_start = max(seg1_end - transition_frames // 2, seg1_start)
+            trans_end = min(seg2_start + transition_frames // 2, seg2_end)
+            trans_length = trans_end - trans_start + 1
+
+            if trans_length < 2:
+                continue
+
+            for j in range(trans_length):
+                t = j / (trans_length - 1)
+                smooth_t = 0.5 - 0.5 * np.cos(t * np.pi)
+                idx = trans_start + j
+                if 0 <= idx < len(result):
+                    result[idx] = val1 + (val2 - val1) * smooth_t
+
+        return result
+
+    def _apply_savgol_filter(
+        self,
+        positions: List[float],
+        window_length: int = 61,
+        polyorder: int = 2
+    ) -> List[float]:
+        """
+        Apply Savitzky-Golay filter for ultra-smooth position tracking.
+        This is a signal processing filter that preserves trends while removing noise.
+        """
+        if len(positions) < window_length:
+            window_length = len(positions) if len(positions) % 2 == 1 else len(positions) - 1
+            if window_length < 3:
+                return positions
+
+        if window_length % 2 == 0:
+            window_length -= 1
+
+        if window_length <= polyorder:
+            polyorder = max(1, window_length - 1)
+
+        try:
+            smoothed = signal.savgol_filter(positions, window_length, polyorder, mode='nearest')
+            return smoothed.tolist()
+        except Exception as e:
+            logger.warning(f"Savgol filter failed: {e}, returning original positions")
+            return positions
+
+    def _apply_median_filter(self, positions: List[float], window_size: int = 5) -> List[float]:
+        """
+        Apply median filter to remove detection noise.
+
+        Median filter is ideal for removing outliers while preserving
+        edges (real movements). Window size of 5 means each position
+        is replaced by the median of itself and 2 neighbors on each side.
+
+        Args:
+            positions: Raw positions from detection
+            window_size: Window size (must be odd), default 5
+
+        Returns:
+            Filtered positions with noise removed
+        """
+        if len(positions) < window_size:
+            return positions
+
+        from scipy.signal import medfilt
+
+        if window_size % 2 == 0:
+            window_size += 1
+
+        filtered = medfilt(positions, kernel_size=window_size)
+
+        return filtered.tolist()
+
+    def _is_detection_stable(self, has_face_flags: List[bool], window_size: int = 30) -> bool:
+        """
+        Check if face detection is stable enough to use smart framing.
+        If detection is too unstable (frequent changes), it's better to use static center crop.
+
+        Args:
+            has_face_flags: Boolean flags indicating if face was detected per frame
+            window_size: Number of frames to analyze for stability
+
+        Returns:
+            True if detection is stable, False if too unstable
+        """
+        if len(has_face_flags) < window_size:
+            window_size = len(has_face_flags)
+
+        if window_size == 0:
+            return False
+
+        changes = 0
+        for i in range(1, len(has_face_flags)):
+            if has_face_flags[i] != has_face_flags[i-1]:
+                changes += 1
+
+        change_rate = changes / len(has_face_flags)
+
+        return change_rate < 0.3
+
+    def _stabilize_no_face_sequences(
+        self,
+        positions: List[float],
+        has_face_flags: List[bool],
+        source_center: float = None
+    ) -> List[float]:
+        """
+        Stabilize positions during sequences without face detection.
+        Uses median of all valid positions for maximum stability.
+        """
+        if len(positions) != len(has_face_flags):
+            return positions
+
+        fallback = source_center if source_center else (positions[0] if positions else 0.0)
+
+        face_ratio = sum(has_face_flags) / len(has_face_flags) if has_face_flags else 0
+        if face_ratio < 0.15:
+            return [fallback] * len(positions)
+
+        changes = sum(1 for i in range(1, len(has_face_flags)) if has_face_flags[i] != has_face_flags[i-1])
+        instability_ratio = changes / len(has_face_flags) if has_face_flags else 0
+        if instability_ratio > 0.25:
+            valid_positions = [positions[i] for i, has_face in enumerate(has_face_flags) if has_face]
+            if valid_positions:
+                return [float(np.median(valid_positions))] * len(positions)
+            return [fallback] * len(positions)
+
+        valid_positions = [positions[i] for i, has_face in enumerate(has_face_flags) if has_face]
+        if not valid_positions:
+            return [fallback] * len(positions)
+
+        global_median = float(np.median(valid_positions))
+        stabilized = list(positions)
+        i = 0
+
+        while i < len(has_face_flags):
+            if not has_face_flags[i]:
+                start_idx = i
+                recent_valid = []
+                for j in range(max(0, start_idx - self.position_history_size), start_idx):
+                    if has_face_flags[j]:
+                        recent_valid.append(positions[j])
+
+                lock_value = float(np.median(recent_valid)) if len(recent_valid) >= 5 else global_median
+
+                while i < len(has_face_flags) and not has_face_flags[i]:
+                    stabilized[i] = lock_value
+                    i += 1
+            else:
+                i += 1
+
+        return stabilized
+
     def _calculate_crop_regions(
         self,
         contexts: List[FrameContext],
         source_width: int,
-        source_height: int
+        source_height: int,
+        fps: Optional[float] = None
     ) -> List[CropRegion]:
         """
-        Calculate smooth crop regions for each frame.
+        Calculate smooth crop regions for each frame with multi-person support.
 
         Args:
             contexts: List of frame contexts
@@ -168,120 +411,174 @@ class SmartFramer:
 
         source_aspect = source_width / source_height
 
+        # Calculate base crop dimensions for 9:16
         if source_aspect > self.target_aspect:
-            crop_height = source_height
-            crop_width = int(crop_height / self.target_aspect)
+            base_crop_height = source_height
+            base_crop_width = int(base_crop_height / self.target_aspect)
 
-            if crop_width > source_width:
-                crop_width = source_width
-                crop_height = int(crop_width * self.target_aspect)
+            if base_crop_width > source_width:
+                base_crop_width = source_width
+                base_crop_height = int(base_crop_width * self.target_aspect)
         else:
-            crop_width = source_width
-            crop_height = int(crop_width * self.target_aspect)
+            base_crop_width = source_width
+            base_crop_height = int(base_crop_width * self.target_aspect)
 
-            if crop_height > source_height:
-                crop_height = source_height
-                crop_width = int(crop_height / self.target_aspect)
+            if base_crop_height > source_height:
+                base_crop_height = source_height
+                base_crop_width = int(base_crop_height / self.target_aspect)
 
-        safe_zone_margin_x = crop_width * 0.40
-        safe_zone_margin_y = crop_height * 0.40
+        center_xs = []
+        center_ys = []
+        zoom_factors = []
+        has_face_flags = []
 
-        dead_zone_threshold = 100
+        static_center_x = float(source_width // 2)
+        static_center_y = float(source_height // 2)
 
-        if contexts and contexts[0].primary_focus:
-            current_crop_center_x = contexts[0].primary_focus[0]
-            current_crop_center_y = contexts[0].primary_focus[1]
-        else:
-            current_crop_center_x = source_width // 2
-            current_crop_center_y = source_height // 2
+        last_valid_x = static_center_x
+        last_valid_y = static_center_y
+        last_valid_zoom = 1.0
 
-        center_xs = [current_crop_center_x]
-        center_ys = [current_crop_center_y]
+        for ctx in contexts:
+            selected_face = None
+            if ctx.selected_people:
+                idx = ctx.selected_people[0]
+                if 0 <= idx < len(ctx.detected_faces):
+                    selected_face = ctx.detected_faces[idx]
 
-        for ctx in contexts[1:]:
-            if ctx.primary_focus and ctx.selected_people and len(ctx.detected_faces) > 0:
-                primary_person_idx = ctx.selected_people[0] if ctx.selected_people else 0
-                if primary_person_idx < len(ctx.detected_faces):
-                    face = ctx.detected_faces[primary_person_idx]
+            if selected_face:
+                center_x = float(selected_face.center_x)
+                center_y = float(selected_face.center_y)
+                center_xs.append(center_x)
+                center_ys.append(center_y)
 
-                    face_left = face.x
-                    face_right = face.x + face.width
-                    face_top = face.y
-                    face_bottom = face.y + face.height
+                required_width = selected_face.width * (1 + self.group_padding * 2)
+                required_height = selected_face.height * (1 + self.group_padding * 3)
 
-                    crop_left = current_crop_center_x - crop_width // 2
-                    crop_right = current_crop_center_x + crop_width // 2
-                    crop_top = current_crop_center_y - crop_height // 2
-                    crop_bottom = current_crop_center_y + crop_height // 2
+                zoom_w = required_width / base_crop_width
+                zoom_h = required_height / base_crop_height
+                zoom = max(zoom_w, zoom_h, 1.0)
+                zoom = min(zoom, self.max_zoom_out)
+                zoom_factors.append(zoom)
 
-                    face_rel_left = face_left - crop_left
-                    face_rel_right = face_right - crop_left
-                    face_rel_top = face_top - crop_top
-                    face_rel_bottom = face_bottom - crop_top
+                last_valid_x = center_x
+                last_valid_y = center_y
+                last_valid_zoom = zoom
+                has_face_flags.append(True)
+            elif ctx.group_bounds and ctx.group_bounds.face_count > 0:
+                group = ctx.group_bounds
+                center_x = float(group.center_x)
+                center_y = float(group.center_y)
+                center_xs.append(center_x)
+                center_ys.append(center_y)
 
-                    face_left_safe = face_rel_left >= safe_zone_margin_x
-                    face_right_safe = face_rel_right <= (crop_width - safe_zone_margin_x)
-                    face_top_safe = face_rel_top >= safe_zone_margin_y
-                    face_bottom_safe = face_rel_bottom <= (crop_height - safe_zone_margin_y)
+                required_width = group.width * (1 + self.group_padding * 2)
+                required_height = group.height * (1 + self.group_padding * 3)
 
-                    face_fully_visible = face_left_safe and face_right_safe and face_top_safe and face_bottom_safe
+                zoom_w = required_width / base_crop_width
+                zoom_h = required_height / base_crop_height
+                zoom = max(zoom_w, zoom_h, 1.0)
+                zoom = min(zoom, self.max_zoom_out)
+                zoom_factors.append(zoom)
 
-                    if face_fully_visible:
-                        center_xs.append(current_crop_center_x)
-                        center_ys.append(current_crop_center_y)
-                    else:
-                        shift_x = 0
-                        shift_y = 0
+                last_valid_x = center_x
+                last_valid_y = center_y
+                last_valid_zoom = zoom
+                has_face_flags.append(True)
+            elif ctx.primary_focus and len(ctx.detected_faces) > 0:
+                center_x = float(ctx.primary_focus[0])
+                center_y = float(ctx.primary_focus[1])
+                center_xs.append(center_x)
+                center_ys.append(center_y)
+                zoom_factors.append(1.0)
 
-                        if not face_left_safe:
-                            shift_x = face_rel_left - safe_zone_margin_x
-                        elif not face_right_safe:
-                            shift_x = face_rel_right - (crop_width - safe_zone_margin_x)
-
-                        if not face_top_safe:
-                            shift_y = face_rel_top - safe_zone_margin_y
-                        elif not face_bottom_safe:
-                            shift_y = face_rel_bottom - (crop_height - safe_zone_margin_y)
-
-                        if abs(shift_x) > dead_zone_threshold:
-                            current_crop_center_x += shift_x
-                        if abs(shift_y) > dead_zone_threshold:
-                            current_crop_center_y += shift_y
-
-                        center_xs.append(current_crop_center_x)
-                        center_ys.append(current_crop_center_y)
-                else:
-                    center_xs.append(current_crop_center_x)
-                    center_ys.append(current_crop_center_y)
+                last_valid_x = center_x
+                last_valid_y = center_y
+                last_valid_zoom = 1.0
+                has_face_flags.append(True)
             else:
-                center_xs.append(current_crop_center_x)
-                center_ys.append(current_crop_center_y)
+                center_xs.append(last_valid_x)
+                center_ys.append(last_valid_y)
+                zoom_factors.append(last_valid_zoom)
+                has_face_flags.append(False)
 
-        if len(center_xs) > 1:
-            alpha = 0.002
-            smoothed_xs = [center_xs[0]]
-            smoothed_ys = [center_ys[0]]
-            for i in range(1, len(center_xs)):
-                if center_xs[i] != center_xs[i-1] or center_ys[i] != center_ys[i-1]:
-                    smoothed_xs.append(alpha * center_xs[i] + (1 - alpha) * smoothed_xs[i-1])
-                    smoothed_ys.append(alpha * center_ys[i] + (1 - alpha) * smoothed_ys[i-1])
-                else:
-                    smoothed_xs.append(smoothed_xs[i-1])
-                    smoothed_ys.append(smoothed_ys[i-1])
-            center_xs = smoothed_xs
-            center_ys = smoothed_ys
+        center_x_video = float(source_width // 2)
+        center_y_video = float(source_height // 2)
 
-        center_xs = self._limit_velocity(center_xs, 2)
-        center_ys = self._limit_velocity(center_ys, 2)
+        if not self._is_detection_stable(has_face_flags):
+            final_xs = [center_x_video] * len(center_xs)
+            final_ys = [center_y_video] * len(center_ys)
+            final_zooms = [1.0] * len(zoom_factors)
+        else:
+            center_xs = self._stabilize_no_face_sequences(
+                center_xs,
+                has_face_flags,
+                source_center=center_x_video
+            )
+            center_ys = self._stabilize_no_face_sequences(
+                center_ys,
+                has_face_flags,
+                source_center=center_y_video
+            )
+            zoom_factors = self._stabilize_no_face_sequences(
+                zoom_factors,
+                has_face_flags,
+                source_center=1.0
+            )
 
-        center_xs = self._apply_dead_zone(center_xs, 5)
-        center_ys = self._apply_dead_zone(center_ys, 5)
+            face_count = sum(has_face_flags)
+            if face_count < len(has_face_flags) * 0.3:
+                final_xs = [center_x_video] * len(center_xs)
+                final_ys = [center_y_video] * len(center_ys)
+                final_zooms = [1.0] * len(zoom_factors)
+            else:
+                valid_xs = [center_xs[i] for i, has_face in enumerate(has_face_flags) if has_face]
+                valid_ys = [center_ys[i] for i, has_face in enumerate(has_face_flags) if has_face]
+                valid_zooms = [zoom_factors[i] for i, has_face in enumerate(has_face_flags) if has_face]
 
+                target_x = float(np.median(valid_xs)) if valid_xs else center_x_video
+                target_y = float(np.median(valid_ys)) if valid_ys else center_y_video
+                target_zoom = float(np.median(valid_zooms)) if valid_zooms else 1.0
+
+                for i in range(len(center_xs)):
+                    if not has_face_flags[i]:
+                        center_xs[i] = target_x
+                        center_ys[i] = target_y
+                        zoom_factors[i] = target_zoom
+
+                final_xs = self._apply_savgol_filter(center_xs, window_length=61, polyorder=2)
+                final_ys = self._apply_savgol_filter(center_ys, window_length=61, polyorder=2)
+                final_zooms = self._apply_savgol_filter(zoom_factors, window_length=61, polyorder=2)
+
+                if fps and self.response_time > 0:
+                    dt = self.frame_skip / fps
+                    alpha = 1 - np.exp(-dt / self.response_time)
+                    final_xs = self._apply_exponential_smoothing(final_xs, alpha)
+                    final_ys = self._apply_exponential_smoothing(final_ys, alpha)
+                    final_zooms = self._apply_exponential_smoothing(final_zooms, alpha)
+
+        # Generate crop regions
         crop_regions = []
-        for center_x, center_y in zip(center_xs, center_ys):
-            x = int(center_x - crop_width // 2)
-            y = int(center_y - crop_height // 2)
+        for cx, cy, zoom in zip(final_xs, final_ys, final_zooms):
+            # Calculate actual crop size with zoom
+            crop_width = int(base_crop_width * zoom)
+            crop_height = int(base_crop_height * zoom)
 
+            # Clamp to source dimensions
+            crop_width = min(crop_width, source_width)
+            crop_height = min(crop_height, source_height)
+
+            # Maintain aspect ratio after clamping
+            if crop_width / crop_height > base_crop_width / base_crop_height:
+                crop_width = int(crop_height * base_crop_width / base_crop_height)
+            else:
+                crop_height = int(crop_width * base_crop_height / base_crop_width)
+
+            # Calculate top-left corner
+            x = int(cx - crop_width // 2)
+            y = int(cy - crop_height // 2)
+
+            # Keep within bounds
             x = max(0, min(x, source_width - crop_width))
             y = max(0, min(y, source_height - crop_height))
 
@@ -292,11 +589,26 @@ class SmartFramer:
                 height=crop_height
             ))
 
+        # Clear temporary lists
         center_xs.clear()
         center_ys.clear()
+        zoom_factors.clear()
 
         return crop_regions
 
+    def _apply_exponential_smoothing(self, positions: List[float], alpha: float) -> List[float]:
+        """
+        Smooth positions with exponential moving average.
+        """
+        if not positions:
+            return positions
+
+        alpha = max(0.0, min(alpha, 1.0))
+        smoothed = [positions[0]]
+        for i in range(1, len(positions)):
+            prev = smoothed[-1]
+            smoothed.append(prev + alpha * (positions[i] - prev))
+        return smoothed
     def _apply_dead_zone(self, positions: List[float], threshold: float) -> List[float]:
         """
         Apply dead zone to eliminate micro-movements.
@@ -355,7 +667,13 @@ class SmartFramer:
     ) -> VideoClip:
         """
         Apply smart framing to a video clip.
-        Always uses single-person focus (no split screen).
+        Automatically selects layout based on number of people detected.
+
+        Layouts:
+        - 1 person: Single framing (follow person)
+        - 2 people: Vertical split screen (side by side)
+        - 3 people: 1 on top, 2 on bottom
+        - 4 people: 2x2 grid
 
         Args:
             video_clip: Source video clip
@@ -364,7 +682,42 @@ class SmartFramer:
         Returns:
             Reframed video clip
         """
-        return self._apply_single_framing(video_clip, framing_plan)
+        # Determine predominant number of faces across all frames
+        if not framing_plan.frame_contexts:
+            return self._apply_single_framing(video_clip, framing_plan)
+
+        face_counts = []
+        for ctx in framing_plan.frame_contexts:
+            if ctx.active_speakers:
+                face_counts.append(len(ctx.active_speakers))
+            elif ctx.group_bounds:
+                face_counts.append(ctx.group_bounds.face_count)
+            else:
+                face_counts.append(len(ctx.detected_faces))
+
+        # Use mode (most common) face count, minimum 1
+        if face_counts:
+            from collections import Counter
+            count_freq = Counter(face_counts)
+            # Get the most common count, but ignore 0
+            non_zero_counts = {k: v for k, v in count_freq.items() if k > 0}
+            if non_zero_counts:
+                predominant_faces = max(non_zero_counts, key=non_zero_counts.get)
+            else:
+                predominant_faces = 1
+        else:
+            predominant_faces = 1
+
+        logger.info(f"Layout selection: predominant_faces={predominant_faces}")
+
+        if predominant_faces == 1:
+            return self._apply_single_framing(video_clip, framing_plan)
+        elif predominant_faces == 2:
+            return self._apply_split_screen(video_clip, framing_plan)
+        elif predominant_faces == 3:
+            return self._apply_three_person_layout(video_clip, framing_plan)
+        else:  # 4 or more
+            return self._apply_grid_layout(video_clip, framing_plan)
 
     def _apply_single_framing(
         self,
@@ -396,22 +749,23 @@ class SmartFramer:
                 cropped = frame[y:y + crop_h, x:x + crop_w]
             else:
                 exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
+                last_idx = len(framing_plan.crop_regions) - 1
+                if last_idx <= 0:
+                    crop = framing_plan.crop_regions[0]
+                    x, y, width, height = crop.x, crop.y, crop.width, crop.height
+                else:
+                    exact_frame_idx = max(0.0, min(exact_frame_idx, float(last_idx)))
+                    low_idx = int(np.floor(exact_frame_idx))
+                    high_idx = min(low_idx + 1, last_idx)
+                    alpha = exact_frame_idx - low_idx
 
-                idx_floor = int(exact_frame_idx)
-                idx_ceil = idx_floor + 1
+                    crop_a = framing_plan.crop_regions[low_idx]
+                    crop_b = framing_plan.crop_regions[high_idx]
 
-                alpha = exact_frame_idx - idx_floor
-
-                idx_floor = max(0, min(idx_floor, len(framing_plan.crop_regions) - 1))
-                idx_ceil = max(0, min(idx_ceil, len(framing_plan.crop_regions) - 1))
-
-                crop1 = framing_plan.crop_regions[idx_floor]
-                crop2 = framing_plan.crop_regions[idx_ceil]
-
-                x = int(crop1.x * (1 - alpha) + crop2.x * alpha)
-                y = int(crop1.y * (1 - alpha) + crop2.y * alpha)
-                width = int(crop1.width * (1 - alpha) + crop2.width * alpha)
-                height = int(crop1.height * (1 - alpha) + crop2.height * alpha)
+                    x = int(round(crop_a.x + (crop_b.x - crop_a.x) * alpha))
+                    y = int(round(crop_a.y + (crop_b.y - crop_a.y) * alpha))
+                    width = int(round(crop_a.width + (crop_b.width - crop_a.width) * alpha))
+                    height = int(round(crop_a.height + (crop_b.height - crop_a.height) * alpha))
 
                 h, w = frame.shape[:2]
                 x = max(0, min(x, w - width))
@@ -440,7 +794,7 @@ class SmartFramer:
         framing_plan: FramingPlan
     ) -> VideoClip:
         """
-        Apply split screen for two people.
+        Apply split screen for two people (side by side vertical split).
 
         Args:
             video_clip: Source video clip
@@ -471,71 +825,53 @@ class SmartFramer:
 
             output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
 
-            if context.selected_people and len(context.selected_people) >= 2:
-                selected_faces = [context.detected_faces[i] for i in context.selected_people[:2]
-                                if i < len(context.detected_faces)]
-
-                if len(selected_faces) >= 2:
-                    faces = sorted(selected_faces, key=lambda f: f.center_x)
-                    left_face = faces[0]
-                    right_face = faces[1]
-
-                    for idx, face in enumerate([left_face, right_face]):
-
-                        half_width = self.target_width // 2
-                        half_aspect = self.target_height / half_width  # Aspect ratio for half
-
-                        face_width = max(face.width, frame.shape[1] // 4)  # At least 1/4 of frame width
-                        crop_width = int(face_width * 2.5)  # Add padding around face
-                        crop_height = int(crop_width * half_aspect)  # Maintain correct aspect
-
-                        max_crop_width = frame.shape[1] // 2  # Half the source width
-                        max_crop_height = frame.shape[0]  # Full source height
-
-                        if crop_width > max_crop_width:
-                            crop_width = max_crop_width
-                            crop_height = int(crop_width * half_aspect)
-
-                        if crop_height > max_crop_height:
-                            crop_height = max_crop_height
-                            crop_width = int(crop_height / half_aspect)
-
-                        x = max(0, face.center_x - crop_width // 2)
-                        y = max(0, face.center_y - crop_height // 2)
-
-                        x = min(x, frame.shape[1] - crop_width)
-                        y = min(y, frame.shape[0] - crop_height)
-
-                        cropped = frame[y:y + crop_height, x:x + crop_width]
-                        resized = cv2.resize(
-                            cropped,
-                            (half_width, self.target_height),
-                            interpolation=cv2.INTER_LINEAR
-                        )
-
-                        x_offset = idx * half_width
-                        output[:, x_offset:x_offset + half_width] = resized
-                else:
-                    if framing_plan.crop_regions:
-                        crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
-                        crop = framing_plan.crop_regions[crop_idx]
-                        cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
-                    else:
-                        h, w = frame.shape[:2]
-                        crop_h = int(w * self.target_aspect)
-                        crop_w = w
-                        if crop_h > h:
-                            crop_h = h
-                            crop_w = int(h / self.target_aspect)
-                        y = (h - crop_h) // 2
-                        x = (w - crop_w) // 2
-                        cropped = frame[y:y + crop_h, x:x + crop_w]
-                    output = cv2.resize(
-                        cropped,
-                        (self.target_width, self.target_height),
-                        interpolation=cv2.INTER_LINEAR
-                    )
+            if context.active_speakers:
+                faces = [
+                    context.detected_faces[idx]
+                    for idx in context.active_speakers
+                    if 0 <= idx < len(context.detected_faces)
+                ][:2]
             else:
+                # Use top faces by confidence for stability
+                faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:2]
+
+            if len(faces) >= 2:
+                # Sort by X position (left to right)
+                faces_sorted = sorted(faces, key=lambda f: f.center_x)
+                left_face = faces_sorted[0]
+                right_face = faces_sorted[1]
+
+                half_width = self.target_width // 2
+                half_aspect = self.target_height / half_width
+
+                for idx, face in enumerate([left_face, right_face]):
+                    # Calculate crop region around face
+                    crop_width = int(face.width * 3)  # 3x face width for good framing
+                    crop_height = int(crop_width * half_aspect)
+
+                    # Clamp to reasonable limits
+                    crop_width = max(crop_width, frame.shape[1] // 4)
+                    crop_width = min(crop_width, frame.shape[1])
+                    crop_height = min(crop_height, frame.shape[0])
+
+                    # Ensure proper aspect ratio
+                    if crop_height / crop_width > half_aspect:
+                        crop_height = int(crop_width * half_aspect)
+                    else:
+                        crop_width = int(crop_height / half_aspect)
+
+                    # Center crop on face
+                    x = max(0, min(face.center_x - crop_width // 2, frame.shape[1] - crop_width))
+                    y = max(0, min(face.center_y - crop_height // 2, frame.shape[0] - crop_height))
+
+                    # Extract and resize
+                    cropped = frame[y:y + crop_height, x:x + crop_width]
+                    resized = cv2.resize(cropped, (half_width, self.target_height), interpolation=cv2.INTER_LINEAR)
+
+                    x_offset = idx * half_width
+                    output[:, x_offset:x_offset + half_width] = resized
+            else:
+                # Fallback to single framing
                 if framing_plan.crop_regions:
                     crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
                     crop = framing_plan.crop_regions[crop_idx]
@@ -550,11 +886,122 @@ class SmartFramer:
                     y = (h - crop_h) // 2
                     x = (w - crop_w) // 2
                     cropped = frame[y:y + crop_h, x:x + crop_w]
-                output = cv2.resize(
-                    cropped,
-                    (self.target_width, self.target_height),
-                    interpolation=cv2.INTER_LINEAR
-                )
+                output = cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
+
+            return output
+
+        new_clip = VideoClip(duration=video_clip.duration)
+        new_clip.size = (self.target_width, self.target_height)
+        new_clip.frame_function = make_frame
+        return new_clip
+
+    def _apply_three_person_layout(
+        self,
+        video_clip: VideoFileClip,
+        framing_plan: FramingPlan
+    ) -> VideoClip:
+        """
+        Apply layout for 3 people: 1 on top (full width), 2 on bottom (side by side).
+
+        Args:
+            video_clip: Source video clip
+            framing_plan: Framing plan
+
+        Returns:
+            Three-person layout video clip
+        """
+        def make_frame(t):
+            frame = video_clip.get_frame(t)
+            exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
+            frame_idx = int(exact_frame_idx)
+
+            if not framing_plan.frame_contexts:
+                h, w = frame.shape[:2]
+                crop_h = int(w * self.target_aspect)
+                crop_w = w
+                if crop_h > h:
+                    crop_h = h
+                    crop_w = int(h / self.target_aspect)
+                y = (h - crop_h) // 2
+                x = (w - crop_w) // 2
+                cropped = frame[y:y + crop_h, x:x + crop_w]
+                return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
+
+            frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
+            context = framing_plan.frame_contexts[frame_idx]
+
+            output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
+
+            if context.active_speakers:
+                faces = [
+                    context.detected_faces[idx]
+                    for idx in context.active_speakers
+                    if 0 <= idx < len(context.detected_faces)
+                ][:3]
+            else:
+                faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:3]  # Max 3 faces
+            num_faces = len(faces)
+
+            if num_faces >= 3:
+                # Sort faces by Y position (top to bottom), then X for bottom row
+                faces_sorted = sorted(faces, key=lambda f: f.center_y)
+                top_face = faces_sorted[0]  # Topmost face
+                bottom_faces = sorted(faces_sorted[1:], key=lambda f: f.center_x)  # Sort bottom by X
+
+                # Top section: full width, half height
+                top_height = self.target_height // 2
+                top_width = self.target_width
+                top_aspect = top_height / top_width
+
+                # Crop around top face
+                crop_w = int(top_face.width * 3)  # 3x face width for context
+                crop_h = int(crop_w * top_aspect)
+                crop_w = min(crop_w, frame.shape[1])
+                crop_h = min(crop_h, frame.shape[0])
+
+                x = max(0, min(top_face.center_x - crop_w // 2, frame.shape[1] - crop_w))
+                y = max(0, min(top_face.center_y - crop_h // 2, frame.shape[0] - crop_h))
+
+                cropped_top = frame[y:y + crop_h, x:x + crop_w]
+                resized_top = cv2.resize(cropped_top, (top_width, top_height), interpolation=cv2.INTER_LINEAR)
+                output[0:top_height, :] = resized_top
+
+                # Bottom section: two halves
+                bottom_height = self.target_height - top_height
+                half_width = self.target_width // 2
+                bottom_aspect = bottom_height / half_width
+
+                for idx, face in enumerate(bottom_faces[:2]):
+                    crop_w = int(face.width * 3)
+                    crop_h = int(crop_w * bottom_aspect)
+                    crop_w = min(crop_w, frame.shape[1] // 2)
+                    crop_h = min(crop_h, frame.shape[0])
+
+                    x = max(0, min(face.center_x - crop_w // 2, frame.shape[1] - crop_w))
+                    y = max(0, min(face.center_y - crop_h // 2, frame.shape[0] - crop_h))
+
+                    cropped = frame[y:y + crop_h, x:x + crop_w]
+                    resized = cv2.resize(cropped, (half_width, bottom_height), interpolation=cv2.INTER_LINEAR)
+
+                    x_offset = idx * half_width
+                    output[top_height:, x_offset:x_offset + half_width] = resized
+            else:
+                # Fallback to single framing
+                if framing_plan.crop_regions:
+                    crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
+                    crop = framing_plan.crop_regions[crop_idx]
+                    cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
+                else:
+                    h, w = frame.shape[:2]
+                    crop_h = int(w * self.target_aspect)
+                    crop_w = w
+                    if crop_h > h:
+                        crop_h = h
+                        crop_w = int(h / self.target_aspect)
+                    y = (h - crop_h) // 2
+                    x = (w - crop_w) // 2
+                    cropped = frame[y:y + crop_h, x:x + crop_w]
+                output = cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
 
             return output
 
@@ -569,7 +1016,8 @@ class SmartFramer:
         framing_plan: FramingPlan
     ) -> VideoClip:
         """
-        Apply grid layout for 3+ people.
+        Apply grid layout for 4 people (2x2 grid).
+        Layout: top-left, top-right, bottom-left, bottom-right
 
         Args:
             video_clip: Source video clip
@@ -600,44 +1048,53 @@ class SmartFramer:
 
             output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
 
-            num_faces = len(context.detected_faces)
+            if context.active_speakers:
+                faces = [
+                    context.detected_faces[idx]
+                    for idx in context.active_speakers
+                    if 0 <= idx < len(context.detected_faces)
+                ][:4]
+            else:
+                faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:4]  # Max 4 faces
+            num_faces = len(faces)
 
-            if num_faces >= 3:
+            if num_faces >= 4:
                 cell_width = self.target_width // 2
                 cell_height = self.target_height // 2
+                cell_aspect = cell_height / cell_width
 
-                for idx, face in enumerate(context.detected_faces[:4]):
+                # Sort faces into grid positions by their actual position
+                # First sort by Y (top row vs bottom row), then by X within each row
+                sorted_by_y = sorted(faces, key=lambda f: f.center_y)
+                top_row = sorted(sorted_by_y[:2], key=lambda f: f.center_x)
+                bottom_row = sorted(sorted_by_y[2:], key=lambda f: f.center_x)
+                grid_faces = top_row + bottom_row
+
+                for idx, face in enumerate(grid_faces):
                     row = idx // 2
                     col = idx % 2
 
-                    cell_aspect = cell_height / cell_width
-
-                    crop_width = frame.shape[1] // 2
+                    # Calculate crop region centered on face
+                    crop_width = int(face.width * 3)  # 3x face width
                     crop_height = int(crop_width * cell_aspect)
 
-                    max_crop_width = frame.shape[1] // 2
-                    max_crop_height = frame.shape[0] // 2
+                    # Clamp to reasonable limits
+                    crop_width = max(crop_width, frame.shape[1] // 4)
+                    crop_width = min(crop_width, frame.shape[1])
+                    crop_height = min(crop_height, frame.shape[0])
 
-                    if crop_width > max_crop_width:
-                        crop_width = max_crop_width
+                    # Ensure proper aspect ratio
+                    if crop_height / crop_width > cell_aspect:
                         crop_height = int(crop_width * cell_aspect)
-
-                    if crop_height > max_crop_height:
-                        crop_height = max_crop_height
+                    else:
                         crop_width = int(crop_height / cell_aspect)
 
-                    x = max(0, face.center_x - crop_width // 2)
-                    y = max(0, face.center_y - crop_height // 2)
-
-                    x = min(x, frame.shape[1] - crop_width)
-                    y = min(y, frame.shape[0] - crop_height)
+                    # Center crop on face
+                    x = max(0, min(face.center_x - crop_width // 2, frame.shape[1] - crop_width))
+                    y = max(0, min(face.center_y - crop_height // 2, frame.shape[0] - crop_height))
 
                     cropped = frame[y:y + crop_height, x:x + crop_width]
-                    resized = cv2.resize(
-                        cropped,
-                        (cell_width, cell_height),
-                        interpolation=cv2.INTER_LINEAR
-                    )
+                    resized = cv2.resize(cropped, (cell_width, cell_height), interpolation=cv2.INTER_LINEAR)
 
                     y_offset = row * cell_height
                     x_offset = col * cell_width