From 07d301f110240c51c0a1758590a85956fcde82b4 Mon Sep 17 00:00:00 2001
From: LeoMortari <leo.mortari.forinn@gmail.com>
Date: Thu, 18 Dec 2025 02:26:25 -0300
Subject: [PATCH] Realiza varios ajustes para melhorar o tracking e o render de
 video

---
 docker-compose.yml                |  10 +-
 prompts/generate.txt              | 165 ++++++----
 video_render/config.py            |  18 +-
 video_render/context_detection.py | 509 +++++++++++++++++++++++++++---
 video_render/llm.py               |   4 +-
 video_render/media.py             |  10 +-
 video_render/pipeline.py          |   3 +
 video_render/rendering.py         |  14 +-
 video_render/smart_framing.py     | 371 +++++++++++-----------
 video_render/transcription.py     | 136 +++++++-
 video_render/utils.py             |  60 +++-
 11 files changed, 984 insertions(+), 316 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 628ee37..200f4a0 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,7 +1,10 @@
 services:
   video-render:
     restart: unless-stopped
-    build: .
+    build:
+      context: .
+      no_cache: true
+      dockerfile: dockerfile
     environment:
       - RABBITMQ_PASS=${RABBITMQ_PASS}
       - OPENROUTER_API_URL=${OPENROUTER_API_URL:-https://openrouter.ai/api/v1/chat/completions}
@@ -9,12 +12,17 @@ services:
       - OPENROUTER_MODEL=${OPENROUTER_MODEL:-openai/gpt-oss-20b:free}
       - OPENROUTER_PROMPT_PATH=${OPENROUTER_PROMPT_PATH:-prompts/generate.txt}
       - FASTER_WHISPER_MODEL_SIZE=${FASTER_WHISPER_MODEL_SIZE:-medium}
+      - SMART_FRAMING_SMOOTHING_WINDOW=${SMART_FRAMING_SMOOTHING_WINDOW:-30}
+      - SMART_FRAMING_MAX_VELOCITY=${SMART_FRAMING_MAX_VELOCITY:-40}
+      - SMART_FRAMING_FRAME_SKIP=${SMART_FRAMING_FRAME_SKIP:-2}
+      - SMART_FRAMING_PERSON_SWITCH_COOLDOWN=${SMART_FRAMING_PERSON_SWITCH_COOLDOWN:-60}
     volumes:
       - "/root/videos:/app/videos"
       - "/root/outputs:/app/outputs"
       - "/root/prompts:/app/prompts"
       # - "./videos:/app/videos"
       # - "./outputs:/app/outputs"
+      # - "./prompts:/app/prompts"
     command: "python -u main.py"
     networks:
       - dokploy-network
diff --git a/prompts/generate.txt b/prompts/generate.txt
index 8638af2..bd90862 100644
--- a/prompts/generate.txt
+++ b/prompts/generate.txt
@@ -1,85 +1,118 @@
-Voce e especialista em viralidade de redes sociais (TikTok, Instagram Reels, YouTube Shorts). Analise a transcricao e selecione trechos com MAXIMO potencial viral, priorizando qualidade sobre quantidade.
+Você é especialista em viralidade de redes sociais (TikTok, Instagram Reels, YouTube Shorts). Sua missão: EXTRAIR O MÁXIMO de clips virais possíveis, priorizando QUANTIDADE + QUALIDADE.
 
-PROCESSO DE ANALISE:
-1. Mapear potenciais trechos na transcricao
-2. Avaliar cada trecho usando sistema de pontuacao abaixo
+🎯 OBJETIVO: Transformar cada vídeo em MÚLTIPLOS clips que podem viralizar
+
+PROCESSO DE ANÁLISE:
+1. Mapear TODOS os potenciais trechos virais na transcrição
+2. Avaliar cada trecho usando sistema de pontuação abaixo
 3. Rankear do maior para menor score viral
-4. Selecionar apenas os top-ranked baseado na duracao do video
+4. Selecionar TODOS os trechos com score ≥ 60 (não seja conservador!)
 
-SISTEMA DE PONTUACAO VIRAL (0-100 pontos):
+SISTEMA DE PONTUAÇÃO VIRAL (0-100 pontos):
 
-HOOK/ABERTURA (0-25 pontos):
-[25] Frase choqueante, pergunta polemica ou promessa ousada
-[20] Historia intrigante ou situacao inusitada
-[15] Afirmacao interessante mas previsivel
-[10] Introducao generica mas aceitavel
-[0] "Oi", "entao", silencio ou conteudo fraco
+🪝 GANCHO INICIAL (0-30 pontos) - CRÍTICO PARA VIRALIZAÇÃO:
+[30] Frase CHOCANTE, pergunta POLÊMICA ou promessa OUSADA nos primeiros 3 segundos
+[25] Hook forte: "Você não vai acreditar...", "O segredo que ninguém conta...", "Isso mudou tudo..."
+[20] Pergunta intrigante ou afirmação controversa
+[15] História interessante mas gancho fraco
+[10] Início genérico mas aceitável
+[0] "Oi", "então", "bem", silêncio - DESCARTAR
 
-GATILHO EMOCIONAL (0-25 pontos):
-[25] Emocao extrema: raiva, choque, riso intenso, inspiracao profunda
-[20] Emocao forte: surpresa, indignacao, humor, curiosidade intensa
-[15] Emocao moderada: interesse, leve humor, curiosidade
-[10] Emocao fraca: informativo sem impacto emocional
-[0] Monotono, tecnico, sem apelo emocional
+🔥 GATILHO EMOCIONAL (0-25 pontos):
+[25] Emoção EXTREMA: raiva, choque, riso intenso, WTF moment, revelação bombástica
+[20] Emoção forte: surpresa, indignação, humor, curiosidade intensa
+[15] Emoção moderada: interesse, leve humor, insight interessante
+[10] Emoção fraca: informativo sem impacto
+[0] Monótono, técnico, sem apelo emocional - EVITAR
 
-VALOR/UTILIDADE (0-20 pontos):
-[20] Segredo valioso, insight transformador ou informacao exclusiva
-[15] Ensina algo pratico e imediatamente aplicavel
-[10] Opiniao interessante ou perspectiva util
-[5] Informacao generica ou conhecimento comum
-[0] Nenhum valor pratico, puro enrolation
+💎 VALOR/UTILIDADE (0-20 pontos):
+[20] Segredo VALIOSO, insight transformador, informação EXCLUSIVA
+[15] Ensina algo prático e IMEDIATAMENTE aplicável
+[10] Opinião interessante ou perspectiva única
+[5] Informação genérica ou conhecimento comum
+[0] Nenhum valor prático, puro "enrolation" - DESCARTAR
 
-ESTRUTURA NARRATIVA (0-15 pontos):
-[15] Historia completa com inicio, conflito/climax e resolucao
-[10] Segmento com comeco e fim coerentes
+📖 ESTRUTURA NARRATIVA (0-15 pontos):
+[15] História COMPLETA com início, conflito/clímax e resolução satisfatória
+[10] Segmento com começo e fim coerentes, faz sentido isolado
 [5] Trecho com sentido mas cortado abruptamente
-[0] Fragmento sem contexto ou conclusao
+[0] Fragmento sem contexto - NÃO USAR
 
-RITMO E ENERGIA (0-15 pontos):
-[15] Dinamico, sem pausas, alta energia, palavras impactantes
-[10] Bom ritmo com pausas naturais curtas
-[5] Ritmo lento mas aceitavel
-[0] Muitas pausas, hesitacoes, monotonia, silencio
+⚡ RITMO E ENERGIA (0-10 pontos):
+[10] DINÂMICO, sem pausas longas, alta energia, palavras impactantes
+[7] Bom ritmo com pausas naturais curtas (< 2s)
+[3] Ritmo lento mas aceitável
+[0] Muitas pausas (> 3s), hesitações, monotonia - EVITAR
 
-REGRAS DE QUANTIDADE:
-5-10 min: 3 clipes (minimo 1 se score alto)
-10-20 min: 4 clipes
-20-30 min: 5 clipes
-30+ min: 6 clipes (maximo absoluto)
+REGRAS DE QUANTIDADE (SER AGRESSIVO):
+📊 Quantidade MÍNIMA por duração:
+- 5-10 min: MÍNIMO 4-6 clips
+- 10-15 min: MÍNIMO 6-8 clips
+- 15-20 min: MÍNIMO 8-10 clips
+- 20-30 min: MÍNIMO 10-15 clips
+- 30+ min: MÍNIMO 15-20 clips
 
-IMPORTANTE: Priorize qualidade. Melhor 3 clipes score 80+ que 6 clipes score 50. Se poucos momentos virais, retorne apenas os melhores (minimo 1).
+🎯 REGRA DE OURO: 1 clip a cada 2-3 minutos de vídeo (NO MÍNIMO)
+- Se encontrar momentos virais, SEMPRE selecione!
+- Melhor ter 3 clips perfeitos que 10 clips bons
 
-CRITERIOS DE SELECAO:
-- Score viral maior ou igual 60 pontos (idealmente maior ou igual 70)
-- Duracao ideal: 60-90s
-- Duracao minima: 60s | Duracao maxima: 120s
-- Sem sobreposicao (end de um menor que start do proximo)
-- Inicio e fim coerentes
+CRITÉRIOS DE SELEÇÃO:
+- Score viral ≥ 60 pontos (idealmente ≥ 70)
+- Duração ideal: 60-120s (formato ideal para Reels/Shorts)
+- Duração mínima: 60s | Duração máxima: 120s
+- Sem sobreposição temporal
+- DEVE ter gancho forte nos primeiros 3 segundos
+- Início e fim coerentes
 
-EVITE:
-- Introducoes genericas
-- Trechos com silencio/pausas maiores que 3s
-- Explicacoes tecnicas sem gancho emocional
-- Segmentos sem conclusao
-- Momentos de transicao
+GANCHOS QUE FAZEM VIRALIZAR (use como filtro):
+- "O que ninguém te conta sobre..."
+- "O erro que 90% das pessoas cometem..."
+- "Você não vai acreditar o que aconteceu..."
+- Revelações chocantes ou contraintuitivas
+- Antes vs Depois, transformações
+- Segredos, bastidores, verdades ocultas
+- Polêmicas, opiniões fortes, hot takes
+- Histórias dramáticas com reviravolta
+- Dicas práticas e acionáveis
+- Momentos de humor genuíno
 
-FORMATO JSON (retorne APENAS isto):
-{"highlights":[{"start":<float>,"end":<float>,"summary":"Score estimado e gatilhos principais"}]}
+❌ EVITE (mas não descarte se score alto):
+- Introduções genéricos SEM gancho
+- Trechos com pausas > 3s consecutivas
+- Explicações técnicas SEM gancho emocional
+- Segmentos sem conclusão clara
+- Momentos de transição vazios
 
-REGRAS TECNICAS:
-- Float com ponto decimal (45.5 NAO 45,5)
+FORMATO JSON (retorne APENAS isto, SEM texto adicional):
+{
+  "highlights": [
+    {
+      "start": <float>,
+      "end": <float>,
+      "summary": "Score: XX/100 | Gancho: [descreva] | Gatilho: [descreva]",
+    }
+  ]
+}
+
+REGRAS TÉCNICAS:
+- Float com ponto decimal (45.5 NÃO 45,5)
 - Timestamps exatos dos segments fornecidos
-- Ordem cronologica (start crescente)
-- Minimo 1, maximo 6 highlights
-- Summary conciso (1-2 frases)
+- Ordem cronológica (start crescente)
+- Summary conciso mas informativo (2-3 frases)
 
-TAREFA:
-1. Leia transcricao e timestamps
-2. Avalie e pontue trechos mentalmente
-3. Rankear por score viral
-4. Selecione top-ranked baseado na duracao
-5. Retorne JSON
-6. Se video fraco, retorne pelo menos 1 highlight
+TAREFA PASSO A PASSO:
+1. Leia transcrição completa
+2. Identifique TODOS os momentos potencialmente virais
+3. Avalie e pontue cada trecho (seja generoso!)
+4. Rankear por score viral
+5. Selecione TODOS com score ≥ 60
+6. Garanta mínimo de 1 clip a cada 5 minutos
+7. Retorne JSON completo
 
-Objetivo: MAXIMIZAR chance de viralizar. Seja criterioso, apenas melhores trechos.
+⚠️ IMPORTANTE:
+- NÃO seja conservador! Se encontrou 10 momentos bons, retorne os 10!
+- Pense em MAXIMIZAR alcance: mais clips = mais chances de viralizar
+- Se vídeo tem conteúdo fraco, seja criterioso, mas SEMPRE retorne pelo menos 3-5 clips
+- Priorize clips com GANCHOS FORTES - gancho fraco = baixo alcance
 
+🎯 MINDSET: Você é um criador de conteúdo viral. Seu objetivo é extrair MÁXIMO valor do vídeo original.
diff --git a/video_render/config.py b/video_render/config.py
index 547d10b..0ca0c1e 100644
--- a/video_render/config.py
+++ b/video_render/config.py
@@ -13,10 +13,10 @@ TEMP_ROOT = BASE_DIR / "temp"
 
 @dataclass(frozen=True)
 class RabbitMQSettings:
-    # host: str = os.environ.get("RABBITMQ_HOST", "154.12.229.181")
-    # port: int = int(os.environ.get("RABBITMQ_PORT", 32790))
-    host: str = os.environ.get("RABBITMQ_HOST", "rabbitmq")
-    port: int = int(os.environ.get("RABBITMQ_PORT", 5672))
+    host: str = os.environ.get("RABBITMQ_HOST", "154.12.229.181")
+    port: int = int(os.environ.get("RABBITMQ_PORT", 32790))
+    # host: str = os.environ.get("RABBITMQ_HOST", "rabbitmq")
+    # port: int = int(os.environ.get("RABBITMQ_PORT", 5672))
     user: str = os.environ.get("RABBITMQ_USER", "admin")
     password: str = os.environ.get("RABBITMQ_PASS")
     consume_queue: str = os.environ.get("RABBITMQ_QUEUE", "to-render")
@@ -62,11 +62,13 @@ class RenderingSettings:
     subtitle_font_size: int = int(os.environ.get("RENDER_SUBTITLE_FONT_SIZE", 64))
     caption_min_words: int = int(os.environ.get("CAPTION_MIN_WORDS", 2))
     caption_max_words: int = int(os.environ.get("CAPTION_MAX_WORDS", 2))
-    # Smart framing settings
+    # Smart framing settings - CONTAINMENT TRACKING mode
     enable_smart_framing: bool = os.environ.get("ENABLE_SMART_FRAMING", "true").lower() in ("true", "1", "yes")
-    smart_framing_min_confidence: float = float(os.environ.get("SMART_FRAMING_MIN_CONFIDENCE", 0.5))
-    smart_framing_smoothing_window: int = int(os.environ.get("SMART_FRAMING_SMOOTHING_WINDOW", 20))
-    smart_framing_frame_skip: int = int(os.environ.get("SMART_FRAMING_FRAME_SKIP", 2))  # Process every Nth frame (CPU optimization)
+    smart_framing_min_confidence: float = float(os.environ.get("SMART_FRAMING_MIN_CONFIDENCE", 0.3))  # Lowered for better cartoon detection
+    smart_framing_smoothing_window: int = int(os.environ.get("SMART_FRAMING_SMOOTHING_WINDOW", 30))  # Reduced - not needed with containment tracking
+    smart_framing_frame_skip: int = int(os.environ.get("SMART_FRAMING_FRAME_SKIP", 1))  # Process every frame for smooth 30 FPS tracking
+    smart_framing_max_velocity: int = int(os.environ.get("SMART_FRAMING_MAX_VELOCITY", 20))  # Moderate - only used during transitions
+    smart_framing_person_switch_cooldown: int = int(os.environ.get("SMART_FRAMING_PERSON_SWITCH_COOLDOWN", 999999))  # DISABLED - never switch people
 
 
 @dataclass(frozen=True)
diff --git a/video_render/context_detection.py b/video_render/context_detection.py
index e342b4c..ab5c203 100644
--- a/video_render/context_detection.py
+++ b/video_render/context_detection.py
@@ -7,7 +7,7 @@ and identify who is speaking in video content using MediaPipe and audio analysis
 from __future__ import annotations
 
 import logging
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import List, Optional, Tuple
 
 import cv2
@@ -50,20 +50,22 @@ class FrameContext:
     active_speakers: List[int]  # indices of speaking faces
     primary_focus: Optional[Tuple[int, int]]  # (x, y) center point
     layout_mode: str  # "single", "dual_split", "grid"
+    selected_people: List[int] = field(default_factory=list)  # indices of people selected for display (max 2)
 
 
 class MediaPipeDetector:
-    """Face and pose detection using MediaPipe."""
+    """Face and pose detection using MediaPipe with OpenCV Haar Cascade fallback."""
 
-    def __init__(self, min_detection_confidence: float = 0.5, min_tracking_confidence: float = 0.5):
+    def __init__(self, min_detection_confidence: float = 0.3, min_tracking_confidence: float = 0.3):
         self.min_detection_confidence = min_detection_confidence
         self.min_tracking_confidence = min_tracking_confidence
         self.mp_face_detection = mp.solutions.face_detection
         self.mp_face_mesh = mp.solutions.face_mesh
 
+        # MediaPipe detectors with lower confidence for better cartoon detection
         self.face_detection = self.mp_face_detection.FaceDetection(
             min_detection_confidence=min_detection_confidence,
-            model_selection=1
+            model_selection=0  # Changed to 0 for better detection of varied faces (including cartoons)
         )
 
         self.face_mesh = self.mp_face_mesh.FaceMesh(
@@ -73,11 +75,17 @@ class MediaPipeDetector:
             static_image_mode=False
         )
 
-        logger.info("MediaPipe detector initialized")
+        # OpenCV Haar Cascade as fallback for cartoon/anime faces
+        self.haar_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
+
+        # Alternative cascade for profile/side faces
+        self.haar_cascade_profile = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_profileface.xml')
+
+        logger.info(f"Hybrid detector initialized (MediaPipe confidence={min_detection_confidence}, OpenCV Haar Cascade enabled)")
 
     def detect_faces(self, frame: np.ndarray) -> List[FaceDetection]:
         """
-        Detect faces in a frame.
+        Detect faces in a frame using hybrid approach (MediaPipe + OpenCV Haar Cascade).
 
         Args:
             frame: RGB image array
@@ -94,6 +102,7 @@ class MediaPipeDetector:
         else:
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
 
+        # Try MediaPipe first
         results = self.face_detection.process(frame_rgb)
 
         faces = []
@@ -126,8 +135,111 @@ class MediaPipeDetector:
                     center_y=center_y
                 ))
 
+        # Fallback to OpenCV Haar Cascade if MediaPipe found nothing
+        if not faces:
+            faces = self._detect_faces_haar_cascade(frame, width, height)
+
         return faces
 
+    def _detect_faces_haar_cascade(self, frame: np.ndarray, width: int, height: int) -> List[FaceDetection]:
+        """
+        Detect faces using OpenCV Haar Cascade (works better with cartoons/anime).
+
+        Args:
+            frame: Image frame (BGR format)
+            width: Frame width
+            height: Frame height
+
+        Returns:
+            List of detected faces
+        """
+        # Convert to grayscale for Haar Cascade
+        if len(frame.shape) == 3:
+            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = frame
+
+        # Detect frontal faces with more sensitive parameters
+        frontal_faces = self.haar_cascade.detectMultiScale(
+            gray,
+            scaleFactor=1.05,  # More sensitive to size variations
+            minNeighbors=3,     # Lower threshold for detection (more permissive)
+            minSize=(30, 30),   # Smaller minimum size
+            flags=cv2.CASCADE_SCALE_IMAGE
+        )
+
+        # Also try profile faces
+        profile_faces = self.haar_cascade_profile.detectMultiScale(
+            gray,
+            scaleFactor=1.1,
+            minNeighbors=3,
+            minSize=(30, 30),
+            flags=cv2.CASCADE_SCALE_IMAGE
+        )
+
+        # Combine frontal and profile detections
+        all_faces = []
+
+        for (x, y, w, h) in frontal_faces:
+            x = max(0, min(x, width - 1))
+            y = max(0, min(y, height - 1))
+            w = min(w, width - x)
+            h = min(h, height - y)
+
+            center_x = x + w // 2
+            center_y = y + h // 2
+
+            all_faces.append(FaceDetection(
+                x=x,
+                y=y,
+                width=w,
+                height=h,
+                confidence=0.7,  # Haar Cascade doesn't provide confidence, use fixed value
+                center_x=center_x,
+                center_y=center_y
+            ))
+
+        for (x, y, w, h) in profile_faces:
+            # Check if this face overlaps significantly with any frontal face
+            overlap = False
+            for existing_face in all_faces:
+                # Calculate IoU (Intersection over Union)
+                x1_overlap = max(x, existing_face.x)
+                y1_overlap = max(y, existing_face.y)
+                x2_overlap = min(x + w, existing_face.x + existing_face.width)
+                y2_overlap = min(y + h, existing_face.y + existing_face.height)
+
+                if x1_overlap < x2_overlap and y1_overlap < y2_overlap:
+                    overlap_area = (x2_overlap - x1_overlap) * (y2_overlap - y1_overlap)
+                    face_area = w * h
+                    if overlap_area / face_area > 0.3:  # 30% overlap threshold
+                        overlap = True
+                        break
+
+            if not overlap:
+                x = max(0, min(x, width - 1))
+                y = max(0, min(y, height - 1))
+                w = min(w, width - x)
+                h = min(h, height - y)
+
+                center_x = x + w // 2
+                center_y = y + h // 2
+
+                all_faces.append(FaceDetection(
+                    x=x,
+                    y=y,
+                    width=w,
+                    height=h,
+                    confidence=0.6,  # Slightly lower confidence for profile
+                    center_x=center_x,
+                    center_y=center_y
+                ))
+
+        if all_faces:
+            logger.debug(f"Haar Cascade detected {len(all_faces)} faces (MediaPipe failed)")
+
+        return all_faces
+
     def detect_face_landmarks(self, frame: np.ndarray) -> List[FaceDetection]:
         """
         Detect faces with landmarks for lip sync detection.
@@ -203,8 +315,8 @@ class AudioActivityDetector:
     def detect_speaking_periods(
         self,
         audio_samples: np.ndarray,
-        threshold: float = 0.02,
-        min_speech_duration: float = 0.1
+        threshold: float = 0.01,  # Reduced from 0.02 for better speech detection
+        min_speech_duration: float = 0.05  # Reduced from 0.1 to catch shorter utterances
     ) -> List[Tuple[float, float]]:
         """
         Detect periods of speech in audio.
@@ -250,6 +362,16 @@ class AudioActivityDetector:
             if end_time - start_time >= min_speech_duration:
                 periods.append((start_time, end_time))
 
+        # Log detected speech periods for debugging
+        if periods:
+            total_speech_time = sum(end - start for start, end in periods)
+            logger.info(f"Audio speech detection: {len(periods)} periods found, "
+                       f"total {total_speech_time:.1f}s of speech (threshold={threshold})")
+        else:
+            max_energy = max(energies) if energies else 0
+            logger.warning(f"No speech detected! Max energy={max_energy:.4f}, threshold={threshold} "
+                          f"(try lowering threshold if speech should be present)")
+
         return periods
 
     def is_speaking_at_time(self, speaking_periods: List[Tuple[float, float]], time: float) -> bool:
@@ -263,12 +385,29 @@ class AudioActivityDetector:
 class ContextAnalyzer:
     """Analyzes video context to determine focus and layout."""
 
-    def __init__(self):
+    def __init__(self, person_switch_cooldown: int = 30):
         self.detector = MediaPipeDetector()
         self.audio_detector = AudioActivityDetector()
         self.previous_faces: List[FaceDetection] = []
 
-        logger.info("Context analyzer initialized")
+        # Person tracking state
+        self.current_selected_people: List[int] = []  # Indices of people currently on screen
+        self.last_switch_frame: int = -999  # Frame when we last switched people
+        self.person_switch_cooldown = person_switch_cooldown  # Minimum frames before switching
+
+        # Stability tracking to prevent flip-flopping
+        self.desired_people_history: List[List[int]] = []  # Track recent desired selections
+        self.stability_threshold = 20  # Frames needed to confirm a switch (increased for more stability)
+        self.last_switched_people: List[int] = []  # People we just switched FROM
+
+        # Focus stability: track recent focus points for temporal smoothing
+        self.focus_history: List[Tuple[int, int]] = []
+        self.focus_history_size: int = 5  # Keep last 5 focus points for smoothing
+
+        # Debug logging
+        self.frame_log_interval = 30  # Log every N frames
+
+        logger.info(f"Context analyzer initialized (cooldown={person_switch_cooldown} frames, focus_smoothing={self.focus_history_size})")
 
     def analyze_frame(
         self,
@@ -296,33 +435,47 @@ class ContextAnalyzer:
 
         # Determine who is speaking
         active_speakers = []
+        has_audio_speech = speaking_periods and self.audio_detector.is_speaking_at_time(speaking_periods, timestamp)
+
         for i, face in enumerate(faces):
             is_speaking = False
 
-            if speaking_periods and self.audio_detector.is_speaking_at_time(speaking_periods, timestamp):
+            # Check audio-based speech detection
+            if has_audio_speech:
                 is_speaking = True
 
+            # Check lip movement (visual speech detection)
             if face.landmarks and len(self.previous_faces) > i:
                 is_speaking = is_speaking or self._detect_lip_movement(face, self.previous_faces[i])
 
             if is_speaking:
                 active_speakers.append(i)
 
-        num_faces = len(faces)
-        num_speakers = len(active_speakers)
+        # Debug: Log speech detection
+        if frame_number % 30 == 0:  # Every second at 30fps
+            logger.info(f"Speech detection - Frame {frame_number}: audio_active={has_audio_speech}, "
+                       f"speakers={active_speakers}, total_faces={len(faces)}")
 
-        if num_faces == 0:
-            layout_mode = "single"
-        elif num_faces == 1:
-            layout_mode = "single"
-        elif num_faces == 2:
-            layout_mode = "dual_split"
-        elif num_faces >= 3:
-            layout_mode = "dual_split"
-        else:
-            layout_mode = "single"
+        # Select THE person to focus on (always single person)
+        # Priority: 1) Who is speaking, 2) Who is most centered
+        selected_people = self._select_person_to_focus(
+            faces,
+            active_speakers,
+            frame_number,
+            frame.shape[1],  # frame width for center calculation
+            frame.shape[0]   # frame height for center calculation
+        )
 
-        primary_focus = self._calculate_focus_point(faces, active_speakers)
+        # Always use single-person layout (no split screen)
+        layout_mode = "single"
+
+        primary_focus = self._calculate_focus_point(faces, selected_people)
+
+        # Debug logging every N frames
+        if frame_number % self.frame_log_interval == 0:
+            focus_reason = "speaker" if active_speakers else "no_speech_detected"
+            logger.info(f"Frame {frame_number}: {len(faces)} faces, "
+                       f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}")
 
         self.previous_faces = faces
 
@@ -332,7 +485,8 @@ class ContextAnalyzer:
             detected_faces=faces,
             active_speakers=active_speakers,
             primary_focus=primary_focus,
-            layout_mode=layout_mode
+            layout_mode=layout_mode,
+            selected_people=selected_people
         )
 
     def _detect_lip_movement(self, current_face: FaceDetection, previous_face: FaceDetection) -> bool:
@@ -363,36 +517,309 @@ class ContextAnalyzer:
         threshold = 2.0
         return abs(current_dist - previous_dist) > threshold
 
-    def _calculate_focus_point(
+    def _select_person_to_focus(
         self,
         faces: List[FaceDetection],
-        active_speakers: List[int]
-    ) -> Optional[Tuple[int, int]]:
+        active_speakers: List[int],
+        frame_number: int,
+        frame_width: int,
+        frame_height: int
+    ) -> List[int]:
         """
-        Calculate the primary focus point based on detected faces and speakers.
-
-        IMPORTANT: This focuses on ONE person to avoid focusing on empty space (table).
-        When multiple people are present, we pick the most relevant person, not average positions.
+        Select THE single person to focus on.
+        Priority: 1) Who is speaking, 2) Who is most centered in frame
 
         Args:
             faces: List of detected faces
-            active_speakers: Indices of faces that are speaking
+            active_speakers: Indices of people currently speaking
+            frame_number: Current frame number
+            frame_width: Frame width for center calculation
+            frame_height: Frame height for center calculation
+
+        Returns:
+            List with single person index [idx], or empty list if no faces
+        """
+        if not faces:
+            self.current_selected_people = []
+            return []
+
+        # If only 1 person, always focus on them
+        if len(faces) == 1:
+            self.current_selected_people = [0]
+            return [0]
+
+        # Check if we can switch people (cooldown period)
+        frames_since_last_switch = frame_number - self.last_switch_frame
+        can_switch = frames_since_last_switch >= self.person_switch_cooldown
+
+        # Calculate frame center for distance comparison
+        frame_center_x = frame_width / 2
+        frame_center_y = frame_height / 2
+
+        # ULTRA-STABLE MODE: Select ONE person at start, NEVER switch
+        # This completely eliminates switching-related instability
+        desired_person_idx = None
+
+        # If we already have someone selected, ALWAYS KEEP THEM (never switch)
+        if self.current_selected_people and len(self.current_selected_people) > 0:
+            current_idx = self.current_selected_people[0]
+            if current_idx < len(faces):
+                # Current person still detected - keep them
+                desired_person_idx = current_idx
+            else:
+                # Current person lost - try to find them again by position/size similarity
+                # This handles temporary detection failures
+                current_person_found = False
+                if self.previous_faces and current_idx < len(self.previous_faces):
+                    prev_face = self.previous_faces[current_idx]
+                    # Find most similar face by position and size
+                    best_match_idx = None
+                    best_match_score = float('inf')
+                    for idx, face in enumerate(faces):
+                        # Distance between centers
+                        dx = face.center_x - prev_face.center_x
+                        dy = face.center_y - prev_face.center_y
+                        dist = np.sqrt(dx**2 + dy**2)
+                        # Size similarity
+                        size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height)
+                        score = dist + size_diff * 0.5
+                        if score < best_match_score:
+                            best_match_score = score
+                            best_match_idx = idx
+
+                    if best_match_idx is not None and best_match_score < 1000:
+                        desired_person_idx = best_match_idx
+                        current_person_found = True
+
+                if not current_person_found:
+                    # Really lost - select most confident
+                    face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
+                    face_confidences.sort(key=lambda x: x[1], reverse=True)
+                    desired_person_idx = face_confidences[0][0]
+                    logger.warning(f"Current person permanently lost - selecting new: {desired_person_idx}")
+        else:
+            # First frame - select most confident person ONCE
+            face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
+            face_confidences.sort(key=lambda x: x[1], reverse=True)
+            desired_person_idx = face_confidences[0][0]
+            logger.info(f"INITIAL SELECTION - Person {desired_person_idx} (will be tracked throughout entire video)")
+
+        # IGNORE SPEECH DETECTION - it was causing instability
+        # We now track ONE person from start to finish, regardless of who speaks
+
+        # OLD LOGIC (commented out - was causing issues):
+        # This logic would switch based on "who is more centered" which caused constant switching
+        if False:  # Disabled
+            # Calculate distance from center for each face
+            center_distances = []
+            for idx, face in enumerate(faces):
+                # Euclidean distance from frame center
+                dx = face.center_x - frame_center_x
+                dy = face.center_y - frame_center_y
+                distance = np.sqrt(dx**2 + dy**2)
+                center_distances.append((idx, distance, face.confidence))
+
+            # Sort by distance (closest first), then by confidence as tiebreaker
+            center_distances.sort(key=lambda x: (x[1], -x[2]))
+            most_centered_idx = center_distances[0][0]
+            most_centered_distance = center_distances[0][1]
+
+            # STICKY BEHAVIOR: If we already have someone selected, only switch if:
+            # - New person is SIGNIFICANTLY more centered (30% closer to center)
+            # - OR current person is now very far from center (>40% of frame width)
+            if self.current_selected_people and len(self.current_selected_people) > 0:
+                current_idx = self.current_selected_people[0]
+                if current_idx < len(faces):
+                    current_face = faces[current_idx]
+                    current_dx = current_face.center_x - frame_center_x
+                    current_dy = current_face.center_y - frame_center_y
+                    current_distance = np.sqrt(current_dx**2 + current_dy**2)
+
+                    # Define "significantly better" threshold
+                    max_acceptable_distance = frame_width * 0.4  # 40% of frame width
+                    improvement_threshold = 0.7  # New person must be 30% closer (0.7 ratio)
+
+                    # Keep current person if they're still reasonably centered
+                    if current_distance < max_acceptable_distance:
+                        # Current person is still acceptable - only switch if new is MUCH better
+                        if most_centered_distance < current_distance * improvement_threshold:
+                            desired_person_idx = most_centered_idx
+                            logger.debug(f"Switching: new person MUCH more centered ({most_centered_distance:.0f} vs {current_distance:.0f})")
+                        else:
+                            desired_person_idx = current_idx  # Keep current
+                            logger.debug(f"Keeping current person: still reasonably centered ({current_distance:.0f} px from center)")
+                    else:
+                        # Current person is too far from center - switch
+                        desired_person_idx = most_centered_idx
+                        logger.debug(f"Current person too far from center ({current_distance:.0f} px), switching")
+                else:
+                    # Current selection invalid
+                    desired_person_idx = most_centered_idx
+            else:
+                # First time - select most centered
+                desired_person_idx = most_centered_idx
+
+        # Wrap in list for compatibility with existing code
+        desired_people = [desired_person_idx] if desired_person_idx is not None else []
+
+        # ULTRA-STABLE MODE: NO SWITCHING LOGIC AT ALL
+        # Simply set the person and never change
+        if not self.current_selected_people:
+            # First time only
+            self.current_selected_people = desired_people
+            self.last_switch_frame = frame_number
+            logger.info(f"Frame {frame_number}: LOCKED ON person {desired_people} - will never switch")
+        else:
+            # Already have someone - just update to desired (which is same person due to logic above)
+            self.current_selected_people = desired_people
+
+        return self.current_selected_people.copy()
+
+    def _ensure_distinct_people(
+        self,
+        faces: List[FaceDetection],
+        people_indices: List[int]
+    ) -> List[int]:
+        """
+        Ensure selected people are distinct by checking minimum distance between them.
+        Prevents showing the same person twice due to duplicate detection.
+
+        Args:
+            faces: List of detected faces
+            people_indices: Indices of people to validate
+
+        Returns:
+            List of distinct people indices (max 2)
+        """
+        if len(people_indices) <= 1:
+            return people_indices
+
+        distinct_people = []
+
+        for idx in people_indices:
+            if idx >= len(faces):
+                continue
+
+            current_face = faces[idx]
+            is_distinct = True
+
+            # Check if this person is too close to any already selected person
+            for selected_idx in distinct_people:
+                selected_face = faces[selected_idx]
+
+                # Calculate distance between face centers
+                dx = current_face.center_x - selected_face.center_x
+                dy = current_face.center_y - selected_face.center_y
+                distance = np.sqrt(dx**2 + dy**2)
+
+                # Also check overlap via IoU (Intersection over Union)
+                x1_overlap = max(current_face.x, selected_face.x)
+                y1_overlap = max(current_face.y, selected_face.y)
+                x2_overlap = min(current_face.x + current_face.width, selected_face.x + selected_face.width)
+                y2_overlap = min(current_face.y + current_face.height, selected_face.y + selected_face.height)
+
+                overlap_area = 0
+                if x1_overlap < x2_overlap and y1_overlap < y2_overlap:
+                    overlap_area = (x2_overlap - x1_overlap) * (y2_overlap - y1_overlap)
+
+                # Calculate areas
+                area1 = current_face.width * current_face.height
+                area2 = selected_face.width * selected_face.height
+                min_area = min(area1, area2)
+
+                # If faces are very close OR significantly overlapping, they're likely the same person
+                # Minimum distance: 1/4 of average face width
+                min_distance = (current_face.width + selected_face.width) / 8
+                overlap_threshold = 0.3  # 30% overlap
+
+                if distance < min_distance or (min_area > 0 and overlap_area / min_area > overlap_threshold):
+                    is_distinct = False
+                    logger.debug(f"Person {idx} too similar to person {selected_idx} (dist={distance:.1f}, overlap={overlap_area/min_area if min_area > 0 else 0:.2%})")
+                    break
+
+            if is_distinct:
+                distinct_people.append(idx)
+
+            # Stop at 2 distinct people
+            if len(distinct_people) >= 2:
+                break
+
+        # If we couldn't find 2 distinct people, return at most 1
+        if len(distinct_people) < 2 and len(people_indices) >= 2:
+            logger.debug(f"Only {len(distinct_people)} distinct person(s) found from {len(people_indices)} detections")
+
+        return distinct_people
+
+    def _calculate_focus_point(
+        self,
+        faces: List[FaceDetection],
+        selected_people: List[int]
+    ) -> Optional[Tuple[int, int]]:
+        """
+        Calculate the primary focus point based on selected people with temporal smoothing.
+
+        Args:
+            faces: List of detected faces
+            selected_people: Indices of people selected for display
 
         Returns:
             (x, y) tuple of focus center, or None if no faces
         """
-        if not faces:
+        if not faces or not selected_people:
             return None
 
-        if active_speakers:
-            speaker_faces = [faces[i] for i in active_speakers if i < len(faces)]
-            if speaker_faces:
-                primary_speaker = max(speaker_faces, key=lambda f: f.confidence)
-                return (primary_speaker.center_x, primary_speaker.center_y)
+        # Calculate raw focus point
+        raw_focus_x = 0
+        raw_focus_y = 0
 
-        most_confident = max(faces, key=lambda f: f.confidence)
-        return (most_confident.center_x, most_confident.center_y)
+        if len(selected_people) == 1:
+            # Single person - focus on them
+            if selected_people[0] < len(faces):
+                primary = faces[selected_people[0]]
+                raw_focus_x = primary.center_x
+                raw_focus_y = primary.center_y
+            else:
+                # Fallback
+                most_confident = max(faces, key=lambda f: f.confidence)
+                raw_focus_x = most_confident.center_x
+                raw_focus_y = most_confident.center_y
+        else:
+            # Multiple people - focus on the CENTER between them for stability
+            # This prevents jarring movements when switching focus between people
+            valid_people = [idx for idx in selected_people if idx < len(faces)]
+            if valid_people:
+                centers_x = [faces[idx].center_x for idx in valid_people]
+                centers_y = [faces[idx].center_y for idx in valid_people]
+                raw_focus_x = int(np.mean(centers_x))
+                raw_focus_y = int(np.mean(centers_y))
+            else:
+                # Fallback
+                most_confident = max(faces, key=lambda f: f.confidence)
+                raw_focus_x = most_confident.center_x
+                raw_focus_y = most_confident.center_y
+
+        # Apply temporal smoothing using focus history
+        self.focus_history.append((raw_focus_x, raw_focus_y))
+        if len(self.focus_history) > self.focus_history_size:
+            self.focus_history.pop(0)
+
+        # Calculate smoothed focus as weighted average (more weight to recent frames)
+        if len(self.focus_history) > 1:
+            # Exponential weights: recent frames have more influence
+            weights = [2 ** i for i in range(len(self.focus_history))]
+            total_weight = sum(weights)
+
+            smoothed_x = sum(x * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
+            smoothed_y = sum(y * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
+
+            return (int(smoothed_x), int(smoothed_y))
+        else:
+            return (raw_focus_x, raw_focus_y)
 
     def close(self):
         """Release resources."""
         self.detector.close()
+        # Clear tracking state to free memory
+        self.previous_faces.clear()
+        self.current_selected_people.clear()
+        self.focus_history.clear()
diff --git a/video_render/llm.py b/video_render/llm.py
index 1f2d798..76be59d 100644
--- a/video_render/llm.py
+++ b/video_render/llm.py
@@ -141,8 +141,8 @@ class OpenRouterCopywriter:
                     logger.warning(f"Highlight ignorado: muito curto ({duration}s, minimo 45s)")
                     continue
 
-                if duration > 120:
-                    logger.warning(f"Highlight ignorado: muito longo ({duration}s, maximo 120s)")
+                if duration > 90:
+                    logger.warning(f"Highlight ignorado: muito longo ({duration}s, maximo 90s)")
                     continue
 
                 if not summary:
diff --git a/video_render/media.py b/video_render/media.py
index d99a71d..532de85 100644
--- a/video_render/media.py
+++ b/video_render/media.py
@@ -50,7 +50,10 @@ class MediaPreparer:
         existing_children = list(workspace_dir.iterdir())
         if existing_children:
             logger.info("Limpando workspace existente para %s", sanitized_name)
-            remove_paths(existing_children)
+            try:
+                remove_paths(existing_children)
+            except Exception as e:
+                logger.warning(f"Não foi possível limpar workspace (não crítico): {e}")
 
         if temp_transcription_json and temp_transcription_json.exists():
             shutil.move(str(temp_transcription_json), str(transcription_json))
@@ -66,7 +69,10 @@ class MediaPreparer:
         output_dir = ensure_workspace(self.settings.outputs_dir, sanitized_name)
         existing_outputs = list(output_dir.iterdir())
         if existing_outputs:
-            remove_paths(existing_outputs)
+            try:
+                remove_paths(existing_outputs)
+            except Exception as e:
+                logger.warning(f"Não foi possível limpar outputs antigos (não crítico): {e}")
 
         audio_path = workspace_dir / "audio.wav"
         extract_audio_to_wav(working_video_path, audio_path)
diff --git a/video_render/pipeline.py b/video_render/pipeline.py
index 0357788..39557d8 100644
--- a/video_render/pipeline.py
+++ b/video_render/pipeline.py
@@ -107,6 +107,9 @@ class VideoPipeline:
         TranscriptionService.persist(transcription, context.workspace.workspace_dir)
         context.transcription = transcription
 
+        # Unload Whisper model immediately after transcription to free memory (1-3GB)
+        self.transcriber.unload_model()
+
     def _determine_highlights(self, context: PipelineContext) -> None:
         if not context.transcription:
             raise RuntimeError("Transcricao nao disponivel")
diff --git a/video_render/rendering.py b/video_render/rendering.py
index ae69813..b2ce7f5 100644
--- a/video_render/rendering.py
+++ b/video_render/rendering.py
@@ -345,7 +345,9 @@ class VideoRenderer:
             target_width=settings.rendering.frame_width,
             target_height=settings.rendering.frame_height,
             frame_skip=settings.rendering.smart_framing_frame_skip,
-            smoothing_window=settings.rendering.smart_framing_smoothing_window
+            smoothing_window=settings.rendering.smart_framing_smoothing_window,
+            max_velocity=settings.rendering.smart_framing_max_velocity,
+            person_switch_cooldown=settings.rendering.smart_framing_person_switch_cooldown
         )
 
     def render(
@@ -436,12 +438,10 @@ class VideoRenderer:
                     audio_samples=audio_samples
                 )
 
-                # Apply smart framing based on detected layout
-                use_split_screen = framing_plan.layout_mode in ["dual_split", "grid"]
+                # Apply smart framing (always single-person focus)
                 video_clip = self.smart_framer.apply_framing(
                     video_clip=subclip,
-                    framing_plan=framing_plan,
-                    use_split_screen=use_split_screen
+                    framing_plan=framing_plan
                 )
 
                 logger.info(f"Smart framing applied: layout={framing_plan.layout_mode}, "
@@ -602,6 +602,10 @@ class VideoRenderer:
         if audio_clip is not None and audio_needs_close:
             audio_clip.close()
 
+        # Force garbage collection to free memory after rendering
+        import gc
+        gc.collect()
+
         return str(output_path)
 
     def _materialize_audio(
diff --git a/video_render/smart_framing.py b/video_render/smart_framing.py
index 76087ba..8b5f52a 100644
--- a/video_render/smart_framing.py
+++ b/video_render/smart_framing.py
@@ -46,21 +46,20 @@ class SmartFramer:
         self,
         target_width: int = 1080,
         target_height: int = 1920,
-        frame_skip: int = 2,
-        smoothing_window: int = 15
+        frame_skip: int = 1,
+        smoothing_window: int = 30,
+        max_velocity: int = 20,
+        person_switch_cooldown: int = 999999
     ):
         self.target_width = target_width
         self.target_height = target_height
         self.target_aspect = target_height / target_width
-
-        # Performance parameters
-        self.frame_skip = frame_skip  # Process every Nth frame (CPU optimization)
-
-        # Smoothing parameters
+        self.frame_skip = frame_skip
         self.smoothing_window = smoothing_window
-        self.max_velocity = 30  # pixels per frame (reduced for smoother transitions)
+        self.max_velocity = max_velocity
+        self.person_switch_cooldown = person_switch_cooldown
 
-        logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip})")
+        logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip}, smoothing={smoothing_window}, velocity={max_velocity}, cooldown={person_switch_cooldown})")
 
     def create_framing_plan(
         self,
@@ -81,25 +80,21 @@ class SmartFramer:
         Returns:
             FramingPlan with all frame contexts and crop regions
         """
-        analyzer = ContextAnalyzer()
+        analyzer = ContextAnalyzer(person_switch_cooldown=self.person_switch_cooldown)
 
-        # Detect speaking periods from audio if available
         speaking_periods = None
         if audio_samples is not None:
             speaking_periods = analyzer.audio_detector.detect_speaking_periods(audio_samples)
 
-        # Open video with error suppression for AV1 codec warnings
         import os
         os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'loglevel;quiet'
 
         cap = cv2.VideoCapture(video_path)
         fps = cap.get(cv2.CAP_PROP_FPS)
 
-        # Calculate frame range
         start_frame = int(start_time * fps)
         end_frame = int(end_time * fps)
 
-        # Set to start frame
         cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
 
         frame_contexts = []
@@ -113,7 +108,6 @@ class SmartFramer:
             if not ret:
                 break
 
-            # Only process every Nth frame for performance (CPU optimization)
             if processed_count % self.frame_skip == 0:
                 timestamp = frame_number / fps
                 context = analyzer.analyze_frame(frame, timestamp, frame_number, speaking_periods)
@@ -122,35 +116,36 @@ class SmartFramer:
             frame_number += 1
             processed_count += 1
 
-        # Get video dimensions before releasing capture
         source_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         source_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
 
         cap.release()
         analyzer.close()
 
-        # Determine overall layout mode (most common)
         layout_modes = [ctx.layout_mode for ctx in frame_contexts]
         if layout_modes:
             overall_layout = max(set(layout_modes), key=layout_modes.count)
         else:
             overall_layout = "single"
 
-        # Calculate crop regions based on contexts
-
         crop_regions = self._calculate_crop_regions(
             frame_contexts,
             source_width,
             source_height
         )
 
-        return FramingPlan(
+        framing_plan = FramingPlan(
             frame_contexts=frame_contexts,
             crop_regions=crop_regions,
             layout_mode=overall_layout,
             fps=fps
         )
 
+        import gc
+        gc.collect()
+
+        return framing_plan
+
     def _calculate_crop_regions(
         self,
         contexts: List[FrameContext],
@@ -171,66 +166,122 @@ class SmartFramer:
         if not contexts:
             return []
 
-        # Calculate ideal crop dimensions maintaining EXACT 9:16 aspect ratio
         source_aspect = source_width / source_height
 
         if source_aspect > self.target_aspect:
-            # Source is wider - crop horizontally (use full height)
             crop_height = source_height
             crop_width = int(crop_height / self.target_aspect)
 
-            # Ensure crop width fits within source
             if crop_width > source_width:
                 crop_width = source_width
                 crop_height = int(crop_width * self.target_aspect)
         else:
-            # Source is taller - crop vertically (use full width)
             crop_width = source_width
             crop_height = int(crop_width * self.target_aspect)
 
-            # Ensure crop height fits within source
             if crop_height > source_height:
                 crop_height = source_height
                 crop_width = int(crop_height / self.target_aspect)
 
-        # Calculate center points for each frame
-        # Since we now always focus on ONE person directly (not averaging),
-        # we can use the focus point directly without complex validation
-        center_xs = []
-        center_ys = []
+        safe_zone_margin_x = crop_width * 0.40
+        safe_zone_margin_y = crop_height * 0.40
 
-        for ctx in contexts:
-            if ctx.primary_focus:
-                # Primary focus is now always a single person's center, never averaged
-                # This means it will never be on the table/empty space
-                center_xs.append(ctx.primary_focus[0])
-                center_ys.append(ctx.primary_focus[1])
+        dead_zone_threshold = 100
+
+        if contexts and contexts[0].primary_focus:
+            current_crop_center_x = contexts[0].primary_focus[0]
+            current_crop_center_y = contexts[0].primary_focus[1]
+        else:
+            current_crop_center_x = source_width // 2
+            current_crop_center_y = source_height // 2
+
+        center_xs = [current_crop_center_x]
+        center_ys = [current_crop_center_y]
+
+        for ctx in contexts[1:]:
+            if ctx.primary_focus and ctx.selected_people and len(ctx.detected_faces) > 0:
+                primary_person_idx = ctx.selected_people[0] if ctx.selected_people else 0
+                if primary_person_idx < len(ctx.detected_faces):
+                    face = ctx.detected_faces[primary_person_idx]
+
+                    face_left = face.x
+                    face_right = face.x + face.width
+                    face_top = face.y
+                    face_bottom = face.y + face.height
+
+                    crop_left = current_crop_center_x - crop_width // 2
+                    crop_right = current_crop_center_x + crop_width // 2
+                    crop_top = current_crop_center_y - crop_height // 2
+                    crop_bottom = current_crop_center_y + crop_height // 2
+
+                    face_rel_left = face_left - crop_left
+                    face_rel_right = face_right - crop_left
+                    face_rel_top = face_top - crop_top
+                    face_rel_bottom = face_bottom - crop_top
+
+                    face_left_safe = face_rel_left >= safe_zone_margin_x
+                    face_right_safe = face_rel_right <= (crop_width - safe_zone_margin_x)
+                    face_top_safe = face_rel_top >= safe_zone_margin_y
+                    face_bottom_safe = face_rel_bottom <= (crop_height - safe_zone_margin_y)
+
+                    face_fully_visible = face_left_safe and face_right_safe and face_top_safe and face_bottom_safe
+
+                    if face_fully_visible:
+                        center_xs.append(current_crop_center_x)
+                        center_ys.append(current_crop_center_y)
+                    else:
+                        shift_x = 0
+                        shift_y = 0
+
+                        if not face_left_safe:
+                            shift_x = face_rel_left - safe_zone_margin_x
+                        elif not face_right_safe:
+                            shift_x = face_rel_right - (crop_width - safe_zone_margin_x)
+
+                        if not face_top_safe:
+                            shift_y = face_rel_top - safe_zone_margin_y
+                        elif not face_bottom_safe:
+                            shift_y = face_rel_bottom - (crop_height - safe_zone_margin_y)
+
+                        if abs(shift_x) > dead_zone_threshold:
+                            current_crop_center_x += shift_x
+                        if abs(shift_y) > dead_zone_threshold:
+                            current_crop_center_y += shift_y
+
+                        center_xs.append(current_crop_center_x)
+                        center_ys.append(current_crop_center_y)
+                else:
+                    center_xs.append(current_crop_center_x)
+                    center_ys.append(current_crop_center_y)
             else:
-                # Default to center only if no faces detected at all
-                center_xs.append(source_width // 2)
-                center_ys.append(source_height // 2)
+                center_xs.append(current_crop_center_x)
+                center_ys.append(current_crop_center_y)
 
-        # Smooth the center points
-        if len(center_xs) > self.smoothing_window:
-            kernel_size = min(self.smoothing_window, len(center_xs))
-            if kernel_size % 2 == 0:
-                kernel_size -= 1
+        if len(center_xs) > 1:
+            alpha = 0.002
+            smoothed_xs = [center_xs[0]]
+            smoothed_ys = [center_ys[0]]
+            for i in range(1, len(center_xs)):
+                if center_xs[i] != center_xs[i-1] or center_ys[i] != center_ys[i-1]:
+                    smoothed_xs.append(alpha * center_xs[i] + (1 - alpha) * smoothed_xs[i-1])
+                    smoothed_ys.append(alpha * center_ys[i] + (1 - alpha) * smoothed_ys[i-1])
+                else:
+                    smoothed_xs.append(smoothed_xs[i-1])
+                    smoothed_ys.append(smoothed_ys[i-1])
+            center_xs = smoothed_xs
+            center_ys = smoothed_ys
 
-            center_xs = signal.medfilt(center_xs, kernel_size=kernel_size).tolist()
-            center_ys = signal.medfilt(center_ys, kernel_size=kernel_size).tolist()
+        center_xs = self._limit_velocity(center_xs, 2)
+        center_ys = self._limit_velocity(center_ys, 2)
 
-        # Limit velocity (prevent jarring movements)
-        center_xs = self._limit_velocity(center_xs, self.max_velocity)
-        center_ys = self._limit_velocity(center_ys, self.max_velocity)
+        center_xs = self._apply_dead_zone(center_xs, 5)
+        center_ys = self._apply_dead_zone(center_ys, 5)
 
-        # Convert to crop regions
         crop_regions = []
         for center_x, center_y in zip(center_xs, center_ys):
-            # Calculate top-left corner
             x = int(center_x - crop_width // 2)
             y = int(center_y - crop_height // 2)
 
-            # Clamp to valid bounds
             x = max(0, min(x, source_width - crop_width))
             y = max(0, min(y, source_height - crop_height))
 
@@ -241,8 +292,37 @@ class SmartFramer:
                 height=crop_height
             ))
 
+        center_xs.clear()
+        center_ys.clear()
+
         return crop_regions
 
+    def _apply_dead_zone(self, positions: List[float], threshold: float) -> List[float]:
+        """
+        Apply dead zone to eliminate micro-movements.
+        If change is smaller than threshold, keep previous position.
+
+        Args:
+            positions: List of positions
+            threshold: Minimum change needed to move (pixels)
+
+        Returns:
+            Positions with dead zone applied
+        """
+        if len(positions) <= 1:
+            return positions
+
+        filtered = [positions[0]]
+
+        for i in range(1, len(positions)):
+            delta = abs(positions[i] - filtered[i - 1])
+            if delta < threshold:
+                filtered.append(filtered[i - 1])
+            else:
+                filtered.append(positions[i])
+
+        return filtered
+
     def _limit_velocity(self, positions: List[float], max_velocity: float) -> List[float]:
         """
         Limit the velocity of position changes.
@@ -271,33 +351,20 @@ class SmartFramer:
     def apply_framing(
         self,
         video_clip: VideoFileClip,
-        framing_plan: FramingPlan,
-        use_split_screen: bool = False
+        framing_plan: FramingPlan
     ) -> VideoClip:
         """
         Apply smart framing to a video clip.
+        Always uses single-person focus (no split screen).
 
         Args:
             video_clip: Source video clip
             framing_plan: Framing plan to apply
-            use_split_screen: Whether to use split screen for multiple people
 
         Returns:
             Reframed video clip
         """
-        # Handle different layout modes
-        if framing_plan.layout_mode in ["single", "single_speaker"]:
-            # Single person or single speaker - use focused single framing
-            return self._apply_single_framing(video_clip, framing_plan)
-        elif framing_plan.layout_mode == "dual_split" and use_split_screen:
-            # Two people in conversation - use split screen
-            return self._apply_split_screen(video_clip, framing_plan)
-        elif framing_plan.layout_mode == "grid" and use_split_screen:
-            # 3+ people - use grid layout
-            return self._apply_grid_layout(video_clip, framing_plan)
-        else:
-            # Fallback to single framing
-            return self._apply_single_framing(video_clip, framing_plan)
+        return self._apply_single_framing(video_clip, framing_plan)
 
     def _apply_single_framing(
         self,
@@ -315,12 +382,9 @@ class SmartFramer:
             Reframed video clip
         """
         def make_frame(t):
-            # Get the original frame
             frame = video_clip.get_frame(t)
 
-            # Ensure we have valid crop regions
             if not framing_plan.crop_regions:
-                # Fallback: return center crop
                 h, w = frame.shape[:2]
                 crop_h = int(w * self.target_aspect)
                 crop_w = w
@@ -331,41 +395,32 @@ class SmartFramer:
                 x = (w - crop_w) // 2
                 cropped = frame[y:y + crop_h, x:x + crop_w]
             else:
-                # Calculate exact frame index with decimal precision for interpolation
                 exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
 
-                # Get the two adjacent analyzed frames
                 idx_floor = int(exact_frame_idx)
                 idx_ceil = idx_floor + 1
 
-                # Interpolation factor (0.0 to 1.0)
                 alpha = exact_frame_idx - idx_floor
 
-                # Clamp indices to valid range
                 idx_floor = max(0, min(idx_floor, len(framing_plan.crop_regions) - 1))
                 idx_ceil = max(0, min(idx_ceil, len(framing_plan.crop_regions) - 1))
 
-                # Get crop regions
                 crop1 = framing_plan.crop_regions[idx_floor]
                 crop2 = framing_plan.crop_regions[idx_ceil]
 
-                # Linear interpolation between crop regions
                 x = int(crop1.x * (1 - alpha) + crop2.x * alpha)
                 y = int(crop1.y * (1 - alpha) + crop2.y * alpha)
                 width = int(crop1.width * (1 - alpha) + crop2.width * alpha)
                 height = int(crop1.height * (1 - alpha) + crop2.height * alpha)
 
-                # Ensure crop stays within frame bounds
                 h, w = frame.shape[:2]
                 x = max(0, min(x, w - width))
                 y = max(0, min(y, h - height))
                 width = min(width, w - x)
                 height = min(height, h - y)
 
-                # Crop the frame
                 cropped = frame[y:y + height, x:x + width]
 
-            # Resize to target dimensions
             resized = cv2.resize(
                 cropped,
                 (self.target_width, self.target_height),
@@ -374,7 +429,6 @@ class SmartFramer:
 
             return resized
 
-        # MoviePy 2.x compatible way to create VideoClip
         new_clip = VideoClip(duration=video_clip.duration)
         new_clip.size = (self.target_width, self.target_height)
         new_clip.frame_function = make_frame
@@ -397,13 +451,10 @@ class SmartFramer:
         """
         def make_frame(t):
             frame = video_clip.get_frame(t)
-            # Calculate exact frame index with decimal precision for smooth interpolation
             exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
             frame_idx = int(exact_frame_idx)
 
-            # Ensure we have valid contexts
             if not framing_plan.frame_contexts:
-                # Fallback to simple center crop
                 h, w = frame.shape[:2]
                 crop_h = int(w * self.target_aspect)
                 crop_w = w
@@ -415,107 +466,81 @@ class SmartFramer:
                 cropped = frame[y:y + crop_h, x:x + crop_w]
                 return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
 
-            # Clamp index to valid range
             frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
             context = framing_plan.frame_contexts[frame_idx]
 
-            # Create output frame
             output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
 
-            if len(context.detected_faces) >= 2:
-                # Split vertically 50/50 (two columns)
-                half_width = self.target_width // 2
+            if context.selected_people and len(context.selected_people) >= 2:
+                selected_faces = [context.detected_faces[i] for i in context.selected_people[:2]
+                                if i < len(context.detected_faces)]
 
-                # Select the 2 most relevant faces
-                # Priority: ALWAYS show active speaker first + most confident other person
-                if context.active_speakers and len(context.active_speakers) >= 1:
-                    # Get the PRIMARY speaker (most confident among active speakers)
-                    speaker_faces = [context.detected_faces[i] for i in context.active_speakers
-                                   if i < len(context.detected_faces)]
+                if len(selected_faces) >= 2:
+                    faces = sorted(selected_faces, key=lambda f: f.center_x)
+                    left_face = faces[0]
+                    right_face = faces[1]
 
-                    primary_speaker = max(speaker_faces, key=lambda f: f.confidence)
+                    for idx, face in enumerate([left_face, right_face]):
 
-                    # Get OTHER faces (not the primary speaker)
-                    other_faces = [f for f in context.detected_faces if f != primary_speaker]
+                        half_width = self.target_width // 2
+                        half_aspect = self.target_height / half_width  # Aspect ratio for half
 
-                    if len(speaker_faces) >= 2:
-                        # Multiple speakers: show primary + second most confident speaker
-                        other_speakers = [f for f in speaker_faces if f != primary_speaker]
-                        secondary_person = max(other_speakers, key=lambda f: f.confidence)
-                    elif other_faces:
-                        # One speaker: show speaker + most confident other person
-                        secondary_person = max(other_faces, key=lambda f: f.confidence)
-                    else:
-                        # Fallback: only one person detected
-                        secondary_person = primary_speaker
+                        face_width = max(face.width, frame.shape[1] // 4)  # At least 1/4 of frame width
+                        crop_width = int(face_width * 2.5)  # Add padding around face
+                        crop_height = int(crop_width * half_aspect)  # Maintain correct aspect
 
-                    selected_faces = [primary_speaker, secondary_person]
+                        max_crop_width = frame.shape[1] // 2  # Half the source width
+                        max_crop_height = frame.shape[0]  # Full source height
+
+                        if crop_width > max_crop_width:
+                            crop_width = max_crop_width
+                            crop_height = int(crop_width * half_aspect)
+
+                        if crop_height > max_crop_height:
+                            crop_height = max_crop_height
+                            crop_width = int(crop_height / half_aspect)
+
+                        x = max(0, face.center_x - crop_width // 2)
+                        y = max(0, face.center_y - crop_height // 2)
+
+                        x = min(x, frame.shape[1] - crop_width)
+                        y = min(y, frame.shape[0] - crop_height)
+
+                        cropped = frame[y:y + crop_height, x:x + crop_width]
+                        resized = cv2.resize(
+                            cropped,
+                            (half_width, self.target_height),
+                            interpolation=cv2.INTER_LINEAR
+                        )
+
+                        x_offset = idx * half_width
+                        output[:, x_offset:x_offset + half_width] = resized
                 else:
-                    # No speakers: take 2 most confident faces
-                    selected_faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:2]
-
-                # Sort selected faces by horizontal position for consistent left/right placement
-                faces = sorted(selected_faces, key=lambda f: f.center_x)
-                left_face = faces[0]
-                right_face = faces[1]
-
-                # Process each person's frame
-                for idx, face in enumerate([left_face, right_face]):
-                    # Calculate crop region focused on this person
-                    # Each person gets half the width, full target aspect ratio (9:16)
-                    # This ensures NO distortion when resizing
-
-                    # For split screen: each side is half_width x full_height
-                    # We need to maintain 9:16 aspect for each half
-                    half_width = self.target_width // 2
-                    half_aspect = self.target_height / half_width  # Aspect ratio for half
-
-                    # Determine crop size based on face with padding
-                    face_width = max(face.width, frame.shape[1] // 4)  # At least 1/4 of frame width
-                    crop_width = int(face_width * 2.5)  # Add padding around face
-                    crop_height = int(crop_width * half_aspect)  # Maintain correct aspect
-
-                    # Ensure crop fits in frame, maintaining aspect ratio
-                    max_crop_width = frame.shape[1] // 2  # Half the source width
-                    max_crop_height = frame.shape[0]  # Full source height
-
-                    # If crop is too wide, scale down proportionally
-                    if crop_width > max_crop_width:
-                        crop_width = max_crop_width
-                        crop_height = int(crop_width * half_aspect)
-
-                    # If crop is too tall, scale down proportionally
-                    if crop_height > max_crop_height:
-                        crop_height = max_crop_height
-                        crop_width = int(crop_height / half_aspect)
-
-                    # Center crop on face
-                    x = max(0, face.center_x - crop_width // 2)
-                    y = max(0, face.center_y - crop_height // 2)
-
-                    # Clamp to frame boundaries
-                    x = min(x, frame.shape[1] - crop_width)
-                    y = min(y, frame.shape[0] - crop_height)
-
-                    # Extract and resize crop
-                    cropped = frame[y:y + crop_height, x:x + crop_width]
-                    resized = cv2.resize(
+                    if framing_plan.crop_regions:
+                        crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
+                        crop = framing_plan.crop_regions[crop_idx]
+                        cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
+                    else:
+                        h, w = frame.shape[:2]
+                        crop_h = int(w * self.target_aspect)
+                        crop_w = w
+                        if crop_h > h:
+                            crop_h = h
+                            crop_w = int(h / self.target_aspect)
+                        y = (h - crop_h) // 2
+                        x = (w - crop_w) // 2
+                        cropped = frame[y:y + crop_h, x:x + crop_w]
+                    output = cv2.resize(
                         cropped,
-                        (half_width, self.target_height),
+                        (self.target_width, self.target_height),
                         interpolation=cv2.INTER_LINEAR
                     )
-
-                    # Place in output at appropriate horizontal position
-                    x_offset = idx * half_width
-                    output[:, x_offset:x_offset + half_width] = resized
             else:
-                # Fall back to single framing
                 if framing_plan.crop_regions:
                     crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
                     crop = framing_plan.crop_regions[crop_idx]
                     cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
                 else:
-                    # Fallback to center crop if no crop regions available
                     h, w = frame.shape[:2]
                     crop_h = int(w * self.target_aspect)
                     crop_w = w
@@ -533,7 +558,6 @@ class SmartFramer:
 
             return output
 
-        # MoviePy 2.x compatible way to create VideoClip
         new_clip = VideoClip(duration=video_clip.duration)
         new_clip.size = (self.target_width, self.target_height)
         new_clip.frame_function = make_frame
@@ -556,13 +580,10 @@ class SmartFramer:
         """
         def make_frame(t):
             frame = video_clip.get_frame(t)
-            # Calculate exact frame index with decimal precision for smooth interpolation
             exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
             frame_idx = int(exact_frame_idx)
 
-            # Ensure we have valid contexts
             if not framing_plan.frame_contexts:
-                # Fallback to simple center crop
                 h, w = frame.shape[:2]
                 crop_h = int(w * self.target_aspect)
                 crop_w = w
@@ -574,7 +595,6 @@ class SmartFramer:
                 cropped = frame[y:y + crop_h, x:x + crop_w]
                 return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
 
-            # Clamp index to valid range
             frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
             context = framing_plan.frame_contexts[frame_idx]
 
@@ -583,23 +603,18 @@ class SmartFramer:
             num_faces = len(context.detected_faces)
 
             if num_faces >= 3:
-                # Create 2x2 grid
                 cell_width = self.target_width // 2
                 cell_height = self.target_height // 2
 
                 for idx, face in enumerate(context.detected_faces[:4]):
-                    # Calculate grid position
                     row = idx // 2
                     col = idx % 2
 
-                    # Each grid cell maintains aspect ratio (square in this case: cell_width = cell_height)
                     cell_aspect = cell_height / cell_width
 
-                    # Crop around face with correct aspect ratio
                     crop_width = frame.shape[1] // 2
                     crop_height = int(crop_width * cell_aspect)
 
-                    # Ensure crop fits in frame, maintaining aspect
                     max_crop_width = frame.shape[1] // 2
                     max_crop_height = frame.shape[0] // 2
 
@@ -611,11 +626,9 @@ class SmartFramer:
                         crop_height = max_crop_height
                         crop_width = int(crop_height / cell_aspect)
 
-                    # Center crop on face
                     x = max(0, face.center_x - crop_width // 2)
                     y = max(0, face.center_y - crop_height // 2)
 
-                    # Clamp to frame boundaries
                     x = min(x, frame.shape[1] - crop_width)
                     y = min(y, frame.shape[0] - crop_height)
 
@@ -626,18 +639,15 @@ class SmartFramer:
                         interpolation=cv2.INTER_LINEAR
                     )
 
-                    # Place in grid
                     y_offset = row * cell_height
                     x_offset = col * cell_width
                     output[y_offset:y_offset + cell_height, x_offset:x_offset + cell_width] = resized
             else:
-                # Fall back to single framing
                 if framing_plan.crop_regions:
                     crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
                     crop = framing_plan.crop_regions[crop_idx]
                     cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
                 else:
-                    # Fallback to center crop if no crop regions available
                     h, w = frame.shape[:2]
                     crop_h = int(w * self.target_aspect)
                     crop_w = w
@@ -655,7 +665,6 @@ class SmartFramer:
 
             return output
 
-        # MoviePy 2.x compatible way to create VideoClip
         new_clip = VideoClip(duration=video_clip.duration)
         new_clip.size = (self.target_width, self.target_height)
         new_clip.frame_function = make_frame
diff --git a/video_render/transcription.py b/video_render/transcription.py
index 5e748bf..da4ab9c 100644
--- a/video_render/transcription.py
+++ b/video_render/transcription.py
@@ -6,6 +6,7 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional
 
+import numpy as np
 from faster_whisper import WhisperModel
 
 from video_render.config import Settings
@@ -56,6 +57,17 @@ class TranscriptionService:
             )
         return self._model
 
+    def unload_model(self) -> None:
+        """Unload the Whisper model to free memory (reduces RAM usage by 1-3GB)."""
+        if self._model is not None:
+            logger.info("Descarregando modelo Whisper para liberar memória...")
+            del self._model
+            self._model = None
+            # Force garbage collection to immediately free GPU/CPU memory
+            import gc
+            gc.collect()
+            logger.info("Modelo Whisper descarregado com sucesso")
+
     def transcribe(self, audio_path: Path, output_dir: Optional[Path] = None) -> TranscriptionResult:
         if output_dir is not None:
             existing_transcription = self.load(output_dir)
@@ -63,7 +75,34 @@ class TranscriptionService:
                 logger.info("Transcrição já existe em %s, reutilizando...", output_dir)
                 return existing_transcription
 
-        logger.info("Iniciando transcrição do áudio com FasterWhisper...")
+        # Get audio duration to decide if we need chunked processing
+        audio_duration = self._get_audio_duration(audio_path)
+        chunk_duration_minutes = 30  # Process in 30-minute chunks for long videos
+        chunk_duration_seconds = chunk_duration_minutes * 60
+
+        # For videos longer than 30 minutes, use chunked processing to avoid OOM
+        if audio_duration > chunk_duration_seconds:
+            logger.info(
+                f"Áudio longo detectado ({audio_duration/60:.1f} min). "
+                f"Processando em chunks de {chunk_duration_minutes} min para evitar erro de memória..."
+            )
+            return self._transcribe_chunked(audio_path, chunk_duration_seconds)
+        else:
+            logger.info(f"Iniciando transcrição do áudio ({audio_duration/60:.1f} min) com FasterWhisper...")
+            return self._transcribe_full(audio_path)
+
+    def _get_audio_duration(self, audio_path: Path) -> float:
+        """Get audio duration in seconds."""
+        try:
+            from moviepy.audio.io.AudioFileClip import AudioFileClip
+            with AudioFileClip(str(audio_path)) as audio:
+                return audio.duration or 0.0
+        except Exception as e:
+            logger.warning(f"Falha ao obter duração do áudio, assumindo curto: {e}")
+            return 0.0  # Assume short if we can't determine
+
+    def _transcribe_full(self, audio_path: Path) -> TranscriptionResult:
+        """Transcribe entire audio at once (for shorter videos)."""
         model = self._load_model()
         segments, _ = model.transcribe(
             str(audio_path),
@@ -97,6 +136,101 @@ class TranscriptionService:
             full_text=" ".join(full_text_parts).strip(),
         )
 
+    def _transcribe_chunked(self, audio_path: Path, chunk_duration: float) -> TranscriptionResult:
+        """Transcribe audio in chunks to avoid OOM on long videos."""
+        import subprocess
+        from moviepy.audio.io.AudioFileClip import AudioFileClip
+
+        model = self._load_model()
+        all_segments: List[TranscriptSegment] = []
+        full_text_parts: List[str] = []
+        segment_id_counter = 0
+
+        # Get total duration
+        total_duration = self._get_audio_duration(audio_path)
+        num_chunks = int(np.ceil(total_duration / chunk_duration))
+
+        logger.info(f"Processando áudio em {num_chunks} chunks...")
+
+        for chunk_idx in range(num_chunks):
+            start_time = chunk_idx * chunk_duration
+            end_time = min((chunk_idx + 1) * chunk_duration, total_duration)
+
+            logger.info(
+                f"Processando chunk {chunk_idx + 1}/{num_chunks} "
+                f"({start_time/60:.1f}min - {end_time/60:.1f}min)..."
+            )
+
+            # Extract chunk using ffmpeg directly (more reliable than moviepy subclip)
+            temp_chunk_path = audio_path.parent / f"temp_chunk_{chunk_idx}.wav"
+            try:
+                # Use ffmpeg to extract the chunk
+                chunk_duration_actual = end_time - start_time
+                ffmpeg_cmd = [
+                    'ffmpeg',
+                    '-y',  # Overwrite output file
+                    '-ss', str(start_time),  # Start time
+                    '-i', str(audio_path),  # Input file
+                    '-t', str(chunk_duration_actual),  # Duration
+                    '-acodec', 'pcm_s16le',  # Audio codec
+                    '-ar', '44100',  # Sample rate
+                    '-ac', '2',  # Stereo
+                    '-loglevel', 'error',  # Only show errors
+                    str(temp_chunk_path)
+                ]
+
+                subprocess.run(ffmpeg_cmd, check=True, capture_output=True)
+
+                # Transcribe chunk
+                segments, _ = model.transcribe(
+                    str(temp_chunk_path),
+                    beam_size=5,
+                    word_timestamps=True,
+                )
+
+                # Process segments with time offset
+                for segment in segments:
+                    words = [
+                        WordTiming(
+                            start=w.start + start_time,
+                            end=w.end + start_time,
+                            word=w.word.strip()
+                        )
+                        for w in segment.words or []
+                        if w.word.strip()
+                    ]
+                    text = segment.text.strip()
+                    full_text_parts.append(text)
+                    all_segments.append(
+                        TranscriptSegment(
+                            id=segment_id_counter,
+                            start=segment.start + start_time,
+                            end=segment.end + start_time,
+                            text=text,
+                            words=words,
+                        )
+                    )
+                    segment_id_counter += 1
+
+                # Force garbage collection after each chunk
+                import gc
+                gc.collect()
+
+            except subprocess.CalledProcessError as e:
+                logger.error(f"Erro ao extrair chunk {chunk_idx}: {e.stderr.decode() if e.stderr else str(e)}")
+                raise
+            finally:
+                # Clean up temp chunk
+                if temp_chunk_path.exists():
+                    temp_chunk_path.unlink()
+
+        logger.info(f"Transcrição em chunks concluída: {len(all_segments)} segmentos processados")
+
+        return TranscriptionResult(
+            segments=all_segments,
+            full_text=" ".join(full_text_parts).strip(),
+        )
+
     @staticmethod
     def persist(result: TranscriptionResult, destination: Path) -> None:
         json_path = destination / "transcription.json"
diff --git a/video_render/utils.py b/video_render/utils.py
index 8d8a4fd..04fbba6 100644
--- a/video_render/utils.py
+++ b/video_render/utils.py
@@ -23,16 +23,58 @@ def ensure_workspace(root: Path, folder_name: str) -> Path:
 
 
 def remove_paths(paths: Iterable[Path]) -> None:
+    import logging
+    import time
+
+    logger = logging.getLogger(__name__)
+
     for path in paths:
         if not path.exists():
             continue
-        if path.is_file() or path.is_symlink():
-            path.unlink(missing_ok=True)
-        else:
-            for child in sorted(path.rglob("*"), reverse=True):
-                if child.is_file() or child.is_symlink():
-                    child.unlink(missing_ok=True)
-                elif child.is_dir():
-                    child.rmdir()
-            path.rmdir()
+
+        # Try to remove with retries and better error handling
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                if path.is_file() or path.is_symlink():
+                    path.unlink(missing_ok=True)
+                else:
+                    for child in sorted(path.rglob("*"), reverse=True):
+                        if child.is_file() or child.is_symlink():
+                            try:
+                                child.unlink(missing_ok=True)
+                            except PermissionError:
+                                logger.warning(f"Não foi possível deletar {child}: sem permissão")
+                                # Try to change permissions and retry
+                                try:
+                                    child.chmod(0o777)
+                                    child.unlink(missing_ok=True)
+                                except Exception as e:
+                                    logger.warning(f"Falha ao forçar deleção de {child}: {e}")
+                        elif child.is_dir():
+                            try:
+                                child.rmdir()
+                            except (PermissionError, OSError) as e:
+                                logger.warning(f"Não foi possível remover diretório {child}: {e}")
+
+                    try:
+                        path.rmdir()
+                    except (PermissionError, OSError) as e:
+                        logger.warning(f"Não foi possível remover diretório {path}: {e}")
+                break  # Success, exit retry loop
+
+            except PermissionError as e:
+                if attempt < max_retries - 1:
+                    logger.warning(f"Tentativa {attempt + 1}/{max_retries} falhou ao deletar {path}: {e}. Tentando novamente...")
+                    time.sleep(0.5)  # Wait a bit before retry
+                    # Try to change permissions
+                    try:
+                        path.chmod(0o777)
+                    except Exception:
+                        pass
+                else:
+                    logger.error(f"Não foi possível deletar {path} após {max_retries} tentativas: {e}")
+            except Exception as e:
+                logger.error(f"Erro inesperado ao deletar {path}: {e}")
+                break  # Don't retry on unexpected errors