From 07d301f110240c51c0a1758590a85956fcde82b4 Mon Sep 17 00:00:00 2001 From: LeoMortari Date: Thu, 18 Dec 2025 02:26:25 -0300 Subject: [PATCH] Realiza varios ajustes para melhorar o tracking e o render de video --- docker-compose.yml | 10 +- prompts/generate.txt | 165 ++++++---- video_render/config.py | 18 +- video_render/context_detection.py | 509 +++++++++++++++++++++++++++--- video_render/llm.py | 4 +- video_render/media.py | 10 +- video_render/pipeline.py | 3 + video_render/rendering.py | 14 +- video_render/smart_framing.py | 371 +++++++++++----------- video_render/transcription.py | 136 +++++++- video_render/utils.py | 60 +++- 11 files changed, 984 insertions(+), 316 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 628ee37..200f4a0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,10 @@ services: video-render: restart: unless-stopped - build: . + build: + context: . + no_cache: true + dockerfile: dockerfile environment: - RABBITMQ_PASS=${RABBITMQ_PASS} - OPENROUTER_API_URL=${OPENROUTER_API_URL:-https://openrouter.ai/api/v1/chat/completions} @@ -9,12 +12,17 @@ services: - OPENROUTER_MODEL=${OPENROUTER_MODEL:-openai/gpt-oss-20b:free} - OPENROUTER_PROMPT_PATH=${OPENROUTER_PROMPT_PATH:-prompts/generate.txt} - FASTER_WHISPER_MODEL_SIZE=${FASTER_WHISPER_MODEL_SIZE:-medium} + - SMART_FRAMING_SMOOTHING_WINDOW=${SMART_FRAMING_SMOOTHING_WINDOW:-30} + - SMART_FRAMING_MAX_VELOCITY=${SMART_FRAMING_MAX_VELOCITY:-40} + - SMART_FRAMING_FRAME_SKIP=${SMART_FRAMING_FRAME_SKIP:-2} + - SMART_FRAMING_PERSON_SWITCH_COOLDOWN=${SMART_FRAMING_PERSON_SWITCH_COOLDOWN:-60} volumes: - "/root/videos:/app/videos" - "/root/outputs:/app/outputs" - "/root/prompts:/app/prompts" # - "./videos:/app/videos" # - "./outputs:/app/outputs" + # - "./prompts:/app/prompts" command: "python -u main.py" networks: - dokploy-network diff --git a/prompts/generate.txt b/prompts/generate.txt index 8638af2..bd90862 100644 --- a/prompts/generate.txt +++ b/prompts/generate.txt @@ -1,85 +1,118 @@ -Voce e especialista em viralidade de redes sociais (TikTok, Instagram Reels, YouTube Shorts). Analise a transcricao e selecione trechos com MAXIMO potencial viral, priorizando qualidade sobre quantidade. +Você é especialista em viralidade de redes sociais (TikTok, Instagram Reels, YouTube Shorts). Sua missão: EXTRAIR O MÁXIMO de clips virais possíveis, priorizando QUANTIDADE + QUALIDADE. -PROCESSO DE ANALISE: -1. Mapear potenciais trechos na transcricao -2. Avaliar cada trecho usando sistema de pontuacao abaixo +🎯 OBJETIVO: Transformar cada vídeo em MÚLTIPLOS clips que podem viralizar + +PROCESSO DE ANÁLISE: +1. Mapear TODOS os potenciais trechos virais na transcrição +2. Avaliar cada trecho usando sistema de pontuação abaixo 3. Rankear do maior para menor score viral -4. Selecionar apenas os top-ranked baseado na duracao do video +4. Selecionar TODOS os trechos com score ≥ 60 (não seja conservador!) -SISTEMA DE PONTUACAO VIRAL (0-100 pontos): +SISTEMA DE PONTUAÇÃO VIRAL (0-100 pontos): -HOOK/ABERTURA (0-25 pontos): -[25] Frase choqueante, pergunta polemica ou promessa ousada -[20] Historia intrigante ou situacao inusitada -[15] Afirmacao interessante mas previsivel -[10] Introducao generica mas aceitavel -[0] "Oi", "entao", silencio ou conteudo fraco +🪝 GANCHO INICIAL (0-30 pontos) - CRÍTICO PARA VIRALIZAÇÃO: +[30] Frase CHOCANTE, pergunta POLÊMICA ou promessa OUSADA nos primeiros 3 segundos +[25] Hook forte: "Você não vai acreditar...", "O segredo que ninguém conta...", "Isso mudou tudo..." +[20] Pergunta intrigante ou afirmação controversa +[15] História interessante mas gancho fraco +[10] Início genérico mas aceitável +[0] "Oi", "então", "bem", silêncio - DESCARTAR -GATILHO EMOCIONAL (0-25 pontos): -[25] Emocao extrema: raiva, choque, riso intenso, inspiracao profunda -[20] Emocao forte: surpresa, indignacao, humor, curiosidade intensa -[15] Emocao moderada: interesse, leve humor, curiosidade -[10] Emocao fraca: informativo sem impacto emocional -[0] Monotono, tecnico, sem apelo emocional +🔥 GATILHO EMOCIONAL (0-25 pontos): +[25] Emoção EXTREMA: raiva, choque, riso intenso, WTF moment, revelação bombástica +[20] Emoção forte: surpresa, indignação, humor, curiosidade intensa +[15] Emoção moderada: interesse, leve humor, insight interessante +[10] Emoção fraca: informativo sem impacto +[0] Monótono, técnico, sem apelo emocional - EVITAR -VALOR/UTILIDADE (0-20 pontos): -[20] Segredo valioso, insight transformador ou informacao exclusiva -[15] Ensina algo pratico e imediatamente aplicavel -[10] Opiniao interessante ou perspectiva util -[5] Informacao generica ou conhecimento comum -[0] Nenhum valor pratico, puro enrolation +💎 VALOR/UTILIDADE (0-20 pontos): +[20] Segredo VALIOSO, insight transformador, informação EXCLUSIVA +[15] Ensina algo prático e IMEDIATAMENTE aplicável +[10] Opinião interessante ou perspectiva única +[5] Informação genérica ou conhecimento comum +[0] Nenhum valor prático, puro "enrolation" - DESCARTAR -ESTRUTURA NARRATIVA (0-15 pontos): -[15] Historia completa com inicio, conflito/climax e resolucao -[10] Segmento com comeco e fim coerentes +📖 ESTRUTURA NARRATIVA (0-15 pontos): +[15] História COMPLETA com início, conflito/clímax e resolução satisfatória +[10] Segmento com começo e fim coerentes, faz sentido isolado [5] Trecho com sentido mas cortado abruptamente -[0] Fragmento sem contexto ou conclusao +[0] Fragmento sem contexto - NÃO USAR -RITMO E ENERGIA (0-15 pontos): -[15] Dinamico, sem pausas, alta energia, palavras impactantes -[10] Bom ritmo com pausas naturais curtas -[5] Ritmo lento mas aceitavel -[0] Muitas pausas, hesitacoes, monotonia, silencio +⚡ RITMO E ENERGIA (0-10 pontos): +[10] DINÂMICO, sem pausas longas, alta energia, palavras impactantes +[7] Bom ritmo com pausas naturais curtas (< 2s) +[3] Ritmo lento mas aceitável +[0] Muitas pausas (> 3s), hesitações, monotonia - EVITAR -REGRAS DE QUANTIDADE: -5-10 min: 3 clipes (minimo 1 se score alto) -10-20 min: 4 clipes -20-30 min: 5 clipes -30+ min: 6 clipes (maximo absoluto) +REGRAS DE QUANTIDADE (SER AGRESSIVO): +📊 Quantidade MÍNIMA por duração: +- 5-10 min: MÍNIMO 4-6 clips +- 10-15 min: MÍNIMO 6-8 clips +- 15-20 min: MÍNIMO 8-10 clips +- 20-30 min: MÍNIMO 10-15 clips +- 30+ min: MÍNIMO 15-20 clips -IMPORTANTE: Priorize qualidade. Melhor 3 clipes score 80+ que 6 clipes score 50. Se poucos momentos virais, retorne apenas os melhores (minimo 1). +🎯 REGRA DE OURO: 1 clip a cada 2-3 minutos de vídeo (NO MÍNIMO) +- Se encontrar momentos virais, SEMPRE selecione! +- Melhor ter 3 clips perfeitos que 10 clips bons -CRITERIOS DE SELECAO: -- Score viral maior ou igual 60 pontos (idealmente maior ou igual 70) -- Duracao ideal: 60-90s -- Duracao minima: 60s | Duracao maxima: 120s -- Sem sobreposicao (end de um menor que start do proximo) -- Inicio e fim coerentes +CRITÉRIOS DE SELEÇÃO: +- Score viral ≥ 60 pontos (idealmente ≥ 70) +- Duração ideal: 60-120s (formato ideal para Reels/Shorts) +- Duração mínima: 60s | Duração máxima: 120s +- Sem sobreposição temporal +- DEVE ter gancho forte nos primeiros 3 segundos +- Início e fim coerentes -EVITE: -- Introducoes genericas -- Trechos com silencio/pausas maiores que 3s -- Explicacoes tecnicas sem gancho emocional -- Segmentos sem conclusao -- Momentos de transicao +GANCHOS QUE FAZEM VIRALIZAR (use como filtro): +- "O que ninguém te conta sobre..." +- "O erro que 90% das pessoas cometem..." +- "Você não vai acreditar o que aconteceu..." +- Revelações chocantes ou contraintuitivas +- Antes vs Depois, transformações +- Segredos, bastidores, verdades ocultas +- Polêmicas, opiniões fortes, hot takes +- Histórias dramáticas com reviravolta +- Dicas práticas e acionáveis +- Momentos de humor genuíno -FORMATO JSON (retorne APENAS isto): -{"highlights":[{"start":,"end":,"summary":"Score estimado e gatilhos principais"}]} +❌ EVITE (mas não descarte se score alto): +- Introduções genéricos SEM gancho +- Trechos com pausas > 3s consecutivas +- Explicações técnicas SEM gancho emocional +- Segmentos sem conclusão clara +- Momentos de transição vazios -REGRAS TECNICAS: -- Float com ponto decimal (45.5 NAO 45,5) +FORMATO JSON (retorne APENAS isto, SEM texto adicional): +{ + "highlights": [ + { + "start": , + "end": , + "summary": "Score: XX/100 | Gancho: [descreva] | Gatilho: [descreva]", + } + ] +} + +REGRAS TÉCNICAS: +- Float com ponto decimal (45.5 NÃO 45,5) - Timestamps exatos dos segments fornecidos -- Ordem cronologica (start crescente) -- Minimo 1, maximo 6 highlights -- Summary conciso (1-2 frases) +- Ordem cronológica (start crescente) +- Summary conciso mas informativo (2-3 frases) -TAREFA: -1. Leia transcricao e timestamps -2. Avalie e pontue trechos mentalmente -3. Rankear por score viral -4. Selecione top-ranked baseado na duracao -5. Retorne JSON -6. Se video fraco, retorne pelo menos 1 highlight +TAREFA PASSO A PASSO: +1. Leia transcrição completa +2. Identifique TODOS os momentos potencialmente virais +3. Avalie e pontue cada trecho (seja generoso!) +4. Rankear por score viral +5. Selecione TODOS com score ≥ 60 +6. Garanta mínimo de 1 clip a cada 5 minutos +7. Retorne JSON completo -Objetivo: MAXIMIZAR chance de viralizar. Seja criterioso, apenas melhores trechos. +⚠️ IMPORTANTE: +- NÃO seja conservador! Se encontrou 10 momentos bons, retorne os 10! +- Pense em MAXIMIZAR alcance: mais clips = mais chances de viralizar +- Se vídeo tem conteúdo fraco, seja criterioso, mas SEMPRE retorne pelo menos 3-5 clips +- Priorize clips com GANCHOS FORTES - gancho fraco = baixo alcance +🎯 MINDSET: Você é um criador de conteúdo viral. Seu objetivo é extrair MÁXIMO valor do vídeo original. diff --git a/video_render/config.py b/video_render/config.py index 547d10b..0ca0c1e 100644 --- a/video_render/config.py +++ b/video_render/config.py @@ -13,10 +13,10 @@ TEMP_ROOT = BASE_DIR / "temp" @dataclass(frozen=True) class RabbitMQSettings: - # host: str = os.environ.get("RABBITMQ_HOST", "154.12.229.181") - # port: int = int(os.environ.get("RABBITMQ_PORT", 32790)) - host: str = os.environ.get("RABBITMQ_HOST", "rabbitmq") - port: int = int(os.environ.get("RABBITMQ_PORT", 5672)) + host: str = os.environ.get("RABBITMQ_HOST", "154.12.229.181") + port: int = int(os.environ.get("RABBITMQ_PORT", 32790)) + # host: str = os.environ.get("RABBITMQ_HOST", "rabbitmq") + # port: int = int(os.environ.get("RABBITMQ_PORT", 5672)) user: str = os.environ.get("RABBITMQ_USER", "admin") password: str = os.environ.get("RABBITMQ_PASS") consume_queue: str = os.environ.get("RABBITMQ_QUEUE", "to-render") @@ -62,11 +62,13 @@ class RenderingSettings: subtitle_font_size: int = int(os.environ.get("RENDER_SUBTITLE_FONT_SIZE", 64)) caption_min_words: int = int(os.environ.get("CAPTION_MIN_WORDS", 2)) caption_max_words: int = int(os.environ.get("CAPTION_MAX_WORDS", 2)) - # Smart framing settings + # Smart framing settings - CONTAINMENT TRACKING mode enable_smart_framing: bool = os.environ.get("ENABLE_SMART_FRAMING", "true").lower() in ("true", "1", "yes") - smart_framing_min_confidence: float = float(os.environ.get("SMART_FRAMING_MIN_CONFIDENCE", 0.5)) - smart_framing_smoothing_window: int = int(os.environ.get("SMART_FRAMING_SMOOTHING_WINDOW", 20)) - smart_framing_frame_skip: int = int(os.environ.get("SMART_FRAMING_FRAME_SKIP", 2)) # Process every Nth frame (CPU optimization) + smart_framing_min_confidence: float = float(os.environ.get("SMART_FRAMING_MIN_CONFIDENCE", 0.3)) # Lowered for better cartoon detection + smart_framing_smoothing_window: int = int(os.environ.get("SMART_FRAMING_SMOOTHING_WINDOW", 30)) # Reduced - not needed with containment tracking + smart_framing_frame_skip: int = int(os.environ.get("SMART_FRAMING_FRAME_SKIP", 1)) # Process every frame for smooth 30 FPS tracking + smart_framing_max_velocity: int = int(os.environ.get("SMART_FRAMING_MAX_VELOCITY", 20)) # Moderate - only used during transitions + smart_framing_person_switch_cooldown: int = int(os.environ.get("SMART_FRAMING_PERSON_SWITCH_COOLDOWN", 999999)) # DISABLED - never switch people @dataclass(frozen=True) diff --git a/video_render/context_detection.py b/video_render/context_detection.py index e342b4c..ab5c203 100644 --- a/video_render/context_detection.py +++ b/video_render/context_detection.py @@ -7,7 +7,7 @@ and identify who is speaking in video content using MediaPipe and audio analysis from __future__ import annotations import logging -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import List, Optional, Tuple import cv2 @@ -50,20 +50,22 @@ class FrameContext: active_speakers: List[int] # indices of speaking faces primary_focus: Optional[Tuple[int, int]] # (x, y) center point layout_mode: str # "single", "dual_split", "grid" + selected_people: List[int] = field(default_factory=list) # indices of people selected for display (max 2) class MediaPipeDetector: - """Face and pose detection using MediaPipe.""" + """Face and pose detection using MediaPipe with OpenCV Haar Cascade fallback.""" - def __init__(self, min_detection_confidence: float = 0.5, min_tracking_confidence: float = 0.5): + def __init__(self, min_detection_confidence: float = 0.3, min_tracking_confidence: float = 0.3): self.min_detection_confidence = min_detection_confidence self.min_tracking_confidence = min_tracking_confidence self.mp_face_detection = mp.solutions.face_detection self.mp_face_mesh = mp.solutions.face_mesh + # MediaPipe detectors with lower confidence for better cartoon detection self.face_detection = self.mp_face_detection.FaceDetection( min_detection_confidence=min_detection_confidence, - model_selection=1 + model_selection=0 # Changed to 0 for better detection of varied faces (including cartoons) ) self.face_mesh = self.mp_face_mesh.FaceMesh( @@ -73,11 +75,17 @@ class MediaPipeDetector: static_image_mode=False ) - logger.info("MediaPipe detector initialized") + # OpenCV Haar Cascade as fallback for cartoon/anime faces + self.haar_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml') + + # Alternative cascade for profile/side faces + self.haar_cascade_profile = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_profileface.xml') + + logger.info(f"Hybrid detector initialized (MediaPipe confidence={min_detection_confidence}, OpenCV Haar Cascade enabled)") def detect_faces(self, frame: np.ndarray) -> List[FaceDetection]: """ - Detect faces in a frame. + Detect faces in a frame using hybrid approach (MediaPipe + OpenCV Haar Cascade). Args: frame: RGB image array @@ -94,6 +102,7 @@ class MediaPipeDetector: else: frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + # Try MediaPipe first results = self.face_detection.process(frame_rgb) faces = [] @@ -126,8 +135,111 @@ class MediaPipeDetector: center_y=center_y )) + # Fallback to OpenCV Haar Cascade if MediaPipe found nothing + if not faces: + faces = self._detect_faces_haar_cascade(frame, width, height) + return faces + def _detect_faces_haar_cascade(self, frame: np.ndarray, width: int, height: int) -> List[FaceDetection]: + """ + Detect faces using OpenCV Haar Cascade (works better with cartoons/anime). + + Args: + frame: Image frame (BGR format) + width: Frame width + height: Frame height + + Returns: + List of detected faces + """ + # Convert to grayscale for Haar Cascade + if len(frame.shape) == 3: + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + else: + gray = frame + + # Detect frontal faces with more sensitive parameters + frontal_faces = self.haar_cascade.detectMultiScale( + gray, + scaleFactor=1.05, # More sensitive to size variations + minNeighbors=3, # Lower threshold for detection (more permissive) + minSize=(30, 30), # Smaller minimum size + flags=cv2.CASCADE_SCALE_IMAGE + ) + + # Also try profile faces + profile_faces = self.haar_cascade_profile.detectMultiScale( + gray, + scaleFactor=1.1, + minNeighbors=3, + minSize=(30, 30), + flags=cv2.CASCADE_SCALE_IMAGE + ) + + # Combine frontal and profile detections + all_faces = [] + + for (x, y, w, h) in frontal_faces: + x = max(0, min(x, width - 1)) + y = max(0, min(y, height - 1)) + w = min(w, width - x) + h = min(h, height - y) + + center_x = x + w // 2 + center_y = y + h // 2 + + all_faces.append(FaceDetection( + x=x, + y=y, + width=w, + height=h, + confidence=0.7, # Haar Cascade doesn't provide confidence, use fixed value + center_x=center_x, + center_y=center_y + )) + + for (x, y, w, h) in profile_faces: + # Check if this face overlaps significantly with any frontal face + overlap = False + for existing_face in all_faces: + # Calculate IoU (Intersection over Union) + x1_overlap = max(x, existing_face.x) + y1_overlap = max(y, existing_face.y) + x2_overlap = min(x + w, existing_face.x + existing_face.width) + y2_overlap = min(y + h, existing_face.y + existing_face.height) + + if x1_overlap < x2_overlap and y1_overlap < y2_overlap: + overlap_area = (x2_overlap - x1_overlap) * (y2_overlap - y1_overlap) + face_area = w * h + if overlap_area / face_area > 0.3: # 30% overlap threshold + overlap = True + break + + if not overlap: + x = max(0, min(x, width - 1)) + y = max(0, min(y, height - 1)) + w = min(w, width - x) + h = min(h, height - y) + + center_x = x + w // 2 + center_y = y + h // 2 + + all_faces.append(FaceDetection( + x=x, + y=y, + width=w, + height=h, + confidence=0.6, # Slightly lower confidence for profile + center_x=center_x, + center_y=center_y + )) + + if all_faces: + logger.debug(f"Haar Cascade detected {len(all_faces)} faces (MediaPipe failed)") + + return all_faces + def detect_face_landmarks(self, frame: np.ndarray) -> List[FaceDetection]: """ Detect faces with landmarks for lip sync detection. @@ -203,8 +315,8 @@ class AudioActivityDetector: def detect_speaking_periods( self, audio_samples: np.ndarray, - threshold: float = 0.02, - min_speech_duration: float = 0.1 + threshold: float = 0.01, # Reduced from 0.02 for better speech detection + min_speech_duration: float = 0.05 # Reduced from 0.1 to catch shorter utterances ) -> List[Tuple[float, float]]: """ Detect periods of speech in audio. @@ -250,6 +362,16 @@ class AudioActivityDetector: if end_time - start_time >= min_speech_duration: periods.append((start_time, end_time)) + # Log detected speech periods for debugging + if periods: + total_speech_time = sum(end - start for start, end in periods) + logger.info(f"Audio speech detection: {len(periods)} periods found, " + f"total {total_speech_time:.1f}s of speech (threshold={threshold})") + else: + max_energy = max(energies) if energies else 0 + logger.warning(f"No speech detected! Max energy={max_energy:.4f}, threshold={threshold} " + f"(try lowering threshold if speech should be present)") + return periods def is_speaking_at_time(self, speaking_periods: List[Tuple[float, float]], time: float) -> bool: @@ -263,12 +385,29 @@ class AudioActivityDetector: class ContextAnalyzer: """Analyzes video context to determine focus and layout.""" - def __init__(self): + def __init__(self, person_switch_cooldown: int = 30): self.detector = MediaPipeDetector() self.audio_detector = AudioActivityDetector() self.previous_faces: List[FaceDetection] = [] - logger.info("Context analyzer initialized") + # Person tracking state + self.current_selected_people: List[int] = [] # Indices of people currently on screen + self.last_switch_frame: int = -999 # Frame when we last switched people + self.person_switch_cooldown = person_switch_cooldown # Minimum frames before switching + + # Stability tracking to prevent flip-flopping + self.desired_people_history: List[List[int]] = [] # Track recent desired selections + self.stability_threshold = 20 # Frames needed to confirm a switch (increased for more stability) + self.last_switched_people: List[int] = [] # People we just switched FROM + + # Focus stability: track recent focus points for temporal smoothing + self.focus_history: List[Tuple[int, int]] = [] + self.focus_history_size: int = 5 # Keep last 5 focus points for smoothing + + # Debug logging + self.frame_log_interval = 30 # Log every N frames + + logger.info(f"Context analyzer initialized (cooldown={person_switch_cooldown} frames, focus_smoothing={self.focus_history_size})") def analyze_frame( self, @@ -296,33 +435,47 @@ class ContextAnalyzer: # Determine who is speaking active_speakers = [] + has_audio_speech = speaking_periods and self.audio_detector.is_speaking_at_time(speaking_periods, timestamp) + for i, face in enumerate(faces): is_speaking = False - if speaking_periods and self.audio_detector.is_speaking_at_time(speaking_periods, timestamp): + # Check audio-based speech detection + if has_audio_speech: is_speaking = True + # Check lip movement (visual speech detection) if face.landmarks and len(self.previous_faces) > i: is_speaking = is_speaking or self._detect_lip_movement(face, self.previous_faces[i]) if is_speaking: active_speakers.append(i) - num_faces = len(faces) - num_speakers = len(active_speakers) + # Debug: Log speech detection + if frame_number % 30 == 0: # Every second at 30fps + logger.info(f"Speech detection - Frame {frame_number}: audio_active={has_audio_speech}, " + f"speakers={active_speakers}, total_faces={len(faces)}") - if num_faces == 0: - layout_mode = "single" - elif num_faces == 1: - layout_mode = "single" - elif num_faces == 2: - layout_mode = "dual_split" - elif num_faces >= 3: - layout_mode = "dual_split" - else: - layout_mode = "single" + # Select THE person to focus on (always single person) + # Priority: 1) Who is speaking, 2) Who is most centered + selected_people = self._select_person_to_focus( + faces, + active_speakers, + frame_number, + frame.shape[1], # frame width for center calculation + frame.shape[0] # frame height for center calculation + ) - primary_focus = self._calculate_focus_point(faces, active_speakers) + # Always use single-person layout (no split screen) + layout_mode = "single" + + primary_focus = self._calculate_focus_point(faces, selected_people) + + # Debug logging every N frames + if frame_number % self.frame_log_interval == 0: + focus_reason = "speaker" if active_speakers else "no_speech_detected" + logger.info(f"Frame {frame_number}: {len(faces)} faces, " + f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}") self.previous_faces = faces @@ -332,7 +485,8 @@ class ContextAnalyzer: detected_faces=faces, active_speakers=active_speakers, primary_focus=primary_focus, - layout_mode=layout_mode + layout_mode=layout_mode, + selected_people=selected_people ) def _detect_lip_movement(self, current_face: FaceDetection, previous_face: FaceDetection) -> bool: @@ -363,36 +517,309 @@ class ContextAnalyzer: threshold = 2.0 return abs(current_dist - previous_dist) > threshold - def _calculate_focus_point( + def _select_person_to_focus( self, faces: List[FaceDetection], - active_speakers: List[int] - ) -> Optional[Tuple[int, int]]: + active_speakers: List[int], + frame_number: int, + frame_width: int, + frame_height: int + ) -> List[int]: """ - Calculate the primary focus point based on detected faces and speakers. - - IMPORTANT: This focuses on ONE person to avoid focusing on empty space (table). - When multiple people are present, we pick the most relevant person, not average positions. + Select THE single person to focus on. + Priority: 1) Who is speaking, 2) Who is most centered in frame Args: faces: List of detected faces - active_speakers: Indices of faces that are speaking + active_speakers: Indices of people currently speaking + frame_number: Current frame number + frame_width: Frame width for center calculation + frame_height: Frame height for center calculation + + Returns: + List with single person index [idx], or empty list if no faces + """ + if not faces: + self.current_selected_people = [] + return [] + + # If only 1 person, always focus on them + if len(faces) == 1: + self.current_selected_people = [0] + return [0] + + # Check if we can switch people (cooldown period) + frames_since_last_switch = frame_number - self.last_switch_frame + can_switch = frames_since_last_switch >= self.person_switch_cooldown + + # Calculate frame center for distance comparison + frame_center_x = frame_width / 2 + frame_center_y = frame_height / 2 + + # ULTRA-STABLE MODE: Select ONE person at start, NEVER switch + # This completely eliminates switching-related instability + desired_person_idx = None + + # If we already have someone selected, ALWAYS KEEP THEM (never switch) + if self.current_selected_people and len(self.current_selected_people) > 0: + current_idx = self.current_selected_people[0] + if current_idx < len(faces): + # Current person still detected - keep them + desired_person_idx = current_idx + else: + # Current person lost - try to find them again by position/size similarity + # This handles temporary detection failures + current_person_found = False + if self.previous_faces and current_idx < len(self.previous_faces): + prev_face = self.previous_faces[current_idx] + # Find most similar face by position and size + best_match_idx = None + best_match_score = float('inf') + for idx, face in enumerate(faces): + # Distance between centers + dx = face.center_x - prev_face.center_x + dy = face.center_y - prev_face.center_y + dist = np.sqrt(dx**2 + dy**2) + # Size similarity + size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height) + score = dist + size_diff * 0.5 + if score < best_match_score: + best_match_score = score + best_match_idx = idx + + if best_match_idx is not None and best_match_score < 1000: + desired_person_idx = best_match_idx + current_person_found = True + + if not current_person_found: + # Really lost - select most confident + face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)] + face_confidences.sort(key=lambda x: x[1], reverse=True) + desired_person_idx = face_confidences[0][0] + logger.warning(f"Current person permanently lost - selecting new: {desired_person_idx}") + else: + # First frame - select most confident person ONCE + face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)] + face_confidences.sort(key=lambda x: x[1], reverse=True) + desired_person_idx = face_confidences[0][0] + logger.info(f"INITIAL SELECTION - Person {desired_person_idx} (will be tracked throughout entire video)") + + # IGNORE SPEECH DETECTION - it was causing instability + # We now track ONE person from start to finish, regardless of who speaks + + # OLD LOGIC (commented out - was causing issues): + # This logic would switch based on "who is more centered" which caused constant switching + if False: # Disabled + # Calculate distance from center for each face + center_distances = [] + for idx, face in enumerate(faces): + # Euclidean distance from frame center + dx = face.center_x - frame_center_x + dy = face.center_y - frame_center_y + distance = np.sqrt(dx**2 + dy**2) + center_distances.append((idx, distance, face.confidence)) + + # Sort by distance (closest first), then by confidence as tiebreaker + center_distances.sort(key=lambda x: (x[1], -x[2])) + most_centered_idx = center_distances[0][0] + most_centered_distance = center_distances[0][1] + + # STICKY BEHAVIOR: If we already have someone selected, only switch if: + # - New person is SIGNIFICANTLY more centered (30% closer to center) + # - OR current person is now very far from center (>40% of frame width) + if self.current_selected_people and len(self.current_selected_people) > 0: + current_idx = self.current_selected_people[0] + if current_idx < len(faces): + current_face = faces[current_idx] + current_dx = current_face.center_x - frame_center_x + current_dy = current_face.center_y - frame_center_y + current_distance = np.sqrt(current_dx**2 + current_dy**2) + + # Define "significantly better" threshold + max_acceptable_distance = frame_width * 0.4 # 40% of frame width + improvement_threshold = 0.7 # New person must be 30% closer (0.7 ratio) + + # Keep current person if they're still reasonably centered + if current_distance < max_acceptable_distance: + # Current person is still acceptable - only switch if new is MUCH better + if most_centered_distance < current_distance * improvement_threshold: + desired_person_idx = most_centered_idx + logger.debug(f"Switching: new person MUCH more centered ({most_centered_distance:.0f} vs {current_distance:.0f})") + else: + desired_person_idx = current_idx # Keep current + logger.debug(f"Keeping current person: still reasonably centered ({current_distance:.0f} px from center)") + else: + # Current person is too far from center - switch + desired_person_idx = most_centered_idx + logger.debug(f"Current person too far from center ({current_distance:.0f} px), switching") + else: + # Current selection invalid + desired_person_idx = most_centered_idx + else: + # First time - select most centered + desired_person_idx = most_centered_idx + + # Wrap in list for compatibility with existing code + desired_people = [desired_person_idx] if desired_person_idx is not None else [] + + # ULTRA-STABLE MODE: NO SWITCHING LOGIC AT ALL + # Simply set the person and never change + if not self.current_selected_people: + # First time only + self.current_selected_people = desired_people + self.last_switch_frame = frame_number + logger.info(f"Frame {frame_number}: LOCKED ON person {desired_people} - will never switch") + else: + # Already have someone - just update to desired (which is same person due to logic above) + self.current_selected_people = desired_people + + return self.current_selected_people.copy() + + def _ensure_distinct_people( + self, + faces: List[FaceDetection], + people_indices: List[int] + ) -> List[int]: + """ + Ensure selected people are distinct by checking minimum distance between them. + Prevents showing the same person twice due to duplicate detection. + + Args: + faces: List of detected faces + people_indices: Indices of people to validate + + Returns: + List of distinct people indices (max 2) + """ + if len(people_indices) <= 1: + return people_indices + + distinct_people = [] + + for idx in people_indices: + if idx >= len(faces): + continue + + current_face = faces[idx] + is_distinct = True + + # Check if this person is too close to any already selected person + for selected_idx in distinct_people: + selected_face = faces[selected_idx] + + # Calculate distance between face centers + dx = current_face.center_x - selected_face.center_x + dy = current_face.center_y - selected_face.center_y + distance = np.sqrt(dx**2 + dy**2) + + # Also check overlap via IoU (Intersection over Union) + x1_overlap = max(current_face.x, selected_face.x) + y1_overlap = max(current_face.y, selected_face.y) + x2_overlap = min(current_face.x + current_face.width, selected_face.x + selected_face.width) + y2_overlap = min(current_face.y + current_face.height, selected_face.y + selected_face.height) + + overlap_area = 0 + if x1_overlap < x2_overlap and y1_overlap < y2_overlap: + overlap_area = (x2_overlap - x1_overlap) * (y2_overlap - y1_overlap) + + # Calculate areas + area1 = current_face.width * current_face.height + area2 = selected_face.width * selected_face.height + min_area = min(area1, area2) + + # If faces are very close OR significantly overlapping, they're likely the same person + # Minimum distance: 1/4 of average face width + min_distance = (current_face.width + selected_face.width) / 8 + overlap_threshold = 0.3 # 30% overlap + + if distance < min_distance or (min_area > 0 and overlap_area / min_area > overlap_threshold): + is_distinct = False + logger.debug(f"Person {idx} too similar to person {selected_idx} (dist={distance:.1f}, overlap={overlap_area/min_area if min_area > 0 else 0:.2%})") + break + + if is_distinct: + distinct_people.append(idx) + + # Stop at 2 distinct people + if len(distinct_people) >= 2: + break + + # If we couldn't find 2 distinct people, return at most 1 + if len(distinct_people) < 2 and len(people_indices) >= 2: + logger.debug(f"Only {len(distinct_people)} distinct person(s) found from {len(people_indices)} detections") + + return distinct_people + + def _calculate_focus_point( + self, + faces: List[FaceDetection], + selected_people: List[int] + ) -> Optional[Tuple[int, int]]: + """ + Calculate the primary focus point based on selected people with temporal smoothing. + + Args: + faces: List of detected faces + selected_people: Indices of people selected for display Returns: (x, y) tuple of focus center, or None if no faces """ - if not faces: + if not faces or not selected_people: return None - if active_speakers: - speaker_faces = [faces[i] for i in active_speakers if i < len(faces)] - if speaker_faces: - primary_speaker = max(speaker_faces, key=lambda f: f.confidence) - return (primary_speaker.center_x, primary_speaker.center_y) + # Calculate raw focus point + raw_focus_x = 0 + raw_focus_y = 0 - most_confident = max(faces, key=lambda f: f.confidence) - return (most_confident.center_x, most_confident.center_y) + if len(selected_people) == 1: + # Single person - focus on them + if selected_people[0] < len(faces): + primary = faces[selected_people[0]] + raw_focus_x = primary.center_x + raw_focus_y = primary.center_y + else: + # Fallback + most_confident = max(faces, key=lambda f: f.confidence) + raw_focus_x = most_confident.center_x + raw_focus_y = most_confident.center_y + else: + # Multiple people - focus on the CENTER between them for stability + # This prevents jarring movements when switching focus between people + valid_people = [idx for idx in selected_people if idx < len(faces)] + if valid_people: + centers_x = [faces[idx].center_x for idx in valid_people] + centers_y = [faces[idx].center_y for idx in valid_people] + raw_focus_x = int(np.mean(centers_x)) + raw_focus_y = int(np.mean(centers_y)) + else: + # Fallback + most_confident = max(faces, key=lambda f: f.confidence) + raw_focus_x = most_confident.center_x + raw_focus_y = most_confident.center_y + + # Apply temporal smoothing using focus history + self.focus_history.append((raw_focus_x, raw_focus_y)) + if len(self.focus_history) > self.focus_history_size: + self.focus_history.pop(0) + + # Calculate smoothed focus as weighted average (more weight to recent frames) + if len(self.focus_history) > 1: + # Exponential weights: recent frames have more influence + weights = [2 ** i for i in range(len(self.focus_history))] + total_weight = sum(weights) + + smoothed_x = sum(x * w for (x, y), w in zip(self.focus_history, weights)) / total_weight + smoothed_y = sum(y * w for (x, y), w in zip(self.focus_history, weights)) / total_weight + + return (int(smoothed_x), int(smoothed_y)) + else: + return (raw_focus_x, raw_focus_y) def close(self): """Release resources.""" self.detector.close() + # Clear tracking state to free memory + self.previous_faces.clear() + self.current_selected_people.clear() + self.focus_history.clear() diff --git a/video_render/llm.py b/video_render/llm.py index 1f2d798..76be59d 100644 --- a/video_render/llm.py +++ b/video_render/llm.py @@ -141,8 +141,8 @@ class OpenRouterCopywriter: logger.warning(f"Highlight ignorado: muito curto ({duration}s, minimo 45s)") continue - if duration > 120: - logger.warning(f"Highlight ignorado: muito longo ({duration}s, maximo 120s)") + if duration > 90: + logger.warning(f"Highlight ignorado: muito longo ({duration}s, maximo 90s)") continue if not summary: diff --git a/video_render/media.py b/video_render/media.py index d99a71d..532de85 100644 --- a/video_render/media.py +++ b/video_render/media.py @@ -50,7 +50,10 @@ class MediaPreparer: existing_children = list(workspace_dir.iterdir()) if existing_children: logger.info("Limpando workspace existente para %s", sanitized_name) - remove_paths(existing_children) + try: + remove_paths(existing_children) + except Exception as e: + logger.warning(f"Não foi possível limpar workspace (não crítico): {e}") if temp_transcription_json and temp_transcription_json.exists(): shutil.move(str(temp_transcription_json), str(transcription_json)) @@ -66,7 +69,10 @@ class MediaPreparer: output_dir = ensure_workspace(self.settings.outputs_dir, sanitized_name) existing_outputs = list(output_dir.iterdir()) if existing_outputs: - remove_paths(existing_outputs) + try: + remove_paths(existing_outputs) + except Exception as e: + logger.warning(f"Não foi possível limpar outputs antigos (não crítico): {e}") audio_path = workspace_dir / "audio.wav" extract_audio_to_wav(working_video_path, audio_path) diff --git a/video_render/pipeline.py b/video_render/pipeline.py index 0357788..39557d8 100644 --- a/video_render/pipeline.py +++ b/video_render/pipeline.py @@ -107,6 +107,9 @@ class VideoPipeline: TranscriptionService.persist(transcription, context.workspace.workspace_dir) context.transcription = transcription + # Unload Whisper model immediately after transcription to free memory (1-3GB) + self.transcriber.unload_model() + def _determine_highlights(self, context: PipelineContext) -> None: if not context.transcription: raise RuntimeError("Transcricao nao disponivel") diff --git a/video_render/rendering.py b/video_render/rendering.py index ae69813..b2ce7f5 100644 --- a/video_render/rendering.py +++ b/video_render/rendering.py @@ -345,7 +345,9 @@ class VideoRenderer: target_width=settings.rendering.frame_width, target_height=settings.rendering.frame_height, frame_skip=settings.rendering.smart_framing_frame_skip, - smoothing_window=settings.rendering.smart_framing_smoothing_window + smoothing_window=settings.rendering.smart_framing_smoothing_window, + max_velocity=settings.rendering.smart_framing_max_velocity, + person_switch_cooldown=settings.rendering.smart_framing_person_switch_cooldown ) def render( @@ -436,12 +438,10 @@ class VideoRenderer: audio_samples=audio_samples ) - # Apply smart framing based on detected layout - use_split_screen = framing_plan.layout_mode in ["dual_split", "grid"] + # Apply smart framing (always single-person focus) video_clip = self.smart_framer.apply_framing( video_clip=subclip, - framing_plan=framing_plan, - use_split_screen=use_split_screen + framing_plan=framing_plan ) logger.info(f"Smart framing applied: layout={framing_plan.layout_mode}, " @@ -602,6 +602,10 @@ class VideoRenderer: if audio_clip is not None and audio_needs_close: audio_clip.close() + # Force garbage collection to free memory after rendering + import gc + gc.collect() + return str(output_path) def _materialize_audio( diff --git a/video_render/smart_framing.py b/video_render/smart_framing.py index 76087ba..8b5f52a 100644 --- a/video_render/smart_framing.py +++ b/video_render/smart_framing.py @@ -46,21 +46,20 @@ class SmartFramer: self, target_width: int = 1080, target_height: int = 1920, - frame_skip: int = 2, - smoothing_window: int = 15 + frame_skip: int = 1, + smoothing_window: int = 30, + max_velocity: int = 20, + person_switch_cooldown: int = 999999 ): self.target_width = target_width self.target_height = target_height self.target_aspect = target_height / target_width - - # Performance parameters - self.frame_skip = frame_skip # Process every Nth frame (CPU optimization) - - # Smoothing parameters + self.frame_skip = frame_skip self.smoothing_window = smoothing_window - self.max_velocity = 30 # pixels per frame (reduced for smoother transitions) + self.max_velocity = max_velocity + self.person_switch_cooldown = person_switch_cooldown - logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip})") + logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip}, smoothing={smoothing_window}, velocity={max_velocity}, cooldown={person_switch_cooldown})") def create_framing_plan( self, @@ -81,25 +80,21 @@ class SmartFramer: Returns: FramingPlan with all frame contexts and crop regions """ - analyzer = ContextAnalyzer() + analyzer = ContextAnalyzer(person_switch_cooldown=self.person_switch_cooldown) - # Detect speaking periods from audio if available speaking_periods = None if audio_samples is not None: speaking_periods = analyzer.audio_detector.detect_speaking_periods(audio_samples) - # Open video with error suppression for AV1 codec warnings import os os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'loglevel;quiet' cap = cv2.VideoCapture(video_path) fps = cap.get(cv2.CAP_PROP_FPS) - # Calculate frame range start_frame = int(start_time * fps) end_frame = int(end_time * fps) - # Set to start frame cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame) frame_contexts = [] @@ -113,7 +108,6 @@ class SmartFramer: if not ret: break - # Only process every Nth frame for performance (CPU optimization) if processed_count % self.frame_skip == 0: timestamp = frame_number / fps context = analyzer.analyze_frame(frame, timestamp, frame_number, speaking_periods) @@ -122,35 +116,36 @@ class SmartFramer: frame_number += 1 processed_count += 1 - # Get video dimensions before releasing capture source_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) source_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) cap.release() analyzer.close() - # Determine overall layout mode (most common) layout_modes = [ctx.layout_mode for ctx in frame_contexts] if layout_modes: overall_layout = max(set(layout_modes), key=layout_modes.count) else: overall_layout = "single" - # Calculate crop regions based on contexts - crop_regions = self._calculate_crop_regions( frame_contexts, source_width, source_height ) - return FramingPlan( + framing_plan = FramingPlan( frame_contexts=frame_contexts, crop_regions=crop_regions, layout_mode=overall_layout, fps=fps ) + import gc + gc.collect() + + return framing_plan + def _calculate_crop_regions( self, contexts: List[FrameContext], @@ -171,66 +166,122 @@ class SmartFramer: if not contexts: return [] - # Calculate ideal crop dimensions maintaining EXACT 9:16 aspect ratio source_aspect = source_width / source_height if source_aspect > self.target_aspect: - # Source is wider - crop horizontally (use full height) crop_height = source_height crop_width = int(crop_height / self.target_aspect) - # Ensure crop width fits within source if crop_width > source_width: crop_width = source_width crop_height = int(crop_width * self.target_aspect) else: - # Source is taller - crop vertically (use full width) crop_width = source_width crop_height = int(crop_width * self.target_aspect) - # Ensure crop height fits within source if crop_height > source_height: crop_height = source_height crop_width = int(crop_height / self.target_aspect) - # Calculate center points for each frame - # Since we now always focus on ONE person directly (not averaging), - # we can use the focus point directly without complex validation - center_xs = [] - center_ys = [] + safe_zone_margin_x = crop_width * 0.40 + safe_zone_margin_y = crop_height * 0.40 - for ctx in contexts: - if ctx.primary_focus: - # Primary focus is now always a single person's center, never averaged - # This means it will never be on the table/empty space - center_xs.append(ctx.primary_focus[0]) - center_ys.append(ctx.primary_focus[1]) + dead_zone_threshold = 100 + + if contexts and contexts[0].primary_focus: + current_crop_center_x = contexts[0].primary_focus[0] + current_crop_center_y = contexts[0].primary_focus[1] + else: + current_crop_center_x = source_width // 2 + current_crop_center_y = source_height // 2 + + center_xs = [current_crop_center_x] + center_ys = [current_crop_center_y] + + for ctx in contexts[1:]: + if ctx.primary_focus and ctx.selected_people and len(ctx.detected_faces) > 0: + primary_person_idx = ctx.selected_people[0] if ctx.selected_people else 0 + if primary_person_idx < len(ctx.detected_faces): + face = ctx.detected_faces[primary_person_idx] + + face_left = face.x + face_right = face.x + face.width + face_top = face.y + face_bottom = face.y + face.height + + crop_left = current_crop_center_x - crop_width // 2 + crop_right = current_crop_center_x + crop_width // 2 + crop_top = current_crop_center_y - crop_height // 2 + crop_bottom = current_crop_center_y + crop_height // 2 + + face_rel_left = face_left - crop_left + face_rel_right = face_right - crop_left + face_rel_top = face_top - crop_top + face_rel_bottom = face_bottom - crop_top + + face_left_safe = face_rel_left >= safe_zone_margin_x + face_right_safe = face_rel_right <= (crop_width - safe_zone_margin_x) + face_top_safe = face_rel_top >= safe_zone_margin_y + face_bottom_safe = face_rel_bottom <= (crop_height - safe_zone_margin_y) + + face_fully_visible = face_left_safe and face_right_safe and face_top_safe and face_bottom_safe + + if face_fully_visible: + center_xs.append(current_crop_center_x) + center_ys.append(current_crop_center_y) + else: + shift_x = 0 + shift_y = 0 + + if not face_left_safe: + shift_x = face_rel_left - safe_zone_margin_x + elif not face_right_safe: + shift_x = face_rel_right - (crop_width - safe_zone_margin_x) + + if not face_top_safe: + shift_y = face_rel_top - safe_zone_margin_y + elif not face_bottom_safe: + shift_y = face_rel_bottom - (crop_height - safe_zone_margin_y) + + if abs(shift_x) > dead_zone_threshold: + current_crop_center_x += shift_x + if abs(shift_y) > dead_zone_threshold: + current_crop_center_y += shift_y + + center_xs.append(current_crop_center_x) + center_ys.append(current_crop_center_y) + else: + center_xs.append(current_crop_center_x) + center_ys.append(current_crop_center_y) else: - # Default to center only if no faces detected at all - center_xs.append(source_width // 2) - center_ys.append(source_height // 2) + center_xs.append(current_crop_center_x) + center_ys.append(current_crop_center_y) - # Smooth the center points - if len(center_xs) > self.smoothing_window: - kernel_size = min(self.smoothing_window, len(center_xs)) - if kernel_size % 2 == 0: - kernel_size -= 1 + if len(center_xs) > 1: + alpha = 0.002 + smoothed_xs = [center_xs[0]] + smoothed_ys = [center_ys[0]] + for i in range(1, len(center_xs)): + if center_xs[i] != center_xs[i-1] or center_ys[i] != center_ys[i-1]: + smoothed_xs.append(alpha * center_xs[i] + (1 - alpha) * smoothed_xs[i-1]) + smoothed_ys.append(alpha * center_ys[i] + (1 - alpha) * smoothed_ys[i-1]) + else: + smoothed_xs.append(smoothed_xs[i-1]) + smoothed_ys.append(smoothed_ys[i-1]) + center_xs = smoothed_xs + center_ys = smoothed_ys - center_xs = signal.medfilt(center_xs, kernel_size=kernel_size).tolist() - center_ys = signal.medfilt(center_ys, kernel_size=kernel_size).tolist() + center_xs = self._limit_velocity(center_xs, 2) + center_ys = self._limit_velocity(center_ys, 2) - # Limit velocity (prevent jarring movements) - center_xs = self._limit_velocity(center_xs, self.max_velocity) - center_ys = self._limit_velocity(center_ys, self.max_velocity) + center_xs = self._apply_dead_zone(center_xs, 5) + center_ys = self._apply_dead_zone(center_ys, 5) - # Convert to crop regions crop_regions = [] for center_x, center_y in zip(center_xs, center_ys): - # Calculate top-left corner x = int(center_x - crop_width // 2) y = int(center_y - crop_height // 2) - # Clamp to valid bounds x = max(0, min(x, source_width - crop_width)) y = max(0, min(y, source_height - crop_height)) @@ -241,8 +292,37 @@ class SmartFramer: height=crop_height )) + center_xs.clear() + center_ys.clear() + return crop_regions + def _apply_dead_zone(self, positions: List[float], threshold: float) -> List[float]: + """ + Apply dead zone to eliminate micro-movements. + If change is smaller than threshold, keep previous position. + + Args: + positions: List of positions + threshold: Minimum change needed to move (pixels) + + Returns: + Positions with dead zone applied + """ + if len(positions) <= 1: + return positions + + filtered = [positions[0]] + + for i in range(1, len(positions)): + delta = abs(positions[i] - filtered[i - 1]) + if delta < threshold: + filtered.append(filtered[i - 1]) + else: + filtered.append(positions[i]) + + return filtered + def _limit_velocity(self, positions: List[float], max_velocity: float) -> List[float]: """ Limit the velocity of position changes. @@ -271,33 +351,20 @@ class SmartFramer: def apply_framing( self, video_clip: VideoFileClip, - framing_plan: FramingPlan, - use_split_screen: bool = False + framing_plan: FramingPlan ) -> VideoClip: """ Apply smart framing to a video clip. + Always uses single-person focus (no split screen). Args: video_clip: Source video clip framing_plan: Framing plan to apply - use_split_screen: Whether to use split screen for multiple people Returns: Reframed video clip """ - # Handle different layout modes - if framing_plan.layout_mode in ["single", "single_speaker"]: - # Single person or single speaker - use focused single framing - return self._apply_single_framing(video_clip, framing_plan) - elif framing_plan.layout_mode == "dual_split" and use_split_screen: - # Two people in conversation - use split screen - return self._apply_split_screen(video_clip, framing_plan) - elif framing_plan.layout_mode == "grid" and use_split_screen: - # 3+ people - use grid layout - return self._apply_grid_layout(video_clip, framing_plan) - else: - # Fallback to single framing - return self._apply_single_framing(video_clip, framing_plan) + return self._apply_single_framing(video_clip, framing_plan) def _apply_single_framing( self, @@ -315,12 +382,9 @@ class SmartFramer: Reframed video clip """ def make_frame(t): - # Get the original frame frame = video_clip.get_frame(t) - # Ensure we have valid crop regions if not framing_plan.crop_regions: - # Fallback: return center crop h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w @@ -331,41 +395,32 @@ class SmartFramer: x = (w - crop_w) // 2 cropped = frame[y:y + crop_h, x:x + crop_w] else: - # Calculate exact frame index with decimal precision for interpolation exact_frame_idx = (t * framing_plan.fps) / self.frame_skip - # Get the two adjacent analyzed frames idx_floor = int(exact_frame_idx) idx_ceil = idx_floor + 1 - # Interpolation factor (0.0 to 1.0) alpha = exact_frame_idx - idx_floor - # Clamp indices to valid range idx_floor = max(0, min(idx_floor, len(framing_plan.crop_regions) - 1)) idx_ceil = max(0, min(idx_ceil, len(framing_plan.crop_regions) - 1)) - # Get crop regions crop1 = framing_plan.crop_regions[idx_floor] crop2 = framing_plan.crop_regions[idx_ceil] - # Linear interpolation between crop regions x = int(crop1.x * (1 - alpha) + crop2.x * alpha) y = int(crop1.y * (1 - alpha) + crop2.y * alpha) width = int(crop1.width * (1 - alpha) + crop2.width * alpha) height = int(crop1.height * (1 - alpha) + crop2.height * alpha) - # Ensure crop stays within frame bounds h, w = frame.shape[:2] x = max(0, min(x, w - width)) y = max(0, min(y, h - height)) width = min(width, w - x) height = min(height, h - y) - # Crop the frame cropped = frame[y:y + height, x:x + width] - # Resize to target dimensions resized = cv2.resize( cropped, (self.target_width, self.target_height), @@ -374,7 +429,6 @@ class SmartFramer: return resized - # MoviePy 2.x compatible way to create VideoClip new_clip = VideoClip(duration=video_clip.duration) new_clip.size = (self.target_width, self.target_height) new_clip.frame_function = make_frame @@ -397,13 +451,10 @@ class SmartFramer: """ def make_frame(t): frame = video_clip.get_frame(t) - # Calculate exact frame index with decimal precision for smooth interpolation exact_frame_idx = (t * framing_plan.fps) / self.frame_skip frame_idx = int(exact_frame_idx) - # Ensure we have valid contexts if not framing_plan.frame_contexts: - # Fallback to simple center crop h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w @@ -415,107 +466,81 @@ class SmartFramer: cropped = frame[y:y + crop_h, x:x + crop_w] return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR) - # Clamp index to valid range frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1)) context = framing_plan.frame_contexts[frame_idx] - # Create output frame output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8) - if len(context.detected_faces) >= 2: - # Split vertically 50/50 (two columns) - half_width = self.target_width // 2 + if context.selected_people and len(context.selected_people) >= 2: + selected_faces = [context.detected_faces[i] for i in context.selected_people[:2] + if i < len(context.detected_faces)] - # Select the 2 most relevant faces - # Priority: ALWAYS show active speaker first + most confident other person - if context.active_speakers and len(context.active_speakers) >= 1: - # Get the PRIMARY speaker (most confident among active speakers) - speaker_faces = [context.detected_faces[i] for i in context.active_speakers - if i < len(context.detected_faces)] + if len(selected_faces) >= 2: + faces = sorted(selected_faces, key=lambda f: f.center_x) + left_face = faces[0] + right_face = faces[1] - primary_speaker = max(speaker_faces, key=lambda f: f.confidence) + for idx, face in enumerate([left_face, right_face]): - # Get OTHER faces (not the primary speaker) - other_faces = [f for f in context.detected_faces if f != primary_speaker] + half_width = self.target_width // 2 + half_aspect = self.target_height / half_width # Aspect ratio for half - if len(speaker_faces) >= 2: - # Multiple speakers: show primary + second most confident speaker - other_speakers = [f for f in speaker_faces if f != primary_speaker] - secondary_person = max(other_speakers, key=lambda f: f.confidence) - elif other_faces: - # One speaker: show speaker + most confident other person - secondary_person = max(other_faces, key=lambda f: f.confidence) - else: - # Fallback: only one person detected - secondary_person = primary_speaker + face_width = max(face.width, frame.shape[1] // 4) # At least 1/4 of frame width + crop_width = int(face_width * 2.5) # Add padding around face + crop_height = int(crop_width * half_aspect) # Maintain correct aspect - selected_faces = [primary_speaker, secondary_person] + max_crop_width = frame.shape[1] // 2 # Half the source width + max_crop_height = frame.shape[0] # Full source height + + if crop_width > max_crop_width: + crop_width = max_crop_width + crop_height = int(crop_width * half_aspect) + + if crop_height > max_crop_height: + crop_height = max_crop_height + crop_width = int(crop_height / half_aspect) + + x = max(0, face.center_x - crop_width // 2) + y = max(0, face.center_y - crop_height // 2) + + x = min(x, frame.shape[1] - crop_width) + y = min(y, frame.shape[0] - crop_height) + + cropped = frame[y:y + crop_height, x:x + crop_width] + resized = cv2.resize( + cropped, + (half_width, self.target_height), + interpolation=cv2.INTER_LINEAR + ) + + x_offset = idx * half_width + output[:, x_offset:x_offset + half_width] = resized else: - # No speakers: take 2 most confident faces - selected_faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:2] - - # Sort selected faces by horizontal position for consistent left/right placement - faces = sorted(selected_faces, key=lambda f: f.center_x) - left_face = faces[0] - right_face = faces[1] - - # Process each person's frame - for idx, face in enumerate([left_face, right_face]): - # Calculate crop region focused on this person - # Each person gets half the width, full target aspect ratio (9:16) - # This ensures NO distortion when resizing - - # For split screen: each side is half_width x full_height - # We need to maintain 9:16 aspect for each half - half_width = self.target_width // 2 - half_aspect = self.target_height / half_width # Aspect ratio for half - - # Determine crop size based on face with padding - face_width = max(face.width, frame.shape[1] // 4) # At least 1/4 of frame width - crop_width = int(face_width * 2.5) # Add padding around face - crop_height = int(crop_width * half_aspect) # Maintain correct aspect - - # Ensure crop fits in frame, maintaining aspect ratio - max_crop_width = frame.shape[1] // 2 # Half the source width - max_crop_height = frame.shape[0] # Full source height - - # If crop is too wide, scale down proportionally - if crop_width > max_crop_width: - crop_width = max_crop_width - crop_height = int(crop_width * half_aspect) - - # If crop is too tall, scale down proportionally - if crop_height > max_crop_height: - crop_height = max_crop_height - crop_width = int(crop_height / half_aspect) - - # Center crop on face - x = max(0, face.center_x - crop_width // 2) - y = max(0, face.center_y - crop_height // 2) - - # Clamp to frame boundaries - x = min(x, frame.shape[1] - crop_width) - y = min(y, frame.shape[0] - crop_height) - - # Extract and resize crop - cropped = frame[y:y + crop_height, x:x + crop_width] - resized = cv2.resize( + if framing_plan.crop_regions: + crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1)) + crop = framing_plan.crop_regions[crop_idx] + cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width] + else: + h, w = frame.shape[:2] + crop_h = int(w * self.target_aspect) + crop_w = w + if crop_h > h: + crop_h = h + crop_w = int(h / self.target_aspect) + y = (h - crop_h) // 2 + x = (w - crop_w) // 2 + cropped = frame[y:y + crop_h, x:x + crop_w] + output = cv2.resize( cropped, - (half_width, self.target_height), + (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR ) - - # Place in output at appropriate horizontal position - x_offset = idx * half_width - output[:, x_offset:x_offset + half_width] = resized else: - # Fall back to single framing if framing_plan.crop_regions: crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1)) crop = framing_plan.crop_regions[crop_idx] cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width] else: - # Fallback to center crop if no crop regions available h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w @@ -533,7 +558,6 @@ class SmartFramer: return output - # MoviePy 2.x compatible way to create VideoClip new_clip = VideoClip(duration=video_clip.duration) new_clip.size = (self.target_width, self.target_height) new_clip.frame_function = make_frame @@ -556,13 +580,10 @@ class SmartFramer: """ def make_frame(t): frame = video_clip.get_frame(t) - # Calculate exact frame index with decimal precision for smooth interpolation exact_frame_idx = (t * framing_plan.fps) / self.frame_skip frame_idx = int(exact_frame_idx) - # Ensure we have valid contexts if not framing_plan.frame_contexts: - # Fallback to simple center crop h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w @@ -574,7 +595,6 @@ class SmartFramer: cropped = frame[y:y + crop_h, x:x + crop_w] return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR) - # Clamp index to valid range frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1)) context = framing_plan.frame_contexts[frame_idx] @@ -583,23 +603,18 @@ class SmartFramer: num_faces = len(context.detected_faces) if num_faces >= 3: - # Create 2x2 grid cell_width = self.target_width // 2 cell_height = self.target_height // 2 for idx, face in enumerate(context.detected_faces[:4]): - # Calculate grid position row = idx // 2 col = idx % 2 - # Each grid cell maintains aspect ratio (square in this case: cell_width = cell_height) cell_aspect = cell_height / cell_width - # Crop around face with correct aspect ratio crop_width = frame.shape[1] // 2 crop_height = int(crop_width * cell_aspect) - # Ensure crop fits in frame, maintaining aspect max_crop_width = frame.shape[1] // 2 max_crop_height = frame.shape[0] // 2 @@ -611,11 +626,9 @@ class SmartFramer: crop_height = max_crop_height crop_width = int(crop_height / cell_aspect) - # Center crop on face x = max(0, face.center_x - crop_width // 2) y = max(0, face.center_y - crop_height // 2) - # Clamp to frame boundaries x = min(x, frame.shape[1] - crop_width) y = min(y, frame.shape[0] - crop_height) @@ -626,18 +639,15 @@ class SmartFramer: interpolation=cv2.INTER_LINEAR ) - # Place in grid y_offset = row * cell_height x_offset = col * cell_width output[y_offset:y_offset + cell_height, x_offset:x_offset + cell_width] = resized else: - # Fall back to single framing if framing_plan.crop_regions: crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1)) crop = framing_plan.crop_regions[crop_idx] cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width] else: - # Fallback to center crop if no crop regions available h, w = frame.shape[:2] crop_h = int(w * self.target_aspect) crop_w = w @@ -655,7 +665,6 @@ class SmartFramer: return output - # MoviePy 2.x compatible way to create VideoClip new_clip = VideoClip(duration=video_clip.duration) new_clip.size = (self.target_width, self.target_height) new_clip.frame_function = make_frame diff --git a/video_render/transcription.py b/video_render/transcription.py index 5e748bf..da4ab9c 100644 --- a/video_render/transcription.py +++ b/video_render/transcription.py @@ -6,6 +6,7 @@ from dataclasses import dataclass from pathlib import Path from typing import List, Optional +import numpy as np from faster_whisper import WhisperModel from video_render.config import Settings @@ -56,6 +57,17 @@ class TranscriptionService: ) return self._model + def unload_model(self) -> None: + """Unload the Whisper model to free memory (reduces RAM usage by 1-3GB).""" + if self._model is not None: + logger.info("Descarregando modelo Whisper para liberar memória...") + del self._model + self._model = None + # Force garbage collection to immediately free GPU/CPU memory + import gc + gc.collect() + logger.info("Modelo Whisper descarregado com sucesso") + def transcribe(self, audio_path: Path, output_dir: Optional[Path] = None) -> TranscriptionResult: if output_dir is not None: existing_transcription = self.load(output_dir) @@ -63,7 +75,34 @@ class TranscriptionService: logger.info("Transcrição já existe em %s, reutilizando...", output_dir) return existing_transcription - logger.info("Iniciando transcrição do áudio com FasterWhisper...") + # Get audio duration to decide if we need chunked processing + audio_duration = self._get_audio_duration(audio_path) + chunk_duration_minutes = 30 # Process in 30-minute chunks for long videos + chunk_duration_seconds = chunk_duration_minutes * 60 + + # For videos longer than 30 minutes, use chunked processing to avoid OOM + if audio_duration > chunk_duration_seconds: + logger.info( + f"Áudio longo detectado ({audio_duration/60:.1f} min). " + f"Processando em chunks de {chunk_duration_minutes} min para evitar erro de memória..." + ) + return self._transcribe_chunked(audio_path, chunk_duration_seconds) + else: + logger.info(f"Iniciando transcrição do áudio ({audio_duration/60:.1f} min) com FasterWhisper...") + return self._transcribe_full(audio_path) + + def _get_audio_duration(self, audio_path: Path) -> float: + """Get audio duration in seconds.""" + try: + from moviepy.audio.io.AudioFileClip import AudioFileClip + with AudioFileClip(str(audio_path)) as audio: + return audio.duration or 0.0 + except Exception as e: + logger.warning(f"Falha ao obter duração do áudio, assumindo curto: {e}") + return 0.0 # Assume short if we can't determine + + def _transcribe_full(self, audio_path: Path) -> TranscriptionResult: + """Transcribe entire audio at once (for shorter videos).""" model = self._load_model() segments, _ = model.transcribe( str(audio_path), @@ -97,6 +136,101 @@ class TranscriptionService: full_text=" ".join(full_text_parts).strip(), ) + def _transcribe_chunked(self, audio_path: Path, chunk_duration: float) -> TranscriptionResult: + """Transcribe audio in chunks to avoid OOM on long videos.""" + import subprocess + from moviepy.audio.io.AudioFileClip import AudioFileClip + + model = self._load_model() + all_segments: List[TranscriptSegment] = [] + full_text_parts: List[str] = [] + segment_id_counter = 0 + + # Get total duration + total_duration = self._get_audio_duration(audio_path) + num_chunks = int(np.ceil(total_duration / chunk_duration)) + + logger.info(f"Processando áudio em {num_chunks} chunks...") + + for chunk_idx in range(num_chunks): + start_time = chunk_idx * chunk_duration + end_time = min((chunk_idx + 1) * chunk_duration, total_duration) + + logger.info( + f"Processando chunk {chunk_idx + 1}/{num_chunks} " + f"({start_time/60:.1f}min - {end_time/60:.1f}min)..." + ) + + # Extract chunk using ffmpeg directly (more reliable than moviepy subclip) + temp_chunk_path = audio_path.parent / f"temp_chunk_{chunk_idx}.wav" + try: + # Use ffmpeg to extract the chunk + chunk_duration_actual = end_time - start_time + ffmpeg_cmd = [ + 'ffmpeg', + '-y', # Overwrite output file + '-ss', str(start_time), # Start time + '-i', str(audio_path), # Input file + '-t', str(chunk_duration_actual), # Duration + '-acodec', 'pcm_s16le', # Audio codec + '-ar', '44100', # Sample rate + '-ac', '2', # Stereo + '-loglevel', 'error', # Only show errors + str(temp_chunk_path) + ] + + subprocess.run(ffmpeg_cmd, check=True, capture_output=True) + + # Transcribe chunk + segments, _ = model.transcribe( + str(temp_chunk_path), + beam_size=5, + word_timestamps=True, + ) + + # Process segments with time offset + for segment in segments: + words = [ + WordTiming( + start=w.start + start_time, + end=w.end + start_time, + word=w.word.strip() + ) + for w in segment.words or [] + if w.word.strip() + ] + text = segment.text.strip() + full_text_parts.append(text) + all_segments.append( + TranscriptSegment( + id=segment_id_counter, + start=segment.start + start_time, + end=segment.end + start_time, + text=text, + words=words, + ) + ) + segment_id_counter += 1 + + # Force garbage collection after each chunk + import gc + gc.collect() + + except subprocess.CalledProcessError as e: + logger.error(f"Erro ao extrair chunk {chunk_idx}: {e.stderr.decode() if e.stderr else str(e)}") + raise + finally: + # Clean up temp chunk + if temp_chunk_path.exists(): + temp_chunk_path.unlink() + + logger.info(f"Transcrição em chunks concluída: {len(all_segments)} segmentos processados") + + return TranscriptionResult( + segments=all_segments, + full_text=" ".join(full_text_parts).strip(), + ) + @staticmethod def persist(result: TranscriptionResult, destination: Path) -> None: json_path = destination / "transcription.json" diff --git a/video_render/utils.py b/video_render/utils.py index 8d8a4fd..04fbba6 100644 --- a/video_render/utils.py +++ b/video_render/utils.py @@ -23,16 +23,58 @@ def ensure_workspace(root: Path, folder_name: str) -> Path: def remove_paths(paths: Iterable[Path]) -> None: + import logging + import time + + logger = logging.getLogger(__name__) + for path in paths: if not path.exists(): continue - if path.is_file() or path.is_symlink(): - path.unlink(missing_ok=True) - else: - for child in sorted(path.rglob("*"), reverse=True): - if child.is_file() or child.is_symlink(): - child.unlink(missing_ok=True) - elif child.is_dir(): - child.rmdir() - path.rmdir() + + # Try to remove with retries and better error handling + max_retries = 3 + for attempt in range(max_retries): + try: + if path.is_file() or path.is_symlink(): + path.unlink(missing_ok=True) + else: + for child in sorted(path.rglob("*"), reverse=True): + if child.is_file() or child.is_symlink(): + try: + child.unlink(missing_ok=True) + except PermissionError: + logger.warning(f"Não foi possível deletar {child}: sem permissão") + # Try to change permissions and retry + try: + child.chmod(0o777) + child.unlink(missing_ok=True) + except Exception as e: + logger.warning(f"Falha ao forçar deleção de {child}: {e}") + elif child.is_dir(): + try: + child.rmdir() + except (PermissionError, OSError) as e: + logger.warning(f"Não foi possível remover diretório {child}: {e}") + + try: + path.rmdir() + except (PermissionError, OSError) as e: + logger.warning(f"Não foi possível remover diretório {path}: {e}") + break # Success, exit retry loop + + except PermissionError as e: + if attempt < max_retries - 1: + logger.warning(f"Tentativa {attempt + 1}/{max_retries} falhou ao deletar {path}: {e}. Tentando novamente...") + time.sleep(0.5) # Wait a bit before retry + # Try to change permissions + try: + path.chmod(0o777) + except Exception: + pass + else: + logger.error(f"Não foi possível deletar {path} após {max_retries} tentativas: {e}") + except Exception as e: + logger.error(f"Erro inesperado ao deletar {path}: {e}") + break # Don't retry on unexpected errors