From 3f7329869d4675f6e1753f693b960d398ebc7bf3 Mon Sep 17 00:00:00 2001 From: LeoMortari Date: Sat, 3 Jan 2026 19:42:23 -0300 Subject: [PATCH] =?UTF-8?q?Ajusta=20contexto,=20falas=20e=20foco,=20tremul?= =?UTF-8?q?a=C3=A7=C3=A3o=20do=20video=20e=20demais=20bugs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docker-compose.yml | 2 +- prompts/generate.txt | 181 +++--- video_render/config.py | 15 +- video_render/context_detection.py | 301 +++++----- video_render/llm.py | 4 +- video_render/rendering.py | 7 +- video_render/smart_framing.py | 877 +++++++++++++++++++++++------- 7 files changed, 932 insertions(+), 455 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 200f4a0..4dba674 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,7 +9,7 @@ services: - RABBITMQ_PASS=${RABBITMQ_PASS} - OPENROUTER_API_URL=${OPENROUTER_API_URL:-https://openrouter.ai/api/v1/chat/completions} - OPENROUTER_API_KEY=${OPENROUTER_API_KEY} - - OPENROUTER_MODEL=${OPENROUTER_MODEL:-openai/gpt-oss-20b:free} + - OPENROUTER_MODEL=${OPENROUTER_MODEL:-mistralai/mistral-small-3.1-24b-instruct:free} - OPENROUTER_PROMPT_PATH=${OPENROUTER_PROMPT_PATH:-prompts/generate.txt} - FASTER_WHISPER_MODEL_SIZE=${FASTER_WHISPER_MODEL_SIZE:-medium} - SMART_FRAMING_SMOOTHING_WINDOW=${SMART_FRAMING_SMOOTHING_WINDOW:-30} diff --git a/prompts/generate.txt b/prompts/generate.txt index bd90862..ab7556f 100644 --- a/prompts/generate.txt +++ b/prompts/generate.txt @@ -1,118 +1,111 @@ -Você é especialista em viralidade de redes sociais (TikTok, Instagram Reels, YouTube Shorts). Sua missão: EXTRAIR O MÁXIMO de clips virais possíveis, priorizando QUANTIDADE + QUALIDADE. +# TAREFA: Extrair clips virais de uma transcrição de vídeo -🎯 OBJETIVO: Transformar cada vídeo em MÚLTIPLOS clips que podem viralizar +Você é um especialista em conteúdo viral para TikTok, Instagram Reels e YouTube Shorts. -PROCESSO DE ANÁLISE: -1. Mapear TODOS os potenciais trechos virais na transcrição -2. Avaliar cada trecho usando sistema de pontuação abaixo -3. Rankear do maior para menor score viral -4. Selecionar TODOS os trechos com score ≥ 60 (não seja conservador!) +## REGRA MAIS IMPORTANTE - DURAÇÃO DOS CLIPS -SISTEMA DE PONTUAÇÃO VIRAL (0-100 pontos): +**CADA CLIP DEVE TER ENTRE 60 E 120 SEGUNDOS DE DURAÇÃO.** -🪝 GANCHO INICIAL (0-30 pontos) - CRÍTICO PARA VIRALIZAÇÃO: -[30] Frase CHOCANTE, pergunta POLÊMICA ou promessa OUSADA nos primeiros 3 segundos -[25] Hook forte: "Você não vai acreditar...", "O segredo que ninguém conta...", "Isso mudou tudo..." -[20] Pergunta intrigante ou afirmação controversa -[15] História interessante mas gancho fraco -[10] Início genérico mas aceitável -[0] "Oi", "então", "bem", silêncio - DESCARTAR +- MÍNIMO ABSOLUTO: 60 segundos (end - start >= 60) +- MÁXIMO: 120 segundos (end - start <= 120) +- IDEAL: 60-90 segundos -🔥 GATILHO EMOCIONAL (0-25 pontos): -[25] Emoção EXTREMA: raiva, choque, riso intenso, WTF moment, revelação bombástica -[20] Emoção forte: surpresa, indignação, humor, curiosidade intensa -[15] Emoção moderada: interesse, leve humor, insight interessante -[10] Emoção fraca: informativo sem impacto -[0] Monótono, técnico, sem apelo emocional - EVITAR +**CLIPS COM MENOS DE 60 SEGUNDOS SERÃO REJEITADOS PELO SISTEMA.** -💎 VALOR/UTILIDADE (0-20 pontos): -[20] Segredo VALIOSO, insight transformador, informação EXCLUSIVA -[15] Ensina algo prático e IMEDIATAMENTE aplicável -[10] Opinião interessante ou perspectiva única -[5] Informação genérica ou conhecimento comum -[0] Nenhum valor prático, puro "enrolation" - DESCARTAR +Antes de incluir um clip, SEMPRE calcule: end - start >= 60 -📖 ESTRUTURA NARRATIVA (0-15 pontos): -[15] História COMPLETA com início, conflito/clímax e resolução satisfatória -[10] Segmento com começo e fim coerentes, faz sentido isolado -[5] Trecho com sentido mas cortado abruptamente -[0] Fragmento sem contexto - NÃO USAR +## QUANTIDADE DE CLIPS -⚡ RITMO E ENERGIA (0-10 pontos): -[10] DINÂMICO, sem pausas longas, alta energia, palavras impactantes -[7] Bom ritmo com pausas naturais curtas (< 2s) -[3] Ritmo lento mas aceitável -[0] Muitas pausas (> 3s), hesitações, monotonia - EVITAR +Baseado na duração total do vídeo: +- Até 10 min: 2-4 clips +- 10-20 min: 4-6 clips +- 20-30 min: 6-10 clips +- 30+ min: 8-15 clips -REGRAS DE QUANTIDADE (SER AGRESSIVO): -📊 Quantidade MÍNIMA por duração: -- 5-10 min: MÍNIMO 4-6 clips -- 10-15 min: MÍNIMO 6-8 clips -- 15-20 min: MÍNIMO 8-10 clips -- 20-30 min: MÍNIMO 10-15 clips -- 30+ min: MÍNIMO 15-20 clips +## CRITÉRIOS DE SELEÇÃO -🎯 REGRA DE OURO: 1 clip a cada 2-3 minutos de vídeo (NO MÍNIMO) -- Se encontrar momentos virais, SEMPRE selecione! -- Melhor ter 3 clips perfeitos que 10 clips bons +Um bom clip viral possui: -CRITÉRIOS DE SELEÇÃO: -- Score viral ≥ 60 pontos (idealmente ≥ 70) -- Duração ideal: 60-120s (formato ideal para Reels/Shorts) -- Duração mínima: 60s | Duração máxima: 120s -- Sem sobreposição temporal -- DEVE ter gancho forte nos primeiros 3 segundos -- Início e fim coerentes +1. GANCHO FORTE nos primeiros 3 segundos (pergunta, afirmação chocante, promessa) +2. EMOÇÃO (humor, surpresa, indignação, curiosidade) +3. VALOR (ensina algo, revela segredo, dá dica prática) +4. ESTRUTURA (início, meio e fim coerentes) +5. RITMO (sem pausas longas, dinâmico) -GANCHOS QUE FAZEM VIRALIZAR (use como filtro): -- "O que ninguém te conta sobre..." -- "O erro que 90% das pessoas cometem..." -- "Você não vai acreditar o que aconteceu..." -- Revelações chocantes ou contraintuitivas -- Antes vs Depois, transformações -- Segredos, bastidores, verdades ocultas -- Polêmicas, opiniões fortes, hot takes -- Histórias dramáticas com reviravolta -- Dicas práticas e acionáveis -- Momentos de humor genuíno +## O QUE EVITAR -❌ EVITE (mas não descarte se score alto): -- Introduções genéricos SEM gancho -- Trechos com pausas > 3s consecutivas -- Explicações técnicas SEM gancho emocional -- Segmentos sem conclusão clara -- Momentos de transição vazios +- Introduções genéricas ("oi pessoal", "então", "bem") +- Trechos com pausas longas (> 3 segundos de silêncio) +- Segmentos sem contexto ou conclusão +- Explicações técnicas monótonas -FORMATO JSON (retorne APENAS isto, SEM texto adicional): +## FORMATO DE RESPOSTA + +Retorne APENAS um JSON válido, sem texto antes ou depois: + +```json { "highlights": [ { - "start": , - "end": , - "summary": "Score: XX/100 | Gancho: [descreva] | Gatilho: [descreva]", + "start": 0.0, + "end": 75.0, + "summary": "Descrição do que acontece neste trecho" + }, + { + "start": 120.5, + "end": 195.0, + "summary": "Descrição do que acontece neste trecho" } ] } +``` -REGRAS TÉCNICAS: -- Float com ponto decimal (45.5 NÃO 45,5) -- Timestamps exatos dos segments fornecidos -- Ordem cronológica (start crescente) -- Summary conciso mas informativo (2-3 frases) +## REGRAS DO JSON -TAREFA PASSO A PASSO: -1. Leia transcrição completa -2. Identifique TODOS os momentos potencialmente virais -3. Avalie e pontue cada trecho (seja generoso!) -4. Rankear por score viral -5. Selecione TODOS com score ≥ 60 -6. Garanta mínimo de 1 clip a cada 5 minutos -7. Retorne JSON completo +- "start" e "end" são números decimais (float) em SEGUNDOS +- Use ponto como separador decimal (60.5, não 60,5) +- "summary" é uma descrição breve do conteúdo (1-2 frases) +- Clips em ordem cronológica (start crescente) +- Clips não podem se sobrepor -⚠️ IMPORTANTE: -- NÃO seja conservador! Se encontrou 10 momentos bons, retorne os 10! -- Pense em MAXIMIZAR alcance: mais clips = mais chances de viralizar -- Se vídeo tem conteúdo fraco, seja criterioso, mas SEMPRE retorne pelo menos 3-5 clips -- Priorize clips com GANCHOS FORTES - gancho fraco = baixo alcance +## CHECKLIST ANTES DE RESPONDER -🎯 MINDSET: Você é um criador de conteúdo viral. Seu objetivo é extrair MÁXIMO valor do vídeo original. +Para CADA clip, verifique: +- [ ] end - start >= 60 segundos? +- [ ] end - start <= 120 segundos? +- [ ] Tem gancho forte no início? +- [ ] Faz sentido isolado do resto do vídeo? +- [ ] JSON está válido? + +## EXEMPLO + +Se o vídeo tem 15 minutos e você encontrou 4 momentos virais: + +```json +{ + "highlights": [ + { + "start": 60.0, + "end": 120.0, + "summary": "Revelação sobre como economizar 50% nas compras" + }, + { + "start": 180.0, + "end": 255.0, + "summary": "História engraçada sobre cliente que tentou enganar a loja" + }, + { + "start": 400.0, + "end": 480.0, + "summary": "Dica prática de negociação com fornecedores" + }, + { + "start": 600.0, + "end": 690.0, + "summary": "Conclusão motivacional sobre empreendedorismo" + } + ] +} +``` + +Agora analise a transcrição fornecida e extraia os clips virais seguindo estas instruções. diff --git a/video_render/config.py b/video_render/config.py index 0ca0c1e..6173f67 100644 --- a/video_render/config.py +++ b/video_render/config.py @@ -62,13 +62,16 @@ class RenderingSettings: subtitle_font_size: int = int(os.environ.get("RENDER_SUBTITLE_FONT_SIZE", 64)) caption_min_words: int = int(os.environ.get("CAPTION_MIN_WORDS", 2)) caption_max_words: int = int(os.environ.get("CAPTION_MAX_WORDS", 2)) - # Smart framing settings - CONTAINMENT TRACKING mode enable_smart_framing: bool = os.environ.get("ENABLE_SMART_FRAMING", "true").lower() in ("true", "1", "yes") - smart_framing_min_confidence: float = float(os.environ.get("SMART_FRAMING_MIN_CONFIDENCE", 0.3)) # Lowered for better cartoon detection - smart_framing_smoothing_window: int = int(os.environ.get("SMART_FRAMING_SMOOTHING_WINDOW", 30)) # Reduced - not needed with containment tracking - smart_framing_frame_skip: int = int(os.environ.get("SMART_FRAMING_FRAME_SKIP", 1)) # Process every frame for smooth 30 FPS tracking - smart_framing_max_velocity: int = int(os.environ.get("SMART_FRAMING_MAX_VELOCITY", 20)) # Moderate - only used during transitions - smart_framing_person_switch_cooldown: int = int(os.environ.get("SMART_FRAMING_PERSON_SWITCH_COOLDOWN", 999999)) # DISABLED - never switch people + smart_framing_min_confidence: float = float(os.environ.get("SMART_FRAMING_MIN_CONFIDENCE", 0.3)) + smart_framing_smoothing_window: int = int(os.environ.get("SMART_FRAMING_SMOOTHING_WINDOW", 30)) + smart_framing_frame_skip: int = int(os.environ.get("SMART_FRAMING_FRAME_SKIP", 1)) + smart_framing_max_velocity: int = int(os.environ.get("SMART_FRAMING_MAX_VELOCITY", 25)) + smart_framing_person_switch_cooldown: int = int(os.environ.get("SMART_FRAMING_PERSON_SWITCH_COOLDOWN", 30)) + smart_framing_response_time: float = float(os.environ.get("SMART_FRAMING_RESPONSE_TIME", 0.6)) + smart_framing_group_padding: float = float(os.environ.get("SMART_FRAMING_GROUP_PADDING", 0.15)) + smart_framing_max_zoom_out: float = float(os.environ.get("SMART_FRAMING_MAX_ZOOM_OUT", 2.0)) + smart_framing_dead_zone: int = int(os.environ.get("SMART_FRAMING_DEAD_ZONE", 60)) @dataclass(frozen=True) diff --git a/video_render/context_detection.py b/video_render/context_detection.py index ab5c203..3200a88 100644 --- a/video_render/context_detection.py +++ b/video_render/context_detection.py @@ -41,6 +41,18 @@ class PersonTracking: frame_number: int +@dataclass +class GroupBoundingBox: + """Bounding box containing all tracked faces.""" + x: int + y: int + width: int + height: int + center_x: int + center_y: int + face_count: int + + @dataclass class FrameContext: """Context information for a video frame.""" @@ -50,7 +62,8 @@ class FrameContext: active_speakers: List[int] # indices of speaking faces primary_focus: Optional[Tuple[int, int]] # (x, y) center point layout_mode: str # "single", "dual_split", "grid" - selected_people: List[int] = field(default_factory=list) # indices of people selected for display (max 2) + selected_people: List[int] = field(default_factory=list) # indices of people selected for display + group_bounds: Optional[GroupBoundingBox] = None # bounding box for all detected faces class MediaPipeDetector: @@ -385,10 +398,11 @@ class AudioActivityDetector: class ContextAnalyzer: """Analyzes video context to determine focus and layout.""" - def __init__(self, person_switch_cooldown: int = 30): + def __init__(self, person_switch_cooldown: int = 30, min_face_confidence: float = 0.3): self.detector = MediaPipeDetector() self.audio_detector = AudioActivityDetector() self.previous_faces: List[FaceDetection] = [] + self.min_face_confidence = min_face_confidence # Person tracking state self.current_selected_people: List[int] = [] # Indices of people currently on screen @@ -400,9 +414,9 @@ class ContextAnalyzer: self.stability_threshold = 20 # Frames needed to confirm a switch (increased for more stability) self.last_switched_people: List[int] = [] # People we just switched FROM - # Focus stability: track recent focus points for temporal smoothing self.focus_history: List[Tuple[int, int]] = [] - self.focus_history_size: int = 5 # Keep last 5 focus points for smoothing + self.focus_history_size: int = 20 + self.focus_dead_zone: int = 60 # Debug logging self.frame_log_interval = 30 # Log every N frames @@ -429,9 +443,11 @@ class ContextAnalyzer: FrameContext with detection results """ faces = self.detector.detect_face_landmarks(frame) + faces = [face for face in faces if face.confidence >= self.min_face_confidence] if faces else [] if not faces: faces = self.detector.detect_faces(frame) + faces = [face for face in faces if face.confidence >= self.min_face_confidence] if faces else [] # Determine who is speaking active_speakers = [] @@ -440,13 +456,13 @@ class ContextAnalyzer: for i, face in enumerate(faces): is_speaking = False - # Check audio-based speech detection - if has_audio_speech: - is_speaking = True - - # Check lip movement (visual speech detection) + # Prefer visual cues when multiple faces are present. if face.landmarks and len(self.previous_faces) > i: - is_speaking = is_speaking or self._detect_lip_movement(face, self.previous_faces[i]) + is_speaking = self._detect_lip_movement(face, self.previous_faces[i]) + + # Audio can confirm speech when there's only one face. + if has_audio_speech and len(faces) == 1: + is_speaking = True if is_speaking: active_speakers.append(i) @@ -456,26 +472,41 @@ class ContextAnalyzer: logger.info(f"Speech detection - Frame {frame_number}: audio_active={has_audio_speech}, " f"speakers={active_speakers}, total_faces={len(faces)}") - # Select THE person to focus on (always single person) - # Priority: 1) Who is speaking, 2) Who is most centered - selected_people = self._select_person_to_focus( - faces, - active_speakers, - frame_number, - frame.shape[1], # frame width for center calculation - frame.shape[0] # frame height for center calculation - ) + if active_speakers: + selected_people = active_speakers[:4] + if len(selected_people) == 1: + layout_mode = "single" + elif len(selected_people) == 2: + layout_mode = "dual_split" + else: + layout_mode = "grid" + else: + # Select THE person to focus on (always single person) + # Priority: 1) Who is speaking, 2) Who is most centered + selected_people = self._select_person_to_focus( + faces, + active_speakers, + frame_number, + frame.shape[1], # frame width for center calculation + frame.shape[0] # frame height for center calculation + ) + layout_mode = "single" - # Always use single-person layout (no split screen) - layout_mode = "single" + # Calculate group bounding box for ALL detected faces (multi-person support) + group_bounds = self._calculate_group_bounding_box(faces) - primary_focus = self._calculate_focus_point(faces, selected_people) + # For multi-person mode, use group center as primary focus + if group_bounds and group_bounds.face_count > 1: + primary_focus = (group_bounds.center_x, group_bounds.center_y) + else: + primary_focus = self._calculate_focus_point(faces, selected_people) # Debug logging every N frames if frame_number % self.frame_log_interval == 0: focus_reason = "speaker" if active_speakers else "no_speech_detected" + group_info = f", group={group_bounds.face_count} faces" if group_bounds else "" logger.info(f"Frame {frame_number}: {len(faces)} faces, " - f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}") + f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}{group_info}") self.previous_faces = faces @@ -486,7 +517,8 @@ class ContextAnalyzer: active_speakers=active_speakers, primary_focus=primary_focus, layout_mode=layout_mode, - selected_people=selected_people + selected_people=selected_people, + group_bounds=group_bounds ) def _detect_lip_movement(self, current_face: FaceDetection, previous_face: FaceDetection) -> bool: @@ -543,134 +575,68 @@ class ContextAnalyzer: self.current_selected_people = [] return [] - # If only 1 person, always focus on them if len(faces) == 1: self.current_selected_people = [0] return [0] - # Check if we can switch people (cooldown period) frames_since_last_switch = frame_number - self.last_switch_frame can_switch = frames_since_last_switch >= self.person_switch_cooldown - # Calculate frame center for distance comparison - frame_center_x = frame_width / 2 - frame_center_y = frame_height / 2 - - # ULTRA-STABLE MODE: Select ONE person at start, NEVER switch - # This completely eliminates switching-related instability desired_person_idx = None - # If we already have someone selected, ALWAYS KEEP THEM (never switch) - if self.current_selected_people and len(self.current_selected_people) > 0: - current_idx = self.current_selected_people[0] - if current_idx < len(faces): - # Current person still detected - keep them - desired_person_idx = current_idx + if active_speakers: + if self.current_selected_people and self.current_selected_people[0] in active_speakers: + desired_person_idx = self.current_selected_people[0] else: - # Current person lost - try to find them again by position/size similarity - # This handles temporary detection failures - current_person_found = False - if self.previous_faces and current_idx < len(self.previous_faces): - prev_face = self.previous_faces[current_idx] - # Find most similar face by position and size - best_match_idx = None - best_match_score = float('inf') - for idx, face in enumerate(faces): - # Distance between centers - dx = face.center_x - prev_face.center_x - dy = face.center_y - prev_face.center_y - dist = np.sqrt(dx**2 + dy**2) - # Size similarity - size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height) - score = dist + size_diff * 0.5 - if score < best_match_score: - best_match_score = score - best_match_idx = idx - - if best_match_idx is not None and best_match_score < 1000: - desired_person_idx = best_match_idx - current_person_found = True - - if not current_person_found: - # Really lost - select most confident - face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)] - face_confidences.sort(key=lambda x: x[1], reverse=True) - desired_person_idx = face_confidences[0][0] - logger.warning(f"Current person permanently lost - selecting new: {desired_person_idx}") + if can_switch or not self.current_selected_people: + desired_person_idx = active_speakers[0] + if self.current_selected_people and desired_person_idx != self.current_selected_people[0]: + logger.info(f"Switching focus to speaker: {desired_person_idx}") + self.last_switch_frame = frame_number + else: + desired_person_idx = self.current_selected_people[0] if self.current_selected_people else active_speakers[0] else: - # First frame - select most confident person ONCE - face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)] - face_confidences.sort(key=lambda x: x[1], reverse=True) - desired_person_idx = face_confidences[0][0] - logger.info(f"INITIAL SELECTION - Person {desired_person_idx} (will be tracked throughout entire video)") - - # IGNORE SPEECH DETECTION - it was causing instability - # We now track ONE person from start to finish, regardless of who speaks - - # OLD LOGIC (commented out - was causing issues): - # This logic would switch based on "who is more centered" which caused constant switching - if False: # Disabled - # Calculate distance from center for each face - center_distances = [] - for idx, face in enumerate(faces): - # Euclidean distance from frame center - dx = face.center_x - frame_center_x - dy = face.center_y - frame_center_y - distance = np.sqrt(dx**2 + dy**2) - center_distances.append((idx, distance, face.confidence)) - - # Sort by distance (closest first), then by confidence as tiebreaker - center_distances.sort(key=lambda x: (x[1], -x[2])) - most_centered_idx = center_distances[0][0] - most_centered_distance = center_distances[0][1] - - # STICKY BEHAVIOR: If we already have someone selected, only switch if: - # - New person is SIGNIFICANTLY more centered (30% closer to center) - # - OR current person is now very far from center (>40% of frame width) if self.current_selected_people and len(self.current_selected_people) > 0: current_idx = self.current_selected_people[0] if current_idx < len(faces): - current_face = faces[current_idx] - current_dx = current_face.center_x - frame_center_x - current_dy = current_face.center_y - frame_center_y - current_distance = np.sqrt(current_dx**2 + current_dy**2) - - # Define "significantly better" threshold - max_acceptable_distance = frame_width * 0.4 # 40% of frame width - improvement_threshold = 0.7 # New person must be 30% closer (0.7 ratio) - - # Keep current person if they're still reasonably centered - if current_distance < max_acceptable_distance: - # Current person is still acceptable - only switch if new is MUCH better - if most_centered_distance < current_distance * improvement_threshold: - desired_person_idx = most_centered_idx - logger.debug(f"Switching: new person MUCH more centered ({most_centered_distance:.0f} vs {current_distance:.0f})") - else: - desired_person_idx = current_idx # Keep current - logger.debug(f"Keeping current person: still reasonably centered ({current_distance:.0f} px from center)") - else: - # Current person is too far from center - switch - desired_person_idx = most_centered_idx - logger.debug(f"Current person too far from center ({current_distance:.0f} px), switching") + desired_person_idx = current_idx else: - # Current selection invalid - desired_person_idx = most_centered_idx - else: - # First time - select most centered - desired_person_idx = most_centered_idx + if self.previous_faces and current_idx < len(self.previous_faces): + prev_face = self.previous_faces[current_idx] + best_match_idx = None + best_match_score = float('inf') + for idx, face in enumerate(faces): + dx = face.center_x - prev_face.center_x + dy = face.center_y - prev_face.center_y + dist = np.sqrt(dx**2 + dy**2) + size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height) + score = dist + size_diff * 0.5 + if score < best_match_score: + best_match_score = score + best_match_idx = idx + + if best_match_idx is not None and best_match_score < 1000: + desired_person_idx = best_match_idx + else: + face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)] + face_confidences.sort(key=lambda x: x[1], reverse=True) + desired_person_idx = face_confidences[0][0] + else: + face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)] + face_confidences.sort(key=lambda x: x[1], reverse=True) + desired_person_idx = face_confidences[0][0] + else: + face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)] + face_confidences.sort(key=lambda x: x[1], reverse=True) + desired_person_idx = face_confidences[0][0] - # Wrap in list for compatibility with existing code desired_people = [desired_person_idx] if desired_person_idx is not None else [] - # ULTRA-STABLE MODE: NO SWITCHING LOGIC AT ALL - # Simply set the person and never change if not self.current_selected_people: - # First time only self.current_selected_people = desired_people self.last_switch_frame = frame_number - logger.info(f"Frame {frame_number}: LOCKED ON person {desired_people} - will never switch") + logger.info(f"Frame {frame_number}: Locked on person {desired_people}") else: - # Already have someone - just update to desired (which is same person due to logic above) self.current_selected_people = desired_people return self.current_selected_people.copy() @@ -798,24 +764,77 @@ class ContextAnalyzer: raw_focus_x = most_confident.center_x raw_focus_y = most_confident.center_y - # Apply temporal smoothing using focus history + if self.focus_history: + last_x, last_y = self.focus_history[-1] + dx = abs(raw_focus_x - last_x) + dy = abs(raw_focus_y - last_y) + if dx < self.focus_dead_zone and dy < self.focus_dead_zone: + return self.focus_history[-1] + self.focus_history.append((raw_focus_x, raw_focus_y)) if len(self.focus_history) > self.focus_history_size: self.focus_history.pop(0) - # Calculate smoothed focus as weighted average (more weight to recent frames) - if len(self.focus_history) > 1: - # Exponential weights: recent frames have more influence - weights = [2 ** i for i in range(len(self.focus_history))] - total_weight = sum(weights) - - smoothed_x = sum(x * w for (x, y), w in zip(self.focus_history, weights)) / total_weight - smoothed_y = sum(y * w for (x, y), w in zip(self.focus_history, weights)) / total_weight - - return (int(smoothed_x), int(smoothed_y)) + if len(self.focus_history) >= 5: + xs = [x for x, y in self.focus_history] + ys = [y for x, y in self.focus_history] + median_x = int(np.median(xs)) + median_y = int(np.median(ys)) + return (median_x, median_y) else: return (raw_focus_x, raw_focus_y) + def _calculate_group_bounding_box( + self, + faces: List[FaceDetection], + padding_percent: float = 0.15, + max_faces: int = 6 + ) -> Optional[GroupBoundingBox]: + """ + Calculate bounding box containing all detected faces with padding. + + Args: + faces: List of detected faces + padding_percent: Padding around group as percentage of bbox dimensions + max_faces: Maximum faces to include (use most confident if exceeded) + + Returns: + GroupBoundingBox or None if no faces + """ + if not faces: + return None + + # If too many faces, use most confident ones + if len(faces) > max_faces: + faces = sorted(faces, key=lambda f: f.confidence, reverse=True)[:max_faces] + + # Calculate bounding box containing all faces + min_x = min(f.x for f in faces) + max_x = max(f.x + f.width for f in faces) + min_y = min(f.y for f in faces) + max_y = max(f.y + f.height for f in faces) + + # Add padding + width = max_x - min_x + height = max_y - min_y + pad_x = int(width * padding_percent) + pad_y = int(height * padding_percent) + + final_x = max(0, min_x - pad_x) + final_y = max(0, min_y - pad_y) + final_width = width + 2 * pad_x + final_height = height + 2 * pad_y + + return GroupBoundingBox( + x=final_x, + y=final_y, + width=final_width, + height=final_height, + center_x=final_x + final_width // 2, + center_y=final_y + final_height // 2, + face_count=len(faces) + ) + def close(self): """Release resources.""" self.detector.close() diff --git a/video_render/llm.py b/video_render/llm.py index 76be59d..f872089 100644 --- a/video_render/llm.py +++ b/video_render/llm.py @@ -137,11 +137,11 @@ class OpenRouterCopywriter: continue duration = end - start - if duration < 45: + if duration < 60: logger.warning(f"Highlight ignorado: muito curto ({duration}s, minimo 45s)") continue - if duration > 90: + if duration > 120: logger.warning(f"Highlight ignorado: muito longo ({duration}s, maximo 90s)") continue diff --git a/video_render/rendering.py b/video_render/rendering.py index b2ce7f5..2fadb9c 100644 --- a/video_render/rendering.py +++ b/video_render/rendering.py @@ -347,7 +347,12 @@ class VideoRenderer: frame_skip=settings.rendering.smart_framing_frame_skip, smoothing_window=settings.rendering.smart_framing_smoothing_window, max_velocity=settings.rendering.smart_framing_max_velocity, - person_switch_cooldown=settings.rendering.smart_framing_person_switch_cooldown + person_switch_cooldown=settings.rendering.smart_framing_person_switch_cooldown, + response_time=settings.rendering.smart_framing_response_time, + group_padding=settings.rendering.smart_framing_group_padding, + max_zoom_out=settings.rendering.smart_framing_max_zoom_out, + dead_zone=settings.rendering.smart_framing_dead_zone, + min_face_confidence=settings.rendering.smart_framing_min_confidence ) def render( diff --git a/video_render/smart_framing.py b/video_render/smart_framing.py index 8b5f52a..e0bb4c9 100644 --- a/video_render/smart_framing.py +++ b/video_render/smart_framing.py @@ -16,7 +16,7 @@ from moviepy.video.VideoClip import VideoClip from moviepy.video.io.VideoFileClip import VideoFileClip from scipy import signal -from video_render.context_detection import ContextAnalyzer, FrameContext, FaceDetection +from video_render.context_detection import ContextAnalyzer, FrameContext, FaceDetection, GroupBoundingBox logger = logging.getLogger(__name__) @@ -40,7 +40,7 @@ class FramingPlan: class SmartFramer: - """Creates intelligent 9:16 framing for horizontal videos.""" + """Creates intelligent 9:16 framing for horizontal videos with multi-person support.""" def __init__( self, @@ -48,8 +48,13 @@ class SmartFramer: target_height: int = 1920, frame_skip: int = 1, smoothing_window: int = 30, - max_velocity: int = 20, - person_switch_cooldown: int = 999999 + max_velocity: int = 25, + person_switch_cooldown: int = 30, + response_time: float = 0.6, + group_padding: float = 0.15, + max_zoom_out: float = 2.0, + dead_zone: int = 100, + min_face_confidence: float = 0.3 ): self.target_width = target_width self.target_height = target_height @@ -58,8 +63,15 @@ class SmartFramer: self.smoothing_window = smoothing_window self.max_velocity = max_velocity self.person_switch_cooldown = person_switch_cooldown + self.response_time = response_time + self.group_padding = group_padding + self.max_zoom_out = max_zoom_out + self.dead_zone = dead_zone + self.min_face_confidence = min_face_confidence + self.position_history_size = 45 + self.hysteresis_frames = 8 - logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip}, smoothing={smoothing_window}, velocity={max_velocity}, cooldown={person_switch_cooldown})") + logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, response_time={response_time}s, max_velocity={max_velocity}, dead_zone={dead_zone})") def create_framing_plan( self, @@ -80,7 +92,10 @@ class SmartFramer: Returns: FramingPlan with all frame contexts and crop regions """ - analyzer = ContextAnalyzer(person_switch_cooldown=self.person_switch_cooldown) + analyzer = ContextAnalyzer( + person_switch_cooldown=self.person_switch_cooldown, + min_face_confidence=self.min_face_confidence + ) speaking_periods = None if audio_samples is not None: @@ -131,7 +146,8 @@ class SmartFramer: crop_regions = self._calculate_crop_regions( frame_contexts, source_width, - source_height + source_height, + fps=fps ) framing_plan = FramingPlan( @@ -146,14 +162,241 @@ class SmartFramer: return framing_plan + def _segment_by_face_detection( + self, + has_face_flags: List[bool], + min_segment_frames: int = 10 + ) -> List[Tuple[int, int, bool]]: + """ + Segment the video into continuous regions with/without face. + Returns list of (start_idx, end_idx, has_face) tuples. + Small segments are merged with neighbors. + """ + if not has_face_flags: + return [] + + segments = [] + start_idx = 0 + current_state = has_face_flags[0] + + for i in range(1, len(has_face_flags)): + if has_face_flags[i] != current_state: + segments.append((start_idx, i - 1, current_state)) + start_idx = i + current_state = has_face_flags[i] + + segments.append((start_idx, len(has_face_flags) - 1, current_state)) + + merged = [] + for seg in segments: + start, end, has_face = seg + length = end - start + 1 + + if length < min_segment_frames and merged: + prev_start, prev_end, prev_has_face = merged[-1] + merged[-1] = (prev_start, end, prev_has_face) + else: + merged.append(seg) + + return merged + + def _interpolate_smooth( + self, + positions: List[float], + segments: List[Tuple[int, int, bool]], + transition_frames: int = 15 + ) -> List[float]: + """ + Create smooth transitions between segments using cosine interpolation. + Within each segment, position is constant. Between segments, smooth transition. + """ + if not positions or not segments: + return positions + + result = list(positions) + + segment_values = [] + for start, end, has_face in segments: + seg_positions = positions[start:end + 1] + if seg_positions: + segment_values.append(float(np.median(seg_positions))) + else: + segment_values.append(positions[start] if start < len(positions) else 0.0) + + for i, (start, end, has_face) in enumerate(segments): + value = segment_values[i] + for j in range(start, end + 1): + result[j] = value + + for i in range(len(segments) - 1): + seg1_start, seg1_end, _ = segments[i] + seg2_start, seg2_end, _ = segments[i + 1] + val1 = segment_values[i] + val2 = segment_values[i + 1] + + if abs(val2 - val1) < self.dead_zone * 0.5: + continue + + trans_start = max(seg1_end - transition_frames // 2, seg1_start) + trans_end = min(seg2_start + transition_frames // 2, seg2_end) + trans_length = trans_end - trans_start + 1 + + if trans_length < 2: + continue + + for j in range(trans_length): + t = j / (trans_length - 1) + smooth_t = 0.5 - 0.5 * np.cos(t * np.pi) + idx = trans_start + j + if 0 <= idx < len(result): + result[idx] = val1 + (val2 - val1) * smooth_t + + return result + + def _apply_savgol_filter( + self, + positions: List[float], + window_length: int = 61, + polyorder: int = 2 + ) -> List[float]: + """ + Apply Savitzky-Golay filter for ultra-smooth position tracking. + This is a signal processing filter that preserves trends while removing noise. + """ + if len(positions) < window_length: + window_length = len(positions) if len(positions) % 2 == 1 else len(positions) - 1 + if window_length < 3: + return positions + + if window_length % 2 == 0: + window_length -= 1 + + if window_length <= polyorder: + polyorder = max(1, window_length - 1) + + try: + smoothed = signal.savgol_filter(positions, window_length, polyorder, mode='nearest') + return smoothed.tolist() + except Exception as e: + logger.warning(f"Savgol filter failed: {e}, returning original positions") + return positions + + def _apply_median_filter(self, positions: List[float], window_size: int = 5) -> List[float]: + """ + Apply median filter to remove detection noise. + + Median filter is ideal for removing outliers while preserving + edges (real movements). Window size of 5 means each position + is replaced by the median of itself and 2 neighbors on each side. + + Args: + positions: Raw positions from detection + window_size: Window size (must be odd), default 5 + + Returns: + Filtered positions with noise removed + """ + if len(positions) < window_size: + return positions + + from scipy.signal import medfilt + + if window_size % 2 == 0: + window_size += 1 + + filtered = medfilt(positions, kernel_size=window_size) + + return filtered.tolist() + + def _is_detection_stable(self, has_face_flags: List[bool], window_size: int = 30) -> bool: + """ + Check if face detection is stable enough to use smart framing. + If detection is too unstable (frequent changes), it's better to use static center crop. + + Args: + has_face_flags: Boolean flags indicating if face was detected per frame + window_size: Number of frames to analyze for stability + + Returns: + True if detection is stable, False if too unstable + """ + if len(has_face_flags) < window_size: + window_size = len(has_face_flags) + + if window_size == 0: + return False + + changes = 0 + for i in range(1, len(has_face_flags)): + if has_face_flags[i] != has_face_flags[i-1]: + changes += 1 + + change_rate = changes / len(has_face_flags) + + return change_rate < 0.3 + + def _stabilize_no_face_sequences( + self, + positions: List[float], + has_face_flags: List[bool], + source_center: float = None + ) -> List[float]: + """ + Stabilize positions during sequences without face detection. + Uses median of all valid positions for maximum stability. + """ + if len(positions) != len(has_face_flags): + return positions + + fallback = source_center if source_center else (positions[0] if positions else 0.0) + + face_ratio = sum(has_face_flags) / len(has_face_flags) if has_face_flags else 0 + if face_ratio < 0.15: + return [fallback] * len(positions) + + changes = sum(1 for i in range(1, len(has_face_flags)) if has_face_flags[i] != has_face_flags[i-1]) + instability_ratio = changes / len(has_face_flags) if has_face_flags else 0 + if instability_ratio > 0.25: + valid_positions = [positions[i] for i, has_face in enumerate(has_face_flags) if has_face] + if valid_positions: + return [float(np.median(valid_positions))] * len(positions) + return [fallback] * len(positions) + + valid_positions = [positions[i] for i, has_face in enumerate(has_face_flags) if has_face] + if not valid_positions: + return [fallback] * len(positions) + + global_median = float(np.median(valid_positions)) + stabilized = list(positions) + i = 0 + + while i < len(has_face_flags): + if not has_face_flags[i]: + start_idx = i + recent_valid = [] + for j in range(max(0, start_idx - self.position_history_size), start_idx): + if has_face_flags[j]: + recent_valid.append(positions[j]) + + lock_value = float(np.median(recent_valid)) if len(recent_valid) >= 5 else global_median + + while i < len(has_face_flags) and not has_face_flags[i]: + stabilized[i] = lock_value + i += 1 + else: + i += 1 + + return stabilized + def _calculate_crop_regions( self, contexts: List[FrameContext], source_width: int, - source_height: int + source_height: int, + fps: Optional[float] = None ) -> List[CropRegion]: """ - Calculate smooth crop regions for each frame. + Calculate smooth crop regions for each frame with multi-person support. Args: contexts: List of frame contexts @@ -168,120 +411,174 @@ class SmartFramer: source_aspect = source_width / source_height + # Calculate base crop dimensions for 9:16 if source_aspect > self.target_aspect: - crop_height = source_height - crop_width = int(crop_height / self.target_aspect) + base_crop_height = source_height + base_crop_width = int(base_crop_height / self.target_aspect) - if crop_width > source_width: - crop_width = source_width - crop_height = int(crop_width * self.target_aspect) + if base_crop_width > source_width: + base_crop_width = source_width + base_crop_height = int(base_crop_width * self.target_aspect) else: - crop_width = source_width - crop_height = int(crop_width * self.target_aspect) + base_crop_width = source_width + base_crop_height = int(base_crop_width * self.target_aspect) - if crop_height > source_height: - crop_height = source_height - crop_width = int(crop_height / self.target_aspect) + if base_crop_height > source_height: + base_crop_height = source_height + base_crop_width = int(base_crop_height / self.target_aspect) - safe_zone_margin_x = crop_width * 0.40 - safe_zone_margin_y = crop_height * 0.40 + center_xs = [] + center_ys = [] + zoom_factors = [] + has_face_flags = [] - dead_zone_threshold = 100 + static_center_x = float(source_width // 2) + static_center_y = float(source_height // 2) - if contexts and contexts[0].primary_focus: - current_crop_center_x = contexts[0].primary_focus[0] - current_crop_center_y = contexts[0].primary_focus[1] - else: - current_crop_center_x = source_width // 2 - current_crop_center_y = source_height // 2 + last_valid_x = static_center_x + last_valid_y = static_center_y + last_valid_zoom = 1.0 - center_xs = [current_crop_center_x] - center_ys = [current_crop_center_y] + for ctx in contexts: + selected_face = None + if ctx.selected_people: + idx = ctx.selected_people[0] + if 0 <= idx < len(ctx.detected_faces): + selected_face = ctx.detected_faces[idx] - for ctx in contexts[1:]: - if ctx.primary_focus and ctx.selected_people and len(ctx.detected_faces) > 0: - primary_person_idx = ctx.selected_people[0] if ctx.selected_people else 0 - if primary_person_idx < len(ctx.detected_faces): - face = ctx.detected_faces[primary_person_idx] + if selected_face: + center_x = float(selected_face.center_x) + center_y = float(selected_face.center_y) + center_xs.append(center_x) + center_ys.append(center_y) - face_left = face.x - face_right = face.x + face.width - face_top = face.y - face_bottom = face.y + face.height + required_width = selected_face.width * (1 + self.group_padding * 2) + required_height = selected_face.height * (1 + self.group_padding * 3) - crop_left = current_crop_center_x - crop_width // 2 - crop_right = current_crop_center_x + crop_width // 2 - crop_top = current_crop_center_y - crop_height // 2 - crop_bottom = current_crop_center_y + crop_height // 2 + zoom_w = required_width / base_crop_width + zoom_h = required_height / base_crop_height + zoom = max(zoom_w, zoom_h, 1.0) + zoom = min(zoom, self.max_zoom_out) + zoom_factors.append(zoom) - face_rel_left = face_left - crop_left - face_rel_right = face_right - crop_left - face_rel_top = face_top - crop_top - face_rel_bottom = face_bottom - crop_top + last_valid_x = center_x + last_valid_y = center_y + last_valid_zoom = zoom + has_face_flags.append(True) + elif ctx.group_bounds and ctx.group_bounds.face_count > 0: + group = ctx.group_bounds + center_x = float(group.center_x) + center_y = float(group.center_y) + center_xs.append(center_x) + center_ys.append(center_y) - face_left_safe = face_rel_left >= safe_zone_margin_x - face_right_safe = face_rel_right <= (crop_width - safe_zone_margin_x) - face_top_safe = face_rel_top >= safe_zone_margin_y - face_bottom_safe = face_rel_bottom <= (crop_height - safe_zone_margin_y) + required_width = group.width * (1 + self.group_padding * 2) + required_height = group.height * (1 + self.group_padding * 3) - face_fully_visible = face_left_safe and face_right_safe and face_top_safe and face_bottom_safe + zoom_w = required_width / base_crop_width + zoom_h = required_height / base_crop_height + zoom = max(zoom_w, zoom_h, 1.0) + zoom = min(zoom, self.max_zoom_out) + zoom_factors.append(zoom) - if face_fully_visible: - center_xs.append(current_crop_center_x) - center_ys.append(current_crop_center_y) - else: - shift_x = 0 - shift_y = 0 + last_valid_x = center_x + last_valid_y = center_y + last_valid_zoom = zoom + has_face_flags.append(True) + elif ctx.primary_focus and len(ctx.detected_faces) > 0: + center_x = float(ctx.primary_focus[0]) + center_y = float(ctx.primary_focus[1]) + center_xs.append(center_x) + center_ys.append(center_y) + zoom_factors.append(1.0) - if not face_left_safe: - shift_x = face_rel_left - safe_zone_margin_x - elif not face_right_safe: - shift_x = face_rel_right - (crop_width - safe_zone_margin_x) - - if not face_top_safe: - shift_y = face_rel_top - safe_zone_margin_y - elif not face_bottom_safe: - shift_y = face_rel_bottom - (crop_height - safe_zone_margin_y) - - if abs(shift_x) > dead_zone_threshold: - current_crop_center_x += shift_x - if abs(shift_y) > dead_zone_threshold: - current_crop_center_y += shift_y - - center_xs.append(current_crop_center_x) - center_ys.append(current_crop_center_y) - else: - center_xs.append(current_crop_center_x) - center_ys.append(current_crop_center_y) + last_valid_x = center_x + last_valid_y = center_y + last_valid_zoom = 1.0 + has_face_flags.append(True) else: - center_xs.append(current_crop_center_x) - center_ys.append(current_crop_center_y) + center_xs.append(last_valid_x) + center_ys.append(last_valid_y) + zoom_factors.append(last_valid_zoom) + has_face_flags.append(False) - if len(center_xs) > 1: - alpha = 0.002 - smoothed_xs = [center_xs[0]] - smoothed_ys = [center_ys[0]] - for i in range(1, len(center_xs)): - if center_xs[i] != center_xs[i-1] or center_ys[i] != center_ys[i-1]: - smoothed_xs.append(alpha * center_xs[i] + (1 - alpha) * smoothed_xs[i-1]) - smoothed_ys.append(alpha * center_ys[i] + (1 - alpha) * smoothed_ys[i-1]) - else: - smoothed_xs.append(smoothed_xs[i-1]) - smoothed_ys.append(smoothed_ys[i-1]) - center_xs = smoothed_xs - center_ys = smoothed_ys + center_x_video = float(source_width // 2) + center_y_video = float(source_height // 2) - center_xs = self._limit_velocity(center_xs, 2) - center_ys = self._limit_velocity(center_ys, 2) + if not self._is_detection_stable(has_face_flags): + final_xs = [center_x_video] * len(center_xs) + final_ys = [center_y_video] * len(center_ys) + final_zooms = [1.0] * len(zoom_factors) + else: + center_xs = self._stabilize_no_face_sequences( + center_xs, + has_face_flags, + source_center=center_x_video + ) + center_ys = self._stabilize_no_face_sequences( + center_ys, + has_face_flags, + source_center=center_y_video + ) + zoom_factors = self._stabilize_no_face_sequences( + zoom_factors, + has_face_flags, + source_center=1.0 + ) - center_xs = self._apply_dead_zone(center_xs, 5) - center_ys = self._apply_dead_zone(center_ys, 5) + face_count = sum(has_face_flags) + if face_count < len(has_face_flags) * 0.3: + final_xs = [center_x_video] * len(center_xs) + final_ys = [center_y_video] * len(center_ys) + final_zooms = [1.0] * len(zoom_factors) + else: + valid_xs = [center_xs[i] for i, has_face in enumerate(has_face_flags) if has_face] + valid_ys = [center_ys[i] for i, has_face in enumerate(has_face_flags) if has_face] + valid_zooms = [zoom_factors[i] for i, has_face in enumerate(has_face_flags) if has_face] + target_x = float(np.median(valid_xs)) if valid_xs else center_x_video + target_y = float(np.median(valid_ys)) if valid_ys else center_y_video + target_zoom = float(np.median(valid_zooms)) if valid_zooms else 1.0 + + for i in range(len(center_xs)): + if not has_face_flags[i]: + center_xs[i] = target_x + center_ys[i] = target_y + zoom_factors[i] = target_zoom + + final_xs = self._apply_savgol_filter(center_xs, window_length=61, polyorder=2) + final_ys = self._apply_savgol_filter(center_ys, window_length=61, polyorder=2) + final_zooms = self._apply_savgol_filter(zoom_factors, window_length=61, polyorder=2) + + if fps and self.response_time > 0: + dt = self.frame_skip / fps + alpha = 1 - np.exp(-dt / self.response_time) + final_xs = self._apply_exponential_smoothing(final_xs, alpha) + final_ys = self._apply_exponential_smoothing(final_ys, alpha) + final_zooms = self._apply_exponential_smoothing(final_zooms, alpha) + + # Generate crop regions crop_regions = [] - for center_x, center_y in zip(center_xs, center_ys): - x = int(center_x - crop_width // 2) - y = int(center_y - crop_height // 2) + for cx, cy, zoom in zip(final_xs, final_ys, final_zooms): + # Calculate actual crop size with zoom + crop_width = int(base_crop_width * zoom) + crop_height = int(base_crop_height * zoom) + # Clamp to source dimensions + crop_width = min(crop_width, source_width) + crop_height = min(crop_height, source_height) + + # Maintain aspect ratio after clamping + if crop_width / crop_height > base_crop_width / base_crop_height: + crop_width = int(crop_height * base_crop_width / base_crop_height) + else: + crop_height = int(crop_width * base_crop_height / base_crop_width) + + # Calculate top-left corner + x = int(cx - crop_width // 2) + y = int(cy - crop_height // 2) + + # Keep within bounds x = max(0, min(x, source_width - crop_width)) y = max(0, min(y, source_height - crop_height)) @@ -292,11 +589,26 @@ class SmartFramer: height=crop_height )) + # Clear temporary lists center_xs.clear() center_ys.clear() + zoom_factors.clear() return crop_regions + def _apply_exponential_smoothing(self, positions: List[float], alpha: float) -> List[float]: + """ + Smooth positions with exponential moving average. + """ + if not positions: + return positions + + alpha = max(0.0, min(alpha, 1.0)) + smoothed = [positions[0]] + for i in range(1, len(positions)): + prev = smoothed[-1] + smoothed.append(prev + alpha * (positions[i] - prev)) + return smoothed def _apply_dead_zone(self, positions: List[float], threshold: float) -> List[float]: """ Apply dead zone to eliminate micro-movements. @@ -355,7 +667,13 @@ class SmartFramer: ) -> VideoClip: """ Apply smart framing to a video clip. - Always uses single-person focus (no split screen). + Automatically selects layout based on number of people detected. + + Layouts: + - 1 person: Single framing (follow person) + - 2 people: Vertical split screen (side by side) + - 3 people: 1 on top, 2 on bottom + - 4 people: 2x2 grid Args: video_clip: Source video clip @@ -364,7 +682,42 @@ class SmartFramer: Returns: Reframed video clip """ - return self._apply_single_framing(video_clip, framing_plan) + # Determine predominant number of faces across all frames + if not framing_plan.frame_contexts: + return self._apply_single_framing(video_clip, framing_plan) + + face_counts = [] + for ctx in framing_plan.frame_contexts: + if ctx.active_speakers: + face_counts.append(len(ctx.active_speakers)) + elif ctx.group_bounds: + face_counts.append(ctx.group_bounds.face_count) + else: + face_counts.append(len(ctx.detected_faces)) + + # Use mode (most common) face count, minimum 1 + if face_counts: + from collections import Counter + count_freq = Counter(face_counts) + # Get the most common count, but ignore 0 + non_zero_counts = {k: v for k, v in count_freq.items() if k > 0} + if non_zero_counts: + predominant_faces = max(non_zero_counts, key=non_zero_counts.get) + else: + predominant_faces = 1 + else: + predominant_faces = 1 + + logger.info(f"Layout selection: predominant_faces={predominant_faces}") + + if predominant_faces == 1: + return self._apply_single_framing(video_clip, framing_plan) + elif predominant_faces == 2: + return self._apply_split_screen(video_clip, framing_plan) + elif predominant_faces == 3: + return self._apply_three_person_layout(video_clip, framing_plan) + else: # 4 or more + return self._apply_grid_layout(video_clip, framing_plan) def _apply_single_framing( self, @@ -396,22 +749,23 @@ class SmartFramer: cropped = frame[y:y + crop_h, x:x + crop_w] else: exact_frame_idx = (t * framing_plan.fps) / self.frame_skip + last_idx = len(framing_plan.crop_regions) - 1 + if last_idx <= 0: + crop = framing_plan.crop_regions[0] + x, y, width, height = crop.x, crop.y, crop.width, crop.height + else: + exact_frame_idx = max(0.0, min(exact_frame_idx, float(last_idx))) + low_idx = int(np.floor(exact_frame_idx)) + high_idx = min(low_idx + 1, last_idx) + alpha = exact_frame_idx - low_idx - idx_floor = int(exact_frame_idx) - idx_ceil = idx_floor + 1 + crop_a = framing_plan.crop_regions[low_idx] + crop_b = framing_plan.crop_regions[high_idx] - alpha = exact_frame_idx - idx_floor - - idx_floor = max(0, min(idx_floor, len(framing_plan.crop_regions) - 1)) - idx_ceil = max(0, min(idx_ceil, len(framing_plan.crop_regions) - 1)) - - crop1 = framing_plan.crop_regions[idx_floor] - crop2 = framing_plan.crop_regions[idx_ceil] - - x = int(crop1.x * (1 - alpha) + crop2.x * alpha) - y = int(crop1.y * (1 - alpha) + crop2.y * alpha) - width = int(crop1.width * (1 - alpha) + crop2.width * alpha) - height = int(crop1.height * (1 - alpha) + crop2.height * alpha) + x = int(round(crop_a.x + (crop_b.x - crop_a.x) * alpha)) + y = int(round(crop_a.y + (crop_b.y - crop_a.y) * alpha)) + width = int(round(crop_a.width + (crop_b.width - crop_a.width) * alpha)) + height = int(round(crop_a.height + (crop_b.height - crop_a.height) * alpha)) h, w = frame.shape[:2] x = max(0, min(x, w - width)) @@ -440,7 +794,7 @@ class SmartFramer: framing_plan: FramingPlan ) -> VideoClip: """ - Apply split screen for two people. + Apply split screen for two people (side by side vertical split). Args: video_clip: Source video clip @@ -471,71 +825,53 @@ class SmartFramer: output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8) - if context.selected_people and len(context.selected_people) >= 2: - selected_faces = [context.detected_faces[i] for i in context.selected_people[:2] - if i < len(context.detected_faces)] - - if len(selected_faces) >= 2: - faces = sorted(selected_faces, key=lambda f: f.center_x) - left_face = faces[0] - right_face = faces[1] - - for idx, face in enumerate([left_face, right_face]): - - half_width = self.target_width // 2 - half_aspect = self.target_height / half_width # Aspect ratio for half - - face_width = max(face.width, frame.shape[1] // 4) # At least 1/4 of frame width - crop_width = int(face_width * 2.5) # Add padding around face - crop_height = int(crop_width * half_aspect) # Maintain correct aspect - - max_crop_width = frame.shape[1] // 2 # Half the source width - max_crop_height = frame.shape[0] # Full source height - - if crop_width > max_crop_width: - crop_width = max_crop_width - crop_height = int(crop_width * half_aspect) - - if crop_height > max_crop_height: - crop_height = max_crop_height - crop_width = int(crop_height / half_aspect) - - x = max(0, face.center_x - crop_width // 2) - y = max(0, face.center_y - crop_height // 2) - - x = min(x, frame.shape[1] - crop_width) - y = min(y, frame.shape[0] - crop_height) - - cropped = frame[y:y + crop_height, x:x + crop_width] - resized = cv2.resize( - cropped, - (half_width, self.target_height), - interpolation=cv2.INTER_LINEAR - ) - - x_offset = idx * half_width - output[:, x_offset:x_offset + half_width] = resized - else: - if framing_plan.crop_regions: - crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1)) - crop = framing_plan.crop_regions[crop_idx] - cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width] - else: - h, w = frame.shape[:2] - crop_h = int(w * self.target_aspect) - crop_w = w - if crop_h > h: - crop_h = h - crop_w = int(h / self.target_aspect) - y = (h - crop_h) // 2 - x = (w - crop_w) // 2 - cropped = frame[y:y + crop_h, x:x + crop_w] - output = cv2.resize( - cropped, - (self.target_width, self.target_height), - interpolation=cv2.INTER_LINEAR - ) + if context.active_speakers: + faces = [ + context.detected_faces[idx] + for idx in context.active_speakers + if 0 <= idx < len(context.detected_faces) + ][:2] else: + # Use top faces by confidence for stability + faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:2] + + if len(faces) >= 2: + # Sort by X position (left to right) + faces_sorted = sorted(faces, key=lambda f: f.center_x) + left_face = faces_sorted[0] + right_face = faces_sorted[1] + + half_width = self.target_width // 2 + half_aspect = self.target_height / half_width + + for idx, face in enumerate([left_face, right_face]): + # Calculate crop region around face + crop_width = int(face.width * 3) # 3x face width for good framing + crop_height = int(crop_width * half_aspect) + + # Clamp to reasonable limits + crop_width = max(crop_width, frame.shape[1] // 4) + crop_width = min(crop_width, frame.shape[1]) + crop_height = min(crop_height, frame.shape[0]) + + # Ensure proper aspect ratio + if crop_height / crop_width > half_aspect: + crop_height = int(crop_width * half_aspect) + else: + crop_width = int(crop_height / half_aspect) + + # Center crop on face + x = max(0, min(face.center_x - crop_width // 2, frame.shape[1] - crop_width)) + y = max(0, min(face.center_y - crop_height // 2, frame.shape[0] - crop_height)) + + # Extract and resize + cropped = frame[y:y + crop_height, x:x + crop_width] + resized = cv2.resize(cropped, (half_width, self.target_height), interpolation=cv2.INTER_LINEAR) + + x_offset = idx * half_width + output[:, x_offset:x_offset + half_width] = resized + else: + # Fallback to single framing if framing_plan.crop_regions: crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1)) crop = framing_plan.crop_regions[crop_idx] @@ -550,11 +886,122 @@ class SmartFramer: y = (h - crop_h) // 2 x = (w - crop_w) // 2 cropped = frame[y:y + crop_h, x:x + crop_w] - output = cv2.resize( - cropped, - (self.target_width, self.target_height), - interpolation=cv2.INTER_LINEAR - ) + output = cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR) + + return output + + new_clip = VideoClip(duration=video_clip.duration) + new_clip.size = (self.target_width, self.target_height) + new_clip.frame_function = make_frame + return new_clip + + def _apply_three_person_layout( + self, + video_clip: VideoFileClip, + framing_plan: FramingPlan + ) -> VideoClip: + """ + Apply layout for 3 people: 1 on top (full width), 2 on bottom (side by side). + + Args: + video_clip: Source video clip + framing_plan: Framing plan + + Returns: + Three-person layout video clip + """ + def make_frame(t): + frame = video_clip.get_frame(t) + exact_frame_idx = (t * framing_plan.fps) / self.frame_skip + frame_idx = int(exact_frame_idx) + + if not framing_plan.frame_contexts: + h, w = frame.shape[:2] + crop_h = int(w * self.target_aspect) + crop_w = w + if crop_h > h: + crop_h = h + crop_w = int(h / self.target_aspect) + y = (h - crop_h) // 2 + x = (w - crop_w) // 2 + cropped = frame[y:y + crop_h, x:x + crop_w] + return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR) + + frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1)) + context = framing_plan.frame_contexts[frame_idx] + + output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8) + + if context.active_speakers: + faces = [ + context.detected_faces[idx] + for idx in context.active_speakers + if 0 <= idx < len(context.detected_faces) + ][:3] + else: + faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:3] # Max 3 faces + num_faces = len(faces) + + if num_faces >= 3: + # Sort faces by Y position (top to bottom), then X for bottom row + faces_sorted = sorted(faces, key=lambda f: f.center_y) + top_face = faces_sorted[0] # Topmost face + bottom_faces = sorted(faces_sorted[1:], key=lambda f: f.center_x) # Sort bottom by X + + # Top section: full width, half height + top_height = self.target_height // 2 + top_width = self.target_width + top_aspect = top_height / top_width + + # Crop around top face + crop_w = int(top_face.width * 3) # 3x face width for context + crop_h = int(crop_w * top_aspect) + crop_w = min(crop_w, frame.shape[1]) + crop_h = min(crop_h, frame.shape[0]) + + x = max(0, min(top_face.center_x - crop_w // 2, frame.shape[1] - crop_w)) + y = max(0, min(top_face.center_y - crop_h // 2, frame.shape[0] - crop_h)) + + cropped_top = frame[y:y + crop_h, x:x + crop_w] + resized_top = cv2.resize(cropped_top, (top_width, top_height), interpolation=cv2.INTER_LINEAR) + output[0:top_height, :] = resized_top + + # Bottom section: two halves + bottom_height = self.target_height - top_height + half_width = self.target_width // 2 + bottom_aspect = bottom_height / half_width + + for idx, face in enumerate(bottom_faces[:2]): + crop_w = int(face.width * 3) + crop_h = int(crop_w * bottom_aspect) + crop_w = min(crop_w, frame.shape[1] // 2) + crop_h = min(crop_h, frame.shape[0]) + + x = max(0, min(face.center_x - crop_w // 2, frame.shape[1] - crop_w)) + y = max(0, min(face.center_y - crop_h // 2, frame.shape[0] - crop_h)) + + cropped = frame[y:y + crop_h, x:x + crop_w] + resized = cv2.resize(cropped, (half_width, bottom_height), interpolation=cv2.INTER_LINEAR) + + x_offset = idx * half_width + output[top_height:, x_offset:x_offset + half_width] = resized + else: + # Fallback to single framing + if framing_plan.crop_regions: + crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1)) + crop = framing_plan.crop_regions[crop_idx] + cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width] + else: + h, w = frame.shape[:2] + crop_h = int(w * self.target_aspect) + crop_w = w + if crop_h > h: + crop_h = h + crop_w = int(h / self.target_aspect) + y = (h - crop_h) // 2 + x = (w - crop_w) // 2 + cropped = frame[y:y + crop_h, x:x + crop_w] + output = cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR) return output @@ -569,7 +1016,8 @@ class SmartFramer: framing_plan: FramingPlan ) -> VideoClip: """ - Apply grid layout for 3+ people. + Apply grid layout for 4 people (2x2 grid). + Layout: top-left, top-right, bottom-left, bottom-right Args: video_clip: Source video clip @@ -600,44 +1048,53 @@ class SmartFramer: output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8) - num_faces = len(context.detected_faces) + if context.active_speakers: + faces = [ + context.detected_faces[idx] + for idx in context.active_speakers + if 0 <= idx < len(context.detected_faces) + ][:4] + else: + faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:4] # Max 4 faces + num_faces = len(faces) - if num_faces >= 3: + if num_faces >= 4: cell_width = self.target_width // 2 cell_height = self.target_height // 2 + cell_aspect = cell_height / cell_width - for idx, face in enumerate(context.detected_faces[:4]): + # Sort faces into grid positions by their actual position + # First sort by Y (top row vs bottom row), then by X within each row + sorted_by_y = sorted(faces, key=lambda f: f.center_y) + top_row = sorted(sorted_by_y[:2], key=lambda f: f.center_x) + bottom_row = sorted(sorted_by_y[2:], key=lambda f: f.center_x) + grid_faces = top_row + bottom_row + + for idx, face in enumerate(grid_faces): row = idx // 2 col = idx % 2 - cell_aspect = cell_height / cell_width - - crop_width = frame.shape[1] // 2 + # Calculate crop region centered on face + crop_width = int(face.width * 3) # 3x face width crop_height = int(crop_width * cell_aspect) - max_crop_width = frame.shape[1] // 2 - max_crop_height = frame.shape[0] // 2 + # Clamp to reasonable limits + crop_width = max(crop_width, frame.shape[1] // 4) + crop_width = min(crop_width, frame.shape[1]) + crop_height = min(crop_height, frame.shape[0]) - if crop_width > max_crop_width: - crop_width = max_crop_width + # Ensure proper aspect ratio + if crop_height / crop_width > cell_aspect: crop_height = int(crop_width * cell_aspect) - - if crop_height > max_crop_height: - crop_height = max_crop_height + else: crop_width = int(crop_height / cell_aspect) - x = max(0, face.center_x - crop_width // 2) - y = max(0, face.center_y - crop_height // 2) - - x = min(x, frame.shape[1] - crop_width) - y = min(y, frame.shape[0] - crop_height) + # Center crop on face + x = max(0, min(face.center_x - crop_width // 2, frame.shape[1] - crop_width)) + y = max(0, min(face.center_y - crop_height // 2, frame.shape[0] - crop_height)) cropped = frame[y:y + crop_height, x:x + crop_width] - resized = cv2.resize( - cropped, - (cell_width, cell_height), - interpolation=cv2.INTER_LINEAR - ) + resized = cv2.resize(cropped, (cell_width, cell_height), interpolation=cv2.INTER_LINEAR) y_offset = row * cell_height x_offset = col * cell_width