#v2 - Inicia testes da v2

- Adiciona rastreamento de objetos
- Facial detection
- Legenda interativa
- Cortes mais precisos
- Refinamento do Prompt
This commit is contained in:
LeoMortari
2025-11-12 11:38:09 -03:00
parent 87c6a5e27c
commit c5d3e83a5f
15 changed files with 1739 additions and 313 deletions

View File

@@ -13,6 +13,8 @@ TEMP_ROOT = BASE_DIR / "temp"
@dataclass(frozen=True)
class RabbitMQSettings:
# host: str = os.environ.get("RABBITMQ_HOST", "154.12.229.181")
# port: int = int(os.environ.get("RABBITMQ_PORT", 32790))
host: str = os.environ.get("RABBITMQ_HOST", "rabbitmq")
port: int = int(os.environ.get("RABBITMQ_PORT", 5672))
user: str = os.environ.get("RABBITMQ_USER", "admin")
@@ -24,33 +26,19 @@ class RabbitMQSettings:
blocked_timeout: int = int(os.environ.get("RABBITMQ_BLOCKED_TIMEOUT", 300))
@dataclass(frozen=True)
class GeminiSettings:
api_key: str = os.environ.get("GEMINI_API_KEY", "")
model: str = os.environ.get("GEMINI_MODEL", "gemini-2.5-flash")
safety_settings: str | None = os.environ.get("GEMINI_SAFETY_SETTINGS")
temperature: float = float(os.environ.get("GEMINI_TEMPERATURE", 0.2))
top_k: int | None = (
int(os.environ["GEMINI_TOP_K"]) if os.environ.get("GEMINI_TOP_K") else None
)
top_p: float | None = (
float(os.environ["GEMINI_TOP_P"]) if os.environ.get("GEMINI_TOP_P") else None
)
prompt_path: str = os.environ.get("GEMINI_PROMPT_PATH", "prompts/generate.txt")
@dataclass(frozen=True)
class OpenRouterSettings:
api_key: str = os.environ.get("OPENROUTER_API_KEY", "")
api_key: str = os.environ.get("OPENROUTER_API_KEY", "https://openrouter.ai/api/v1/chat/completions")
model: str = os.environ.get(
"OPENROUTER_MODEL", "openai/gpt-oss-20b:free"
)
temperature: float = float(os.environ.get("OPENROUTER_TEMPERATURE", 0.6))
prompt_path: str = os.environ.get("OPENROUTER_PROMPT_PATH", "prompts/generate.txt")
@dataclass(frozen=True)
class WhisperSettings:
model_size: str = os.environ.get("FASTER_WHISPER_MODEL_SIZE", "small")
model_size: str = os.environ.get("FASTER_WHISPER_MODEL_SIZE", "medium")
device: str | None = os.environ.get("FASTER_WHISPER_DEVICE")
compute_type: str | None = os.environ.get("FASTER_WHISPER_COMPUTE_TYPE")
download_root: Path = Path(
@@ -67,19 +55,23 @@ class RenderingSettings:
audio_codec: str = os.environ.get("RENDER_AUDIO_CODEC", "aac")
bitrate: str = os.environ.get("RENDER_BITRATE", "5000k")
preset: str = os.environ.get("RENDER_PRESET", "faster")
highlight_color: str = os.environ.get("SUBTITLE_HIGHLIGHT_COLOR", "#FFD200")
highlight_color: str = os.environ.get("SUBTITLE_HIGHLIGHT_COLOR", "#00FF00")
base_color: str = os.environ.get("SUBTITLE_BASE_COLOR", "#FFFFFF")
font_path: Path = Path(os.environ.get("RENDER_FONT_PATH", "./Montserrat.ttf"))
title_font_size: int = int(os.environ.get("RENDER_TITLE_FONT_SIZE", 110))
subtitle_font_size: int = int(os.environ.get("RENDER_SUBTITLE_FONT_SIZE", 64))
caption_min_words: int = int(os.environ.get("CAPTION_MIN_WORDS", 3))
caption_max_words: int = int(os.environ.get("CAPTION_MAX_WORDS", 4))
caption_min_words: int = int(os.environ.get("CAPTION_MIN_WORDS", 2))
caption_max_words: int = int(os.environ.get("CAPTION_MAX_WORDS", 2))
# Smart framing settings
enable_smart_framing: bool = os.environ.get("ENABLE_SMART_FRAMING", "true").lower() in ("true", "1", "yes")
smart_framing_min_confidence: float = float(os.environ.get("SMART_FRAMING_MIN_CONFIDENCE", 0.5))
smart_framing_smoothing_window: int = int(os.environ.get("SMART_FRAMING_SMOOTHING_WINDOW", 20))
smart_framing_frame_skip: int = int(os.environ.get("SMART_FRAMING_FRAME_SKIP", 2)) # Process every Nth frame (CPU optimization)
@dataclass(frozen=True)
class Settings:
rabbitmq: RabbitMQSettings = RabbitMQSettings()
gemini: GeminiSettings = GeminiSettings()
openrouter: OpenRouterSettings = OpenRouterSettings()
whisper: WhisperSettings = WhisperSettings()
rendering: RenderingSettings = RenderingSettings()

View File

@@ -0,0 +1,398 @@
"""
Context detection module for video analysis.
This module provides functionality to detect faces, track people,
and identify who is speaking in video content using MediaPipe and audio analysis.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from typing import List, Optional, Tuple
import cv2
import mediapipe as mp
import numpy as np
from scipy import signal
logger = logging.getLogger(__name__)
@dataclass
class FaceDetection:
"""Represents a detected face in a frame."""
x: int
y: int
width: int
height: int
confidence: float
center_x: int
center_y: int
landmarks: Optional[List[Tuple[int, int]]] = None
@dataclass
class PersonTracking:
"""Tracks a person across frames."""
person_id: int
face: FaceDetection
is_speaking: bool
speaking_confidence: float
frame_number: int
@dataclass
class FrameContext:
"""Context information for a video frame."""
frame_number: int
timestamp: float
detected_faces: List[FaceDetection]
active_speakers: List[int] # indices of speaking faces
primary_focus: Optional[Tuple[int, int]] # (x, y) center point
layout_mode: str # "single", "dual_split", "grid"
class MediaPipeDetector:
"""Face and pose detection using MediaPipe."""
def __init__(self, min_detection_confidence: float = 0.5, min_tracking_confidence: float = 0.5):
self.min_detection_confidence = min_detection_confidence
self.min_tracking_confidence = min_tracking_confidence
self.mp_face_detection = mp.solutions.face_detection
self.mp_face_mesh = mp.solutions.face_mesh
self.face_detection = self.mp_face_detection.FaceDetection(
min_detection_confidence=min_detection_confidence,
model_selection=1
)
self.face_mesh = self.mp_face_mesh.FaceMesh(
max_num_faces=5,
min_detection_confidence=min_detection_confidence,
min_tracking_confidence=min_tracking_confidence,
static_image_mode=False
)
logger.info("MediaPipe detector initialized")
def detect_faces(self, frame: np.ndarray) -> List[FaceDetection]:
"""
Detect faces in a frame.
Args:
frame: RGB image array
Returns:
List of detected faces
"""
height, width = frame.shape[:2]
if len(frame.shape) == 2:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
elif frame.shape[2] == 4:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGRA2RGB)
else:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = self.face_detection.process(frame_rgb)
faces = []
if results.detections:
for detection in results.detections:
bbox = detection.location_data.relative_bounding_box
x = int(bbox.xmin * width)
y = int(bbox.ymin * height)
w = int(bbox.width * width)
h = int(bbox.height * height)
x = max(0, min(x, width - 1))
y = max(0, min(y, height - 1))
w = min(w, width - x)
h = min(h, height - y)
center_x = x + w // 2
center_y = y + h // 2
confidence = detection.score[0] if detection.score else 0.0
faces.append(FaceDetection(
x=x,
y=y,
width=w,
height=h,
confidence=confidence,
center_x=center_x,
center_y=center_y
))
return faces
def detect_face_landmarks(self, frame: np.ndarray) -> List[FaceDetection]:
"""
Detect faces with landmarks for lip sync detection.
Args:
frame: RGB image array
Returns:
List of detected faces with landmark information
"""
height, width = frame.shape[:2]
if len(frame.shape) == 2:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
elif frame.shape[2] == 4:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGRA2RGB)
else:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = self.face_mesh.process(frame_rgb)
faces = []
if results.multi_face_landmarks:
for face_landmarks in results.multi_face_landmarks:
xs = [lm.x for lm in face_landmarks.landmark]
ys = [lm.y for lm in face_landmarks.landmark]
x_min, x_max = min(xs), max(xs)
y_min, y_max = min(ys), max(ys)
x = int(x_min * width)
y = int(y_min * height)
w = int((x_max - x_min) * width)
h = int((y_max - y_min) * height)
center_x = x + w // 2
center_y = y + h // 2
lip_landmarks = []
for idx in [13, 14, 78, 308]:
lm = face_landmarks.landmark[idx]
lip_landmarks.append((int(lm.x * width), int(lm.y * height)))
faces.append(FaceDetection(
x=x,
y=y,
width=w,
height=h,
confidence=1.0,
center_x=center_x,
center_y=center_y,
landmarks=lip_landmarks
))
return faces
def close(self):
"""Release MediaPipe resources."""
self.face_detection.close()
self.face_mesh.close()
class AudioActivityDetector:
"""Detects speech activity in audio."""
def __init__(self, sample_rate: int = 44100, frame_duration_ms: int = 30):
self.sample_rate = sample_rate
self.frame_duration_ms = frame_duration_ms
self.frame_size = int(sample_rate * frame_duration_ms / 1000)
logger.info(f"Audio activity detector initialized (sr={sample_rate}, frame={frame_duration_ms}ms)")
def detect_speaking_periods(
self,
audio_samples: np.ndarray,
threshold: float = 0.02,
min_speech_duration: float = 0.1
) -> List[Tuple[float, float]]:
"""
Detect periods of speech in audio.
Args:
audio_samples: Audio samples array
threshold: Energy threshold for speech detection
min_speech_duration: Minimum duration of speech in seconds
Returns:
List of (start_time, end_time) tuples in seconds
"""
if audio_samples.ndim > 1:
audio_samples = audio_samples.mean(axis=1)
energies = []
for i in range(0, len(audio_samples), self.frame_size):
frame = audio_samples[i:i + self.frame_size]
if len(frame) > 0:
energy = np.sqrt(np.mean(frame ** 2))
energies.append(energy)
speaking_frames = [e > threshold for e in energies]
periods = []
start_frame = None
for i, is_speaking in enumerate(speaking_frames):
if is_speaking and start_frame is None:
start_frame = i
elif not is_speaking and start_frame is not None:
start_time = start_frame * self.frame_duration_ms / 1000
end_time = i * self.frame_duration_ms / 1000
if end_time - start_time >= min_speech_duration:
periods.append((start_time, end_time))
start_frame = None
if start_frame is not None:
start_time = start_frame * self.frame_duration_ms / 1000
end_time = len(speaking_frames) * self.frame_duration_ms / 1000
if end_time - start_time >= min_speech_duration:
periods.append((start_time, end_time))
return periods
def is_speaking_at_time(self, speaking_periods: List[Tuple[float, float]], time: float) -> bool:
"""Check if there is speech activity at a given time."""
for start, end in speaking_periods:
if start <= time <= end:
return True
return False
class ContextAnalyzer:
"""Analyzes video context to determine focus and layout."""
def __init__(self):
self.detector = MediaPipeDetector()
self.audio_detector = AudioActivityDetector()
self.previous_faces: List[FaceDetection] = []
logger.info("Context analyzer initialized")
def analyze_frame(
self,
frame: np.ndarray,
timestamp: float,
frame_number: int,
speaking_periods: Optional[List[Tuple[float, float]]] = None
) -> FrameContext:
"""
Analyze a single frame to extract context information.
Args:
frame: Video frame (BGR format from OpenCV)
timestamp: Frame timestamp in seconds
frame_number: Frame index
speaking_periods: List of (start, end) times where speech is detected
Returns:
FrameContext with detection results
"""
faces = self.detector.detect_face_landmarks(frame)
if not faces:
faces = self.detector.detect_faces(frame)
# Determine who is speaking
active_speakers = []
for i, face in enumerate(faces):
is_speaking = False
if speaking_periods and self.audio_detector.is_speaking_at_time(speaking_periods, timestamp):
is_speaking = True
if face.landmarks and len(self.previous_faces) > i:
is_speaking = is_speaking or self._detect_lip_movement(face, self.previous_faces[i])
if is_speaking:
active_speakers.append(i)
num_faces = len(faces)
num_speakers = len(active_speakers)
if num_faces == 0:
layout_mode = "single"
elif num_faces == 1:
layout_mode = "single"
elif num_faces == 2:
layout_mode = "dual_split"
elif num_faces >= 3:
layout_mode = "dual_split"
else:
layout_mode = "single"
primary_focus = self._calculate_focus_point(faces, active_speakers)
self.previous_faces = faces
return FrameContext(
frame_number=frame_number,
timestamp=timestamp,
detected_faces=faces,
active_speakers=active_speakers,
primary_focus=primary_focus,
layout_mode=layout_mode
)
def _detect_lip_movement(self, current_face: FaceDetection, previous_face: FaceDetection) -> bool:
"""
Detect lip movement by comparing landmarks between frames.
Args:
current_face: Current frame face detection
previous_face: Previous frame face detection
Returns:
True if significant lip movement detected
"""
if not current_face.landmarks or not previous_face.landmarks:
return False
def lip_distance(landmarks):
if len(landmarks) < 4:
return 0
upper = np.array(landmarks[0:2])
lower = np.array(landmarks[2:4])
return np.linalg.norm(upper.mean(axis=0) - lower.mean(axis=0))
current_dist = lip_distance(current_face.landmarks)
previous_dist = lip_distance(previous_face.landmarks)
threshold = 2.0
return abs(current_dist - previous_dist) > threshold
def _calculate_focus_point(
self,
faces: List[FaceDetection],
active_speakers: List[int]
) -> Optional[Tuple[int, int]]:
"""
Calculate the primary focus point based on detected faces and speakers.
IMPORTANT: This focuses on ONE person to avoid focusing on empty space (table).
When multiple people are present, we pick the most relevant person, not average positions.
Args:
faces: List of detected faces
active_speakers: Indices of faces that are speaking
Returns:
(x, y) tuple of focus center, or None if no faces
"""
if not faces:
return None
if active_speakers:
speaker_faces = [faces[i] for i in active_speakers if i < len(faces)]
if speaker_faces:
primary_speaker = max(speaker_faces, key=lambda f: f.confidence)
return (primary_speaker.center_x, primary_speaker.center_y)
most_confident = max(faces, key=lambda f: f.confidence)
return (most_confident.center_x, most_confident.center_y)
def close(self):
"""Release resources."""
self.detector.close()

View File

@@ -2,11 +2,11 @@ from __future__ import annotations
import json
import logging
import time
import os
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Dict, List
from google import genai
from google.genai import types as genai_types
import requests
from video_render.config import BASE_DIR, Settings
@@ -14,27 +14,24 @@ from video_render.transcription import TranscriptionResult
logger = logging.getLogger(__name__)
OPENROUTER_ENDPOINT = "https://openrouter.ai/api/v1/chat/completions"
OPENROUTER_ENDPOINT = os.environ.get("OPENROUTER_API_URL", "https://openrouter.ai/api/v1/chat/completions")
class GeminiHighlighter:
class OpenRouterCopywriter:
def __init__(self, settings: Settings) -> None:
if not settings.gemini.api_key:
raise RuntimeError("GEMINI_API_KEY nao foi definido")
prompt_path = Path(settings.gemini.prompt_path)
if not settings.openrouter.api_key:
raise RuntimeError("OPENROUTER_API_KEY nao foi definido")
self.settings = settings
prompt_path = Path(settings.openrouter.prompt_path)
if not prompt_path.is_absolute():
prompt_path = BASE_DIR / prompt_path
if not prompt_path.exists():
raise FileNotFoundError(f"Prompt do Gemini nao encontrado: {prompt_path}")
self.prompt_template = prompt_path.read_text(encoding="utf-8")
self.settings = settings
self.client = genai.Client()
raise FileNotFoundError(f"Prompt nao encontrado: {prompt_path}")
self.highlights_prompt_template = prompt_path.read_text(encoding="utf-8")
def generate_highlights(self, transcription: TranscriptionResult) -> List[Dict]:
"""Generate video highlights using OpenRouter GPT-OSS with retry logic."""
payload = {
"transcript": transcription.full_text,
"segments": [
@@ -47,93 +44,139 @@ class GeminiHighlighter:
],
}
try:
response = self._call_gemini(payload)
except Exception as exc:
logger.error("Gemini API request falhou: %s", exc)
raise RuntimeError("Gemini API request falhou") from exc
raw_text = self._extract_response_text(response)
parsed = self._extract_json(raw_text)
highlights = parsed.get("highlights")
if not isinstance(highlights, list):
raise ValueError("Resposta do Gemini invalida: campo 'highlights' ausente")
return highlights
def _call_gemini(self, payload: Dict[str, Any]) -> Any:
contents = [
{
"role": "user",
"parts": [
{"text": self.prompt_template},
{"text": json.dumps(payload, ensure_ascii=False)},
],
}
]
request_kwargs: Dict[str, Any] = {
"model": self.settings.gemini.model,
"contents": contents,
body = {
"model": self.settings.openrouter.model,
"temperature": self.settings.openrouter.temperature,
"messages": [
{"role": "system", "content": self.highlights_prompt_template},
{
"role": "user",
"content": json.dumps(payload, ensure_ascii=False),
},
],
}
config = self._build_generation_config()
if config is not None:
request_kwargs["config"] = config
headers = {
"Authorization": f"Bearer {self.settings.openrouter.api_key}",
"Content-Type": "application/json",
"X-Title": "Video Render - Highlights Detection"
}
return self.client.models.generate_content(**request_kwargs)
logger.info(f"Calling OpenRouter with model: {self.settings.openrouter.model}")
logger.debug(f"Request payload keys: transcript_length={len(payload['transcript'])}, segments_count={len(payload['segments'])}")
def _build_generation_config(self) -> Optional[genai_types.GenerateContentConfig]:
config_kwargs: Dict[str, Any] = {}
if self.settings.gemini.temperature is not None:
config_kwargs["temperature"] = self.settings.gemini.temperature
if self.settings.gemini.top_p is not None:
config_kwargs["top_p"] = self.settings.gemini.top_p
if self.settings.gemini.top_k is not None:
config_kwargs["top_k"] = self.settings.gemini.top_k
# Retry configuration for rate limits (especially free tier)
max_retries = 5
base_delay = 5 # Start with 5s delay
if not config_kwargs:
return None
for attempt in range(max_retries):
try:
response = requests.post(
url=OPENROUTER_ENDPOINT,
data=json.dumps(body),
headers=headers,
timeout=120,
)
response.raise_for_status()
data = response.json()
break
return genai_types.GenerateContentConfig(**config_kwargs)
except requests.exceptions.HTTPError as exc:
if exc.response.status_code == 429:
if attempt < max_retries - 1:
# Exponential backoff: 5s, 10s, 20s, 40s, 80s
delay = base_delay * (2 ** attempt)
logger.warning(f"Rate limit atingido (429). Aguardando {delay}s antes de tentar novamente (tentativa {attempt + 1}/{max_retries})")
time.sleep(delay)
continue
else:
logger.error("Rate limit atingido apos todas as tentativas")
logger.error("Solucao: Use um modelo pago ou adicione creditos na OpenRouter")
raise RuntimeError("OpenRouter rate limit excedido") from exc
else:
logger.error(f"OpenRouter API request falhou com status {exc.response.status_code}: {exc}")
raise RuntimeError("OpenRouter API request falhou") from exc
@staticmethod
def _extract_response_text(response: Any) -> str:
text = getattr(response, "text", None)
if text:
return str(text).strip()
except Exception as exc:
logger.error("OpenRouter API request falhou: %s", exc)
raise RuntimeError("OpenRouter API request falhou") from exc
candidates = getattr(response, "candidates", None) or []
for candidate in candidates:
content = getattr(candidate, "content", None)
if not content:
# Debug: log response structure
logger.info(f"OpenRouter response keys: {list(data.keys())}")
if "error" in data:
logger.error(f"OpenRouter API error: {data.get('error')}")
raise RuntimeError(f"OpenRouter API error: {data.get('error')}")
choices = data.get("choices") or []
if not choices:
logger.error(f"OpenRouter response completa: {json.dumps(data, indent=2)}")
raise RuntimeError("OpenRouter nao retornou escolhas")
message = choices[0].get("message", {}).get("content")
if not message:
raise RuntimeError("Resposta do OpenRouter sem conteudo")
parsed = self._extract_json(message)
highlights = parsed.get("highlights")
if not isinstance(highlights, list):
raise ValueError("Resposta do OpenRouter invalida: campo 'highlights' ausente")
valid_highlights = []
for highlight in highlights:
try:
start = float(highlight.get("start", 0))
end = float(highlight.get("end", 0))
summary = str(highlight.get("summary", "")).strip()
if start < 0 or end < 0:
logger.warning(f"Highlight ignorado: timestamps negativos (start={start}, end={end})")
continue
if end <= start:
logger.warning(f"Highlight ignorado: end <= start (start={start}, end={end})")
continue
duration = end - start
if duration < 45:
logger.warning(f"Highlight ignorado: muito curto ({duration}s, minimo 45s)")
continue
if duration > 120:
logger.warning(f"Highlight ignorado: muito longo ({duration}s, maximo 120s)")
continue
if not summary:
logger.warning(f"Highlight ignorado: summary vazio")
continue
valid_highlights.append({
"start": start,
"end": end,
"summary": summary
})
except (TypeError, ValueError) as e:
logger.warning(f"Highlight invalido ignorado: {highlight} - {e}")
continue
parts = getattr(content, "parts", None) or []
for part in parts:
part_text = getattr(part, "text", None)
if part_text:
return str(part_text).strip()
raise RuntimeError("Resposta do Gemini sem texto")
if not valid_highlights:
logger.warning("Nenhum highlight valido retornado pelo OpenRouter")
total_duration = 75.0
if transcription.segments:
total_duration = max(seg.end for seg in transcription.segments)
@staticmethod
def _extract_json(response_text: str) -> Dict:
try:
return json.loads(response_text)
except json.JSONDecodeError:
start = response_text.find("{")
end = response_text.rfind("}")
if start == -1 or end == -1:
raise
subset = response_text[start : end + 1]
return json.loads(subset)
fallback_end = min(75.0, total_duration)
if fallback_end < 60.0:
fallback_end = min(60.0, total_duration)
return [{
"start": 0.0,
"end": fallback_end,
"summary": "Trecho inicial do video (fallback automatico)"
}]
class OpenRouterCopywriter:
def __init__(self, settings: Settings) -> None:
if not settings.openrouter.api_key:
raise RuntimeError("OPENROUTER_API_KEY nao foi definido")
self.settings = settings
logger.info(f"OpenRouter retornou {len(valid_highlights)} highlights validos")
return valid_highlights
def generate_titles(self, highlights: List[Dict]) -> List[str]:
if not highlights:

View File

@@ -35,11 +35,29 @@ class MediaPreparer:
sanitized_name = sanitize_filename(Path(filename).stem)
workspace_dir = ensure_workspace(self.settings.videos_dir, sanitized_name)
transcription_json = workspace_dir / "transcription.json"
transcription_txt = workspace_dir / "transcription.txt"
temp_transcription_json = None
temp_transcription_txt = None
if transcription_json.exists():
temp_transcription_json = workspace_dir.parent / f".{sanitized_name}_transcription.json.tmp"
shutil.copy2(transcription_json, temp_transcription_json)
if transcription_txt.exists():
temp_transcription_txt = workspace_dir.parent / f".{sanitized_name}_transcription.txt.tmp"
shutil.copy2(transcription_txt, temp_transcription_txt)
existing_children = list(workspace_dir.iterdir())
if existing_children:
logger.info("Limpando workspace existente para %s", sanitized_name)
remove_paths(existing_children)
if temp_transcription_json and temp_transcription_json.exists():
shutil.move(str(temp_transcription_json), str(transcription_json))
logger.info("Transcrição preservada em %s", transcription_json)
if temp_transcription_txt and temp_transcription_txt.exists():
shutil.move(str(temp_transcription_txt), str(transcription_txt))
destination_name = f"{sanitized_name}{source_path.suffix.lower()}"
working_video_path = workspace_dir / destination_name
shutil.copy2(source_path, working_video_path)

View File

@@ -6,7 +6,7 @@ from pathlib import Path
from typing import Any, Dict, List, Optional
from video_render.config import Settings
from video_render.llm import GeminiHighlighter, OpenRouterCopywriter
from video_render.llm import OpenRouterCopywriter
from video_render.media import MediaPreparer, VideoWorkspace
from video_render.transcription import TranscriptionResult, TranscriptionService
from video_render.utils import remove_paths, sanitize_filename
@@ -55,8 +55,7 @@ class VideoPipeline:
self.settings = settings
self.media_preparer = MediaPreparer(settings)
self.transcriber = TranscriptionService(settings)
self.highlighter = GeminiHighlighter(settings)
self.copywriter = OpenRouterCopywriter(settings)
self.llm_service = OpenRouterCopywriter(settings) # Using OpenRouter for both highlights and titles
self.renderer = VideoRenderer(settings)
def process_message(self, message: Dict[str, Any]) -> Dict[str, Any]:
@@ -65,12 +64,11 @@ class VideoPipeline:
self._prepare_workspace(context)
self._generate_transcription(context)
self._determine_highlights(context)
self._generate_titles(context)
self._render_clips(context)
return self._build_success_payload(context)
except Exception as exc:
logger.exception("Falha ao processar vídeo %s", context.job.filename)
# return self._handle_failure(context, exc)
def _parse_job(self, message: Dict[str, Any]) -> JobMessage:
filename = message.get("filename")
@@ -102,7 +100,10 @@ class VideoPipeline:
context.transcription = existing
return
transcription = self.transcriber.transcribe(context.workspace.audio_path)
transcription = self.transcriber.transcribe(
context.workspace.audio_path,
output_dir=context.workspace.workspace_dir
)
TranscriptionService.persist(transcription, context.workspace.workspace_dir)
context.transcription = transcription
@@ -111,10 +112,10 @@ class VideoPipeline:
raise RuntimeError("Transcricao nao disponivel")
try:
highlights_raw = self.highlighter.generate_highlights(context.transcription)
highlights_raw = self.llm_service.generate_highlights(context.transcription)
except Exception:
logger.exception(
"Falha ao gerar destaques com Gemini; aplicando fallback padrao."
"Falha ao gerar destaques com OpenRouter; aplicando fallback padrao."
)
context.highlight_windows = [self._build_fallback_highlight(context)]
return
@@ -130,11 +131,13 @@ class VideoPipeline:
continue
summary = str(item.get("summary", "")).strip()
title = str(item.get("title", summary[:60])).strip()
if end <= start:
logger.debug("Highlight com intervalo invalido ignorado: %s", item)
continue
windows.append(HighlightWindow(start=start, end=end, summary=summary))
windows.append(HighlightWindow(start=start, end=end, summary=summary, title=title))
if not windows:
windows.append(self._build_fallback_highlight(context))
@@ -142,17 +145,12 @@ class VideoPipeline:
context.highlight_windows = windows
def _generate_titles(self, context: PipelineContext) -> None:
if not context.highlight_windows:
return
"""DEPRECATED: Titles are now generated together with highlights.
highlight_dicts = [
{"start": window.start, "end": window.end, "summary": window.summary}
for window in context.highlight_windows
]
titles = self.copywriter.generate_titles(highlight_dicts)
for window, title in zip(context.highlight_windows, titles):
window.title = title.strip()
This method is kept for backwards compatibility but does nothing.
Titles are extracted from highlights in _determine_highlights().
"""
pass
def _build_fallback_highlight(self, context: PipelineContext) -> HighlightWindow:
if not context.transcription:
@@ -167,6 +165,7 @@ class VideoPipeline:
start=0.0,
end=max(last_end, 10.0),
summary="Sem destaque identificado; fallback automatico.",
title="Confira este momento",
)
def _render_clips(self, context: PipelineContext) -> None:

View File

@@ -15,6 +15,7 @@ from PIL import Image, ImageColor, ImageDraw, ImageFont
from video_render.config import Settings
from video_render.transcription import TranscriptionResult, WordTiming
from video_render.smart_framing import SmartFramer, extract_audio_samples
logger = logging.getLogger(__name__)
@@ -54,7 +55,41 @@ class CaptionBuilder:
self.space_width = self.font.getbbox(" ")[2] - self.font.getbbox(" ")[0]
def build(self, words: Sequence[WordTiming], clip_start: float) -> List[CaptionClipSet]:
grouped = self._group_words(words)
# Filter out empty, whitespace-only, or very short words (likely noise)
valid_words = [
w for w in words
if w.word
and w.word.strip()
and len(w.word.strip()) >= 2 # At least 2 characters
and not w.word.strip() in ['...', '..', '.', ',', '-', 'hmm', 'hm', 'ah', 'eh', 'uh'] # Not just punctuation or filler
]
# Note: We don't filter out words based on gaps here
# Gap detection is handled in _group_words_with_gaps
# This ensures captions disappear during silence naturally
filtered_words = valid_words
# Calculate speech density (words per second)
# If density is too low, it's likely just noise/silence being misinterpreted
if filtered_words:
first_word_time = filtered_words[0].start
last_word_time = filtered_words[-1].end
duration = last_word_time - first_word_time
if duration > 0:
words_per_second = len(filtered_words) / duration
# Typical speech is 2-3 words per second
# If less than 0.5 words/second, it's probably silence/noise
if words_per_second < 0.5:
logger.debug(f"Captions suprimidas: densidade muito baixa ({words_per_second:.2f} palavras/seg)")
return []
# Only show captions if we have at least 3 valid words (reduced from 5 for 2-word groups)
# This prevents showing captions for noise/mumbling
if len(filtered_words) < 3:
return []
grouped = self._group_words_with_gaps(filtered_words)
clip_sets: List[CaptionClipSet] = []
for group in grouped:
@@ -101,6 +136,92 @@ class CaptionBuilder:
if len(widths) > 1:
total_width += self.space_width * (len(widths) - 1)
# Check if text needs to wrap to multiple lines
# If total width exceeds canvas width, break into 2 lines
needs_wrap = total_width > self.canvas_width
if needs_wrap:
# Split into 2 lines - try to balance the lines
mid_point = len(texts) // 2
line1_texts = texts[:mid_point]
line2_texts = texts[mid_point:]
line1_widths = widths[:mid_point]
line2_widths = widths[mid_point:]
# Calculate widths for each line
line1_width = sum(line1_widths)
if len(line1_widths) > 1:
line1_width += self.space_width * (len(line1_widths) - 1)
line2_width = sum(line2_widths)
if len(line2_widths) > 1:
line2_width += self.space_width * (len(line2_widths) - 1)
# Double the canvas height for 2 lines
canvas_height = self.canvas_height * 2
base_image = Image.new("RGBA", (self.canvas_width, canvas_height), (0, 0, 0, 0))
base_draw = ImageDraw.Draw(base_image)
highlight_images: List[Image.Image] = []
# Stroke settings: 8px black stroke for better readability
stroke_width = 8
stroke_color = (0, 0, 0, 255) # Black
# Draw line 1
x = max(0, (self.canvas_width - line1_width) // 2)
y = self.baseline
for i, (text, width) in enumerate(zip(line1_texts, line1_widths)):
base_draw.text(
(x, y),
text,
font=self.font,
fill=self.base_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
highlight_draw = ImageDraw.Draw(highlight_image)
highlight_draw.text(
(x, y),
text,
font=self.font,
fill=self.highlight_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
highlight_images.append(highlight_image)
x += width + self.space_width
# Draw line 2
x = max(0, (self.canvas_width - line2_width) // 2)
y = self.baseline + self.text_height + 5 # 5px spacing between lines
for i, (text, width) in enumerate(zip(line2_texts, line2_widths)):
base_draw.text(
(x, y),
text,
font=self.font,
fill=self.base_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
highlight_draw = ImageDraw.Draw(highlight_image)
highlight_draw.text(
(x, y),
text,
font=self.font,
fill=self.highlight_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
highlight_images.append(highlight_image)
x += width + self.space_width
return base_image, highlight_images
# Single line rendering (original code)
start_x = max(0, (self.canvas_width - total_width) // 2)
base_image = Image.new("RGBA", (self.canvas_width, self.canvas_height), (0, 0, 0, 0))
@@ -108,13 +229,31 @@ class CaptionBuilder:
highlight_images: List[Image.Image] = []
x = start_x
for text, width in zip(texts, widths):
base_draw.text((x, self.baseline), text, font=self.font, fill=self.base_color)
# Stroke settings: 8px black stroke for better readability
stroke_width = 8
stroke_color = (0, 0, 0, 255) # Black
for text, width in zip(texts, widths):
# Draw base text with stroke
base_draw.text(
(x, self.baseline),
text,
font=self.font,
fill=self.base_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
# Draw highlight text with stroke
highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
highlight_draw = ImageDraw.Draw(highlight_image)
highlight_draw.text(
(x, self.baseline), text, font=self.font, fill=self.highlight_color
(x, self.baseline),
text,
font=self.font,
fill=self.highlight_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
highlight_images.append(highlight_image)
@@ -153,6 +292,44 @@ class CaptionBuilder:
return grouped
def _group_words_with_gaps(self, words: Sequence[WordTiming]) -> List[List[WordTiming]]:
"""
Group words into 2-word chunks, respecting silence gaps.
Creates natural breaks where there are pauses > 1.5s
"""
if not words:
return []
grouped: List[List[WordTiming]] = []
buffer: List[WordTiming] = []
for i, word in enumerate(words):
# Check if there's a long pause before this word
if i > 0:
gap = word.start - words[i-1].end
# If gap > 1.5s, finish current buffer and start new group
if gap > 1.5:
if buffer:
grouped.append(buffer)
buffer = []
buffer.append(word)
# Group into 2 words maximum
if len(buffer) == 2:
grouped.append(buffer)
buffer = []
# Handle remaining words
if buffer:
if len(buffer) == 1 and grouped:
# Add single remaining word to last group
grouped[-1].append(buffer[0])
else:
grouped.append(buffer)
return [grp for grp in grouped if grp]
@staticmethod
def _clean_word(text: str) -> str:
text = text.strip()
@@ -164,6 +341,12 @@ class VideoRenderer:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self.captions = CaptionBuilder(settings)
self.smart_framer = SmartFramer(
target_width=settings.rendering.frame_width,
target_height=settings.rendering.frame_height,
frame_skip=settings.rendering.smart_framing_frame_skip,
smoothing_window=settings.rendering.smart_framing_smoothing_window
)
def render(
self,
@@ -234,26 +417,100 @@ class VideoRenderer:
duration = end - start
frame_w = self.settings.rendering.frame_width
frame_h = self.settings.rendering.frame_height
top_h = int(frame_h * 0.18)
# Removed top panel - no longer showing title
bottom_h = int(frame_h * 0.20)
video_area_h = max(1, frame_h - top_h - bottom_h)
scale_factor = min(
frame_w / subclip.w,
video_area_h / subclip.h,
)
resized_clip = subclip.resized(scale_factor)
video_y = top_h + (video_area_h - resized_clip.h) // 2
video_clip = resized_clip.with_position(
((frame_w - resized_clip.w) // 2, video_y)
)
# Use smart framing to create intelligent 9:16 video (if enabled)
if self.settings.rendering.enable_smart_framing:
logger.info(f"Creating smart framing plan for clip {index} ({start:.2f}s - {end:.2f}s)")
try:
# Extract audio for speech detection
audio_samples = extract_audio_samples(source_path, start, end)
# Create framing plan
framing_plan = self.smart_framer.create_framing_plan(
video_path=source_path,
start_time=start,
end_time=end,
audio_samples=audio_samples
)
# Apply smart framing based on detected layout
use_split_screen = framing_plan.layout_mode in ["dual_split", "grid"]
video_clip = self.smart_framer.apply_framing(
video_clip=subclip,
framing_plan=framing_plan,
use_split_screen=use_split_screen
)
logger.info(f"Smart framing applied: layout={framing_plan.layout_mode}, "
f"faces_detected={len(framing_plan.frame_contexts[0].detected_faces) if framing_plan.frame_contexts else 0}")
except Exception as exc:
logger.warning(f"Smart framing failed for clip {index}, falling back to center crop: {exc}", exc_info=True)
# Fallback to center crop (maintains aspect ratio, crops to fit)
video_area_h = max(1, frame_h - bottom_h)
# Use MAX to ensure video covers entire area (will crop excess)
scale_factor = max(
frame_w / subclip.w,
video_area_h / subclip.h,
)
# Resize to cover area
resized_clip = subclip.resized(scale_factor)
# Calculate crop region (center crop)
crop_x1 = max(0, (resized_clip.w - frame_w) // 2)
crop_y1 = max(0, (resized_clip.h - video_area_h) // 2)
crop_x2 = crop_x1 + frame_w
crop_y2 = crop_y1 + video_area_h
# Crop to fit target dimensions using MoviePy crop(x1, y1, x2, y2)
cropped_clip = resized_clip.cropped(
x1=crop_x1,
y1=crop_y1,
x2=crop_x2,
y2=crop_y2
)
video_clip = cropped_clip.with_position((0, 0))
resized_clip.close()
else:
# Use center crop (smart framing disabled)
logger.info(f"Using center crop for clip {index} (smart framing disabled)")
video_area_h = max(1, frame_h - bottom_h)
# Use MAX to ensure video covers entire area (will crop excess)
scale_factor = max(
frame_w / subclip.w,
video_area_h / subclip.h,
)
# Resize to cover area
resized_clip = subclip.resized(scale_factor)
# Calculate crop region (center crop)
crop_x1 = max(0, (resized_clip.w - frame_w) // 2)
crop_y1 = max(0, (resized_clip.h - video_area_h) // 2)
crop_x2 = crop_x1 + frame_w
crop_y2 = crop_y1 + video_area_h
# Crop to fit target dimensions using MoviePy crop(x1, y1, x2, y2)
cropped_clip = resized_clip.cropped(
x1=crop_x1,
y1=crop_y1,
x2=crop_x2,
y2=crop_y2
)
video_clip = cropped_clip.with_position((0, 0))
resized_clip.close()
background = ColorClip(size=(frame_w, frame_h), color=(0, 0, 0)).with_duration(duration)
top_panel = (
ColorClip(size=(frame_w, top_h), color=(12, 12, 12))
.with_duration(duration)
.with_opacity(0.85)
)
# Removed top panel and title - no longer needed
bottom_panel = (
ColorClip(size=(frame_w, bottom_h), color=(12, 12, 12))
.with_position((0, frame_h - bottom_h))
@@ -261,34 +518,42 @@ class VideoRenderer:
.with_opacity(0.85)
)
title_clip = self._build_title_clip(
title=title,
summary=summary,
duration=duration,
frame_width=frame_w,
top_panel_height=top_h,
)
title_clip = title_clip.with_position(
((frame_w - title_clip.w) // 2, (top_h - title_clip.h) // 2)
)
words = self._collect_words(transcription, start, end)
caption_sets = self.captions.build(words, clip_start=start)
# Calculate speech coverage: how much of the clip has actual speech?
# If less than 30% of the clip has speech, don't show captions
clip_duration = end - start
if words and clip_duration > 0:
# Calculate total time with speech
total_speech_time = sum(w.end - w.start for w in words)
speech_coverage = total_speech_time / clip_duration
if speech_coverage < 0.3: # Less than 30% speech
logger.debug(f"Captions suprimidas: cobertura de fala baixa ({speech_coverage:.1%})")
words = [] # Clear words to prevent captions
# Only build captions if there are actual words to display
# This prevents empty/placeholder captions from appearing
caption_sets = self.captions.build(words, clip_start=start) if words else []
caption_clips = []
caption_resources: List[ImageClip] = []
caption_area_top = frame_h - bottom_h
caption_area_height = bottom_h
# Position captions 120px below center (for 1920px height, center is 960px, so 1080px)
# This ensures they're visible, well-positioned, and don't interfere with faces
# Range: 100-150px as requested, using 120px for optimal positioning
center_y = frame_h // 2
caption_y = center_y + 120
caption_margin = 20
raw_caption_y = caption_area_top + (caption_area_height - self.captions.canvas_height) // 2
min_caption_y = caption_area_top + caption_margin
max_caption_y = (
caption_area_top + caption_area_height - self.captions.canvas_height - caption_margin
)
# Ensure captions stay within reasonable bounds (no top panel now)
min_caption_y = caption_margin
max_caption_y = frame_h - bottom_h - self.captions.canvas_height - caption_margin
if max_caption_y < min_caption_y:
caption_y = min_caption_y
else:
caption_y = min(max(raw_caption_y, min_caption_y), max_caption_y)
caption_y = min(max(caption_y, min_caption_y), max_caption_y)
for clip_set in caption_sets:
base_positioned = clip_set.base.with_position(("center", caption_y))
@@ -299,30 +564,20 @@ class VideoRenderer:
caption_clips.append(positioned)
caption_resources.append(highlight)
if not caption_clips:
fallback_text = self._wrap_text(summary or title, max_width=frame_w - 160)
caption_clips.append(
self._make_textclip(
text=fallback_text,
font_path=self.settings.rendering.font_path,
font_size=self.settings.rendering.subtitle_font_size,
color=self.settings.rendering.base_color,
size=(frame_w - 160, max(40, self.captions.canvas_height)),
)
.with_duration(duration)
.with_position(("center", caption_y))
)
# No fallback captions - if there are no dynamic captions, show nothing
# This matches Opus Clip behavior where captions only appear when there's actual speech
audio_clip, audio_needs_close = self._materialize_audio(
source_path=source_path,
start=start,
end=end,
duration=duration,
fallback_audio=video_clip.audio or resized_clip.audio or subclip.audio,
fallback_audio=video_clip.audio or subclip.audio,
)
# Composite with background, bottom panel, video, and captions only (no top panel or title)
composite = CompositeVideoClip(
[background, top_panel, bottom_panel, video_clip, title_clip, *caption_clips],
[background, bottom_panel, video_clip, *caption_clips],
size=(frame_w, frame_h),
)
if audio_clip is not None:
@@ -337,11 +592,8 @@ class VideoRenderer:
)
composite.close()
resized_clip.close()
video_clip.close()
title_clip.close()
background.close()
top_panel.close()
bottom_panel.close()
for clip in caption_clips:
clip.close()
@@ -352,95 +604,6 @@ class VideoRenderer:
return str(output_path)
def _build_title_clip(
self,
*,
title: str,
summary: str,
duration: float,
frame_width: int,
top_panel_height: int,
) -> ImageClip:
text = (title or summary or "").strip()
if not text:
text = summary or ""
max_width = max(200, frame_width - 160)
font_size = self.settings.rendering.title_font_size
min_font_size = max(28, int(font_size * 0.6))
target_height = max(80, top_panel_height - 40)
title_color = ImageColor.getrgb(self.settings.rendering.base_color)
font_path = self.settings.rendering.font_path
while True:
font = ImageFont.truetype(str(font_path), font_size)
lines = self._split_title_lines(text, font, max_width)
line_height = font.getbbox("Ay")[3] - font.getbbox("Ay")[1]
spacing = max(4, int(line_height * 0.25))
text_height = self._measure_text_height(len(lines), line_height, spacing)
if text_height <= target_height or font_size <= min_font_size:
break
font_size = max(min_font_size, font_size - 6)
# Recompute dimensions with final font size to ensure consistency
font = ImageFont.truetype(str(font_path), font_size)
lines = self._split_title_lines(text, font, max_width)
line_height = font.getbbox("Ay")[3] - font.getbbox("Ay")[1]
spacing = max(4, int(line_height * 0.25))
text_height = self._measure_text_height(len(lines), line_height, spacing)
canvas_height = max(1, text_height)
image = Image.new("RGBA", (max_width, canvas_height), (0, 0, 0, 0))
draw = ImageDraw.Draw(image)
y = 0
for idx, line in enumerate(lines):
bbox = font.getbbox(line)
line_width = bbox[2] - bbox[0]
x = max(0, (max_width - line_width) // 2)
draw.text((x, y - bbox[1]), line, font=font, fill=title_color)
y += line_height
if idx < len(lines) - 1:
y += spacing
return ImageClip(np.array(image)).with_duration(duration)
@staticmethod
def _measure_text_height(line_count: int, line_height: int, spacing: int) -> int:
if line_count <= 0:
return line_height
return line_count * line_height + max(0, line_count - 1) * spacing
@staticmethod
def _split_title_lines(
text: str, font: ImageFont.FreeTypeFont, max_width: int
) -> List[str]:
words = text.split()
if not words:
return [""]
lines: List[str] = []
current: List[str] = []
for word in words:
test_line = " ".join(current + [word]) if current else word
bbox = font.getbbox(test_line)
line_width = bbox[2] - bbox[0]
if line_width <= max_width or not current:
current.append(word)
if line_width > max_width and not current[:-1]:
lines.append(" ".join(current))
current = []
continue
lines.append(" ".join(current))
current = [word]
if current:
lines.append(" ".join(current))
return lines
def _materialize_audio(
self,
*,

View File

@@ -0,0 +1,687 @@
"""
Smart framing module for intelligent video cropping and composition.
This module provides functionality to create 9:16 vertical videos with
intelligent framing that follows the action and speakers.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from typing import List, Optional, Tuple
import cv2
import numpy as np
from moviepy.video.VideoClip import VideoClip
from moviepy.video.io.VideoFileClip import VideoFileClip
from scipy import signal
from video_render.context_detection import ContextAnalyzer, FrameContext, FaceDetection
logger = logging.getLogger(__name__)
@dataclass
class CropRegion:
"""Defines a crop region for a frame."""
x: int
y: int
width: int
height: int
@dataclass
class FramingPlan:
"""Complete framing plan for a video segment."""
frame_contexts: List[FrameContext]
crop_regions: List[CropRegion]
layout_mode: str
fps: float
class SmartFramer:
"""Creates intelligent 9:16 framing for horizontal videos."""
def __init__(
self,
target_width: int = 1080,
target_height: int = 1920,
frame_skip: int = 2,
smoothing_window: int = 15
):
self.target_width = target_width
self.target_height = target_height
self.target_aspect = target_height / target_width
# Performance parameters
self.frame_skip = frame_skip # Process every Nth frame (CPU optimization)
# Smoothing parameters
self.smoothing_window = smoothing_window
self.max_velocity = 30 # pixels per frame (reduced for smoother transitions)
logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip})")
def create_framing_plan(
self,
video_path: str,
start_time: float,
end_time: float,
audio_samples: Optional[np.ndarray] = None
) -> FramingPlan:
"""
Analyze video and create a complete framing plan.
Args:
video_path: Path to video file
start_time: Start time in seconds
end_time: End time in seconds
audio_samples: Optional audio samples for speech detection
Returns:
FramingPlan with all frame contexts and crop regions
"""
analyzer = ContextAnalyzer()
# Detect speaking periods from audio if available
speaking_periods = None
if audio_samples is not None:
speaking_periods = analyzer.audio_detector.detect_speaking_periods(audio_samples)
# Open video with error suppression for AV1 codec warnings
import os
os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'loglevel;quiet'
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
# Calculate frame range
start_frame = int(start_time * fps)
end_frame = int(end_time * fps)
# Set to start frame
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
frame_contexts = []
frame_number = start_frame
processed_count = 0
logger.info(f"Analyzing frames {start_frame} to {end_frame} (fps={fps}, skip={self.frame_skip})")
while frame_number < end_frame:
ret, frame = cap.read()
if not ret:
break
# Only process every Nth frame for performance (CPU optimization)
if processed_count % self.frame_skip == 0:
timestamp = frame_number / fps
context = analyzer.analyze_frame(frame, timestamp, frame_number, speaking_periods)
frame_contexts.append(context)
frame_number += 1
processed_count += 1
# Get video dimensions before releasing capture
source_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
source_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()
analyzer.close()
# Determine overall layout mode (most common)
layout_modes = [ctx.layout_mode for ctx in frame_contexts]
if layout_modes:
overall_layout = max(set(layout_modes), key=layout_modes.count)
else:
overall_layout = "single"
# Calculate crop regions based on contexts
crop_regions = self._calculate_crop_regions(
frame_contexts,
source_width,
source_height
)
return FramingPlan(
frame_contexts=frame_contexts,
crop_regions=crop_regions,
layout_mode=overall_layout,
fps=fps
)
def _calculate_crop_regions(
self,
contexts: List[FrameContext],
source_width: int,
source_height: int
) -> List[CropRegion]:
"""
Calculate smooth crop regions for each frame.
Args:
contexts: List of frame contexts
source_width: Source video width
source_height: Source video height
Returns:
List of crop regions
"""
if not contexts:
return []
# Calculate ideal crop dimensions maintaining EXACT 9:16 aspect ratio
source_aspect = source_width / source_height
if source_aspect > self.target_aspect:
# Source is wider - crop horizontally (use full height)
crop_height = source_height
crop_width = int(crop_height / self.target_aspect)
# Ensure crop width fits within source
if crop_width > source_width:
crop_width = source_width
crop_height = int(crop_width * self.target_aspect)
else:
# Source is taller - crop vertically (use full width)
crop_width = source_width
crop_height = int(crop_width * self.target_aspect)
# Ensure crop height fits within source
if crop_height > source_height:
crop_height = source_height
crop_width = int(crop_height / self.target_aspect)
# Calculate center points for each frame
# Since we now always focus on ONE person directly (not averaging),
# we can use the focus point directly without complex validation
center_xs = []
center_ys = []
for ctx in contexts:
if ctx.primary_focus:
# Primary focus is now always a single person's center, never averaged
# This means it will never be on the table/empty space
center_xs.append(ctx.primary_focus[0])
center_ys.append(ctx.primary_focus[1])
else:
# Default to center only if no faces detected at all
center_xs.append(source_width // 2)
center_ys.append(source_height // 2)
# Smooth the center points
if len(center_xs) > self.smoothing_window:
kernel_size = min(self.smoothing_window, len(center_xs))
if kernel_size % 2 == 0:
kernel_size -= 1
center_xs = signal.medfilt(center_xs, kernel_size=kernel_size).tolist()
center_ys = signal.medfilt(center_ys, kernel_size=kernel_size).tolist()
# Limit velocity (prevent jarring movements)
center_xs = self._limit_velocity(center_xs, self.max_velocity)
center_ys = self._limit_velocity(center_ys, self.max_velocity)
# Convert to crop regions
crop_regions = []
for center_x, center_y in zip(center_xs, center_ys):
# Calculate top-left corner
x = int(center_x - crop_width // 2)
y = int(center_y - crop_height // 2)
# Clamp to valid bounds
x = max(0, min(x, source_width - crop_width))
y = max(0, min(y, source_height - crop_height))
crop_regions.append(CropRegion(
x=x,
y=y,
width=crop_width,
height=crop_height
))
return crop_regions
def _limit_velocity(self, positions: List[float], max_velocity: float) -> List[float]:
"""
Limit the velocity of position changes.
Args:
positions: List of positions
max_velocity: Maximum allowed change per frame
Returns:
Smoothed positions
"""
if len(positions) <= 1:
return positions
limited = [positions[0]]
for i in range(1, len(positions)):
delta = positions[i] - limited[i - 1]
if abs(delta) > max_velocity:
delta = max_velocity if delta > 0 else -max_velocity
limited.append(limited[i - 1] + delta)
return limited
def apply_framing(
self,
video_clip: VideoFileClip,
framing_plan: FramingPlan,
use_split_screen: bool = False
) -> VideoClip:
"""
Apply smart framing to a video clip.
Args:
video_clip: Source video clip
framing_plan: Framing plan to apply
use_split_screen: Whether to use split screen for multiple people
Returns:
Reframed video clip
"""
# Handle different layout modes
if framing_plan.layout_mode in ["single", "single_speaker"]:
# Single person or single speaker - use focused single framing
return self._apply_single_framing(video_clip, framing_plan)
elif framing_plan.layout_mode == "dual_split" and use_split_screen:
# Two people in conversation - use split screen
return self._apply_split_screen(video_clip, framing_plan)
elif framing_plan.layout_mode == "grid" and use_split_screen:
# 3+ people - use grid layout
return self._apply_grid_layout(video_clip, framing_plan)
else:
# Fallback to single framing
return self._apply_single_framing(video_clip, framing_plan)
def _apply_single_framing(
self,
video_clip: VideoFileClip,
framing_plan: FramingPlan
) -> VideoClip:
"""
Apply single-focus framing (following one person or action).
Args:
video_clip: Source video clip
framing_plan: Framing plan
Returns:
Reframed video clip
"""
def make_frame(t):
# Get the original frame
frame = video_clip.get_frame(t)
# Ensure we have valid crop regions
if not framing_plan.crop_regions:
# Fallback: return center crop
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
if crop_h > h:
crop_h = h
crop_w = int(h / self.target_aspect)
y = (h - crop_h) // 2
x = (w - crop_w) // 2
cropped = frame[y:y + crop_h, x:x + crop_w]
else:
# Calculate exact frame index with decimal precision for interpolation
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
# Get the two adjacent analyzed frames
idx_floor = int(exact_frame_idx)
idx_ceil = idx_floor + 1
# Interpolation factor (0.0 to 1.0)
alpha = exact_frame_idx - idx_floor
# Clamp indices to valid range
idx_floor = max(0, min(idx_floor, len(framing_plan.crop_regions) - 1))
idx_ceil = max(0, min(idx_ceil, len(framing_plan.crop_regions) - 1))
# Get crop regions
crop1 = framing_plan.crop_regions[idx_floor]
crop2 = framing_plan.crop_regions[idx_ceil]
# Linear interpolation between crop regions
x = int(crop1.x * (1 - alpha) + crop2.x * alpha)
y = int(crop1.y * (1 - alpha) + crop2.y * alpha)
width = int(crop1.width * (1 - alpha) + crop2.width * alpha)
height = int(crop1.height * (1 - alpha) + crop2.height * alpha)
# Ensure crop stays within frame bounds
h, w = frame.shape[:2]
x = max(0, min(x, w - width))
y = max(0, min(y, h - height))
width = min(width, w - x)
height = min(height, h - y)
# Crop the frame
cropped = frame[y:y + height, x:x + width]
# Resize to target dimensions
resized = cv2.resize(
cropped,
(self.target_width, self.target_height),
interpolation=cv2.INTER_LINEAR
)
return resized
# MoviePy 2.x compatible way to create VideoClip
new_clip = VideoClip(duration=video_clip.duration)
new_clip.size = (self.target_width, self.target_height)
new_clip.frame_function = make_frame
return new_clip
def _apply_split_screen(
self,
video_clip: VideoFileClip,
framing_plan: FramingPlan
) -> VideoClip:
"""
Apply split screen for two people.
Args:
video_clip: Source video clip
framing_plan: Framing plan
Returns:
Split screen video clip
"""
def make_frame(t):
frame = video_clip.get_frame(t)
# Calculate exact frame index with decimal precision for smooth interpolation
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
frame_idx = int(exact_frame_idx)
# Ensure we have valid contexts
if not framing_plan.frame_contexts:
# Fallback to simple center crop
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
if crop_h > h:
crop_h = h
crop_w = int(h / self.target_aspect)
y = (h - crop_h) // 2
x = (w - crop_w) // 2
cropped = frame[y:y + crop_h, x:x + crop_w]
return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
# Clamp index to valid range
frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
context = framing_plan.frame_contexts[frame_idx]
# Create output frame
output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
if len(context.detected_faces) >= 2:
# Split vertically 50/50 (two columns)
half_width = self.target_width // 2
# Select the 2 most relevant faces
# Priority: ALWAYS show active speaker first + most confident other person
if context.active_speakers and len(context.active_speakers) >= 1:
# Get the PRIMARY speaker (most confident among active speakers)
speaker_faces = [context.detected_faces[i] for i in context.active_speakers
if i < len(context.detected_faces)]
primary_speaker = max(speaker_faces, key=lambda f: f.confidence)
# Get OTHER faces (not the primary speaker)
other_faces = [f for f in context.detected_faces if f != primary_speaker]
if len(speaker_faces) >= 2:
# Multiple speakers: show primary + second most confident speaker
other_speakers = [f for f in speaker_faces if f != primary_speaker]
secondary_person = max(other_speakers, key=lambda f: f.confidence)
elif other_faces:
# One speaker: show speaker + most confident other person
secondary_person = max(other_faces, key=lambda f: f.confidence)
else:
# Fallback: only one person detected
secondary_person = primary_speaker
selected_faces = [primary_speaker, secondary_person]
else:
# No speakers: take 2 most confident faces
selected_faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:2]
# Sort selected faces by horizontal position for consistent left/right placement
faces = sorted(selected_faces, key=lambda f: f.center_x)
left_face = faces[0]
right_face = faces[1]
# Process each person's frame
for idx, face in enumerate([left_face, right_face]):
# Calculate crop region focused on this person
# Each person gets half the width, full target aspect ratio (9:16)
# This ensures NO distortion when resizing
# For split screen: each side is half_width x full_height
# We need to maintain 9:16 aspect for each half
half_width = self.target_width // 2
half_aspect = self.target_height / half_width # Aspect ratio for half
# Determine crop size based on face with padding
face_width = max(face.width, frame.shape[1] // 4) # At least 1/4 of frame width
crop_width = int(face_width * 2.5) # Add padding around face
crop_height = int(crop_width * half_aspect) # Maintain correct aspect
# Ensure crop fits in frame, maintaining aspect ratio
max_crop_width = frame.shape[1] // 2 # Half the source width
max_crop_height = frame.shape[0] # Full source height
# If crop is too wide, scale down proportionally
if crop_width > max_crop_width:
crop_width = max_crop_width
crop_height = int(crop_width * half_aspect)
# If crop is too tall, scale down proportionally
if crop_height > max_crop_height:
crop_height = max_crop_height
crop_width = int(crop_height / half_aspect)
# Center crop on face
x = max(0, face.center_x - crop_width // 2)
y = max(0, face.center_y - crop_height // 2)
# Clamp to frame boundaries
x = min(x, frame.shape[1] - crop_width)
y = min(y, frame.shape[0] - crop_height)
# Extract and resize crop
cropped = frame[y:y + crop_height, x:x + crop_width]
resized = cv2.resize(
cropped,
(half_width, self.target_height),
interpolation=cv2.INTER_LINEAR
)
# Place in output at appropriate horizontal position
x_offset = idx * half_width
output[:, x_offset:x_offset + half_width] = resized
else:
# Fall back to single framing
if framing_plan.crop_regions:
crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
crop = framing_plan.crop_regions[crop_idx]
cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
else:
# Fallback to center crop if no crop regions available
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
if crop_h > h:
crop_h = h
crop_w = int(h / self.target_aspect)
y = (h - crop_h) // 2
x = (w - crop_w) // 2
cropped = frame[y:y + crop_h, x:x + crop_w]
output = cv2.resize(
cropped,
(self.target_width, self.target_height),
interpolation=cv2.INTER_LINEAR
)
return output
# MoviePy 2.x compatible way to create VideoClip
new_clip = VideoClip(duration=video_clip.duration)
new_clip.size = (self.target_width, self.target_height)
new_clip.frame_function = make_frame
return new_clip
def _apply_grid_layout(
self,
video_clip: VideoFileClip,
framing_plan: FramingPlan
) -> VideoClip:
"""
Apply grid layout for 3+ people.
Args:
video_clip: Source video clip
framing_plan: Framing plan
Returns:
Grid layout video clip
"""
def make_frame(t):
frame = video_clip.get_frame(t)
# Calculate exact frame index with decimal precision for smooth interpolation
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
frame_idx = int(exact_frame_idx)
# Ensure we have valid contexts
if not framing_plan.frame_contexts:
# Fallback to simple center crop
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
if crop_h > h:
crop_h = h
crop_w = int(h / self.target_aspect)
y = (h - crop_h) // 2
x = (w - crop_w) // 2
cropped = frame[y:y + crop_h, x:x + crop_w]
return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
# Clamp index to valid range
frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
context = framing_plan.frame_contexts[frame_idx]
output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
num_faces = len(context.detected_faces)
if num_faces >= 3:
# Create 2x2 grid
cell_width = self.target_width // 2
cell_height = self.target_height // 2
for idx, face in enumerate(context.detected_faces[:4]):
# Calculate grid position
row = idx // 2
col = idx % 2
# Each grid cell maintains aspect ratio (square in this case: cell_width = cell_height)
cell_aspect = cell_height / cell_width
# Crop around face with correct aspect ratio
crop_width = frame.shape[1] // 2
crop_height = int(crop_width * cell_aspect)
# Ensure crop fits in frame, maintaining aspect
max_crop_width = frame.shape[1] // 2
max_crop_height = frame.shape[0] // 2
if crop_width > max_crop_width:
crop_width = max_crop_width
crop_height = int(crop_width * cell_aspect)
if crop_height > max_crop_height:
crop_height = max_crop_height
crop_width = int(crop_height / cell_aspect)
# Center crop on face
x = max(0, face.center_x - crop_width // 2)
y = max(0, face.center_y - crop_height // 2)
# Clamp to frame boundaries
x = min(x, frame.shape[1] - crop_width)
y = min(y, frame.shape[0] - crop_height)
cropped = frame[y:y + crop_height, x:x + crop_width]
resized = cv2.resize(
cropped,
(cell_width, cell_height),
interpolation=cv2.INTER_LINEAR
)
# Place in grid
y_offset = row * cell_height
x_offset = col * cell_width
output[y_offset:y_offset + cell_height, x_offset:x_offset + cell_width] = resized
else:
# Fall back to single framing
if framing_plan.crop_regions:
crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
crop = framing_plan.crop_regions[crop_idx]
cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
else:
# Fallback to center crop if no crop regions available
h, w = frame.shape[:2]
crop_h = int(w * self.target_aspect)
crop_w = w
if crop_h > h:
crop_h = h
crop_w = int(h / self.target_aspect)
y = (h - crop_h) // 2
x = (w - crop_w) // 2
cropped = frame[y:y + crop_h, x:x + crop_w]
output = cv2.resize(
cropped,
(self.target_width, self.target_height),
interpolation=cv2.INTER_LINEAR
)
return output
# MoviePy 2.x compatible way to create VideoClip
new_clip = VideoClip(duration=video_clip.duration)
new_clip.size = (self.target_width, self.target_height)
new_clip.frame_function = make_frame
return new_clip
def extract_audio_samples(video_path: str, start_time: float, end_time: float) -> Optional[np.ndarray]:
"""
Extract audio samples from video for speech detection.
Args:
video_path: Path to video file
start_time: Start time in seconds
end_time: End time in seconds
Returns:
Audio samples array or None if no audio
"""
try:
from moviepy.audio.io.AudioFileClip import AudioFileClip
with AudioFileClip(video_path) as audio:
segment = audio.subclipped(start_time, end_time)
fps = getattr(segment, 'fps', 44100)
samples = segment.to_soundarray(fps=fps)
return samples
except Exception as exc:
logger.warning(f"Failed to extract audio: {exc}")
return None

View File

@@ -56,7 +56,14 @@ class TranscriptionService:
)
return self._model
def transcribe(self, audio_path: Path) -> TranscriptionResult:
def transcribe(self, audio_path: Path, output_dir: Optional[Path] = None) -> TranscriptionResult:
if output_dir is not None:
existing_transcription = self.load(output_dir)
if existing_transcription is not None:
logger.info("Transcrição já existe em %s, reutilizando...", output_dir)
return existing_transcription
logger.info("Iniciando transcrição do áudio com FasterWhisper...")
model = self._load_model()
segments, _ = model.transcribe(
str(audio_path),