from __future__ import annotations import logging import re from dataclasses import dataclass from typing import Dict, Iterable, List, Sequence, Tuple, Optional import numpy as np from moviepy.audio.AudioClip import AudioArrayClip, AudioClip from moviepy.audio.io.AudioFileClip import AudioFileClip from moviepy.video.VideoClip import ColorClip, ImageClip, TextClip from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip from moviepy.video.io.VideoFileClip import VideoFileClip from PIL import Image, ImageColor, ImageDraw, ImageFont from video_render.config import Settings from video_render.transcription import TranscriptionResult, WordTiming from video_render.smart_framing import SmartFramer, extract_audio_samples logger = logging.getLogger(__name__) def clamp_time(value: float, minimum: float = 0.0) -> float: return max(minimum, float(value)) @dataclass class CaptionClipSet: base: ImageClip highlights: List[ImageClip] class CaptionBuilder: def __init__(self, settings: Settings) -> None: self.settings = settings self.font_path = settings.rendering.font_path if not self.font_path.exists(): raise FileNotFoundError(f"Fonte nao encontrada: {self.font_path}") self.font = ImageFont.truetype( str(self.font_path), settings.rendering.subtitle_font_size ) self.base_color = ImageColor.getrgb(settings.rendering.base_color) self.highlight_color = ImageColor.getrgb(settings.rendering.highlight_color) self.canvas_width = settings.rendering.frame_width - 160 self.canvas_height = int(settings.rendering.subtitle_font_size * 2.2) self.min_words = settings.rendering.caption_min_words self.max_words = settings.rendering.caption_max_words bbox = self.font.getbbox("Ay") self.text_height = bbox[3] - bbox[1] self.baseline = (self.canvas_height - self.text_height) // 2 - bbox[1] self.space_width = self.font.getbbox(" ")[2] - self.font.getbbox(" ")[0] def build(self, words: Sequence[WordTiming], clip_start: float) -> List[CaptionClipSet]: # Filter out empty, whitespace-only, or very short words (likely noise) valid_words = [ w for w in words if w.word and w.word.strip() and len(w.word.strip()) >= 2 # At least 2 characters and not w.word.strip() in ['...', '..', '.', ',', '-', 'hmm', 'hm', 'ah', 'eh', 'uh'] # Not just punctuation or filler ] # Note: We don't filter out words based on gaps here # Gap detection is handled in _group_words_with_gaps # This ensures captions disappear during silence naturally filtered_words = valid_words # Calculate speech density (words per second) # If density is too low, it's likely just noise/silence being misinterpreted if filtered_words: first_word_time = filtered_words[0].start last_word_time = filtered_words[-1].end duration = last_word_time - first_word_time if duration > 0: words_per_second = len(filtered_words) / duration # Typical speech is 2-3 words per second # If less than 0.5 words/second, it's probably silence/noise if words_per_second < 0.5: logger.debug(f"Captions suprimidas: densidade muito baixa ({words_per_second:.2f} palavras/seg)") return [] # Only show captions if we have at least 3 valid words (reduced from 5 for 2-word groups) # This prevents showing captions for noise/mumbling if len(filtered_words) < 3: return [] grouped = self._group_words_with_gaps(filtered_words) clip_sets: List[CaptionClipSet] = [] for group in grouped: group_start = clamp_time(group[0].start, minimum=clip_start) group_end = clamp_time(group[-1].end, minimum=group_start + 0.05) duration = max(0.05, group_end - group_start) start_offset = group_start - clip_start base_image, highlight_images = self._render_group(group) base_clip = ( ImageClip(np.array(base_image)) .with_start(start_offset) .with_duration(duration) ) highlight_clips: List[ImageClip] = [] for word, image in zip(group, highlight_images): h_start = clamp_time(word.start, minimum=clip_start) - clip_start h_end = clamp_time(word.end, minimum=word.start + 0.02) - clip_start h_duration = max(0.05, h_end - h_start) highlight_clip = ( ImageClip(np.array(image)) .with_start(h_start) .with_duration(h_duration) ) highlight_clips.append(highlight_clip) clip_sets.append(CaptionClipSet(base=base_clip, highlights=highlight_clips)) return clip_sets def _render_group(self, group: Sequence[WordTiming]) -> Tuple[Image.Image, List[Image.Image]]: texts = [self._clean_word(word.word) for word in group] widths = [] for text in texts: bbox = self.font.getbbox(text) widths.append(bbox[2] - bbox[0]) total_width = sum(widths) if len(widths) > 1: total_width += self.space_width * (len(widths) - 1) # Check if text needs to wrap to multiple lines # If total width exceeds canvas width, break into 2 lines needs_wrap = total_width > self.canvas_width if needs_wrap: # Split into 2 lines - try to balance the lines mid_point = len(texts) // 2 line1_texts = texts[:mid_point] line2_texts = texts[mid_point:] line1_widths = widths[:mid_point] line2_widths = widths[mid_point:] # Calculate widths for each line line1_width = sum(line1_widths) if len(line1_widths) > 1: line1_width += self.space_width * (len(line1_widths) - 1) line2_width = sum(line2_widths) if len(line2_widths) > 1: line2_width += self.space_width * (len(line2_widths) - 1) # Double the canvas height for 2 lines canvas_height = self.canvas_height * 2 base_image = Image.new("RGBA", (self.canvas_width, canvas_height), (0, 0, 0, 0)) base_draw = ImageDraw.Draw(base_image) highlight_images: List[Image.Image] = [] # Stroke settings: 8px black stroke for better readability stroke_width = 8 stroke_color = (0, 0, 0, 255) # Black # Draw line 1 x = max(0, (self.canvas_width - line1_width) // 2) y = self.baseline for i, (text, width) in enumerate(zip(line1_texts, line1_widths)): base_draw.text( (x, y), text, font=self.font, fill=self.base_color, stroke_width=stroke_width, stroke_fill=stroke_color ) highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0)) highlight_draw = ImageDraw.Draw(highlight_image) highlight_draw.text( (x, y), text, font=self.font, fill=self.highlight_color, stroke_width=stroke_width, stroke_fill=stroke_color ) highlight_images.append(highlight_image) x += width + self.space_width # Draw line 2 x = max(0, (self.canvas_width - line2_width) // 2) y = self.baseline + self.text_height + 5 # 5px spacing between lines for i, (text, width) in enumerate(zip(line2_texts, line2_widths)): base_draw.text( (x, y), text, font=self.font, fill=self.base_color, stroke_width=stroke_width, stroke_fill=stroke_color ) highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0)) highlight_draw = ImageDraw.Draw(highlight_image) highlight_draw.text( (x, y), text, font=self.font, fill=self.highlight_color, stroke_width=stroke_width, stroke_fill=stroke_color ) highlight_images.append(highlight_image) x += width + self.space_width return base_image, highlight_images # Single line rendering (original code) start_x = max(0, (self.canvas_width - total_width) // 2) base_image = Image.new("RGBA", (self.canvas_width, self.canvas_height), (0, 0, 0, 0)) base_draw = ImageDraw.Draw(base_image) highlight_images: List[Image.Image] = [] x = start_x # Stroke settings: 8px black stroke for better readability stroke_width = 8 stroke_color = (0, 0, 0, 255) # Black for text, width in zip(texts, widths): # Draw base text with stroke base_draw.text( (x, self.baseline), text, font=self.font, fill=self.base_color, stroke_width=stroke_width, stroke_fill=stroke_color ) # Draw highlight text with stroke highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0)) highlight_draw = ImageDraw.Draw(highlight_image) highlight_draw.text( (x, self.baseline), text, font=self.font, fill=self.highlight_color, stroke_width=stroke_width, stroke_fill=stroke_color ) highlight_images.append(highlight_image) x += width + self.space_width return base_image, highlight_images def _group_words(self, words: Sequence[WordTiming]) -> List[List[WordTiming]]: if not words: return [] grouped: List[List[WordTiming]] = [] buffer: List[WordTiming] = [] for word in words: buffer.append(word) if len(buffer) == self.max_words: grouped.append(buffer) buffer = [] if buffer: if len(buffer) == 1 and grouped: grouped[-1].extend(buffer) else: grouped.append(buffer) for idx, group in enumerate(grouped[:-1]): if len(group) < self.min_words and len(grouped[idx + 1]) > self.min_words: deficit = self.min_words - len(group) transfer = grouped[idx + 1][:deficit] grouped[idx] = group + transfer grouped[idx + 1] = grouped[idx + 1][deficit:] grouped = [grp for grp in grouped if grp] return grouped def _group_words_with_gaps(self, words: Sequence[WordTiming]) -> List[List[WordTiming]]: """ Group words into 2-word chunks, respecting silence gaps. Creates natural breaks where there are pauses > 1.5s """ if not words: return [] grouped: List[List[WordTiming]] = [] buffer: List[WordTiming] = [] for i, word in enumerate(words): # Check if there's a long pause before this word if i > 0: gap = word.start - words[i-1].end # If gap > 1.5s, finish current buffer and start new group if gap > 1.5: if buffer: grouped.append(buffer) buffer = [] buffer.append(word) # Group into 2 words maximum if len(buffer) == 2: grouped.append(buffer) buffer = [] # Handle remaining words if buffer: if len(buffer) == 1 and grouped: # Add single remaining word to last group grouped[-1].append(buffer[0]) else: grouped.append(buffer) return [grp for grp in grouped if grp] @staticmethod def _clean_word(text: str) -> str: text = text.strip() text = re.sub(r"\s+", " ", text) return text or "..." class VideoRenderer: def __init__(self, settings: Settings) -> None: self.settings = settings self.captions = CaptionBuilder(settings) self.smart_framer = SmartFramer( target_width=settings.rendering.frame_width, target_height=settings.rendering.frame_height, frame_skip=settings.rendering.smart_framing_frame_skip, smoothing_window=settings.rendering.smart_framing_smoothing_window ) def render( self, workspace_path: str, highlight_windows: Sequence, transcription: TranscriptionResult, titles: Sequence[str], output_dir, ) -> List[Tuple[str, float, float, str, str, int]]: results: List[Tuple[str, float, float, str, str, int]] = [] with VideoFileClip(workspace_path) as base_clip: video_duration = base_clip.duration or 0 for index, window in enumerate(highlight_windows, start=1): start = clamp_time(window.start) end = clamp_time(window.end) start = min(start, video_duration) end = min(end, video_duration) if end <= start: logger.info("Janela ignorada por intervalo invalido: %s", window) continue subclip = base_clip.subclipped(start, end) try: rendered_path = self._render_single_clip( subclip=subclip, start=start, end=end, title=titles[index - 1] if index - 1 < len(titles) else window.summary, summary=window.summary, index=index, transcription=transcription, output_dir=output_dir, source_path=workspace_path, ) finally: subclip.close() results.append( ( rendered_path, float(start), float(end), titles[index - 1] if index - 1 < len(titles) else window.summary, window.summary, index, ) ) return results def _render_single_clip( self, subclip: VideoFileClip, start: float, end: float, title: str, summary: str, index: int, transcription: TranscriptionResult, output_dir, source_path: str, ) -> str: duration = end - start frame_w = self.settings.rendering.frame_width frame_h = self.settings.rendering.frame_height # Removed top panel - no longer showing title bottom_h = int(frame_h * 0.20) # Use smart framing to create intelligent 9:16 video (if enabled) if self.settings.rendering.enable_smart_framing: logger.info(f"Creating smart framing plan for clip {index} ({start:.2f}s - {end:.2f}s)") try: # Extract audio for speech detection audio_samples = extract_audio_samples(source_path, start, end) # Create framing plan framing_plan = self.smart_framer.create_framing_plan( video_path=source_path, start_time=start, end_time=end, audio_samples=audio_samples ) # Apply smart framing based on detected layout use_split_screen = framing_plan.layout_mode in ["dual_split", "grid"] video_clip = self.smart_framer.apply_framing( video_clip=subclip, framing_plan=framing_plan, use_split_screen=use_split_screen ) logger.info(f"Smart framing applied: layout={framing_plan.layout_mode}, " f"faces_detected={len(framing_plan.frame_contexts[0].detected_faces) if framing_plan.frame_contexts else 0}") except Exception as exc: logger.warning(f"Smart framing failed for clip {index}, falling back to center crop: {exc}", exc_info=True) # Fallback to center crop (maintains aspect ratio, crops to fit) video_area_h = max(1, frame_h - bottom_h) # Use MAX to ensure video covers entire area (will crop excess) scale_factor = max( frame_w / subclip.w, video_area_h / subclip.h, ) # Resize to cover area resized_clip = subclip.resized(scale_factor) # Calculate crop region (center crop) crop_x1 = max(0, (resized_clip.w - frame_w) // 2) crop_y1 = max(0, (resized_clip.h - video_area_h) // 2) crop_x2 = crop_x1 + frame_w crop_y2 = crop_y1 + video_area_h # Crop to fit target dimensions using MoviePy crop(x1, y1, x2, y2) cropped_clip = resized_clip.cropped( x1=crop_x1, y1=crop_y1, x2=crop_x2, y2=crop_y2 ) video_clip = cropped_clip.with_position((0, 0)) resized_clip.close() else: # Use center crop (smart framing disabled) logger.info(f"Using center crop for clip {index} (smart framing disabled)") video_area_h = max(1, frame_h - bottom_h) # Use MAX to ensure video covers entire area (will crop excess) scale_factor = max( frame_w / subclip.w, video_area_h / subclip.h, ) # Resize to cover area resized_clip = subclip.resized(scale_factor) # Calculate crop region (center crop) crop_x1 = max(0, (resized_clip.w - frame_w) // 2) crop_y1 = max(0, (resized_clip.h - video_area_h) // 2) crop_x2 = crop_x1 + frame_w crop_y2 = crop_y1 + video_area_h # Crop to fit target dimensions using MoviePy crop(x1, y1, x2, y2) cropped_clip = resized_clip.cropped( x1=crop_x1, y1=crop_y1, x2=crop_x2, y2=crop_y2 ) video_clip = cropped_clip.with_position((0, 0)) resized_clip.close() background = ColorClip(size=(frame_w, frame_h), color=(0, 0, 0)).with_duration(duration) # Removed top panel and title - no longer needed bottom_panel = ( ColorClip(size=(frame_w, bottom_h), color=(12, 12, 12)) .with_position((0, frame_h - bottom_h)) .with_duration(duration) .with_opacity(0.85) ) words = self._collect_words(transcription, start, end) # Calculate speech coverage: how much of the clip has actual speech? # If less than 30% of the clip has speech, don't show captions clip_duration = end - start if words and clip_duration > 0: # Calculate total time with speech total_speech_time = sum(w.end - w.start for w in words) speech_coverage = total_speech_time / clip_duration if speech_coverage < 0.3: # Less than 30% speech logger.debug(f"Captions suprimidas: cobertura de fala baixa ({speech_coverage:.1%})") words = [] # Clear words to prevent captions # Only build captions if there are actual words to display # This prevents empty/placeholder captions from appearing caption_sets = self.captions.build(words, clip_start=start) if words else [] caption_clips = [] caption_resources: List[ImageClip] = [] # Position captions 120px below center (for 1920px height, center is 960px, so 1080px) # This ensures they're visible, well-positioned, and don't interfere with faces # Range: 100-150px as requested, using 120px for optimal positioning center_y = frame_h // 2 caption_y = center_y + 120 caption_margin = 20 # Ensure captions stay within reasonable bounds (no top panel now) min_caption_y = caption_margin max_caption_y = frame_h - bottom_h - self.captions.canvas_height - caption_margin if max_caption_y < min_caption_y: caption_y = min_caption_y else: caption_y = min(max(caption_y, min_caption_y), max_caption_y) for clip_set in caption_sets: base_positioned = clip_set.base.with_position(("center", caption_y)) caption_clips.append(base_positioned) caption_resources.append(clip_set.base) for highlight in clip_set.highlights: positioned = highlight.with_position(("center", caption_y)) caption_clips.append(positioned) caption_resources.append(highlight) # No fallback captions - if there are no dynamic captions, show nothing # This matches Opus Clip behavior where captions only appear when there's actual speech audio_clip, audio_needs_close = self._materialize_audio( source_path=source_path, start=start, end=end, duration=duration, fallback_audio=video_clip.audio or subclip.audio, ) # Composite with background, bottom panel, video, and captions only (no top panel or title) composite = CompositeVideoClip( [background, bottom_panel, video_clip, *caption_clips], size=(frame_w, frame_h), ) if audio_clip is not None: composite = self._with_audio(composite, audio_clip) output_path = output_dir / f"clip_{index:02d}.mp4" self._write_with_fallback( composite=composite, output_path=output_path, index=index, output_dir=output_dir, ) composite.close() video_clip.close() background.close() bottom_panel.close() for clip in caption_clips: clip.close() for clip in caption_resources: clip.close() if audio_clip is not None and audio_needs_close: audio_clip.close() return str(output_path) def _materialize_audio( self, *, source_path: str, start: float, end: float, duration: float, fallback_audio, ) -> Tuple[Optional[AudioClip], bool]: try: with AudioFileClip(source_path) as audio_file: segment = audio_file.subclipped(start, end) fps = ( getattr(segment, "fps", None) or getattr(audio_file, "fps", None) or 44100 ) samples = segment.to_soundarray(fps=fps) except Exception: logger.warning( "Falha ao carregar audio independente; utilizando fluxo original", exc_info=True, ) return fallback_audio, False audio_clip = AudioArrayClip(samples, fps=fps).with_duration(duration) return audio_clip, True def _collect_words( self, transcription: TranscriptionResult, start: float, end: float ) -> List[WordTiming]: collected: List[WordTiming] = [] for segment in transcription.segments: if segment.end < start or segment.start > end: continue if segment.words: for word in segment.words: if word.end < start or word.start > end: continue collected.append( WordTiming( start=max(start, word.start), end=min(end, word.end), word=word.word, ) ) else: collected.extend(self._fallback_words(segment.text, segment.start, segment.end, start, end)) collected.sort(key=lambda w: w.start) return collected def _fallback_words( self, text: str, segment_start: float, segment_end: float, window_start: float, window_end: float, ) -> Iterable[WordTiming]: words = [w for w in re.split(r"\s+", text.strip()) if w] if not words: return [] seg_start = max(segment_start, window_start) seg_end = min(segment_end, window_end) duration = max(0.01, seg_end - seg_start) step = duration / len(words) timings: List[WordTiming] = [] for idx, word in enumerate(words): w_start = seg_start + idx * step w_end = min(seg_end, w_start + step) timings.append(WordTiming(start=w_start, end=w_end, word=word)) return timings @staticmethod def _wrap_text(text: str, max_width: int) -> str: text = text.strip() if not text: return "" words = text.split() lines: List[str] = [] current: List[str] = [] for word in words: current.append(word) if len(" ".join(current)) > max_width // 18: lines.append(" ".join(current[:-1])) current = [current[-1]] if current: lines.append(" ".join(current)) return "\n".join(lines) def _write_with_fallback( self, *, composite: CompositeVideoClip, output_path, index: int, output_dir, ) -> None: attempts = self._encoding_attempts() temp_audio_path = output_dir / f"temp_audio_{index:02d}.m4a" last_error: Exception | None = None for attempt in attempts: codec = attempt["codec"] bitrate = attempt["bitrate"] preset = attempt["preset"] ffmpeg_params = ["-pix_fmt", "yuv420p"] if preset: ffmpeg_params = ["-preset", preset, "-pix_fmt", "yuv420p"] try: logger.info( "Renderizando clip %02d com codec %s (bitrate=%s, preset=%s)", index, codec, bitrate, preset or "default", ) composite.write_videofile( str(output_path), codec=codec, audio_codec=self.settings.rendering.audio_codec, fps=self.settings.rendering.fps, bitrate=bitrate, ffmpeg_params=ffmpeg_params, temp_audiofile=str(temp_audio_path), remove_temp=True, threads=4, ) return except Exception as exc: # noqa: BLE001 - propagate after fallbacks last_error = exc logger.warning( "Falha ao renderizar com codec %s: %s", codec, exc, exc_info=True ) if output_path.exists(): output_path.unlink(missing_ok=True) if temp_audio_path.exists(): temp_audio_path.unlink(missing_ok=True) raise RuntimeError("Todas as tentativas de renderizacao falharam") from last_error def _encoding_attempts(self) -> List[Dict[str, str | None]]: settings = self.settings.rendering attempts: List[Dict[str, str | None]] = [] attempts.append( { "codec": settings.video_codec, "bitrate": settings.bitrate, "preset": settings.preset, } ) deduped: List[Dict[str, str | None]] = [] seen = set() for attempt in attempts: key = (attempt["codec"], attempt["bitrate"], attempt["preset"]) if key in seen: continue seen.add(key) deduped.append(attempt) return deduped @staticmethod def _with_audio( composite: CompositeVideoClip, audio_clip, ) -> CompositeVideoClip: """Attach audio to a composite clip across MoviePy versions.""" if hasattr(composite, "with_audio"): return composite.with_audio(audio_clip) if hasattr(composite, "set_audio"): return composite.set_audio(audio_clip) raise AttributeError("CompositeVideoClip does not support audio assignment") @staticmethod def _make_textclip( *, text: str, font_path, font_size: int, color: str, size: Tuple[int, int], ) -> TextClip: """Create a TextClip compatible with MoviePy 1.x and 2.x. MoviePy 2.x removed the 'align' keyword from TextClip. We try with 'align' for older versions and fall back to a call without it when unsupported. """ kwargs = dict( text=text, font=str(font_path), font_size=font_size, color=color, method="caption", size=size, ) try: return TextClip(**kwargs, align="center") # MoviePy 1.x style except TypeError: logger.debug("TextClip 'align' not supported; falling back without it") return TextClip(**kwargs) # MoviePy 2.x style