Files
video-render/video_render/rendering.py

827 lines
30 KiB
Python

from __future__ import annotations
import logging
import re
from dataclasses import dataclass
from typing import Dict, Iterable, List, Sequence, Tuple, Optional
import numpy as np
from moviepy.audio.AudioClip import AudioArrayClip, AudioClip
from moviepy.audio.io.AudioFileClip import AudioFileClip
from moviepy.video.VideoClip import ColorClip, ImageClip, TextClip
from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
from moviepy.video.io.VideoFileClip import VideoFileClip
from PIL import Image, ImageColor, ImageDraw, ImageFont
from video_render.config import Settings
from video_render.transcription import TranscriptionResult, WordTiming
from video_render.smart_framing import SmartFramer, extract_audio_samples
logger = logging.getLogger(__name__)
def clamp_time(value: float, minimum: float = 0.0) -> float:
return max(minimum, float(value))
@dataclass
class CaptionClipSet:
base: ImageClip
highlights: List[ImageClip]
class CaptionBuilder:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self.font_path = settings.rendering.font_path
if not self.font_path.exists():
raise FileNotFoundError(f"Fonte nao encontrada: {self.font_path}")
self.font = ImageFont.truetype(
str(self.font_path), settings.rendering.subtitle_font_size
)
self.base_color = ImageColor.getrgb(settings.rendering.base_color)
self.highlight_color = ImageColor.getrgb(settings.rendering.highlight_color)
self.canvas_width = settings.rendering.frame_width - 160
self.canvas_height = int(settings.rendering.subtitle_font_size * 2.2)
self.min_words = settings.rendering.caption_min_words
self.max_words = settings.rendering.caption_max_words
bbox = self.font.getbbox("Ay")
self.text_height = bbox[3] - bbox[1]
self.baseline = (self.canvas_height - self.text_height) // 2 - bbox[1]
self.space_width = self.font.getbbox(" ")[2] - self.font.getbbox(" ")[0]
def build(self, words: Sequence[WordTiming], clip_start: float) -> List[CaptionClipSet]:
# Filter out empty, whitespace-only, or very short words (likely noise)
valid_words = [
w for w in words
if w.word
and w.word.strip()
and len(w.word.strip()) >= 2 # At least 2 characters
and not w.word.strip() in ['...', '..', '.', ',', '-', 'hmm', 'hm', 'ah', 'eh', 'uh'] # Not just punctuation or filler
]
# Note: We don't filter out words based on gaps here
# Gap detection is handled in _group_words_with_gaps
# This ensures captions disappear during silence naturally
filtered_words = valid_words
# Calculate speech density (words per second)
# If density is too low, it's likely just noise/silence being misinterpreted
if filtered_words:
first_word_time = filtered_words[0].start
last_word_time = filtered_words[-1].end
duration = last_word_time - first_word_time
if duration > 0:
words_per_second = len(filtered_words) / duration
# Typical speech is 2-3 words per second
# If less than 0.5 words/second, it's probably silence/noise
if words_per_second < 0.5:
logger.debug(f"Captions suprimidas: densidade muito baixa ({words_per_second:.2f} palavras/seg)")
return []
# Only show captions if we have at least 3 valid words (reduced from 5 for 2-word groups)
# This prevents showing captions for noise/mumbling
if len(filtered_words) < 3:
return []
grouped = self._group_words_with_gaps(filtered_words)
clip_sets: List[CaptionClipSet] = []
for group in grouped:
group_start = clamp_time(group[0].start, minimum=clip_start)
group_end = clamp_time(group[-1].end, minimum=group_start + 0.05)
duration = max(0.05, group_end - group_start)
start_offset = group_start - clip_start
base_image, highlight_images = self._render_group(group)
base_clip = (
ImageClip(np.array(base_image))
.with_start(start_offset)
.with_duration(duration)
)
highlight_clips: List[ImageClip] = []
for word, image in zip(group, highlight_images):
h_start = clamp_time(word.start, minimum=clip_start) - clip_start
h_end = clamp_time(word.end, minimum=word.start + 0.02) - clip_start
h_duration = max(0.05, h_end - h_start)
highlight_clip = (
ImageClip(np.array(image))
.with_start(h_start)
.with_duration(h_duration)
)
highlight_clips.append(highlight_clip)
clip_sets.append(CaptionClipSet(base=base_clip, highlights=highlight_clips))
return clip_sets
def _render_group(self, group: Sequence[WordTiming]) -> Tuple[Image.Image, List[Image.Image]]:
texts = [self._clean_word(word.word) for word in group]
widths = []
for text in texts:
bbox = self.font.getbbox(text)
widths.append(bbox[2] - bbox[0])
total_width = sum(widths)
if len(widths) > 1:
total_width += self.space_width * (len(widths) - 1)
# Check if text needs to wrap to multiple lines
# If total width exceeds canvas width, break into 2 lines
needs_wrap = total_width > self.canvas_width
if needs_wrap:
# Split into 2 lines - try to balance the lines
mid_point = len(texts) // 2
line1_texts = texts[:mid_point]
line2_texts = texts[mid_point:]
line1_widths = widths[:mid_point]
line2_widths = widths[mid_point:]
# Calculate widths for each line
line1_width = sum(line1_widths)
if len(line1_widths) > 1:
line1_width += self.space_width * (len(line1_widths) - 1)
line2_width = sum(line2_widths)
if len(line2_widths) > 1:
line2_width += self.space_width * (len(line2_widths) - 1)
# Double the canvas height for 2 lines
canvas_height = self.canvas_height * 2
base_image = Image.new("RGBA", (self.canvas_width, canvas_height), (0, 0, 0, 0))
base_draw = ImageDraw.Draw(base_image)
highlight_images: List[Image.Image] = []
# Stroke settings: 8px black stroke for better readability
stroke_width = 8
stroke_color = (0, 0, 0, 255) # Black
# Draw line 1
x = max(0, (self.canvas_width - line1_width) // 2)
y = self.baseline
for i, (text, width) in enumerate(zip(line1_texts, line1_widths)):
base_draw.text(
(x, y),
text,
font=self.font,
fill=self.base_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
highlight_draw = ImageDraw.Draw(highlight_image)
highlight_draw.text(
(x, y),
text,
font=self.font,
fill=self.highlight_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
highlight_images.append(highlight_image)
x += width + self.space_width
# Draw line 2
x = max(0, (self.canvas_width - line2_width) // 2)
y = self.baseline + self.text_height + 5 # 5px spacing between lines
for i, (text, width) in enumerate(zip(line2_texts, line2_widths)):
base_draw.text(
(x, y),
text,
font=self.font,
fill=self.base_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
highlight_draw = ImageDraw.Draw(highlight_image)
highlight_draw.text(
(x, y),
text,
font=self.font,
fill=self.highlight_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
highlight_images.append(highlight_image)
x += width + self.space_width
return base_image, highlight_images
# Single line rendering (original code)
start_x = max(0, (self.canvas_width - total_width) // 2)
base_image = Image.new("RGBA", (self.canvas_width, self.canvas_height), (0, 0, 0, 0))
base_draw = ImageDraw.Draw(base_image)
highlight_images: List[Image.Image] = []
x = start_x
# Stroke settings: 8px black stroke for better readability
stroke_width = 8
stroke_color = (0, 0, 0, 255) # Black
for text, width in zip(texts, widths):
# Draw base text with stroke
base_draw.text(
(x, self.baseline),
text,
font=self.font,
fill=self.base_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
# Draw highlight text with stroke
highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
highlight_draw = ImageDraw.Draw(highlight_image)
highlight_draw.text(
(x, self.baseline),
text,
font=self.font,
fill=self.highlight_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
highlight_images.append(highlight_image)
x += width + self.space_width
return base_image, highlight_images
def _group_words(self, words: Sequence[WordTiming]) -> List[List[WordTiming]]:
if not words:
return []
grouped: List[List[WordTiming]] = []
buffer: List[WordTiming] = []
for word in words:
buffer.append(word)
if len(buffer) == self.max_words:
grouped.append(buffer)
buffer = []
if buffer:
if len(buffer) == 1 and grouped:
grouped[-1].extend(buffer)
else:
grouped.append(buffer)
for idx, group in enumerate(grouped[:-1]):
if len(group) < self.min_words and len(grouped[idx + 1]) > self.min_words:
deficit = self.min_words - len(group)
transfer = grouped[idx + 1][:deficit]
grouped[idx] = group + transfer
grouped[idx + 1] = grouped[idx + 1][deficit:]
grouped = [grp for grp in grouped if grp]
return grouped
def _group_words_with_gaps(self, words: Sequence[WordTiming]) -> List[List[WordTiming]]:
"""
Group words into 2-word chunks, respecting silence gaps.
Creates natural breaks where there are pauses > 1.5s
"""
if not words:
return []
grouped: List[List[WordTiming]] = []
buffer: List[WordTiming] = []
for i, word in enumerate(words):
# Check if there's a long pause before this word
if i > 0:
gap = word.start - words[i-1].end
# If gap > 1.5s, finish current buffer and start new group
if gap > 1.5:
if buffer:
grouped.append(buffer)
buffer = []
buffer.append(word)
# Group into 2 words maximum
if len(buffer) == 2:
grouped.append(buffer)
buffer = []
# Handle remaining words
if buffer:
if len(buffer) == 1 and grouped:
# Add single remaining word to last group
grouped[-1].append(buffer[0])
else:
grouped.append(buffer)
return [grp for grp in grouped if grp]
@staticmethod
def _clean_word(text: str) -> str:
text = text.strip()
text = re.sub(r"\s+", " ", text)
return text or "..."
class VideoRenderer:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self.captions = CaptionBuilder(settings)
self.smart_framer = SmartFramer(
target_width=settings.rendering.frame_width,
target_height=settings.rendering.frame_height,
frame_skip=settings.rendering.smart_framing_frame_skip,
smoothing_window=settings.rendering.smart_framing_smoothing_window,
max_velocity=settings.rendering.smart_framing_max_velocity,
person_switch_cooldown=settings.rendering.smart_framing_person_switch_cooldown,
response_time=settings.rendering.smart_framing_response_time,
group_padding=settings.rendering.smart_framing_group_padding,
max_zoom_out=settings.rendering.smart_framing_max_zoom_out,
dead_zone=settings.rendering.smart_framing_dead_zone,
min_face_confidence=settings.rendering.smart_framing_min_confidence
)
def render(
self,
workspace_path: str,
highlight_windows: Sequence,
transcription: TranscriptionResult,
titles: Sequence[str],
output_dir,
) -> List[Tuple[str, float, float, str, str, int]]:
results: List[Tuple[str, float, float, str, str, int]] = []
with VideoFileClip(workspace_path) as base_clip:
video_duration = base_clip.duration or 0
for index, window in enumerate(highlight_windows, start=1):
start = clamp_time(window.start)
end = clamp_time(window.end)
start = min(start, video_duration)
end = min(end, video_duration)
if end <= start:
logger.info("Janela ignorada por intervalo invalido: %s", window)
continue
subclip = base_clip.subclipped(start, end)
try:
rendered_path = self._render_single_clip(
subclip=subclip,
start=start,
end=end,
title=titles[index - 1] if index - 1 < len(titles) else window.summary,
summary=window.summary,
index=index,
transcription=transcription,
output_dir=output_dir,
source_path=workspace_path,
)
finally:
subclip.close()
results.append(
(
rendered_path,
float(start),
float(end),
titles[index - 1] if index - 1 < len(titles) else window.summary,
window.summary,
index,
)
)
return results
def _render_single_clip(
self,
subclip: VideoFileClip,
start: float,
end: float,
title: str,
summary: str,
index: int,
transcription: TranscriptionResult,
output_dir,
source_path: str,
) -> str:
duration = end - start
frame_w = self.settings.rendering.frame_width
frame_h = self.settings.rendering.frame_height
# Removed top panel - no longer showing title
bottom_h = int(frame_h * 0.20)
# Use smart framing to create intelligent 9:16 video (if enabled)
if self.settings.rendering.enable_smart_framing:
logger.info(f"Creating smart framing plan for clip {index} ({start:.2f}s - {end:.2f}s)")
try:
# Extract audio for speech detection
audio_samples = extract_audio_samples(source_path, start, end)
# Create framing plan
framing_plan = self.smart_framer.create_framing_plan(
video_path=source_path,
start_time=start,
end_time=end,
audio_samples=audio_samples
)
# Apply smart framing (always single-person focus)
video_clip = self.smart_framer.apply_framing(
video_clip=subclip,
framing_plan=framing_plan
)
logger.info(f"Smart framing applied: layout={framing_plan.layout_mode}, "
f"faces_detected={len(framing_plan.frame_contexts[0].detected_faces) if framing_plan.frame_contexts else 0}")
except Exception as exc:
logger.warning(f"Smart framing failed for clip {index}, falling back to center crop: {exc}", exc_info=True)
# Fallback to center crop (maintains aspect ratio, crops to fit)
video_area_h = max(1, frame_h - bottom_h)
# Use MAX to ensure video covers entire area (will crop excess)
scale_factor = max(
frame_w / subclip.w,
video_area_h / subclip.h,
)
# Resize to cover area
resized_clip = subclip.resized(scale_factor)
# Calculate crop region (center crop)
crop_x1 = max(0, (resized_clip.w - frame_w) // 2)
crop_y1 = max(0, (resized_clip.h - video_area_h) // 2)
crop_x2 = crop_x1 + frame_w
crop_y2 = crop_y1 + video_area_h
# Crop to fit target dimensions using MoviePy crop(x1, y1, x2, y2)
cropped_clip = resized_clip.cropped(
x1=crop_x1,
y1=crop_y1,
x2=crop_x2,
y2=crop_y2
)
video_clip = cropped_clip.with_position((0, 0))
resized_clip.close()
else:
# Use center crop (smart framing disabled)
logger.info(f"Using center crop for clip {index} (smart framing disabled)")
video_area_h = max(1, frame_h - bottom_h)
# Use MAX to ensure video covers entire area (will crop excess)
scale_factor = max(
frame_w / subclip.w,
video_area_h / subclip.h,
)
# Resize to cover area
resized_clip = subclip.resized(scale_factor)
# Calculate crop region (center crop)
crop_x1 = max(0, (resized_clip.w - frame_w) // 2)
crop_y1 = max(0, (resized_clip.h - video_area_h) // 2)
crop_x2 = crop_x1 + frame_w
crop_y2 = crop_y1 + video_area_h
# Crop to fit target dimensions using MoviePy crop(x1, y1, x2, y2)
cropped_clip = resized_clip.cropped(
x1=crop_x1,
y1=crop_y1,
x2=crop_x2,
y2=crop_y2
)
video_clip = cropped_clip.with_position((0, 0))
resized_clip.close()
background = ColorClip(size=(frame_w, frame_h), color=(0, 0, 0)).with_duration(duration)
# Removed top panel and title - no longer needed
bottom_panel = (
ColorClip(size=(frame_w, bottom_h), color=(12, 12, 12))
.with_position((0, frame_h - bottom_h))
.with_duration(duration)
.with_opacity(0.85)
)
words = self._collect_words(transcription, start, end)
# Calculate speech coverage: how much of the clip has actual speech?
# If less than 30% of the clip has speech, don't show captions
clip_duration = end - start
if words and clip_duration > 0:
# Calculate total time with speech
total_speech_time = sum(w.end - w.start for w in words)
speech_coverage = total_speech_time / clip_duration
if speech_coverage < 0.3: # Less than 30% speech
logger.debug(f"Captions suprimidas: cobertura de fala baixa ({speech_coverage:.1%})")
words = [] # Clear words to prevent captions
# Only build captions if there are actual words to display
# This prevents empty/placeholder captions from appearing
caption_sets = self.captions.build(words, clip_start=start) if words else []
caption_clips = []
caption_resources: List[ImageClip] = []
# Position captions 120px below center (for 1920px height, center is 960px, so 1080px)
# This ensures they're visible, well-positioned, and don't interfere with faces
# Range: 100-150px as requested, using 120px for optimal positioning
center_y = frame_h // 2
caption_y = center_y + 120
caption_margin = 20
# Ensure captions stay within reasonable bounds (no top panel now)
min_caption_y = caption_margin
max_caption_y = frame_h - bottom_h - self.captions.canvas_height - caption_margin
if max_caption_y < min_caption_y:
caption_y = min_caption_y
else:
caption_y = min(max(caption_y, min_caption_y), max_caption_y)
for clip_set in caption_sets:
base_positioned = clip_set.base.with_position(("center", caption_y))
caption_clips.append(base_positioned)
caption_resources.append(clip_set.base)
for highlight in clip_set.highlights:
positioned = highlight.with_position(("center", caption_y))
caption_clips.append(positioned)
caption_resources.append(highlight)
# No fallback captions - if there are no dynamic captions, show nothing
# This matches Opus Clip behavior where captions only appear when there's actual speech
audio_clip, audio_needs_close = self._materialize_audio(
source_path=source_path,
start=start,
end=end,
duration=duration,
fallback_audio=video_clip.audio or subclip.audio,
)
# Composite with background, bottom panel, video, and captions only (no top panel or title)
composite = CompositeVideoClip(
[background, bottom_panel, video_clip, *caption_clips],
size=(frame_w, frame_h),
)
if audio_clip is not None:
composite = self._with_audio(composite, audio_clip)
output_path = output_dir / f"clip_{index:02d}.mp4"
self._write_with_fallback(
composite=composite,
output_path=output_path,
index=index,
output_dir=output_dir,
)
composite.close()
video_clip.close()
background.close()
bottom_panel.close()
for clip in caption_clips:
clip.close()
for clip in caption_resources:
clip.close()
if audio_clip is not None and audio_needs_close:
audio_clip.close()
# Force garbage collection to free memory after rendering
import gc
gc.collect()
return str(output_path)
def _materialize_audio(
self,
*,
source_path: str,
start: float,
end: float,
duration: float,
fallback_audio,
) -> Tuple[Optional[AudioClip], bool]:
try:
with AudioFileClip(source_path) as audio_file:
segment = audio_file.subclipped(start, end)
fps = (
getattr(segment, "fps", None)
or getattr(audio_file, "fps", None)
or 44100
)
samples = segment.to_soundarray(fps=fps)
except Exception:
logger.warning(
"Falha ao carregar audio independente; utilizando fluxo original",
exc_info=True,
)
return fallback_audio, False
audio_clip = AudioArrayClip(samples, fps=fps).with_duration(duration)
return audio_clip, True
def _collect_words(
self, transcription: TranscriptionResult, start: float, end: float
) -> List[WordTiming]:
collected: List[WordTiming] = []
for segment in transcription.segments:
if segment.end < start or segment.start > end:
continue
if segment.words:
for word in segment.words:
if word.end < start or word.start > end:
continue
collected.append(
WordTiming(
start=max(start, word.start),
end=min(end, word.end),
word=word.word,
)
)
else:
collected.extend(self._fallback_words(segment.text, segment.start, segment.end, start, end))
collected.sort(key=lambda w: w.start)
return collected
def _fallback_words(
self,
text: str,
segment_start: float,
segment_end: float,
window_start: float,
window_end: float,
) -> Iterable[WordTiming]:
words = [w for w in re.split(r"\s+", text.strip()) if w]
if not words:
return []
seg_start = max(segment_start, window_start)
seg_end = min(segment_end, window_end)
duration = max(0.01, seg_end - seg_start)
step = duration / len(words)
timings: List[WordTiming] = []
for idx, word in enumerate(words):
w_start = seg_start + idx * step
w_end = min(seg_end, w_start + step)
timings.append(WordTiming(start=w_start, end=w_end, word=word))
return timings
@staticmethod
def _wrap_text(text: str, max_width: int) -> str:
text = text.strip()
if not text:
return ""
words = text.split()
lines: List[str] = []
current: List[str] = []
for word in words:
current.append(word)
if len(" ".join(current)) > max_width // 18:
lines.append(" ".join(current[:-1]))
current = [current[-1]]
if current:
lines.append(" ".join(current))
return "\n".join(lines)
def _write_with_fallback(
self,
*,
composite: CompositeVideoClip,
output_path,
index: int,
output_dir,
) -> None:
attempts = self._encoding_attempts()
temp_audio_path = output_dir / f"temp_audio_{index:02d}.m4a"
last_error: Exception | None = None
for attempt in attempts:
codec = attempt["codec"]
bitrate = attempt["bitrate"]
preset = attempt["preset"]
ffmpeg_params = ["-pix_fmt", "yuv420p"]
if preset:
ffmpeg_params = ["-preset", preset, "-pix_fmt", "yuv420p"]
try:
logger.info(
"Renderizando clip %02d com codec %s (bitrate=%s, preset=%s)",
index,
codec,
bitrate,
preset or "default",
)
composite.write_videofile(
str(output_path),
codec=codec,
audio_codec=self.settings.rendering.audio_codec,
fps=self.settings.rendering.fps,
bitrate=bitrate,
ffmpeg_params=ffmpeg_params,
temp_audiofile=str(temp_audio_path),
remove_temp=True,
threads=4,
)
return
except Exception as exc: # noqa: BLE001 - propagate after fallbacks
last_error = exc
logger.warning(
"Falha ao renderizar com codec %s: %s", codec, exc, exc_info=True
)
if output_path.exists():
output_path.unlink(missing_ok=True)
if temp_audio_path.exists():
temp_audio_path.unlink(missing_ok=True)
raise RuntimeError("Todas as tentativas de renderizacao falharam") from last_error
def _encoding_attempts(self) -> List[Dict[str, str | None]]:
settings = self.settings.rendering
attempts: List[Dict[str, str | None]] = []
attempts.append(
{
"codec": settings.video_codec,
"bitrate": settings.bitrate,
"preset": settings.preset,
}
)
deduped: List[Dict[str, str | None]] = []
seen = set()
for attempt in attempts:
key = (attempt["codec"], attempt["bitrate"], attempt["preset"])
if key in seen:
continue
seen.add(key)
deduped.append(attempt)
return deduped
@staticmethod
def _with_audio(
composite: CompositeVideoClip,
audio_clip,
) -> CompositeVideoClip:
"""Attach audio to a composite clip across MoviePy versions."""
if hasattr(composite, "with_audio"):
return composite.with_audio(audio_clip)
if hasattr(composite, "set_audio"):
return composite.set_audio(audio_clip)
raise AttributeError("CompositeVideoClip does not support audio assignment")
@staticmethod
def _make_textclip(
*,
text: str,
font_path,
font_size: int,
color: str,
size: Tuple[int, int],
) -> TextClip:
"""Create a TextClip compatible with MoviePy 1.x and 2.x.
MoviePy 2.x removed the 'align' keyword from TextClip. We try with
'align' for older versions and fall back to a call without it when
unsupported.
"""
kwargs = dict(
text=text,
font=str(font_path),
font_size=font_size,
color=color,
method="caption",
size=size,
)
try:
return TextClip(**kwargs, align="center") # MoviePy 1.x style
except TypeError:
logger.debug("TextClip 'align' not supported; falling back without it")
return TextClip(**kwargs) # MoviePy 2.x style