822 lines
30 KiB
Python
822 lines
30 KiB
Python
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import Dict, Iterable, List, Sequence, Tuple, Optional
|
|
|
|
import numpy as np
|
|
from moviepy.audio.AudioClip import AudioArrayClip, AudioClip
|
|
from moviepy.audio.io.AudioFileClip import AudioFileClip
|
|
from moviepy.video.VideoClip import ColorClip, ImageClip, TextClip
|
|
from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
|
|
from moviepy.video.io.VideoFileClip import VideoFileClip
|
|
from PIL import Image, ImageColor, ImageDraw, ImageFont
|
|
|
|
from video_render.config import Settings
|
|
from video_render.transcription import TranscriptionResult, WordTiming
|
|
from video_render.smart_framing import SmartFramer, extract_audio_samples
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def clamp_time(value: float, minimum: float = 0.0) -> float:
|
|
return max(minimum, float(value))
|
|
|
|
|
|
@dataclass
|
|
class CaptionClipSet:
|
|
base: ImageClip
|
|
highlights: List[ImageClip]
|
|
|
|
|
|
class CaptionBuilder:
|
|
def __init__(self, settings: Settings) -> None:
|
|
self.settings = settings
|
|
self.font_path = settings.rendering.font_path
|
|
|
|
if not self.font_path.exists():
|
|
raise FileNotFoundError(f"Fonte nao encontrada: {self.font_path}")
|
|
|
|
self.font = ImageFont.truetype(
|
|
str(self.font_path), settings.rendering.subtitle_font_size
|
|
)
|
|
self.base_color = ImageColor.getrgb(settings.rendering.base_color)
|
|
self.highlight_color = ImageColor.getrgb(settings.rendering.highlight_color)
|
|
self.canvas_width = settings.rendering.frame_width - 160
|
|
self.canvas_height = int(settings.rendering.subtitle_font_size * 2.2)
|
|
self.min_words = settings.rendering.caption_min_words
|
|
self.max_words = settings.rendering.caption_max_words
|
|
|
|
bbox = self.font.getbbox("Ay")
|
|
|
|
self.text_height = bbox[3] - bbox[1]
|
|
self.baseline = (self.canvas_height - self.text_height) // 2 - bbox[1]
|
|
self.space_width = self.font.getbbox(" ")[2] - self.font.getbbox(" ")[0]
|
|
|
|
def build(self, words: Sequence[WordTiming], clip_start: float) -> List[CaptionClipSet]:
|
|
# Filter out empty, whitespace-only, or very short words (likely noise)
|
|
valid_words = [
|
|
w for w in words
|
|
if w.word
|
|
and w.word.strip()
|
|
and len(w.word.strip()) >= 2 # At least 2 characters
|
|
and not w.word.strip() in ['...', '..', '.', ',', '-', 'hmm', 'hm', 'ah', 'eh', 'uh'] # Not just punctuation or filler
|
|
]
|
|
|
|
# Note: We don't filter out words based on gaps here
|
|
# Gap detection is handled in _group_words_with_gaps
|
|
# This ensures captions disappear during silence naturally
|
|
filtered_words = valid_words
|
|
|
|
# Calculate speech density (words per second)
|
|
# If density is too low, it's likely just noise/silence being misinterpreted
|
|
if filtered_words:
|
|
first_word_time = filtered_words[0].start
|
|
last_word_time = filtered_words[-1].end
|
|
duration = last_word_time - first_word_time
|
|
|
|
if duration > 0:
|
|
words_per_second = len(filtered_words) / duration
|
|
# Typical speech is 2-3 words per second
|
|
# If less than 0.5 words/second, it's probably silence/noise
|
|
if words_per_second < 0.5:
|
|
logger.debug(f"Captions suprimidas: densidade muito baixa ({words_per_second:.2f} palavras/seg)")
|
|
return []
|
|
|
|
# Only show captions if we have at least 3 valid words (reduced from 5 for 2-word groups)
|
|
# This prevents showing captions for noise/mumbling
|
|
if len(filtered_words) < 3:
|
|
return []
|
|
|
|
grouped = self._group_words_with_gaps(filtered_words)
|
|
clip_sets: List[CaptionClipSet] = []
|
|
|
|
for group in grouped:
|
|
group_start = clamp_time(group[0].start, minimum=clip_start)
|
|
group_end = clamp_time(group[-1].end, minimum=group_start + 0.05)
|
|
duration = max(0.05, group_end - group_start)
|
|
start_offset = group_start - clip_start
|
|
|
|
base_image, highlight_images = self._render_group(group)
|
|
|
|
base_clip = (
|
|
ImageClip(np.array(base_image))
|
|
.with_start(start_offset)
|
|
.with_duration(duration)
|
|
)
|
|
|
|
highlight_clips: List[ImageClip] = []
|
|
|
|
for word, image in zip(group, highlight_images):
|
|
h_start = clamp_time(word.start, minimum=clip_start) - clip_start
|
|
h_end = clamp_time(word.end, minimum=word.start + 0.02) - clip_start
|
|
h_duration = max(0.05, h_end - h_start)
|
|
highlight_clip = (
|
|
ImageClip(np.array(image))
|
|
.with_start(h_start)
|
|
.with_duration(h_duration)
|
|
)
|
|
highlight_clips.append(highlight_clip)
|
|
|
|
clip_sets.append(CaptionClipSet(base=base_clip, highlights=highlight_clips))
|
|
|
|
return clip_sets
|
|
|
|
def _render_group(self, group: Sequence[WordTiming]) -> Tuple[Image.Image, List[Image.Image]]:
|
|
texts = [self._clean_word(word.word) for word in group]
|
|
widths = []
|
|
|
|
for text in texts:
|
|
bbox = self.font.getbbox(text)
|
|
widths.append(bbox[2] - bbox[0])
|
|
|
|
total_width = sum(widths)
|
|
|
|
if len(widths) > 1:
|
|
total_width += self.space_width * (len(widths) - 1)
|
|
|
|
# Check if text needs to wrap to multiple lines
|
|
# If total width exceeds canvas width, break into 2 lines
|
|
needs_wrap = total_width > self.canvas_width
|
|
|
|
if needs_wrap:
|
|
# Split into 2 lines - try to balance the lines
|
|
mid_point = len(texts) // 2
|
|
line1_texts = texts[:mid_point]
|
|
line2_texts = texts[mid_point:]
|
|
line1_widths = widths[:mid_point]
|
|
line2_widths = widths[mid_point:]
|
|
|
|
# Calculate widths for each line
|
|
line1_width = sum(line1_widths)
|
|
if len(line1_widths) > 1:
|
|
line1_width += self.space_width * (len(line1_widths) - 1)
|
|
|
|
line2_width = sum(line2_widths)
|
|
if len(line2_widths) > 1:
|
|
line2_width += self.space_width * (len(line2_widths) - 1)
|
|
|
|
# Double the canvas height for 2 lines
|
|
canvas_height = self.canvas_height * 2
|
|
base_image = Image.new("RGBA", (self.canvas_width, canvas_height), (0, 0, 0, 0))
|
|
base_draw = ImageDraw.Draw(base_image)
|
|
highlight_images: List[Image.Image] = []
|
|
|
|
# Stroke settings: 8px black stroke for better readability
|
|
stroke_width = 8
|
|
stroke_color = (0, 0, 0, 255) # Black
|
|
|
|
# Draw line 1
|
|
x = max(0, (self.canvas_width - line1_width) // 2)
|
|
y = self.baseline
|
|
for i, (text, width) in enumerate(zip(line1_texts, line1_widths)):
|
|
base_draw.text(
|
|
(x, y),
|
|
text,
|
|
font=self.font,
|
|
fill=self.base_color,
|
|
stroke_width=stroke_width,
|
|
stroke_fill=stroke_color
|
|
)
|
|
|
|
highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
|
|
highlight_draw = ImageDraw.Draw(highlight_image)
|
|
highlight_draw.text(
|
|
(x, y),
|
|
text,
|
|
font=self.font,
|
|
fill=self.highlight_color,
|
|
stroke_width=stroke_width,
|
|
stroke_fill=stroke_color
|
|
)
|
|
highlight_images.append(highlight_image)
|
|
x += width + self.space_width
|
|
|
|
# Draw line 2
|
|
x = max(0, (self.canvas_width - line2_width) // 2)
|
|
y = self.baseline + self.text_height + 5 # 5px spacing between lines
|
|
for i, (text, width) in enumerate(zip(line2_texts, line2_widths)):
|
|
base_draw.text(
|
|
(x, y),
|
|
text,
|
|
font=self.font,
|
|
fill=self.base_color,
|
|
stroke_width=stroke_width,
|
|
stroke_fill=stroke_color
|
|
)
|
|
|
|
highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
|
|
highlight_draw = ImageDraw.Draw(highlight_image)
|
|
highlight_draw.text(
|
|
(x, y),
|
|
text,
|
|
font=self.font,
|
|
fill=self.highlight_color,
|
|
stroke_width=stroke_width,
|
|
stroke_fill=stroke_color
|
|
)
|
|
highlight_images.append(highlight_image)
|
|
x += width + self.space_width
|
|
|
|
return base_image, highlight_images
|
|
|
|
# Single line rendering (original code)
|
|
start_x = max(0, (self.canvas_width - total_width) // 2)
|
|
|
|
base_image = Image.new("RGBA", (self.canvas_width, self.canvas_height), (0, 0, 0, 0))
|
|
base_draw = ImageDraw.Draw(base_image)
|
|
highlight_images: List[Image.Image] = []
|
|
x = start_x
|
|
|
|
# Stroke settings: 8px black stroke for better readability
|
|
stroke_width = 8
|
|
stroke_color = (0, 0, 0, 255) # Black
|
|
|
|
for text, width in zip(texts, widths):
|
|
# Draw base text with stroke
|
|
base_draw.text(
|
|
(x, self.baseline),
|
|
text,
|
|
font=self.font,
|
|
fill=self.base_color,
|
|
stroke_width=stroke_width,
|
|
stroke_fill=stroke_color
|
|
)
|
|
|
|
# Draw highlight text with stroke
|
|
highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
|
|
highlight_draw = ImageDraw.Draw(highlight_image)
|
|
highlight_draw.text(
|
|
(x, self.baseline),
|
|
text,
|
|
font=self.font,
|
|
fill=self.highlight_color,
|
|
stroke_width=stroke_width,
|
|
stroke_fill=stroke_color
|
|
)
|
|
highlight_images.append(highlight_image)
|
|
|
|
x += width + self.space_width
|
|
|
|
return base_image, highlight_images
|
|
|
|
def _group_words(self, words: Sequence[WordTiming]) -> List[List[WordTiming]]:
|
|
if not words:
|
|
return []
|
|
|
|
grouped: List[List[WordTiming]] = []
|
|
buffer: List[WordTiming] = []
|
|
|
|
for word in words:
|
|
buffer.append(word)
|
|
|
|
if len(buffer) == self.max_words:
|
|
grouped.append(buffer)
|
|
buffer = []
|
|
|
|
if buffer:
|
|
if len(buffer) == 1 and grouped:
|
|
grouped[-1].extend(buffer)
|
|
else:
|
|
grouped.append(buffer)
|
|
|
|
for idx, group in enumerate(grouped[:-1]):
|
|
if len(group) < self.min_words and len(grouped[idx + 1]) > self.min_words:
|
|
deficit = self.min_words - len(group)
|
|
transfer = grouped[idx + 1][:deficit]
|
|
grouped[idx] = group + transfer
|
|
grouped[idx + 1] = grouped[idx + 1][deficit:]
|
|
|
|
grouped = [grp for grp in grouped if grp]
|
|
|
|
return grouped
|
|
|
|
def _group_words_with_gaps(self, words: Sequence[WordTiming]) -> List[List[WordTiming]]:
|
|
"""
|
|
Group words into 2-word chunks, respecting silence gaps.
|
|
Creates natural breaks where there are pauses > 1.5s
|
|
"""
|
|
if not words:
|
|
return []
|
|
|
|
grouped: List[List[WordTiming]] = []
|
|
buffer: List[WordTiming] = []
|
|
|
|
for i, word in enumerate(words):
|
|
# Check if there's a long pause before this word
|
|
if i > 0:
|
|
gap = word.start - words[i-1].end
|
|
# If gap > 1.5s, finish current buffer and start new group
|
|
if gap > 1.5:
|
|
if buffer:
|
|
grouped.append(buffer)
|
|
buffer = []
|
|
|
|
buffer.append(word)
|
|
|
|
# Group into 2 words maximum
|
|
if len(buffer) == 2:
|
|
grouped.append(buffer)
|
|
buffer = []
|
|
|
|
# Handle remaining words
|
|
if buffer:
|
|
if len(buffer) == 1 and grouped:
|
|
# Add single remaining word to last group
|
|
grouped[-1].append(buffer[0])
|
|
else:
|
|
grouped.append(buffer)
|
|
|
|
return [grp for grp in grouped if grp]
|
|
|
|
@staticmethod
|
|
def _clean_word(text: str) -> str:
|
|
text = text.strip()
|
|
text = re.sub(r"\s+", " ", text)
|
|
return text or "..."
|
|
|
|
|
|
class VideoRenderer:
|
|
def __init__(self, settings: Settings) -> None:
|
|
self.settings = settings
|
|
self.captions = CaptionBuilder(settings)
|
|
self.smart_framer = SmartFramer(
|
|
target_width=settings.rendering.frame_width,
|
|
target_height=settings.rendering.frame_height,
|
|
frame_skip=settings.rendering.smart_framing_frame_skip,
|
|
smoothing_window=settings.rendering.smart_framing_smoothing_window,
|
|
max_velocity=settings.rendering.smart_framing_max_velocity,
|
|
person_switch_cooldown=settings.rendering.smart_framing_person_switch_cooldown
|
|
)
|
|
|
|
def render(
|
|
self,
|
|
workspace_path: str,
|
|
highlight_windows: Sequence,
|
|
transcription: TranscriptionResult,
|
|
titles: Sequence[str],
|
|
output_dir,
|
|
) -> List[Tuple[str, float, float, str, str, int]]:
|
|
results: List[Tuple[str, float, float, str, str, int]] = []
|
|
|
|
with VideoFileClip(workspace_path) as base_clip:
|
|
video_duration = base_clip.duration or 0
|
|
|
|
for index, window in enumerate(highlight_windows, start=1):
|
|
start = clamp_time(window.start)
|
|
end = clamp_time(window.end)
|
|
start = min(start, video_duration)
|
|
end = min(end, video_duration)
|
|
|
|
if end <= start:
|
|
logger.info("Janela ignorada por intervalo invalido: %s", window)
|
|
|
|
continue
|
|
|
|
subclip = base_clip.subclipped(start, end)
|
|
|
|
try:
|
|
rendered_path = self._render_single_clip(
|
|
subclip=subclip,
|
|
start=start,
|
|
end=end,
|
|
title=titles[index - 1] if index - 1 < len(titles) else window.summary,
|
|
summary=window.summary,
|
|
index=index,
|
|
transcription=transcription,
|
|
output_dir=output_dir,
|
|
source_path=workspace_path,
|
|
)
|
|
finally:
|
|
subclip.close()
|
|
|
|
results.append(
|
|
(
|
|
rendered_path,
|
|
float(start),
|
|
float(end),
|
|
titles[index - 1] if index - 1 < len(titles) else window.summary,
|
|
window.summary,
|
|
index,
|
|
)
|
|
)
|
|
|
|
return results
|
|
|
|
def _render_single_clip(
|
|
self,
|
|
subclip: VideoFileClip,
|
|
start: float,
|
|
end: float,
|
|
title: str,
|
|
summary: str,
|
|
index: int,
|
|
transcription: TranscriptionResult,
|
|
output_dir,
|
|
source_path: str,
|
|
) -> str:
|
|
duration = end - start
|
|
frame_w = self.settings.rendering.frame_width
|
|
frame_h = self.settings.rendering.frame_height
|
|
# Removed top panel - no longer showing title
|
|
bottom_h = int(frame_h * 0.20)
|
|
|
|
# Use smart framing to create intelligent 9:16 video (if enabled)
|
|
if self.settings.rendering.enable_smart_framing:
|
|
logger.info(f"Creating smart framing plan for clip {index} ({start:.2f}s - {end:.2f}s)")
|
|
|
|
try:
|
|
# Extract audio for speech detection
|
|
audio_samples = extract_audio_samples(source_path, start, end)
|
|
|
|
# Create framing plan
|
|
framing_plan = self.smart_framer.create_framing_plan(
|
|
video_path=source_path,
|
|
start_time=start,
|
|
end_time=end,
|
|
audio_samples=audio_samples
|
|
)
|
|
|
|
# Apply smart framing (always single-person focus)
|
|
video_clip = self.smart_framer.apply_framing(
|
|
video_clip=subclip,
|
|
framing_plan=framing_plan
|
|
)
|
|
|
|
logger.info(f"Smart framing applied: layout={framing_plan.layout_mode}, "
|
|
f"faces_detected={len(framing_plan.frame_contexts[0].detected_faces) if framing_plan.frame_contexts else 0}")
|
|
|
|
except Exception as exc:
|
|
logger.warning(f"Smart framing failed for clip {index}, falling back to center crop: {exc}", exc_info=True)
|
|
|
|
# Fallback to center crop (maintains aspect ratio, crops to fit)
|
|
video_area_h = max(1, frame_h - bottom_h)
|
|
|
|
# Use MAX to ensure video covers entire area (will crop excess)
|
|
scale_factor = max(
|
|
frame_w / subclip.w,
|
|
video_area_h / subclip.h,
|
|
)
|
|
|
|
# Resize to cover area
|
|
resized_clip = subclip.resized(scale_factor)
|
|
|
|
# Calculate crop region (center crop)
|
|
crop_x1 = max(0, (resized_clip.w - frame_w) // 2)
|
|
crop_y1 = max(0, (resized_clip.h - video_area_h) // 2)
|
|
crop_x2 = crop_x1 + frame_w
|
|
crop_y2 = crop_y1 + video_area_h
|
|
|
|
# Crop to fit target dimensions using MoviePy crop(x1, y1, x2, y2)
|
|
cropped_clip = resized_clip.cropped(
|
|
x1=crop_x1,
|
|
y1=crop_y1,
|
|
x2=crop_x2,
|
|
y2=crop_y2
|
|
)
|
|
|
|
video_clip = cropped_clip.with_position((0, 0))
|
|
resized_clip.close()
|
|
else:
|
|
# Use center crop (smart framing disabled)
|
|
logger.info(f"Using center crop for clip {index} (smart framing disabled)")
|
|
video_area_h = max(1, frame_h - bottom_h)
|
|
|
|
# Use MAX to ensure video covers entire area (will crop excess)
|
|
scale_factor = max(
|
|
frame_w / subclip.w,
|
|
video_area_h / subclip.h,
|
|
)
|
|
|
|
# Resize to cover area
|
|
resized_clip = subclip.resized(scale_factor)
|
|
|
|
# Calculate crop region (center crop)
|
|
crop_x1 = max(0, (resized_clip.w - frame_w) // 2)
|
|
crop_y1 = max(0, (resized_clip.h - video_area_h) // 2)
|
|
crop_x2 = crop_x1 + frame_w
|
|
crop_y2 = crop_y1 + video_area_h
|
|
|
|
# Crop to fit target dimensions using MoviePy crop(x1, y1, x2, y2)
|
|
cropped_clip = resized_clip.cropped(
|
|
x1=crop_x1,
|
|
y1=crop_y1,
|
|
x2=crop_x2,
|
|
y2=crop_y2
|
|
)
|
|
|
|
video_clip = cropped_clip.with_position((0, 0))
|
|
resized_clip.close()
|
|
|
|
background = ColorClip(size=(frame_w, frame_h), color=(0, 0, 0)).with_duration(duration)
|
|
# Removed top panel and title - no longer needed
|
|
bottom_panel = (
|
|
ColorClip(size=(frame_w, bottom_h), color=(12, 12, 12))
|
|
.with_position((0, frame_h - bottom_h))
|
|
.with_duration(duration)
|
|
.with_opacity(0.85)
|
|
)
|
|
|
|
words = self._collect_words(transcription, start, end)
|
|
|
|
# Calculate speech coverage: how much of the clip has actual speech?
|
|
# If less than 30% of the clip has speech, don't show captions
|
|
clip_duration = end - start
|
|
if words and clip_duration > 0:
|
|
# Calculate total time with speech
|
|
total_speech_time = sum(w.end - w.start for w in words)
|
|
speech_coverage = total_speech_time / clip_duration
|
|
|
|
if speech_coverage < 0.3: # Less than 30% speech
|
|
logger.debug(f"Captions suprimidas: cobertura de fala baixa ({speech_coverage:.1%})")
|
|
words = [] # Clear words to prevent captions
|
|
|
|
# Only build captions if there are actual words to display
|
|
# This prevents empty/placeholder captions from appearing
|
|
caption_sets = self.captions.build(words, clip_start=start) if words else []
|
|
|
|
caption_clips = []
|
|
caption_resources: List[ImageClip] = []
|
|
|
|
# Position captions 120px below center (for 1920px height, center is 960px, so 1080px)
|
|
# This ensures they're visible, well-positioned, and don't interfere with faces
|
|
# Range: 100-150px as requested, using 120px for optimal positioning
|
|
center_y = frame_h // 2
|
|
caption_y = center_y + 120
|
|
caption_margin = 20
|
|
|
|
# Ensure captions stay within reasonable bounds (no top panel now)
|
|
min_caption_y = caption_margin
|
|
max_caption_y = frame_h - bottom_h - self.captions.canvas_height - caption_margin
|
|
|
|
if max_caption_y < min_caption_y:
|
|
caption_y = min_caption_y
|
|
else:
|
|
caption_y = min(max(caption_y, min_caption_y), max_caption_y)
|
|
|
|
for clip_set in caption_sets:
|
|
base_positioned = clip_set.base.with_position(("center", caption_y))
|
|
caption_clips.append(base_positioned)
|
|
caption_resources.append(clip_set.base)
|
|
for highlight in clip_set.highlights:
|
|
positioned = highlight.with_position(("center", caption_y))
|
|
caption_clips.append(positioned)
|
|
caption_resources.append(highlight)
|
|
|
|
# No fallback captions - if there are no dynamic captions, show nothing
|
|
# This matches Opus Clip behavior where captions only appear when there's actual speech
|
|
|
|
audio_clip, audio_needs_close = self._materialize_audio(
|
|
source_path=source_path,
|
|
start=start,
|
|
end=end,
|
|
duration=duration,
|
|
fallback_audio=video_clip.audio or subclip.audio,
|
|
)
|
|
|
|
# Composite with background, bottom panel, video, and captions only (no top panel or title)
|
|
composite = CompositeVideoClip(
|
|
[background, bottom_panel, video_clip, *caption_clips],
|
|
size=(frame_w, frame_h),
|
|
)
|
|
if audio_clip is not None:
|
|
composite = self._with_audio(composite, audio_clip)
|
|
|
|
output_path = output_dir / f"clip_{index:02d}.mp4"
|
|
self._write_with_fallback(
|
|
composite=composite,
|
|
output_path=output_path,
|
|
index=index,
|
|
output_dir=output_dir,
|
|
)
|
|
|
|
composite.close()
|
|
video_clip.close()
|
|
background.close()
|
|
bottom_panel.close()
|
|
for clip in caption_clips:
|
|
clip.close()
|
|
for clip in caption_resources:
|
|
clip.close()
|
|
if audio_clip is not None and audio_needs_close:
|
|
audio_clip.close()
|
|
|
|
# Force garbage collection to free memory after rendering
|
|
import gc
|
|
gc.collect()
|
|
|
|
return str(output_path)
|
|
|
|
def _materialize_audio(
|
|
self,
|
|
*,
|
|
source_path: str,
|
|
start: float,
|
|
end: float,
|
|
duration: float,
|
|
fallback_audio,
|
|
) -> Tuple[Optional[AudioClip], bool]:
|
|
try:
|
|
with AudioFileClip(source_path) as audio_file:
|
|
segment = audio_file.subclipped(start, end)
|
|
fps = (
|
|
getattr(segment, "fps", None)
|
|
or getattr(audio_file, "fps", None)
|
|
or 44100
|
|
)
|
|
samples = segment.to_soundarray(fps=fps)
|
|
except Exception:
|
|
logger.warning(
|
|
"Falha ao carregar audio independente; utilizando fluxo original",
|
|
exc_info=True,
|
|
)
|
|
return fallback_audio, False
|
|
|
|
audio_clip = AudioArrayClip(samples, fps=fps).with_duration(duration)
|
|
return audio_clip, True
|
|
|
|
def _collect_words(
|
|
self, transcription: TranscriptionResult, start: float, end: float
|
|
) -> List[WordTiming]:
|
|
collected: List[WordTiming] = []
|
|
for segment in transcription.segments:
|
|
if segment.end < start or segment.start > end:
|
|
continue
|
|
|
|
if segment.words:
|
|
for word in segment.words:
|
|
if word.end < start or word.start > end:
|
|
continue
|
|
collected.append(
|
|
WordTiming(
|
|
start=max(start, word.start),
|
|
end=min(end, word.end),
|
|
word=word.word,
|
|
)
|
|
)
|
|
else:
|
|
collected.extend(self._fallback_words(segment.text, segment.start, segment.end, start, end))
|
|
|
|
collected.sort(key=lambda w: w.start)
|
|
return collected
|
|
|
|
def _fallback_words(
|
|
self,
|
|
text: str,
|
|
segment_start: float,
|
|
segment_end: float,
|
|
window_start: float,
|
|
window_end: float,
|
|
) -> Iterable[WordTiming]:
|
|
words = [w for w in re.split(r"\s+", text.strip()) if w]
|
|
if not words:
|
|
return []
|
|
|
|
seg_start = max(segment_start, window_start)
|
|
seg_end = min(segment_end, window_end)
|
|
duration = max(0.01, seg_end - seg_start)
|
|
step = duration / len(words)
|
|
|
|
timings: List[WordTiming] = []
|
|
for idx, word in enumerate(words):
|
|
w_start = seg_start + idx * step
|
|
w_end = min(seg_end, w_start + step)
|
|
timings.append(WordTiming(start=w_start, end=w_end, word=word))
|
|
return timings
|
|
|
|
@staticmethod
|
|
def _wrap_text(text: str, max_width: int) -> str:
|
|
text = text.strip()
|
|
if not text:
|
|
return ""
|
|
|
|
words = text.split()
|
|
lines: List[str] = []
|
|
current: List[str] = []
|
|
for word in words:
|
|
current.append(word)
|
|
if len(" ".join(current)) > max_width // 18:
|
|
lines.append(" ".join(current[:-1]))
|
|
current = [current[-1]]
|
|
if current:
|
|
lines.append(" ".join(current))
|
|
return "\n".join(lines)
|
|
|
|
def _write_with_fallback(
|
|
self,
|
|
*,
|
|
composite: CompositeVideoClip,
|
|
output_path,
|
|
index: int,
|
|
output_dir,
|
|
) -> None:
|
|
attempts = self._encoding_attempts()
|
|
temp_audio_path = output_dir / f"temp_audio_{index:02d}.m4a"
|
|
last_error: Exception | None = None
|
|
|
|
for attempt in attempts:
|
|
codec = attempt["codec"]
|
|
bitrate = attempt["bitrate"]
|
|
preset = attempt["preset"]
|
|
|
|
ffmpeg_params = ["-pix_fmt", "yuv420p"]
|
|
if preset:
|
|
ffmpeg_params = ["-preset", preset, "-pix_fmt", "yuv420p"]
|
|
|
|
try:
|
|
logger.info(
|
|
"Renderizando clip %02d com codec %s (bitrate=%s, preset=%s)",
|
|
index,
|
|
codec,
|
|
bitrate,
|
|
preset or "default",
|
|
)
|
|
composite.write_videofile(
|
|
str(output_path),
|
|
codec=codec,
|
|
audio_codec=self.settings.rendering.audio_codec,
|
|
fps=self.settings.rendering.fps,
|
|
bitrate=bitrate,
|
|
ffmpeg_params=ffmpeg_params,
|
|
temp_audiofile=str(temp_audio_path),
|
|
remove_temp=True,
|
|
threads=4,
|
|
)
|
|
return
|
|
except Exception as exc: # noqa: BLE001 - propagate after fallbacks
|
|
last_error = exc
|
|
logger.warning(
|
|
"Falha ao renderizar com codec %s: %s", codec, exc, exc_info=True
|
|
)
|
|
if output_path.exists():
|
|
output_path.unlink(missing_ok=True)
|
|
if temp_audio_path.exists():
|
|
temp_audio_path.unlink(missing_ok=True)
|
|
|
|
raise RuntimeError("Todas as tentativas de renderizacao falharam") from last_error
|
|
|
|
def _encoding_attempts(self) -> List[Dict[str, str | None]]:
|
|
settings = self.settings.rendering
|
|
attempts: List[Dict[str, str | None]] = []
|
|
|
|
attempts.append(
|
|
{
|
|
"codec": settings.video_codec,
|
|
"bitrate": settings.bitrate,
|
|
"preset": settings.preset,
|
|
}
|
|
)
|
|
|
|
deduped: List[Dict[str, str | None]] = []
|
|
seen = set()
|
|
for attempt in attempts:
|
|
key = (attempt["codec"], attempt["bitrate"], attempt["preset"])
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
deduped.append(attempt)
|
|
|
|
return deduped
|
|
|
|
@staticmethod
|
|
def _with_audio(
|
|
composite: CompositeVideoClip,
|
|
audio_clip,
|
|
) -> CompositeVideoClip:
|
|
"""Attach audio to a composite clip across MoviePy versions."""
|
|
if hasattr(composite, "with_audio"):
|
|
return composite.with_audio(audio_clip)
|
|
if hasattr(composite, "set_audio"):
|
|
return composite.set_audio(audio_clip)
|
|
raise AttributeError("CompositeVideoClip does not support audio assignment")
|
|
|
|
@staticmethod
|
|
def _make_textclip(
|
|
*,
|
|
text: str,
|
|
font_path,
|
|
font_size: int,
|
|
color: str,
|
|
size: Tuple[int, int],
|
|
) -> TextClip:
|
|
"""Create a TextClip compatible with MoviePy 1.x and 2.x.
|
|
|
|
MoviePy 2.x removed the 'align' keyword from TextClip. We try with
|
|
'align' for older versions and fall back to a call without it when
|
|
unsupported.
|
|
"""
|
|
kwargs = dict(
|
|
text=text,
|
|
font=str(font_path),
|
|
font_size=font_size,
|
|
color=color,
|
|
method="caption",
|
|
size=size,
|
|
)
|
|
try:
|
|
return TextClip(**kwargs, align="center") # MoviePy 1.x style
|
|
except TypeError:
|
|
logger.debug("TextClip 'align' not supported; falling back without it")
|
|
return TextClip(**kwargs) # MoviePy 2.x style
|