#v2 - Inicia testes da v2

- Adiciona rastreamento de objetos
- Facial detection
- Legenda interativa
- Cortes mais precisos
- Refinamento do Prompt
This commit is contained in:
LeoMortari
2025-11-12 11:38:09 -03:00
parent 87c6a5e27c
commit c5d3e83a5f
15 changed files with 1739 additions and 313 deletions

View File

@@ -15,6 +15,7 @@ from PIL import Image, ImageColor, ImageDraw, ImageFont
from video_render.config import Settings
from video_render.transcription import TranscriptionResult, WordTiming
from video_render.smart_framing import SmartFramer, extract_audio_samples
logger = logging.getLogger(__name__)
@@ -54,7 +55,41 @@ class CaptionBuilder:
self.space_width = self.font.getbbox(" ")[2] - self.font.getbbox(" ")[0]
def build(self, words: Sequence[WordTiming], clip_start: float) -> List[CaptionClipSet]:
grouped = self._group_words(words)
# Filter out empty, whitespace-only, or very short words (likely noise)
valid_words = [
w for w in words
if w.word
and w.word.strip()
and len(w.word.strip()) >= 2 # At least 2 characters
and not w.word.strip() in ['...', '..', '.', ',', '-', 'hmm', 'hm', 'ah', 'eh', 'uh'] # Not just punctuation or filler
]
# Note: We don't filter out words based on gaps here
# Gap detection is handled in _group_words_with_gaps
# This ensures captions disappear during silence naturally
filtered_words = valid_words
# Calculate speech density (words per second)
# If density is too low, it's likely just noise/silence being misinterpreted
if filtered_words:
first_word_time = filtered_words[0].start
last_word_time = filtered_words[-1].end
duration = last_word_time - first_word_time
if duration > 0:
words_per_second = len(filtered_words) / duration
# Typical speech is 2-3 words per second
# If less than 0.5 words/second, it's probably silence/noise
if words_per_second < 0.5:
logger.debug(f"Captions suprimidas: densidade muito baixa ({words_per_second:.2f} palavras/seg)")
return []
# Only show captions if we have at least 3 valid words (reduced from 5 for 2-word groups)
# This prevents showing captions for noise/mumbling
if len(filtered_words) < 3:
return []
grouped = self._group_words_with_gaps(filtered_words)
clip_sets: List[CaptionClipSet] = []
for group in grouped:
@@ -101,6 +136,92 @@ class CaptionBuilder:
if len(widths) > 1:
total_width += self.space_width * (len(widths) - 1)
# Check if text needs to wrap to multiple lines
# If total width exceeds canvas width, break into 2 lines
needs_wrap = total_width > self.canvas_width
if needs_wrap:
# Split into 2 lines - try to balance the lines
mid_point = len(texts) // 2
line1_texts = texts[:mid_point]
line2_texts = texts[mid_point:]
line1_widths = widths[:mid_point]
line2_widths = widths[mid_point:]
# Calculate widths for each line
line1_width = sum(line1_widths)
if len(line1_widths) > 1:
line1_width += self.space_width * (len(line1_widths) - 1)
line2_width = sum(line2_widths)
if len(line2_widths) > 1:
line2_width += self.space_width * (len(line2_widths) - 1)
# Double the canvas height for 2 lines
canvas_height = self.canvas_height * 2
base_image = Image.new("RGBA", (self.canvas_width, canvas_height), (0, 0, 0, 0))
base_draw = ImageDraw.Draw(base_image)
highlight_images: List[Image.Image] = []
# Stroke settings: 8px black stroke for better readability
stroke_width = 8
stroke_color = (0, 0, 0, 255) # Black
# Draw line 1
x = max(0, (self.canvas_width - line1_width) // 2)
y = self.baseline
for i, (text, width) in enumerate(zip(line1_texts, line1_widths)):
base_draw.text(
(x, y),
text,
font=self.font,
fill=self.base_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
highlight_draw = ImageDraw.Draw(highlight_image)
highlight_draw.text(
(x, y),
text,
font=self.font,
fill=self.highlight_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
highlight_images.append(highlight_image)
x += width + self.space_width
# Draw line 2
x = max(0, (self.canvas_width - line2_width) // 2)
y = self.baseline + self.text_height + 5 # 5px spacing between lines
for i, (text, width) in enumerate(zip(line2_texts, line2_widths)):
base_draw.text(
(x, y),
text,
font=self.font,
fill=self.base_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
highlight_draw = ImageDraw.Draw(highlight_image)
highlight_draw.text(
(x, y),
text,
font=self.font,
fill=self.highlight_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
highlight_images.append(highlight_image)
x += width + self.space_width
return base_image, highlight_images
# Single line rendering (original code)
start_x = max(0, (self.canvas_width - total_width) // 2)
base_image = Image.new("RGBA", (self.canvas_width, self.canvas_height), (0, 0, 0, 0))
@@ -108,13 +229,31 @@ class CaptionBuilder:
highlight_images: List[Image.Image] = []
x = start_x
for text, width in zip(texts, widths):
base_draw.text((x, self.baseline), text, font=self.font, fill=self.base_color)
# Stroke settings: 8px black stroke for better readability
stroke_width = 8
stroke_color = (0, 0, 0, 255) # Black
for text, width in zip(texts, widths):
# Draw base text with stroke
base_draw.text(
(x, self.baseline),
text,
font=self.font,
fill=self.base_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
# Draw highlight text with stroke
highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
highlight_draw = ImageDraw.Draw(highlight_image)
highlight_draw.text(
(x, self.baseline), text, font=self.font, fill=self.highlight_color
(x, self.baseline),
text,
font=self.font,
fill=self.highlight_color,
stroke_width=stroke_width,
stroke_fill=stroke_color
)
highlight_images.append(highlight_image)
@@ -153,6 +292,44 @@ class CaptionBuilder:
return grouped
def _group_words_with_gaps(self, words: Sequence[WordTiming]) -> List[List[WordTiming]]:
"""
Group words into 2-word chunks, respecting silence gaps.
Creates natural breaks where there are pauses > 1.5s
"""
if not words:
return []
grouped: List[List[WordTiming]] = []
buffer: List[WordTiming] = []
for i, word in enumerate(words):
# Check if there's a long pause before this word
if i > 0:
gap = word.start - words[i-1].end
# If gap > 1.5s, finish current buffer and start new group
if gap > 1.5:
if buffer:
grouped.append(buffer)
buffer = []
buffer.append(word)
# Group into 2 words maximum
if len(buffer) == 2:
grouped.append(buffer)
buffer = []
# Handle remaining words
if buffer:
if len(buffer) == 1 and grouped:
# Add single remaining word to last group
grouped[-1].append(buffer[0])
else:
grouped.append(buffer)
return [grp for grp in grouped if grp]
@staticmethod
def _clean_word(text: str) -> str:
text = text.strip()
@@ -164,6 +341,12 @@ class VideoRenderer:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self.captions = CaptionBuilder(settings)
self.smart_framer = SmartFramer(
target_width=settings.rendering.frame_width,
target_height=settings.rendering.frame_height,
frame_skip=settings.rendering.smart_framing_frame_skip,
smoothing_window=settings.rendering.smart_framing_smoothing_window
)
def render(
self,
@@ -234,26 +417,100 @@ class VideoRenderer:
duration = end - start
frame_w = self.settings.rendering.frame_width
frame_h = self.settings.rendering.frame_height
top_h = int(frame_h * 0.18)
# Removed top panel - no longer showing title
bottom_h = int(frame_h * 0.20)
video_area_h = max(1, frame_h - top_h - bottom_h)
scale_factor = min(
frame_w / subclip.w,
video_area_h / subclip.h,
)
resized_clip = subclip.resized(scale_factor)
video_y = top_h + (video_area_h - resized_clip.h) // 2
video_clip = resized_clip.with_position(
((frame_w - resized_clip.w) // 2, video_y)
)
# Use smart framing to create intelligent 9:16 video (if enabled)
if self.settings.rendering.enable_smart_framing:
logger.info(f"Creating smart framing plan for clip {index} ({start:.2f}s - {end:.2f}s)")
try:
# Extract audio for speech detection
audio_samples = extract_audio_samples(source_path, start, end)
# Create framing plan
framing_plan = self.smart_framer.create_framing_plan(
video_path=source_path,
start_time=start,
end_time=end,
audio_samples=audio_samples
)
# Apply smart framing based on detected layout
use_split_screen = framing_plan.layout_mode in ["dual_split", "grid"]
video_clip = self.smart_framer.apply_framing(
video_clip=subclip,
framing_plan=framing_plan,
use_split_screen=use_split_screen
)
logger.info(f"Smart framing applied: layout={framing_plan.layout_mode}, "
f"faces_detected={len(framing_plan.frame_contexts[0].detected_faces) if framing_plan.frame_contexts else 0}")
except Exception as exc:
logger.warning(f"Smart framing failed for clip {index}, falling back to center crop: {exc}", exc_info=True)
# Fallback to center crop (maintains aspect ratio, crops to fit)
video_area_h = max(1, frame_h - bottom_h)
# Use MAX to ensure video covers entire area (will crop excess)
scale_factor = max(
frame_w / subclip.w,
video_area_h / subclip.h,
)
# Resize to cover area
resized_clip = subclip.resized(scale_factor)
# Calculate crop region (center crop)
crop_x1 = max(0, (resized_clip.w - frame_w) // 2)
crop_y1 = max(0, (resized_clip.h - video_area_h) // 2)
crop_x2 = crop_x1 + frame_w
crop_y2 = crop_y1 + video_area_h
# Crop to fit target dimensions using MoviePy crop(x1, y1, x2, y2)
cropped_clip = resized_clip.cropped(
x1=crop_x1,
y1=crop_y1,
x2=crop_x2,
y2=crop_y2
)
video_clip = cropped_clip.with_position((0, 0))
resized_clip.close()
else:
# Use center crop (smart framing disabled)
logger.info(f"Using center crop for clip {index} (smart framing disabled)")
video_area_h = max(1, frame_h - bottom_h)
# Use MAX to ensure video covers entire area (will crop excess)
scale_factor = max(
frame_w / subclip.w,
video_area_h / subclip.h,
)
# Resize to cover area
resized_clip = subclip.resized(scale_factor)
# Calculate crop region (center crop)
crop_x1 = max(0, (resized_clip.w - frame_w) // 2)
crop_y1 = max(0, (resized_clip.h - video_area_h) // 2)
crop_x2 = crop_x1 + frame_w
crop_y2 = crop_y1 + video_area_h
# Crop to fit target dimensions using MoviePy crop(x1, y1, x2, y2)
cropped_clip = resized_clip.cropped(
x1=crop_x1,
y1=crop_y1,
x2=crop_x2,
y2=crop_y2
)
video_clip = cropped_clip.with_position((0, 0))
resized_clip.close()
background = ColorClip(size=(frame_w, frame_h), color=(0, 0, 0)).with_duration(duration)
top_panel = (
ColorClip(size=(frame_w, top_h), color=(12, 12, 12))
.with_duration(duration)
.with_opacity(0.85)
)
# Removed top panel and title - no longer needed
bottom_panel = (
ColorClip(size=(frame_w, bottom_h), color=(12, 12, 12))
.with_position((0, frame_h - bottom_h))
@@ -261,34 +518,42 @@ class VideoRenderer:
.with_opacity(0.85)
)
title_clip = self._build_title_clip(
title=title,
summary=summary,
duration=duration,
frame_width=frame_w,
top_panel_height=top_h,
)
title_clip = title_clip.with_position(
((frame_w - title_clip.w) // 2, (top_h - title_clip.h) // 2)
)
words = self._collect_words(transcription, start, end)
caption_sets = self.captions.build(words, clip_start=start)
# Calculate speech coverage: how much of the clip has actual speech?
# If less than 30% of the clip has speech, don't show captions
clip_duration = end - start
if words and clip_duration > 0:
# Calculate total time with speech
total_speech_time = sum(w.end - w.start for w in words)
speech_coverage = total_speech_time / clip_duration
if speech_coverage < 0.3: # Less than 30% speech
logger.debug(f"Captions suprimidas: cobertura de fala baixa ({speech_coverage:.1%})")
words = [] # Clear words to prevent captions
# Only build captions if there are actual words to display
# This prevents empty/placeholder captions from appearing
caption_sets = self.captions.build(words, clip_start=start) if words else []
caption_clips = []
caption_resources: List[ImageClip] = []
caption_area_top = frame_h - bottom_h
caption_area_height = bottom_h
# Position captions 120px below center (for 1920px height, center is 960px, so 1080px)
# This ensures they're visible, well-positioned, and don't interfere with faces
# Range: 100-150px as requested, using 120px for optimal positioning
center_y = frame_h // 2
caption_y = center_y + 120
caption_margin = 20
raw_caption_y = caption_area_top + (caption_area_height - self.captions.canvas_height) // 2
min_caption_y = caption_area_top + caption_margin
max_caption_y = (
caption_area_top + caption_area_height - self.captions.canvas_height - caption_margin
)
# Ensure captions stay within reasonable bounds (no top panel now)
min_caption_y = caption_margin
max_caption_y = frame_h - bottom_h - self.captions.canvas_height - caption_margin
if max_caption_y < min_caption_y:
caption_y = min_caption_y
else:
caption_y = min(max(raw_caption_y, min_caption_y), max_caption_y)
caption_y = min(max(caption_y, min_caption_y), max_caption_y)
for clip_set in caption_sets:
base_positioned = clip_set.base.with_position(("center", caption_y))
@@ -299,30 +564,20 @@ class VideoRenderer:
caption_clips.append(positioned)
caption_resources.append(highlight)
if not caption_clips:
fallback_text = self._wrap_text(summary or title, max_width=frame_w - 160)
caption_clips.append(
self._make_textclip(
text=fallback_text,
font_path=self.settings.rendering.font_path,
font_size=self.settings.rendering.subtitle_font_size,
color=self.settings.rendering.base_color,
size=(frame_w - 160, max(40, self.captions.canvas_height)),
)
.with_duration(duration)
.with_position(("center", caption_y))
)
# No fallback captions - if there are no dynamic captions, show nothing
# This matches Opus Clip behavior where captions only appear when there's actual speech
audio_clip, audio_needs_close = self._materialize_audio(
source_path=source_path,
start=start,
end=end,
duration=duration,
fallback_audio=video_clip.audio or resized_clip.audio or subclip.audio,
fallback_audio=video_clip.audio or subclip.audio,
)
# Composite with background, bottom panel, video, and captions only (no top panel or title)
composite = CompositeVideoClip(
[background, top_panel, bottom_panel, video_clip, title_clip, *caption_clips],
[background, bottom_panel, video_clip, *caption_clips],
size=(frame_w, frame_h),
)
if audio_clip is not None:
@@ -337,11 +592,8 @@ class VideoRenderer:
)
composite.close()
resized_clip.close()
video_clip.close()
title_clip.close()
background.close()
top_panel.close()
bottom_panel.close()
for clip in caption_clips:
clip.close()
@@ -352,95 +604,6 @@ class VideoRenderer:
return str(output_path)
def _build_title_clip(
self,
*,
title: str,
summary: str,
duration: float,
frame_width: int,
top_panel_height: int,
) -> ImageClip:
text = (title or summary or "").strip()
if not text:
text = summary or ""
max_width = max(200, frame_width - 160)
font_size = self.settings.rendering.title_font_size
min_font_size = max(28, int(font_size * 0.6))
target_height = max(80, top_panel_height - 40)
title_color = ImageColor.getrgb(self.settings.rendering.base_color)
font_path = self.settings.rendering.font_path
while True:
font = ImageFont.truetype(str(font_path), font_size)
lines = self._split_title_lines(text, font, max_width)
line_height = font.getbbox("Ay")[3] - font.getbbox("Ay")[1]
spacing = max(4, int(line_height * 0.25))
text_height = self._measure_text_height(len(lines), line_height, spacing)
if text_height <= target_height or font_size <= min_font_size:
break
font_size = max(min_font_size, font_size - 6)
# Recompute dimensions with final font size to ensure consistency
font = ImageFont.truetype(str(font_path), font_size)
lines = self._split_title_lines(text, font, max_width)
line_height = font.getbbox("Ay")[3] - font.getbbox("Ay")[1]
spacing = max(4, int(line_height * 0.25))
text_height = self._measure_text_height(len(lines), line_height, spacing)
canvas_height = max(1, text_height)
image = Image.new("RGBA", (max_width, canvas_height), (0, 0, 0, 0))
draw = ImageDraw.Draw(image)
y = 0
for idx, line in enumerate(lines):
bbox = font.getbbox(line)
line_width = bbox[2] - bbox[0]
x = max(0, (max_width - line_width) // 2)
draw.text((x, y - bbox[1]), line, font=font, fill=title_color)
y += line_height
if idx < len(lines) - 1:
y += spacing
return ImageClip(np.array(image)).with_duration(duration)
@staticmethod
def _measure_text_height(line_count: int, line_height: int, spacing: int) -> int:
if line_count <= 0:
return line_height
return line_count * line_height + max(0, line_count - 1) * spacing
@staticmethod
def _split_title_lines(
text: str, font: ImageFont.FreeTypeFont, max_width: int
) -> List[str]:
words = text.split()
if not words:
return [""]
lines: List[str] = []
current: List[str] = []
for word in words:
test_line = " ".join(current + [word]) if current else word
bbox = font.getbbox(test_line)
line_width = bbox[2] - bbox[0]
if line_width <= max_width or not current:
current.append(word)
if line_width > max_width and not current[:-1]:
lines.append(" ".join(current))
current = []
continue
lines.append(" ".join(current))
current = [word]
if current:
lines.append(" ".join(current))
return lines
def _materialize_audio(
self,
*,