video-render/video_render/rendering.py

from __future__ import annotations

import logging
import re
from dataclasses import dataclass
from typing import Dict, Iterable, List, Sequence, Tuple, Optional

import numpy as np
from moviepy.audio.AudioClip import AudioArrayClip, AudioClip
from moviepy.audio.io.AudioFileClip import AudioFileClip
from moviepy.video.VideoClip import ColorClip, ImageClip, TextClip
from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
from moviepy.video.io.VideoFileClip import VideoFileClip
from PIL import Image, ImageColor, ImageDraw, ImageFont

from video_render.config import Settings
from video_render.transcription import TranscriptionResult, WordTiming
from video_render.smart_framing import SmartFramer, extract_audio_samples

logger = logging.getLogger(__name__)


def clamp_time(value: float, minimum: float = 0.0) -> float:
    return max(minimum, float(value))


@dataclass
class CaptionClipSet:
    base: ImageClip
    highlights: List[ImageClip]


class CaptionBuilder:
    def __init__(self, settings: Settings) -> None:
        self.settings = settings
        self.font_path = settings.rendering.font_path

        if not self.font_path.exists():
            raise FileNotFoundError(f"Fonte nao encontrada: {self.font_path}")

        self.font = ImageFont.truetype(
            str(self.font_path), settings.rendering.subtitle_font_size
        )
        self.base_color = ImageColor.getrgb(settings.rendering.base_color)
        self.highlight_color = ImageColor.getrgb(settings.rendering.highlight_color)
        self.canvas_width = settings.rendering.frame_width - 160
        self.canvas_height = int(settings.rendering.subtitle_font_size * 2.2)
        self.min_words = settings.rendering.caption_min_words
        self.max_words = settings.rendering.caption_max_words

        bbox = self.font.getbbox("Ay")

        self.text_height = bbox[3] - bbox[1]
        self.baseline = (self.canvas_height - self.text_height) // 2 - bbox[1]
        self.space_width = self.font.getbbox(" ")[2] - self.font.getbbox(" ")[0]

    def build(self, words: Sequence[WordTiming], clip_start: float) -> List[CaptionClipSet]:
        # Filter out empty, whitespace-only, or very short words (likely noise)
        valid_words = [
            w for w in words
            if w.word
            and w.word.strip()
            and len(w.word.strip()) >= 2  # At least 2 characters
            and not w.word.strip() in ['...', '..', '.', ',', '-', 'hmm', 'hm', 'ah', 'eh', 'uh']  # Not just punctuation or filler
        ]

        # Note: We don't filter out words based on gaps here
        # Gap detection is handled in _group_words_with_gaps
        # This ensures captions disappear during silence naturally
        filtered_words = valid_words

        # Calculate speech density (words per second)
        # If density is too low, it's likely just noise/silence being misinterpreted
        if filtered_words:
            first_word_time = filtered_words[0].start
            last_word_time = filtered_words[-1].end
            duration = last_word_time - first_word_time

            if duration > 0:
                words_per_second = len(filtered_words) / duration
                # Typical speech is 2-3 words per second
                # If less than 0.5 words/second, it's probably silence/noise
                if words_per_second < 0.5:
                    logger.debug(f"Captions suprimidas: densidade muito baixa ({words_per_second:.2f} palavras/seg)")
                    return []

        # Only show captions if we have at least 3 valid words (reduced from 5 for 2-word groups)
        # This prevents showing captions for noise/mumbling
        if len(filtered_words) < 3:
            return []

        grouped = self._group_words_with_gaps(filtered_words)
        clip_sets: List[CaptionClipSet] = []

        for group in grouped:
            group_start = clamp_time(group[0].start, minimum=clip_start)
            group_end = clamp_time(group[-1].end, minimum=group_start + 0.05)
            duration = max(0.05, group_end - group_start)
            start_offset = group_start - clip_start

            base_image, highlight_images = self._render_group(group)

            base_clip = (
                ImageClip(np.array(base_image))
                .with_start(start_offset)
                .with_duration(duration)
            )

            highlight_clips: List[ImageClip] = []

            for word, image in zip(group, highlight_images):
                h_start = clamp_time(word.start, minimum=clip_start) - clip_start
                h_end = clamp_time(word.end, minimum=word.start + 0.02) - clip_start
                h_duration = max(0.05, h_end - h_start)
                highlight_clip = (
                    ImageClip(np.array(image))
                    .with_start(h_start)
                    .with_duration(h_duration)
                )
                highlight_clips.append(highlight_clip)

            clip_sets.append(CaptionClipSet(base=base_clip, highlights=highlight_clips))

        return clip_sets

    def _render_group(self, group: Sequence[WordTiming]) -> Tuple[Image.Image, List[Image.Image]]:
        texts = [self._clean_word(word.word) for word in group]
        widths = []

        for text in texts:
            bbox = self.font.getbbox(text)
            widths.append(bbox[2] - bbox[0])

        total_width = sum(widths)

        if len(widths) > 1:
            total_width += self.space_width * (len(widths) - 1)

        # Check if text needs to wrap to multiple lines
        # If total width exceeds canvas width, break into 2 lines
        needs_wrap = total_width > self.canvas_width

        if needs_wrap:
            # Split into 2 lines - try to balance the lines
            mid_point = len(texts) // 2
            line1_texts = texts[:mid_point]
            line2_texts = texts[mid_point:]
            line1_widths = widths[:mid_point]
            line2_widths = widths[mid_point:]

            # Calculate widths for each line
            line1_width = sum(line1_widths)
            if len(line1_widths) > 1:
                line1_width += self.space_width * (len(line1_widths) - 1)

            line2_width = sum(line2_widths)
            if len(line2_widths) > 1:
                line2_width += self.space_width * (len(line2_widths) - 1)

            # Double the canvas height for 2 lines
            canvas_height = self.canvas_height * 2
            base_image = Image.new("RGBA", (self.canvas_width, canvas_height), (0, 0, 0, 0))
            base_draw = ImageDraw.Draw(base_image)
            highlight_images: List[Image.Image] = []

            # Stroke settings: 8px black stroke for better readability
            stroke_width = 8
            stroke_color = (0, 0, 0, 255)  # Black

            # Draw line 1
            x = max(0, (self.canvas_width - line1_width) // 2)
            y = self.baseline
            for i, (text, width) in enumerate(zip(line1_texts, line1_widths)):
                base_draw.text(
                    (x, y),
                    text,
                    font=self.font,
                    fill=self.base_color,
                    stroke_width=stroke_width,
                    stroke_fill=stroke_color
                )

                highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
                highlight_draw = ImageDraw.Draw(highlight_image)
                highlight_draw.text(
                    (x, y),
                    text,
                    font=self.font,
                    fill=self.highlight_color,
                    stroke_width=stroke_width,
                    stroke_fill=stroke_color
                )
                highlight_images.append(highlight_image)
                x += width + self.space_width

            # Draw line 2
            x = max(0, (self.canvas_width - line2_width) // 2)
            y = self.baseline + self.text_height + 5  # 5px spacing between lines
            for i, (text, width) in enumerate(zip(line2_texts, line2_widths)):
                base_draw.text(
                    (x, y),
                    text,
                    font=self.font,
                    fill=self.base_color,
                    stroke_width=stroke_width,
                    stroke_fill=stroke_color
                )

                highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
                highlight_draw = ImageDraw.Draw(highlight_image)
                highlight_draw.text(
                    (x, y),
                    text,
                    font=self.font,
                    fill=self.highlight_color,
                    stroke_width=stroke_width,
                    stroke_fill=stroke_color
                )
                highlight_images.append(highlight_image)
                x += width + self.space_width

            return base_image, highlight_images

        # Single line rendering (original code)
        start_x = max(0, (self.canvas_width - total_width) // 2)

        base_image = Image.new("RGBA", (self.canvas_width, self.canvas_height), (0, 0, 0, 0))
        base_draw = ImageDraw.Draw(base_image)
        highlight_images: List[Image.Image] = []
        x = start_x

        # Stroke settings: 8px black stroke for better readability
        stroke_width = 8
        stroke_color = (0, 0, 0, 255)  # Black

        for text, width in zip(texts, widths):
            # Draw base text with stroke
            base_draw.text(
                (x, self.baseline),
                text,
                font=self.font,
                fill=self.base_color,
                stroke_width=stroke_width,
                stroke_fill=stroke_color
            )

            # Draw highlight text with stroke
            highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
            highlight_draw = ImageDraw.Draw(highlight_image)
            highlight_draw.text(
                (x, self.baseline),
                text,
                font=self.font,
                fill=self.highlight_color,
                stroke_width=stroke_width,
                stroke_fill=stroke_color
            )
            highlight_images.append(highlight_image)

            x += width + self.space_width

        return base_image, highlight_images

    def _group_words(self, words: Sequence[WordTiming]) -> List[List[WordTiming]]:
        if not words:
            return []

        grouped: List[List[WordTiming]] = []
        buffer: List[WordTiming] = []

        for word in words:
            buffer.append(word)

            if len(buffer) == self.max_words:
                grouped.append(buffer)
                buffer = []

        if buffer:
            if len(buffer) == 1 and grouped:
                grouped[-1].extend(buffer)
            else:
                grouped.append(buffer)

        for idx, group in enumerate(grouped[:-1]):
            if len(group) < self.min_words and len(grouped[idx + 1]) > self.min_words:
                deficit = self.min_words - len(group)
                transfer = grouped[idx + 1][:deficit]
                grouped[idx] = group + transfer
                grouped[idx + 1] = grouped[idx + 1][deficit:]

        grouped = [grp for grp in grouped if grp]

        return grouped

    def _group_words_with_gaps(self, words: Sequence[WordTiming]) -> List[List[WordTiming]]:
        """
        Group words into 2-word chunks, respecting silence gaps.
        Creates natural breaks where there are pauses > 1.5s
        """
        if not words:
            return []

        grouped: List[List[WordTiming]] = []
        buffer: List[WordTiming] = []

        for i, word in enumerate(words):
            # Check if there's a long pause before this word
            if i > 0:
                gap = word.start - words[i-1].end
                # If gap > 1.5s, finish current buffer and start new group
                if gap > 1.5:
                    if buffer:
                        grouped.append(buffer)
                        buffer = []

            buffer.append(word)

            # Group into 2 words maximum
            if len(buffer) == 2:
                grouped.append(buffer)
                buffer = []

        # Handle remaining words
        if buffer:
            if len(buffer) == 1 and grouped:
                # Add single remaining word to last group
                grouped[-1].append(buffer[0])
            else:
                grouped.append(buffer)

        return [grp for grp in grouped if grp]

    @staticmethod
    def _clean_word(text: str) -> str:
        text = text.strip()
        text = re.sub(r"\s+", " ", text)
        return text or "..."


class VideoRenderer:
    def __init__(self, settings: Settings) -> None:
        self.settings = settings
        self.captions = CaptionBuilder(settings)
        self.smart_framer = SmartFramer(
            target_width=settings.rendering.frame_width,
            target_height=settings.rendering.frame_height,
            frame_skip=settings.rendering.smart_framing_frame_skip,
            smoothing_window=settings.rendering.smart_framing_smoothing_window,
            max_velocity=settings.rendering.smart_framing_max_velocity,
            person_switch_cooldown=settings.rendering.smart_framing_person_switch_cooldown,
            response_time=settings.rendering.smart_framing_response_time,
            group_padding=settings.rendering.smart_framing_group_padding,
            max_zoom_out=settings.rendering.smart_framing_max_zoom_out,
            dead_zone=settings.rendering.smart_framing_dead_zone,
            min_face_confidence=settings.rendering.smart_framing_min_confidence
        )

    def render(
        self,
        workspace_path: str,
        highlight_windows: Sequence,
        transcription: TranscriptionResult,
        titles: Sequence[str],
        output_dir,
    ) -> List[Tuple[str, float, float, str, str, int]]:
        results: List[Tuple[str, float, float, str, str, int]] = []

        with VideoFileClip(workspace_path) as base_clip:
            video_duration = base_clip.duration or 0

            for index, window in enumerate(highlight_windows, start=1):
                start = clamp_time(window.start)
                end = clamp_time(window.end)
                start = min(start, video_duration)
                end = min(end, video_duration)

                if end <= start:
                    logger.info("Janela ignorada por intervalo invalido: %s", window)

                    continue

                subclip = base_clip.subclipped(start, end)

                try:
                    rendered_path = self._render_single_clip(
                        subclip=subclip,
                        start=start,
                        end=end,
                        title=titles[index - 1] if index - 1 < len(titles) else window.summary,
                        summary=window.summary,
                        index=index,
                        transcription=transcription,
                        output_dir=output_dir,
                        source_path=workspace_path,
                    )
                finally:
                    subclip.close()

                results.append(
                    (
                        rendered_path,
                        float(start),
                        float(end),
                        titles[index - 1] if index - 1 < len(titles) else window.summary,
                        window.summary,
                        index,
                    )
                )

        return results

    def _render_single_clip(
        self,
        subclip: VideoFileClip,
        start: float,
        end: float,
        title: str,
        summary: str,
        index: int,
        transcription: TranscriptionResult,
        output_dir,
        source_path: str,
    ) -> str:
        duration = end - start
        frame_w = self.settings.rendering.frame_width
        frame_h = self.settings.rendering.frame_height
        # Removed top panel - no longer showing title
        bottom_h = int(frame_h * 0.20)

        # Use smart framing to create intelligent 9:16 video (if enabled)
        if self.settings.rendering.enable_smart_framing:
            logger.info(f"Creating smart framing plan for clip {index} ({start:.2f}s - {end:.2f}s)")

            try:
                # Extract audio for speech detection
                audio_samples = extract_audio_samples(source_path, start, end)

                # Create framing plan
                framing_plan = self.smart_framer.create_framing_plan(
                    video_path=source_path,
                    start_time=start,
                    end_time=end,
                    audio_samples=audio_samples
                )

                # Apply smart framing (always single-person focus)
                video_clip = self.smart_framer.apply_framing(
                    video_clip=subclip,
                    framing_plan=framing_plan
                )

                logger.info(f"Smart framing applied: layout={framing_plan.layout_mode}, "
                           f"faces_detected={len(framing_plan.frame_contexts[0].detected_faces) if framing_plan.frame_contexts else 0}")

            except Exception as exc:
                logger.warning(f"Smart framing failed for clip {index}, falling back to center crop: {exc}", exc_info=True)

                # Fallback to center crop (maintains aspect ratio, crops to fit)
                video_area_h = max(1, frame_h - bottom_h)

                # Use MAX to ensure video covers entire area (will crop excess)
                scale_factor = max(
                    frame_w / subclip.w,
                    video_area_h / subclip.h,
                )

                # Resize to cover area
                resized_clip = subclip.resized(scale_factor)

                # Calculate crop region (center crop)
                crop_x1 = max(0, (resized_clip.w - frame_w) // 2)
                crop_y1 = max(0, (resized_clip.h - video_area_h) // 2)
                crop_x2 = crop_x1 + frame_w
                crop_y2 = crop_y1 + video_area_h

                # Crop to fit target dimensions using MoviePy crop(x1, y1, x2, y2)
                cropped_clip = resized_clip.cropped(
                    x1=crop_x1,
                    y1=crop_y1,
                    x2=crop_x2,
                    y2=crop_y2
                )

                video_clip = cropped_clip.with_position((0, 0))
                resized_clip.close()
        else:
            # Use center crop (smart framing disabled)
            logger.info(f"Using center crop for clip {index} (smart framing disabled)")
            video_area_h = max(1, frame_h - bottom_h)

            # Use MAX to ensure video covers entire area (will crop excess)
            scale_factor = max(
                frame_w / subclip.w,
                video_area_h / subclip.h,
            )

            # Resize to cover area
            resized_clip = subclip.resized(scale_factor)

            # Calculate crop region (center crop)
            crop_x1 = max(0, (resized_clip.w - frame_w) // 2)
            crop_y1 = max(0, (resized_clip.h - video_area_h) // 2)
            crop_x2 = crop_x1 + frame_w
            crop_y2 = crop_y1 + video_area_h

            # Crop to fit target dimensions using MoviePy crop(x1, y1, x2, y2)
            cropped_clip = resized_clip.cropped(
                x1=crop_x1,
                y1=crop_y1,
                x2=crop_x2,
                y2=crop_y2
            )

            video_clip = cropped_clip.with_position((0, 0))
            resized_clip.close()

        background = ColorClip(size=(frame_w, frame_h), color=(0, 0, 0)).with_duration(duration)
        # Removed top panel and title - no longer needed
        bottom_panel = (
            ColorClip(size=(frame_w, bottom_h), color=(12, 12, 12))
            .with_position((0, frame_h - bottom_h))
            .with_duration(duration)
            .with_opacity(0.85)
        )

        words = self._collect_words(transcription, start, end)

        # Calculate speech coverage: how much of the clip has actual speech?
        # If less than 30% of the clip has speech, don't show captions
        clip_duration = end - start
        if words and clip_duration > 0:
            # Calculate total time with speech
            total_speech_time = sum(w.end - w.start for w in words)
            speech_coverage = total_speech_time / clip_duration

            if speech_coverage < 0.3:  # Less than 30% speech
                logger.debug(f"Captions suprimidas: cobertura de fala baixa ({speech_coverage:.1%})")
                words = []  # Clear words to prevent captions

        # Only build captions if there are actual words to display
        # This prevents empty/placeholder captions from appearing
        caption_sets = self.captions.build(words, clip_start=start) if words else []

        caption_clips = []
        caption_resources: List[ImageClip] = []

        # Position captions 120px below center (for 1920px height, center is 960px, so 1080px)
        # This ensures they're visible, well-positioned, and don't interfere with faces
        # Range: 100-150px as requested, using 120px for optimal positioning
        center_y = frame_h // 2
        caption_y = center_y + 120
        caption_margin = 20

        # Ensure captions stay within reasonable bounds (no top panel now)
        min_caption_y = caption_margin
        max_caption_y = frame_h - bottom_h - self.captions.canvas_height - caption_margin

        if max_caption_y < min_caption_y:
            caption_y = min_caption_y
        else:
            caption_y = min(max(caption_y, min_caption_y), max_caption_y)

        for clip_set in caption_sets:
            base_positioned = clip_set.base.with_position(("center", caption_y))
            caption_clips.append(base_positioned)
            caption_resources.append(clip_set.base)
            for highlight in clip_set.highlights:
                positioned = highlight.with_position(("center", caption_y))
                caption_clips.append(positioned)
                caption_resources.append(highlight)

        # No fallback captions - if there are no dynamic captions, show nothing
        # This matches Opus Clip behavior where captions only appear when there's actual speech

        audio_clip, audio_needs_close = self._materialize_audio(
            source_path=source_path,
            start=start,
            end=end,
            duration=duration,
            fallback_audio=video_clip.audio or subclip.audio,
        )

        # Composite with background, bottom panel, video, and captions only (no top panel or title)
        composite = CompositeVideoClip(
            [background, bottom_panel, video_clip, *caption_clips],
            size=(frame_w, frame_h),
        )
        if audio_clip is not None:
            composite = self._with_audio(composite, audio_clip)

        output_path = output_dir / f"clip_{index:02d}.mp4"
        self._write_with_fallback(
            composite=composite,
            output_path=output_path,
            index=index,
            output_dir=output_dir,
        )

        composite.close()
        video_clip.close()
        background.close()
        bottom_panel.close()
        for clip in caption_clips:
            clip.close()
        for clip in caption_resources:
            clip.close()
        if audio_clip is not None and audio_needs_close:
            audio_clip.close()

        # Force garbage collection to free memory after rendering
        import gc
        gc.collect()

        return str(output_path)

    def _materialize_audio(
        self,
        *,
        source_path: str,
        start: float,
        end: float,
        duration: float,
        fallback_audio,
    ) -> Tuple[Optional[AudioClip], bool]:
        try:
            with AudioFileClip(source_path) as audio_file:
                segment = audio_file.subclipped(start, end)
                fps = (
                    getattr(segment, "fps", None)
                    or getattr(audio_file, "fps", None)
                    or 44100
                )
                samples = segment.to_soundarray(fps=fps)
        except Exception:
            logger.warning(
                "Falha ao carregar audio independente; utilizando fluxo original",
                exc_info=True,
            )
            return fallback_audio, False

        audio_clip = AudioArrayClip(samples, fps=fps).with_duration(duration)
        return audio_clip, True

    def _collect_words(
        self, transcription: TranscriptionResult, start: float, end: float
    ) -> List[WordTiming]:
        collected: List[WordTiming] = []
        for segment in transcription.segments:
            if segment.end < start or segment.start > end:
                continue

            if segment.words:
                for word in segment.words:
                    if word.end < start or word.start > end:
                        continue
                    collected.append(
                        WordTiming(
                            start=max(start, word.start),
                            end=min(end, word.end),
                            word=word.word,
                        )
                    )
            else:
                collected.extend(self._fallback_words(segment.text, segment.start, segment.end, start, end))

        collected.sort(key=lambda w: w.start)
        return collected

    def _fallback_words(
        self,
        text: str,
        segment_start: float,
        segment_end: float,
        window_start: float,
        window_end: float,
    ) -> Iterable[WordTiming]:
        words = [w for w in re.split(r"\s+", text.strip()) if w]
        if not words:
            return []

        seg_start = max(segment_start, window_start)
        seg_end = min(segment_end, window_end)
        duration = max(0.01, seg_end - seg_start)
        step = duration / len(words)

        timings: List[WordTiming] = []
        for idx, word in enumerate(words):
            w_start = seg_start + idx * step
            w_end = min(seg_end, w_start + step)
            timings.append(WordTiming(start=w_start, end=w_end, word=word))
        return timings

    @staticmethod
    def _wrap_text(text: str, max_width: int) -> str:
        text = text.strip()
        if not text:
            return ""

        words = text.split()
        lines: List[str] = []
        current: List[str] = []
        for word in words:
            current.append(word)
            if len(" ".join(current)) > max_width // 18:
                lines.append(" ".join(current[:-1]))
                current = [current[-1]]
        if current:
            lines.append(" ".join(current))
        return "\n".join(lines)

    def _write_with_fallback(
        self,
        *,
        composite: CompositeVideoClip,
        output_path,
        index: int,
        output_dir,
    ) -> None:
        attempts = self._encoding_attempts()
        temp_audio_path = output_dir / f"temp_audio_{index:02d}.m4a"
        last_error: Exception | None = None

        for attempt in attempts:
            codec = attempt["codec"]
            bitrate = attempt["bitrate"]
            preset = attempt["preset"]

            ffmpeg_params = ["-pix_fmt", "yuv420p"]
            if preset:
                ffmpeg_params = ["-preset", preset, "-pix_fmt", "yuv420p"]

            try:
                logger.info(
                    "Renderizando clip %02d com codec %s (bitrate=%s, preset=%s)",
                    index,
                    codec,
                    bitrate,
                    preset or "default",
                )
                composite.write_videofile(
                    str(output_path),
                    codec=codec,
                    audio_codec=self.settings.rendering.audio_codec,
                    fps=self.settings.rendering.fps,
                    bitrate=bitrate,
                    ffmpeg_params=ffmpeg_params,
                    temp_audiofile=str(temp_audio_path),
                    remove_temp=True,
                    threads=4,
                )
                return
            except Exception as exc:  # noqa: BLE001 - propagate after fallbacks
                last_error = exc
                logger.warning(
                    "Falha ao renderizar com codec %s: %s", codec, exc, exc_info=True
                )
                if output_path.exists():
                    output_path.unlink(missing_ok=True)
                if temp_audio_path.exists():
                    temp_audio_path.unlink(missing_ok=True)

        raise RuntimeError("Todas as tentativas de renderizacao falharam") from last_error

    def _encoding_attempts(self) -> List[Dict[str, str | None]]:
        settings = self.settings.rendering
        attempts: List[Dict[str, str | None]] = []

        attempts.append(
            {
                "codec": settings.video_codec,
                "bitrate": settings.bitrate,
                "preset": settings.preset,
            }
        )

        deduped: List[Dict[str, str | None]] = []
        seen = set()
        for attempt in attempts:
            key = (attempt["codec"], attempt["bitrate"], attempt["preset"])
            if key in seen:
                continue
            seen.add(key)
            deduped.append(attempt)

        return deduped

    @staticmethod
    def _with_audio(
        composite: CompositeVideoClip,
        audio_clip,
    ) -> CompositeVideoClip:
        """Attach audio to a composite clip across MoviePy versions."""
        if hasattr(composite, "with_audio"):
            return composite.with_audio(audio_clip)
        if hasattr(composite, "set_audio"):
            return composite.set_audio(audio_clip)
        raise AttributeError("CompositeVideoClip does not support audio assignment")

    @staticmethod
    def _make_textclip(
        *,
        text: str,
        font_path,
        font_size: int,
        color: str,
        size: Tuple[int, int],
    ) -> TextClip:
        """Create a TextClip compatible with MoviePy 1.x and 2.x.

        MoviePy 2.x removed the 'align' keyword from TextClip. We try with
        'align' for older versions and fall back to a call without it when
        unsupported.
        """
        kwargs = dict(
            text=text,
            font=str(font_path),
            font_size=font_size,
            color=color,
            method="caption",
            size=size,
        )
        try:
            return TextClip(**kwargs, align="center")  # MoviePy 1.x style
        except TypeError:
            logger.debug("TextClip 'align' not supported; falling back without it")
            return TextClip(**kwargs)  # MoviePy 2.x style