video-render/video_render/transcription.py

from __future__ import annotations

import json
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional

from faster_whisper import WhisperModel

from .config import Settings

logger = logging.getLogger(__name__)


@dataclass(frozen=True)
class WordTiming:
    start: float
    end: float
    word: str


@dataclass(frozen=True)
class TranscriptSegment:
    id: int
    start: float
    end: float
    text: str
    words: List[WordTiming]


@dataclass(frozen=True)
class TranscriptionResult:
    segments: List[TranscriptSegment]
    full_text: str


class TranscriptionService:
    def __init__(self, settings: Settings) -> None:
        self.settings = settings
        self._model: Optional[WhisperModel] = None

    def _load_model(self) -> WhisperModel:
        if self._model is None:
            logger.info(
                "Carregando modelo Faster-Whisper '%s' (device=%s, compute_type=%s)",
                self.settings.whisper.model_size,
                self.settings.whisper.device or "auto",
                self.settings.whisper.compute_type or "default",
            )
            self._model = WhisperModel(
                self.settings.whisper.model_size,
                device=self.settings.whisper.device or "auto",
                compute_type=self.settings.whisper.compute_type or "default",
                download_root=str(self.settings.whisper.download_root),
            )
        return self._model

    def transcribe(self, audio_path: Path) -> TranscriptionResult:
        model = self._load_model()
        segments, _ = model.transcribe(
            str(audio_path),
            beam_size=5,
            word_timestamps=True,
        )

        parsed_segments: List[TranscriptSegment] = []
        full_text_parts: List[str] = []

        for idx, segment in enumerate(segments):
            words = [
                WordTiming(start=w.start, end=w.end, word=w.word.strip())
                for w in segment.words or []
                if w.word.strip()
            ]
            text = segment.text.strip()
            full_text_parts.append(text)
            parsed_segments.append(
                TranscriptSegment(
                    id=idx,
                    start=segment.start,
                    end=segment.end,
                    text=text,
                    words=words,
                )
            )

        return TranscriptionResult(
            segments=parsed_segments,
            full_text=" ".join(full_text_parts).strip(),
        )

    @staticmethod
    def persist(result: TranscriptionResult, destination: Path) -> None:
        json_path = destination / "transcription.json"
        text_path = destination / "transcription.txt"

        payload = {
            "segments": [
                {
                    "id": segment.id,
                    "start": segment.start,
                    "end": segment.end,
                    "text": segment.text,
                    "words": [
                        {"start": word.start, "end": word.end, "text": word.word}
                        for word in segment.words
                    ],
                }
                for segment in result.segments
            ],
            "full_text": result.full_text,
        }

        with json_path.open("w", encoding="utf-8") as fp:
            json.dump(payload, fp, ensure_ascii=False, indent=2)

        with text_path.open("w", encoding="utf-8") as fp:
            fp.write(result.full_text)

        logger.info("Transcrição salva em %s", destination)