Files
video-render/video_render/transcription.py
2025-10-20 17:56:36 -03:00

123 lines
3.5 KiB
Python

from __future__ import annotations
import json
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional
from faster_whisper import WhisperModel
from .config import Settings
logger = logging.getLogger(__name__)
@dataclass(frozen=True)
class WordTiming:
start: float
end: float
word: str
@dataclass(frozen=True)
class TranscriptSegment:
id: int
start: float
end: float
text: str
words: List[WordTiming]
@dataclass(frozen=True)
class TranscriptionResult:
segments: List[TranscriptSegment]
full_text: str
class TranscriptionService:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self._model: Optional[WhisperModel] = None
def _load_model(self) -> WhisperModel:
if self._model is None:
logger.info(
"Carregando modelo Faster-Whisper '%s' (device=%s, compute_type=%s)",
self.settings.whisper.model_size,
self.settings.whisper.device or "auto",
self.settings.whisper.compute_type or "default",
)
self._model = WhisperModel(
self.settings.whisper.model_size,
device=self.settings.whisper.device or "auto",
compute_type=self.settings.whisper.compute_type or "default",
download_root=str(self.settings.whisper.download_root),
)
return self._model
def transcribe(self, audio_path: Path) -> TranscriptionResult:
model = self._load_model()
segments, _ = model.transcribe(
str(audio_path),
beam_size=5,
word_timestamps=True,
)
parsed_segments: List[TranscriptSegment] = []
full_text_parts: List[str] = []
for idx, segment in enumerate(segments):
words = [
WordTiming(start=w.start, end=w.end, word=w.word.strip())
for w in segment.words or []
if w.word.strip()
]
text = segment.text.strip()
full_text_parts.append(text)
parsed_segments.append(
TranscriptSegment(
id=idx,
start=segment.start,
end=segment.end,
text=text,
words=words,
)
)
return TranscriptionResult(
segments=parsed_segments,
full_text=" ".join(full_text_parts).strip(),
)
@staticmethod
def persist(result: TranscriptionResult, destination: Path) -> None:
json_path = destination / "transcription.json"
text_path = destination / "transcription.txt"
payload = {
"segments": [
{
"id": segment.id,
"start": segment.start,
"end": segment.end,
"text": segment.text,
"words": [
{"start": word.start, "end": word.end, "text": word.word}
for word in segment.words
],
}
for segment in result.segments
],
"full_text": result.full_text,
}
with json_path.open("w", encoding="utf-8") as fp:
json.dump(payload, fp, ensure_ascii=False, indent=2)
with text_path.open("w", encoding="utf-8") as fp:
fp.write(result.full_text)
logger.info("Transcrição salva em %s", destination)