video-render/transcribe.py

"""Utilities for extracting audio from video and generating transcriptions.

This module handles two tasks:

1. Use FFMPEG to extract the audio track from a video file into a WAV file
   suitable for consumption by the Whisper model. The audio is resampled to
   16 kHz mono PCM as required by Whisper.
2. Use the Faster-Whisper implementation to generate a transcription with
   word-level timestamps. The transcription is returned both as a list of
   segments (for building an SRT) and as a flattened list of words (for
   building dynamic subtitles).

If FFMPEG is not installed or fails, a ``RuntimeError`` is raised. The caller
is responsible for cleaning up the temporary files created in the working
directory.
"""

from __future__ import annotations

import os
import subprocess
from typing import Dict, List, Tuple

from faster_whisper import WhisperModel


def extract_audio_ffmpeg(video_path: str, audio_path: str) -> None:
    """Use FFMPEG to extract audio from ``video_path`` into ``audio_path``.

    The output will be a 16 kHz mono WAV file in PCM S16LE format. Any
    existing file at ``audio_path`` will be overwritten. If ffmpeg returns
    a non-zero exit code, a ``RuntimeError`` is raised with the stderr.
    """
    cmd = [
        "ffmpeg",
        "-y",  # overwrite output
        "-i",
        video_path,
        "-vn",  # disable video recording
        "-acodec",
        "pcm_s16le",
        "-ar",
        "16000",
        "-ac",
        "1",
        audio_path,
    ]
    proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if proc.returncode != 0:
        raise RuntimeError(f"FFMPEG error: {proc.stderr.decode(errors='ignore')}")


def load_whisper_model() -> WhisperModel:
    """Instantiate and cache a Faster-Whisper model.

    The model name and device can be configured via the ``WHISPER_MODEL`` and
    ``WHISPER_DEVICE`` environment variables. The default model is
    ``large-v3`` for best accuracy. The device can be ``cuda`` or ``cpu``.
    A module-level cache is used to prevent loading the model multiple times.
    """
    if hasattr(load_whisper_model, "_cache"):
        return load_whisper_model._cache  # type: ignore[attr-defined]
    model_name = os.environ.get("WHISPER_MODEL", "large-v3")
    device = os.environ.get("WHISPER_DEVICE", "cpu")
    # Compute type can be set via WHISPER_COMPUTE_TYPE; default to float16 on GPU
    compute_type = os.environ.get("WHISPER_COMPUTE_TYPE")
    # If not explicitly set, choose sensible defaults
    if compute_type is None:
        compute_type = "float16" if device == "cuda" else "int8"
    model = WhisperModel(model_name, device=device, compute_type=compute_type)
    load_whisper_model._cache = model  # type: ignore[attr-defined]
    return model


def transcribe(video_path: str, work_dir: str) -> Tuple[List[Dict[str, float]], List[Dict[str, float]]]:
    """Transcribe a video file using Faster-Whisper.

    ``video_path`` is the path to the video to transcribe. ``work_dir`` is a
    directory where temporary files will be stored (audio file and
    transcription). The function returns a tuple ``(segments, words)`` where
    ``segments`` is a list of dictionaries with ``start``, ``end`` and
    ``text`` fields, and ``words`` is a flat list of dictionaries with
    ``start``, ``end`` and ``word`` fields covering the entire video.
    The timestamps are expressed in seconds as floats.
    """
    os.makedirs(work_dir, exist_ok=True)
    audio_path = os.path.join(work_dir, "audio.wav")
    # Extract audio
    extract_audio_ffmpeg(video_path, audio_path)
    # Load Whisper model
    model = load_whisper_model()
    # Run transcription with word-level timestamps
    segments, info = model.transcribe(audio_path, word_timestamps=True)
    seg_list: List[Dict[str, float]] = []
    words_list: List[Dict[str, float]] = []
    for seg in segments:
        seg_list.append({
            "start": float(seg.start),
            "end": float(seg.end),
            "text": seg.text.strip(),
        })
        # Each segment may contain words attribute
        for w in getattr(seg, "words", []) or []:
            words_list.append({
                "start": float(w.start),
                "end": float(w.end),
                "word": w.word,
            })
    # Sort words by start time to be safe
    words_list.sort(key=lambda d: d["start"])
    return seg_list, words_list