"""Utilities for extracting audio from video and generating transcriptions. This module handles two tasks: 1. Use FFMPEG to extract the audio track from a video file into a WAV file suitable for consumption by the Whisper model. The audio is resampled to 16 kHz mono PCM as required by Whisper. 2. Use the Faster-Whisper implementation to generate a transcription with word-level timestamps. The transcription is returned both as a list of segments (for building an SRT) and as a flattened list of words (for building dynamic subtitles). If FFMPEG is not installed or fails, a ``RuntimeError`` is raised. The caller is responsible for cleaning up the temporary files created in the working directory. """ from __future__ import annotations import os import subprocess from typing import Dict, List, Tuple from faster_whisper import WhisperModel def extract_audio_ffmpeg(video_path: str, audio_path: str) -> None: """Use FFMPEG to extract audio from ``video_path`` into ``audio_path``. The output will be a 16 kHz mono WAV file in PCM S16LE format. Any existing file at ``audio_path`` will be overwritten. If ffmpeg returns a non-zero exit code, a ``RuntimeError`` is raised with the stderr. """ cmd = [ "ffmpeg", "-y", # overwrite output "-i", video_path, "-vn", # disable video recording "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", audio_path, ] proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if proc.returncode != 0: raise RuntimeError(f"FFMPEG error: {proc.stderr.decode(errors='ignore')}") def load_whisper_model() -> WhisperModel: """Instantiate and cache a Faster-Whisper model. The model name and device can be configured via the ``WHISPER_MODEL`` and ``WHISPER_DEVICE`` environment variables. The default model is ``large-v3`` for best accuracy. The device can be ``cuda`` or ``cpu``. A module-level cache is used to prevent loading the model multiple times. """ if hasattr(load_whisper_model, "_cache"): return load_whisper_model._cache # type: ignore[attr-defined] model_name = os.environ.get("WHISPER_MODEL", "large-v3") device = os.environ.get("WHISPER_DEVICE", "cpu") # Compute type can be set via WHISPER_COMPUTE_TYPE; default to float16 on GPU compute_type = os.environ.get("WHISPER_COMPUTE_TYPE") # If not explicitly set, choose sensible defaults if compute_type is None: compute_type = "float16" if device == "cuda" else "int8" model = WhisperModel(model_name, device=device, compute_type=compute_type) load_whisper_model._cache = model # type: ignore[attr-defined] return model def transcribe(video_path: str, work_dir: str) -> Tuple[List[Dict[str, float]], List[Dict[str, float]]]: """Transcribe a video file using Faster-Whisper. ``video_path`` is the path to the video to transcribe. ``work_dir`` is a directory where temporary files will be stored (audio file and transcription). The function returns a tuple ``(segments, words)`` where ``segments`` is a list of dictionaries with ``start``, ``end`` and ``text`` fields, and ``words`` is a flat list of dictionaries with ``start``, ``end`` and ``word`` fields covering the entire video. The timestamps are expressed in seconds as floats. """ os.makedirs(work_dir, exist_ok=True) audio_path = os.path.join(work_dir, "audio.wav") # Extract audio extract_audio_ffmpeg(video_path, audio_path) # Load Whisper model model = load_whisper_model() # Run transcription with word-level timestamps segments, info = model.transcribe(audio_path, word_timestamps=True) seg_list: List[Dict[str, float]] = [] words_list: List[Dict[str, float]] = [] for seg in segments: seg_list.append({ "start": float(seg.start), "end": float(seg.end), "text": seg.text.strip(), }) # Each segment may contain words attribute for w in getattr(seg, "words", []) or []: words_list.append({ "start": float(w.start), "end": float(w.end), "word": w.word, }) # Sort words by start time to be safe words_list.sort(key=lambda d: d["start"]) return seg_list, words_list