Files
video-render/transcribe.py
LeoMortari 0c0a9c3b5c Inicia novos recursos
Dentre eles estão recurso de adicao do faster-whisper, geração de legenda e integracao com Gemini e Open Router
2025-10-17 09:27:50 -03:00

111 lines
4.3 KiB
Python

"""Utilities for extracting audio from video and generating transcriptions.
This module handles two tasks:
1. Use FFMPEG to extract the audio track from a video file into a WAV file
suitable for consumption by the Whisper model. The audio is resampled to
16 kHz mono PCM as required by Whisper.
2. Use the Faster-Whisper implementation to generate a transcription with
word-level timestamps. The transcription is returned both as a list of
segments (for building an SRT) and as a flattened list of words (for
building dynamic subtitles).
If FFMPEG is not installed or fails, a ``RuntimeError`` is raised. The caller
is responsible for cleaning up the temporary files created in the working
directory.
"""
from __future__ import annotations
import os
import subprocess
from typing import Dict, List, Tuple
from faster_whisper import WhisperModel
def extract_audio_ffmpeg(video_path: str, audio_path: str) -> None:
"""Use FFMPEG to extract audio from ``video_path`` into ``audio_path``.
The output will be a 16 kHz mono WAV file in PCM S16LE format. Any
existing file at ``audio_path`` will be overwritten. If ffmpeg returns
a non-zero exit code, a ``RuntimeError`` is raised with the stderr.
"""
cmd = [
"ffmpeg",
"-y", # overwrite output
"-i",
video_path,
"-vn", # disable video recording
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
audio_path,
]
proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if proc.returncode != 0:
raise RuntimeError(f"FFMPEG error: {proc.stderr.decode(errors='ignore')}")
def load_whisper_model() -> WhisperModel:
"""Instantiate and cache a Faster-Whisper model.
The model name and device can be configured via the ``WHISPER_MODEL`` and
``WHISPER_DEVICE`` environment variables. The default model is
``large-v3`` for best accuracy. The device can be ``cuda`` or ``cpu``.
A module-level cache is used to prevent loading the model multiple times.
"""
if hasattr(load_whisper_model, "_cache"):
return load_whisper_model._cache # type: ignore[attr-defined]
model_name = os.environ.get("WHISPER_MODEL", "large-v3")
device = os.environ.get("WHISPER_DEVICE", "cpu")
# Compute type can be set via WHISPER_COMPUTE_TYPE; default to float16 on GPU
compute_type = os.environ.get("WHISPER_COMPUTE_TYPE")
# If not explicitly set, choose sensible defaults
if compute_type is None:
compute_type = "float16" if device == "cuda" else "int8"
model = WhisperModel(model_name, device=device, compute_type=compute_type)
load_whisper_model._cache = model # type: ignore[attr-defined]
return model
def transcribe(video_path: str, work_dir: str) -> Tuple[List[Dict[str, float]], List[Dict[str, float]]]:
"""Transcribe a video file using Faster-Whisper.
``video_path`` is the path to the video to transcribe. ``work_dir`` is a
directory where temporary files will be stored (audio file and
transcription). The function returns a tuple ``(segments, words)`` where
``segments`` is a list of dictionaries with ``start``, ``end`` and
``text`` fields, and ``words`` is a flat list of dictionaries with
``start``, ``end`` and ``word`` fields covering the entire video.
The timestamps are expressed in seconds as floats.
"""
os.makedirs(work_dir, exist_ok=True)
audio_path = os.path.join(work_dir, "audio.wav")
# Extract audio
extract_audio_ffmpeg(video_path, audio_path)
# Load Whisper model
model = load_whisper_model()
# Run transcription with word-level timestamps
segments, info = model.transcribe(audio_path, word_timestamps=True)
seg_list: List[Dict[str, float]] = []
words_list: List[Dict[str, float]] = []
for seg in segments:
seg_list.append({
"start": float(seg.start),
"end": float(seg.end),
"text": seg.text.strip(),
})
# Each segment may contain words attribute
for w in getattr(seg, "words", []) or []:
words_list.append({
"start": float(w.start),
"end": float(w.end),
"word": w.word,
})
# Sort words by start time to be safe
words_list.sort(key=lambda d: d["start"])
return seg_list, words_list