Dentre eles estão recurso de adicao do faster-whisper, geração de legenda e integracao com Gemini e Open Router
111 lines
4.3 KiB
Python
111 lines
4.3 KiB
Python
"""Utilities for extracting audio from video and generating transcriptions.
|
|
|
|
This module handles two tasks:
|
|
|
|
1. Use FFMPEG to extract the audio track from a video file into a WAV file
|
|
suitable for consumption by the Whisper model. The audio is resampled to
|
|
16 kHz mono PCM as required by Whisper.
|
|
2. Use the Faster-Whisper implementation to generate a transcription with
|
|
word-level timestamps. The transcription is returned both as a list of
|
|
segments (for building an SRT) and as a flattened list of words (for
|
|
building dynamic subtitles).
|
|
|
|
If FFMPEG is not installed or fails, a ``RuntimeError`` is raised. The caller
|
|
is responsible for cleaning up the temporary files created in the working
|
|
directory.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import subprocess
|
|
from typing import Dict, List, Tuple
|
|
|
|
from faster_whisper import WhisperModel
|
|
|
|
|
|
def extract_audio_ffmpeg(video_path: str, audio_path: str) -> None:
|
|
"""Use FFMPEG to extract audio from ``video_path`` into ``audio_path``.
|
|
|
|
The output will be a 16 kHz mono WAV file in PCM S16LE format. Any
|
|
existing file at ``audio_path`` will be overwritten. If ffmpeg returns
|
|
a non-zero exit code, a ``RuntimeError`` is raised with the stderr.
|
|
"""
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-y", # overwrite output
|
|
"-i",
|
|
video_path,
|
|
"-vn", # disable video recording
|
|
"-acodec",
|
|
"pcm_s16le",
|
|
"-ar",
|
|
"16000",
|
|
"-ac",
|
|
"1",
|
|
audio_path,
|
|
]
|
|
proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
if proc.returncode != 0:
|
|
raise RuntimeError(f"FFMPEG error: {proc.stderr.decode(errors='ignore')}")
|
|
|
|
|
|
def load_whisper_model() -> WhisperModel:
|
|
"""Instantiate and cache a Faster-Whisper model.
|
|
|
|
The model name and device can be configured via the ``WHISPER_MODEL`` and
|
|
``WHISPER_DEVICE`` environment variables. The default model is
|
|
``large-v3`` for best accuracy. The device can be ``cuda`` or ``cpu``.
|
|
A module-level cache is used to prevent loading the model multiple times.
|
|
"""
|
|
if hasattr(load_whisper_model, "_cache"):
|
|
return load_whisper_model._cache # type: ignore[attr-defined]
|
|
model_name = os.environ.get("WHISPER_MODEL", "large-v3")
|
|
device = os.environ.get("WHISPER_DEVICE", "cpu")
|
|
# Compute type can be set via WHISPER_COMPUTE_TYPE; default to float16 on GPU
|
|
compute_type = os.environ.get("WHISPER_COMPUTE_TYPE")
|
|
# If not explicitly set, choose sensible defaults
|
|
if compute_type is None:
|
|
compute_type = "float16" if device == "cuda" else "int8"
|
|
model = WhisperModel(model_name, device=device, compute_type=compute_type)
|
|
load_whisper_model._cache = model # type: ignore[attr-defined]
|
|
return model
|
|
|
|
|
|
def transcribe(video_path: str, work_dir: str) -> Tuple[List[Dict[str, float]], List[Dict[str, float]]]:
|
|
"""Transcribe a video file using Faster-Whisper.
|
|
|
|
``video_path`` is the path to the video to transcribe. ``work_dir`` is a
|
|
directory where temporary files will be stored (audio file and
|
|
transcription). The function returns a tuple ``(segments, words)`` where
|
|
``segments`` is a list of dictionaries with ``start``, ``end`` and
|
|
``text`` fields, and ``words`` is a flat list of dictionaries with
|
|
``start``, ``end`` and ``word`` fields covering the entire video.
|
|
The timestamps are expressed in seconds as floats.
|
|
"""
|
|
os.makedirs(work_dir, exist_ok=True)
|
|
audio_path = os.path.join(work_dir, "audio.wav")
|
|
# Extract audio
|
|
extract_audio_ffmpeg(video_path, audio_path)
|
|
# Load Whisper model
|
|
model = load_whisper_model()
|
|
# Run transcription with word-level timestamps
|
|
segments, info = model.transcribe(audio_path, word_timestamps=True)
|
|
seg_list: List[Dict[str, float]] = []
|
|
words_list: List[Dict[str, float]] = []
|
|
for seg in segments:
|
|
seg_list.append({
|
|
"start": float(seg.start),
|
|
"end": float(seg.end),
|
|
"text": seg.text.strip(),
|
|
})
|
|
# Each segment may contain words attribute
|
|
for w in getattr(seg, "words", []) or []:
|
|
words_list.append({
|
|
"start": float(w.start),
|
|
"end": float(w.end),
|
|
"word": w.word,
|
|
})
|
|
# Sort words by start time to be safe
|
|
words_list.sort(key=lambda d: d["start"])
|
|
return seg_list, words_list |