Inicia novos recursos
Dentre eles estão recurso de adicao do faster-whisper, geração de legenda e integracao com Gemini e Open Router
This commit is contained in:
111
transcribe.py
Normal file
111
transcribe.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""Utilities for extracting audio from video and generating transcriptions.
|
||||
|
||||
This module handles two tasks:
|
||||
|
||||
1. Use FFMPEG to extract the audio track from a video file into a WAV file
|
||||
suitable for consumption by the Whisper model. The audio is resampled to
|
||||
16 kHz mono PCM as required by Whisper.
|
||||
2. Use the Faster-Whisper implementation to generate a transcription with
|
||||
word-level timestamps. The transcription is returned both as a list of
|
||||
segments (for building an SRT) and as a flattened list of words (for
|
||||
building dynamic subtitles).
|
||||
|
||||
If FFMPEG is not installed or fails, a ``RuntimeError`` is raised. The caller
|
||||
is responsible for cleaning up the temporary files created in the working
|
||||
directory.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
|
||||
def extract_audio_ffmpeg(video_path: str, audio_path: str) -> None:
|
||||
"""Use FFMPEG to extract audio from ``video_path`` into ``audio_path``.
|
||||
|
||||
The output will be a 16 kHz mono WAV file in PCM S16LE format. Any
|
||||
existing file at ``audio_path`` will be overwritten. If ffmpeg returns
|
||||
a non-zero exit code, a ``RuntimeError`` is raised with the stderr.
|
||||
"""
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y", # overwrite output
|
||||
"-i",
|
||||
video_path,
|
||||
"-vn", # disable video recording
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-ac",
|
||||
"1",
|
||||
audio_path,
|
||||
]
|
||||
proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError(f"FFMPEG error: {proc.stderr.decode(errors='ignore')}")
|
||||
|
||||
|
||||
def load_whisper_model() -> WhisperModel:
|
||||
"""Instantiate and cache a Faster-Whisper model.
|
||||
|
||||
The model name and device can be configured via the ``WHISPER_MODEL`` and
|
||||
``WHISPER_DEVICE`` environment variables. The default model is
|
||||
``large-v3`` for best accuracy. The device can be ``cuda`` or ``cpu``.
|
||||
A module-level cache is used to prevent loading the model multiple times.
|
||||
"""
|
||||
if hasattr(load_whisper_model, "_cache"):
|
||||
return load_whisper_model._cache # type: ignore[attr-defined]
|
||||
model_name = os.environ.get("WHISPER_MODEL", "large-v3")
|
||||
device = os.environ.get("WHISPER_DEVICE", "cpu")
|
||||
# Compute type can be set via WHISPER_COMPUTE_TYPE; default to float16 on GPU
|
||||
compute_type = os.environ.get("WHISPER_COMPUTE_TYPE")
|
||||
# If not explicitly set, choose sensible defaults
|
||||
if compute_type is None:
|
||||
compute_type = "float16" if device == "cuda" else "int8"
|
||||
model = WhisperModel(model_name, device=device, compute_type=compute_type)
|
||||
load_whisper_model._cache = model # type: ignore[attr-defined]
|
||||
return model
|
||||
|
||||
|
||||
def transcribe(video_path: str, work_dir: str) -> Tuple[List[Dict[str, float]], List[Dict[str, float]]]:
|
||||
"""Transcribe a video file using Faster-Whisper.
|
||||
|
||||
``video_path`` is the path to the video to transcribe. ``work_dir`` is a
|
||||
directory where temporary files will be stored (audio file and
|
||||
transcription). The function returns a tuple ``(segments, words)`` where
|
||||
``segments`` is a list of dictionaries with ``start``, ``end`` and
|
||||
``text`` fields, and ``words`` is a flat list of dictionaries with
|
||||
``start``, ``end`` and ``word`` fields covering the entire video.
|
||||
The timestamps are expressed in seconds as floats.
|
||||
"""
|
||||
os.makedirs(work_dir, exist_ok=True)
|
||||
audio_path = os.path.join(work_dir, "audio.wav")
|
||||
# Extract audio
|
||||
extract_audio_ffmpeg(video_path, audio_path)
|
||||
# Load Whisper model
|
||||
model = load_whisper_model()
|
||||
# Run transcription with word-level timestamps
|
||||
segments, info = model.transcribe(audio_path, word_timestamps=True)
|
||||
seg_list: List[Dict[str, float]] = []
|
||||
words_list: List[Dict[str, float]] = []
|
||||
for seg in segments:
|
||||
seg_list.append({
|
||||
"start": float(seg.start),
|
||||
"end": float(seg.end),
|
||||
"text": seg.text.strip(),
|
||||
})
|
||||
# Each segment may contain words attribute
|
||||
for w in getattr(seg, "words", []) or []:
|
||||
words_list.append({
|
||||
"start": float(w.start),
|
||||
"end": float(w.end),
|
||||
"word": w.word,
|
||||
})
|
||||
# Sort words by start time to be safe
|
||||
words_list.sort(key=lambda d: d["start"])
|
||||
return seg_list, words_list
|
||||
Reference in New Issue
Block a user