Inicia novos recursos

Dentre eles estão recurso de adicao do faster-whisper, geração de legenda e integracao com Gemini e Open Router
2025-10-17 09:27:50 -03:00
commit 0c0a9c3b5c
15 changed files with 997 additions and 0 deletions
--- a/transcribe.py
+++ b/transcribe.py
@@ -0,0 +1,111 @@
+"""Utilities for extracting audio from video and generating transcriptions.
+
+This module handles two tasks:
+
+1. Use FFMPEG to extract the audio track from a video file into a WAV file
+   suitable for consumption by the Whisper model. The audio is resampled to
+   16 kHz mono PCM as required by Whisper.
+2. Use the Faster-Whisper implementation to generate a transcription with
+   word-level timestamps. The transcription is returned both as a list of
+   segments (for building an SRT) and as a flattened list of words (for
+   building dynamic subtitles).
+
+If FFMPEG is not installed or fails, a ``RuntimeError`` is raised. The caller
+is responsible for cleaning up the temporary files created in the working
+directory.
+"""
+
+from __future__ import annotations
+
+import os
+import subprocess
+from typing import Dict, List, Tuple
+
+from faster_whisper import WhisperModel
+
+
+def extract_audio_ffmpeg(video_path: str, audio_path: str) -> None:
+    """Use FFMPEG to extract audio from ``video_path`` into ``audio_path``.
+
+    The output will be a 16 kHz mono WAV file in PCM S16LE format. Any
+    existing file at ``audio_path`` will be overwritten. If ffmpeg returns
+    a non-zero exit code, a ``RuntimeError`` is raised with the stderr.
+    """
+    cmd = [
+        "ffmpeg",
+        "-y",  # overwrite output
+        "-i",
+        video_path,
+        "-vn",  # disable video recording
+        "-acodec",
+        "pcm_s16le",
+        "-ar",
+        "16000",
+        "-ac",
+        "1",
+        audio_path,
+    ]
+    proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if proc.returncode != 0:
+        raise RuntimeError(f"FFMPEG error: {proc.stderr.decode(errors='ignore')}")
+
+
+def load_whisper_model() -> WhisperModel:
+    """Instantiate and cache a Faster-Whisper model.
+
+    The model name and device can be configured via the ``WHISPER_MODEL`` and
+    ``WHISPER_DEVICE`` environment variables. The default model is
+    ``large-v3`` for best accuracy. The device can be ``cuda`` or ``cpu``.
+    A module-level cache is used to prevent loading the model multiple times.
+    """
+    if hasattr(load_whisper_model, "_cache"):
+        return load_whisper_model._cache  # type: ignore[attr-defined]
+    model_name = os.environ.get("WHISPER_MODEL", "large-v3")
+    device = os.environ.get("WHISPER_DEVICE", "cpu")
+    # Compute type can be set via WHISPER_COMPUTE_TYPE; default to float16 on GPU
+    compute_type = os.environ.get("WHISPER_COMPUTE_TYPE")
+    # If not explicitly set, choose sensible defaults
+    if compute_type is None:
+        compute_type = "float16" if device == "cuda" else "int8"
+    model = WhisperModel(model_name, device=device, compute_type=compute_type)
+    load_whisper_model._cache = model  # type: ignore[attr-defined]
+    return model
+
+
+def transcribe(video_path: str, work_dir: str) -> Tuple[List[Dict[str, float]], List[Dict[str, float]]]:
+    """Transcribe a video file using Faster-Whisper.
+
+    ``video_path`` is the path to the video to transcribe. ``work_dir`` is a
+    directory where temporary files will be stored (audio file and
+    transcription). The function returns a tuple ``(segments, words)`` where
+    ``segments`` is a list of dictionaries with ``start``, ``end`` and
+    ``text`` fields, and ``words`` is a flat list of dictionaries with
+    ``start``, ``end`` and ``word`` fields covering the entire video.
+    The timestamps are expressed in seconds as floats.
+    """
+    os.makedirs(work_dir, exist_ok=True)
+    audio_path = os.path.join(work_dir, "audio.wav")
+    # Extract audio
+    extract_audio_ffmpeg(video_path, audio_path)
+    # Load Whisper model
+    model = load_whisper_model()
+    # Run transcription with word-level timestamps
+    segments, info = model.transcribe(audio_path, word_timestamps=True)
+    seg_list: List[Dict[str, float]] = []
+    words_list: List[Dict[str, float]] = []
+    for seg in segments:
+        seg_list.append({
+            "start": float(seg.start),
+            "end": float(seg.end),
+            "text": seg.text.strip(),
+        })
+        # Each segment may contain words attribute
+        for w in getattr(seg, "words", []) or []:
+            words_list.append({
+                "start": float(w.start),
+                "end": float(w.end),
+                "word": w.word,
+            })
+    # Sort words by start time to be safe
+    words_list.sort(key=lambda d: d["start"])
+    return seg_list, words_list