from __future__ import annotations import json import logging from dataclasses import dataclass from pathlib import Path from typing import List, Optional import numpy as np from faster_whisper import WhisperModel from video_render.config import Settings logger = logging.getLogger(__name__) @dataclass(frozen=True) class WordTiming: start: float end: float word: str @dataclass(frozen=True) class TranscriptSegment: id: int start: float end: float text: str words: List[WordTiming] @dataclass(frozen=True) class TranscriptionResult: segments: List[TranscriptSegment] full_text: str class TranscriptionService: def __init__(self, settings: Settings) -> None: self.settings = settings self._model: Optional[WhisperModel] = None def _load_model(self) -> WhisperModel: if self._model is None: logger.info( "Carregando modelo Faster-Whisper '%s' (device=%s, compute_type=%s)", self.settings.whisper.model_size, self.settings.whisper.device or "auto", self.settings.whisper.compute_type or "default", ) self._model = WhisperModel( self.settings.whisper.model_size, device=self.settings.whisper.device or "auto", compute_type=self.settings.whisper.compute_type or "default", download_root=str(self.settings.whisper.download_root), ) return self._model def unload_model(self) -> None: """Unload the Whisper model to free memory (reduces RAM usage by 1-3GB).""" if self._model is not None: logger.info("Descarregando modelo Whisper para liberar memória...") del self._model self._model = None # Force garbage collection to immediately free GPU/CPU memory import gc gc.collect() logger.info("Modelo Whisper descarregado com sucesso") def transcribe(self, audio_path: Path, output_dir: Optional[Path] = None) -> TranscriptionResult: if output_dir is not None: existing_transcription = self.load(output_dir) if existing_transcription is not None: logger.info("Transcrição já existe em %s, reutilizando...", output_dir) return existing_transcription # Get audio duration to decide if we need chunked processing audio_duration = self._get_audio_duration(audio_path) chunk_duration_minutes = 30 # Process in 30-minute chunks for long videos chunk_duration_seconds = chunk_duration_minutes * 60 # For videos longer than 30 minutes, use chunked processing to avoid OOM if audio_duration > chunk_duration_seconds: logger.info( f"Áudio longo detectado ({audio_duration/60:.1f} min). " f"Processando em chunks de {chunk_duration_minutes} min para evitar erro de memória..." ) return self._transcribe_chunked(audio_path, chunk_duration_seconds) else: logger.info(f"Iniciando transcrição do áudio ({audio_duration/60:.1f} min) com FasterWhisper...") return self._transcribe_full(audio_path) def _get_audio_duration(self, audio_path: Path) -> float: """Get audio duration in seconds.""" try: from moviepy.audio.io.AudioFileClip import AudioFileClip with AudioFileClip(str(audio_path)) as audio: return audio.duration or 0.0 except Exception as e: logger.warning(f"Falha ao obter duração do áudio, assumindo curto: {e}") return 0.0 # Assume short if we can't determine def _transcribe_full(self, audio_path: Path) -> TranscriptionResult: """Transcribe entire audio at once (for shorter videos).""" model = self._load_model() segments, _ = model.transcribe( str(audio_path), beam_size=5, word_timestamps=True, ) parsed_segments: List[TranscriptSegment] = [] full_text_parts: List[str] = [] for idx, segment in enumerate(segments): words = [ WordTiming(start=w.start, end=w.end, word=w.word.strip()) for w in segment.words or [] if w.word.strip() ] text = segment.text.strip() full_text_parts.append(text) parsed_segments.append( TranscriptSegment( id=idx, start=segment.start, end=segment.end, text=text, words=words, ) ) return TranscriptionResult( segments=parsed_segments, full_text=" ".join(full_text_parts).strip(), ) def _transcribe_chunked(self, audio_path: Path, chunk_duration: float) -> TranscriptionResult: """Transcribe audio in chunks to avoid OOM on long videos.""" import subprocess from moviepy.audio.io.AudioFileClip import AudioFileClip model = self._load_model() all_segments: List[TranscriptSegment] = [] full_text_parts: List[str] = [] segment_id_counter = 0 # Get total duration total_duration = self._get_audio_duration(audio_path) num_chunks = int(np.ceil(total_duration / chunk_duration)) logger.info(f"Processando áudio em {num_chunks} chunks...") for chunk_idx in range(num_chunks): start_time = chunk_idx * chunk_duration end_time = min((chunk_idx + 1) * chunk_duration, total_duration) logger.info( f"Processando chunk {chunk_idx + 1}/{num_chunks} " f"({start_time/60:.1f}min - {end_time/60:.1f}min)..." ) # Extract chunk using ffmpeg directly (more reliable than moviepy subclip) temp_chunk_path = audio_path.parent / f"temp_chunk_{chunk_idx}.wav" try: # Use ffmpeg to extract the chunk chunk_duration_actual = end_time - start_time ffmpeg_cmd = [ 'ffmpeg', '-y', # Overwrite output file '-ss', str(start_time), # Start time '-i', str(audio_path), # Input file '-t', str(chunk_duration_actual), # Duration '-acodec', 'pcm_s16le', # Audio codec '-ar', '44100', # Sample rate '-ac', '2', # Stereo '-loglevel', 'error', # Only show errors str(temp_chunk_path) ] subprocess.run(ffmpeg_cmd, check=True, capture_output=True) # Transcribe chunk segments, _ = model.transcribe( str(temp_chunk_path), beam_size=5, word_timestamps=True, ) # Process segments with time offset for segment in segments: words = [ WordTiming( start=w.start + start_time, end=w.end + start_time, word=w.word.strip() ) for w in segment.words or [] if w.word.strip() ] text = segment.text.strip() full_text_parts.append(text) all_segments.append( TranscriptSegment( id=segment_id_counter, start=segment.start + start_time, end=segment.end + start_time, text=text, words=words, ) ) segment_id_counter += 1 # Force garbage collection after each chunk import gc gc.collect() except subprocess.CalledProcessError as e: logger.error(f"Erro ao extrair chunk {chunk_idx}: {e.stderr.decode() if e.stderr else str(e)}") raise finally: # Clean up temp chunk if temp_chunk_path.exists(): temp_chunk_path.unlink() logger.info(f"Transcrição em chunks concluída: {len(all_segments)} segmentos processados") return TranscriptionResult( segments=all_segments, full_text=" ".join(full_text_parts).strip(), ) @staticmethod def persist(result: TranscriptionResult, destination: Path) -> None: json_path = destination / "transcription.json" text_path = destination / "transcription.txt" payload = { "segments": [ { "id": segment.id, "start": segment.start, "end": segment.end, "text": segment.text, "words": [ {"start": word.start, "end": word.end, "text": word.word} for word in segment.words ], } for segment in result.segments ], "full_text": result.full_text, } with json_path.open("w", encoding="utf-8") as fp: json.dump(payload, fp, ensure_ascii=False, indent=2) with text_path.open("w", encoding="utf-8") as fp: fp.write(result.full_text) logger.info("Transcricao salva em %s", destination) @staticmethod def load(source: Path) -> Optional[TranscriptionResult]: json_path = source / "transcription.json" if not json_path.exists(): return None try: with json_path.open("r", encoding="utf-8") as fp: payload = json.load(fp) except (OSError, json.JSONDecodeError) as exc: logger.warning( "Falha ao carregar transcricao existente de %s: %s", json_path, exc ) return None segments_payload = payload.get("segments", []) if not isinstance(segments_payload, list): logger.warning( "Formato inesperado ao carregar transcricao de %s: 'segments' invalido", json_path, ) return None segments: List[TranscriptSegment] = [] for idx, segment_data in enumerate(segments_payload): if not isinstance(segment_data, dict): logger.debug("Segmento invalido ignorado ao carregar: %s", segment_data) continue try: segment_id = int(segment_data.get("id", idx)) start = float(segment_data["start"]) end = float(segment_data["end"]) except (KeyError, TypeError, ValueError): logger.debug("Segmento sem dados obrigatorios ignorado: %s", segment_data) continue text = str(segment_data.get("text", "")).strip() words_payload = segment_data.get("words", []) words: List[WordTiming] = [] if isinstance(words_payload, list): for word_data in words_payload: if not isinstance(word_data, dict): continue try: w_start = float(word_data["start"]) w_end = float(word_data["end"]) except (KeyError, TypeError, ValueError): logger.debug( "Palavra sem dados obrigatorios ignorada: %s", word_data ) continue word_text = str(word_data.get("text", "")).strip() if not word_text: continue words.append(WordTiming(start=w_start, end=w_end, word=word_text)) segments.append( TranscriptSegment( id=segment_id, start=start, end=end, text=text, words=words, ) ) full_text = str(payload.get("full_text", "")).strip() return TranscriptionResult(segments=segments, full_text=full_text)