Realiza varios ajustes para melhorar o tracking e o render de video

2025-12-18 02:26:25 -03:00
parent 78e35d65fd
commit 07d301f110
11 changed files with 984 additions and 316 deletions
--- a/video_render/transcription.py
+++ b/video_render/transcription.py
@@ -6,6 +6,7 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional

+import numpy as np
 from faster_whisper import WhisperModel

 from video_render.config import Settings
@@ -56,6 +57,17 @@ class TranscriptionService:
            )
        return self._model

+    def unload_model(self) -> None:
+        """Unload the Whisper model to free memory (reduces RAM usage by 1-3GB)."""
+        if self._model is not None:
+            logger.info("Descarregando modelo Whisper para liberar memória...")
+            del self._model
+            self._model = None
+            # Force garbage collection to immediately free GPU/CPU memory
+            import gc
+            gc.collect()
+            logger.info("Modelo Whisper descarregado com sucesso")
+
    def transcribe(self, audio_path: Path, output_dir: Optional[Path] = None) -> TranscriptionResult:
        if output_dir is not None:
            existing_transcription = self.load(output_dir)
@@ -63,7 +75,34 @@ class TranscriptionService:
                logger.info("Transcrição já existe em %s, reutilizando...", output_dir)
                return existing_transcription

-        logger.info("Iniciando transcrição do áudio com FasterWhisper...")
+        # Get audio duration to decide if we need chunked processing
+        audio_duration = self._get_audio_duration(audio_path)
+        chunk_duration_minutes = 30  # Process in 30-minute chunks for long videos
+        chunk_duration_seconds = chunk_duration_minutes * 60
+
+        # For videos longer than 30 minutes, use chunked processing to avoid OOM
+        if audio_duration > chunk_duration_seconds:
+            logger.info(
+                f"Áudio longo detectado ({audio_duration/60:.1f} min). "
+                f"Processando em chunks de {chunk_duration_minutes} min para evitar erro de memória..."
+            )
+            return self._transcribe_chunked(audio_path, chunk_duration_seconds)
+        else:
+            logger.info(f"Iniciando transcrição do áudio ({audio_duration/60:.1f} min) com FasterWhisper...")
+            return self._transcribe_full(audio_path)
+
+    def _get_audio_duration(self, audio_path: Path) -> float:
+        """Get audio duration in seconds."""
+        try:
+            from moviepy.audio.io.AudioFileClip import AudioFileClip
+            with AudioFileClip(str(audio_path)) as audio:
+                return audio.duration or 0.0
+        except Exception as e:
+            logger.warning(f"Falha ao obter duração do áudio, assumindo curto: {e}")
+            return 0.0  # Assume short if we can't determine
+
+    def _transcribe_full(self, audio_path: Path) -> TranscriptionResult:
+        """Transcribe entire audio at once (for shorter videos)."""
        model = self._load_model()
        segments, _ = model.transcribe(
            str(audio_path),
@@ -97,6 +136,101 @@ class TranscriptionService:
            full_text=" ".join(full_text_parts).strip(),
        )

+    def _transcribe_chunked(self, audio_path: Path, chunk_duration: float) -> TranscriptionResult:
+        """Transcribe audio in chunks to avoid OOM on long videos."""
+        import subprocess
+        from moviepy.audio.io.AudioFileClip import AudioFileClip
+
+        model = self._load_model()
+        all_segments: List[TranscriptSegment] = []
+        full_text_parts: List[str] = []
+        segment_id_counter = 0
+
+        # Get total duration
+        total_duration = self._get_audio_duration(audio_path)
+        num_chunks = int(np.ceil(total_duration / chunk_duration))
+
+        logger.info(f"Processando áudio em {num_chunks} chunks...")
+
+        for chunk_idx in range(num_chunks):
+            start_time = chunk_idx * chunk_duration
+            end_time = min((chunk_idx + 1) * chunk_duration, total_duration)
+
+            logger.info(
+                f"Processando chunk {chunk_idx + 1}/{num_chunks} "
+                f"({start_time/60:.1f}min - {end_time/60:.1f}min)..."
+            )
+
+            # Extract chunk using ffmpeg directly (more reliable than moviepy subclip)
+            temp_chunk_path = audio_path.parent / f"temp_chunk_{chunk_idx}.wav"
+            try:
+                # Use ffmpeg to extract the chunk
+                chunk_duration_actual = end_time - start_time
+                ffmpeg_cmd = [
+                    'ffmpeg',
+                    '-y',  # Overwrite output file
+                    '-ss', str(start_time),  # Start time
+                    '-i', str(audio_path),  # Input file
+                    '-t', str(chunk_duration_actual),  # Duration
+                    '-acodec', 'pcm_s16le',  # Audio codec
+                    '-ar', '44100',  # Sample rate
+                    '-ac', '2',  # Stereo
+                    '-loglevel', 'error',  # Only show errors
+                    str(temp_chunk_path)
+                ]
+
+                subprocess.run(ffmpeg_cmd, check=True, capture_output=True)
+
+                # Transcribe chunk
+                segments, _ = model.transcribe(
+                    str(temp_chunk_path),
+                    beam_size=5,
+                    word_timestamps=True,
+                )
+
+                # Process segments with time offset
+                for segment in segments:
+                    words = [
+                        WordTiming(
+                            start=w.start + start_time,
+                            end=w.end + start_time,
+                            word=w.word.strip()
+                        )
+                        for w in segment.words or []
+                        if w.word.strip()
+                    ]
+                    text = segment.text.strip()
+                    full_text_parts.append(text)
+                    all_segments.append(
+                        TranscriptSegment(
+                            id=segment_id_counter,
+                            start=segment.start + start_time,
+                            end=segment.end + start_time,
+                            text=text,
+                            words=words,
+                        )
+                    )
+                    segment_id_counter += 1
+
+                # Force garbage collection after each chunk
+                import gc
+                gc.collect()
+
+            except subprocess.CalledProcessError as e:
+                logger.error(f"Erro ao extrair chunk {chunk_idx}: {e.stderr.decode() if e.stderr else str(e)}")
+                raise
+            finally:
+                # Clean up temp chunk
+                if temp_chunk_path.exists():
+                    temp_chunk_path.unlink()
+
+        logger.info(f"Transcrição em chunks concluída: {len(all_segments)} segmentos processados")
+
+        return TranscriptionResult(
+            segments=all_segments,
+            full_text=" ".join(full_text_parts).strip(),
+        )
+
    @staticmethod
    def persist(result: TranscriptionResult, destination: Path) -> None:
        json_path = destination / "transcription.json"