Realiza varios ajustes para melhorar o tracking e o render de video
This commit is contained in:
@@ -6,6 +6,7 @@ from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
from video_render.config import Settings
|
||||
@@ -56,6 +57,17 @@ class TranscriptionService:
|
||||
)
|
||||
return self._model
|
||||
|
||||
def unload_model(self) -> None:
|
||||
"""Unload the Whisper model to free memory (reduces RAM usage by 1-3GB)."""
|
||||
if self._model is not None:
|
||||
logger.info("Descarregando modelo Whisper para liberar memória...")
|
||||
del self._model
|
||||
self._model = None
|
||||
# Force garbage collection to immediately free GPU/CPU memory
|
||||
import gc
|
||||
gc.collect()
|
||||
logger.info("Modelo Whisper descarregado com sucesso")
|
||||
|
||||
def transcribe(self, audio_path: Path, output_dir: Optional[Path] = None) -> TranscriptionResult:
|
||||
if output_dir is not None:
|
||||
existing_transcription = self.load(output_dir)
|
||||
@@ -63,7 +75,34 @@ class TranscriptionService:
|
||||
logger.info("Transcrição já existe em %s, reutilizando...", output_dir)
|
||||
return existing_transcription
|
||||
|
||||
logger.info("Iniciando transcrição do áudio com FasterWhisper...")
|
||||
# Get audio duration to decide if we need chunked processing
|
||||
audio_duration = self._get_audio_duration(audio_path)
|
||||
chunk_duration_minutes = 30 # Process in 30-minute chunks for long videos
|
||||
chunk_duration_seconds = chunk_duration_minutes * 60
|
||||
|
||||
# For videos longer than 30 minutes, use chunked processing to avoid OOM
|
||||
if audio_duration > chunk_duration_seconds:
|
||||
logger.info(
|
||||
f"Áudio longo detectado ({audio_duration/60:.1f} min). "
|
||||
f"Processando em chunks de {chunk_duration_minutes} min para evitar erro de memória..."
|
||||
)
|
||||
return self._transcribe_chunked(audio_path, chunk_duration_seconds)
|
||||
else:
|
||||
logger.info(f"Iniciando transcrição do áudio ({audio_duration/60:.1f} min) com FasterWhisper...")
|
||||
return self._transcribe_full(audio_path)
|
||||
|
||||
def _get_audio_duration(self, audio_path: Path) -> float:
|
||||
"""Get audio duration in seconds."""
|
||||
try:
|
||||
from moviepy.audio.io.AudioFileClip import AudioFileClip
|
||||
with AudioFileClip(str(audio_path)) as audio:
|
||||
return audio.duration or 0.0
|
||||
except Exception as e:
|
||||
logger.warning(f"Falha ao obter duração do áudio, assumindo curto: {e}")
|
||||
return 0.0 # Assume short if we can't determine
|
||||
|
||||
def _transcribe_full(self, audio_path: Path) -> TranscriptionResult:
|
||||
"""Transcribe entire audio at once (for shorter videos)."""
|
||||
model = self._load_model()
|
||||
segments, _ = model.transcribe(
|
||||
str(audio_path),
|
||||
@@ -97,6 +136,101 @@ class TranscriptionService:
|
||||
full_text=" ".join(full_text_parts).strip(),
|
||||
)
|
||||
|
||||
def _transcribe_chunked(self, audio_path: Path, chunk_duration: float) -> TranscriptionResult:
|
||||
"""Transcribe audio in chunks to avoid OOM on long videos."""
|
||||
import subprocess
|
||||
from moviepy.audio.io.AudioFileClip import AudioFileClip
|
||||
|
||||
model = self._load_model()
|
||||
all_segments: List[TranscriptSegment] = []
|
||||
full_text_parts: List[str] = []
|
||||
segment_id_counter = 0
|
||||
|
||||
# Get total duration
|
||||
total_duration = self._get_audio_duration(audio_path)
|
||||
num_chunks = int(np.ceil(total_duration / chunk_duration))
|
||||
|
||||
logger.info(f"Processando áudio em {num_chunks} chunks...")
|
||||
|
||||
for chunk_idx in range(num_chunks):
|
||||
start_time = chunk_idx * chunk_duration
|
||||
end_time = min((chunk_idx + 1) * chunk_duration, total_duration)
|
||||
|
||||
logger.info(
|
||||
f"Processando chunk {chunk_idx + 1}/{num_chunks} "
|
||||
f"({start_time/60:.1f}min - {end_time/60:.1f}min)..."
|
||||
)
|
||||
|
||||
# Extract chunk using ffmpeg directly (more reliable than moviepy subclip)
|
||||
temp_chunk_path = audio_path.parent / f"temp_chunk_{chunk_idx}.wav"
|
||||
try:
|
||||
# Use ffmpeg to extract the chunk
|
||||
chunk_duration_actual = end_time - start_time
|
||||
ffmpeg_cmd = [
|
||||
'ffmpeg',
|
||||
'-y', # Overwrite output file
|
||||
'-ss', str(start_time), # Start time
|
||||
'-i', str(audio_path), # Input file
|
||||
'-t', str(chunk_duration_actual), # Duration
|
||||
'-acodec', 'pcm_s16le', # Audio codec
|
||||
'-ar', '44100', # Sample rate
|
||||
'-ac', '2', # Stereo
|
||||
'-loglevel', 'error', # Only show errors
|
||||
str(temp_chunk_path)
|
||||
]
|
||||
|
||||
subprocess.run(ffmpeg_cmd, check=True, capture_output=True)
|
||||
|
||||
# Transcribe chunk
|
||||
segments, _ = model.transcribe(
|
||||
str(temp_chunk_path),
|
||||
beam_size=5,
|
||||
word_timestamps=True,
|
||||
)
|
||||
|
||||
# Process segments with time offset
|
||||
for segment in segments:
|
||||
words = [
|
||||
WordTiming(
|
||||
start=w.start + start_time,
|
||||
end=w.end + start_time,
|
||||
word=w.word.strip()
|
||||
)
|
||||
for w in segment.words or []
|
||||
if w.word.strip()
|
||||
]
|
||||
text = segment.text.strip()
|
||||
full_text_parts.append(text)
|
||||
all_segments.append(
|
||||
TranscriptSegment(
|
||||
id=segment_id_counter,
|
||||
start=segment.start + start_time,
|
||||
end=segment.end + start_time,
|
||||
text=text,
|
||||
words=words,
|
||||
)
|
||||
)
|
||||
segment_id_counter += 1
|
||||
|
||||
# Force garbage collection after each chunk
|
||||
import gc
|
||||
gc.collect()
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Erro ao extrair chunk {chunk_idx}: {e.stderr.decode() if e.stderr else str(e)}")
|
||||
raise
|
||||
finally:
|
||||
# Clean up temp chunk
|
||||
if temp_chunk_path.exists():
|
||||
temp_chunk_path.unlink()
|
||||
|
||||
logger.info(f"Transcrição em chunks concluída: {len(all_segments)} segmentos processados")
|
||||
|
||||
return TranscriptionResult(
|
||||
segments=all_segments,
|
||||
full_text=" ".join(full_text_parts).strip(),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def persist(result: TranscriptionResult, destination: Path) -> None:
|
||||
json_path = destination / "transcription.json"
|
||||
|
||||
Reference in New Issue
Block a user