Realiza varios ajustes para melhorar o tracking e o render de video
This commit is contained in:
@@ -13,10 +13,10 @@ TEMP_ROOT = BASE_DIR / "temp"
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RabbitMQSettings:
|
||||
# host: str = os.environ.get("RABBITMQ_HOST", "154.12.229.181")
|
||||
# port: int = int(os.environ.get("RABBITMQ_PORT", 32790))
|
||||
host: str = os.environ.get("RABBITMQ_HOST", "rabbitmq")
|
||||
port: int = int(os.environ.get("RABBITMQ_PORT", 5672))
|
||||
host: str = os.environ.get("RABBITMQ_HOST", "154.12.229.181")
|
||||
port: int = int(os.environ.get("RABBITMQ_PORT", 32790))
|
||||
# host: str = os.environ.get("RABBITMQ_HOST", "rabbitmq")
|
||||
# port: int = int(os.environ.get("RABBITMQ_PORT", 5672))
|
||||
user: str = os.environ.get("RABBITMQ_USER", "admin")
|
||||
password: str = os.environ.get("RABBITMQ_PASS")
|
||||
consume_queue: str = os.environ.get("RABBITMQ_QUEUE", "to-render")
|
||||
@@ -62,11 +62,13 @@ class RenderingSettings:
|
||||
subtitle_font_size: int = int(os.environ.get("RENDER_SUBTITLE_FONT_SIZE", 64))
|
||||
caption_min_words: int = int(os.environ.get("CAPTION_MIN_WORDS", 2))
|
||||
caption_max_words: int = int(os.environ.get("CAPTION_MAX_WORDS", 2))
|
||||
# Smart framing settings
|
||||
# Smart framing settings - CONTAINMENT TRACKING mode
|
||||
enable_smart_framing: bool = os.environ.get("ENABLE_SMART_FRAMING", "true").lower() in ("true", "1", "yes")
|
||||
smart_framing_min_confidence: float = float(os.environ.get("SMART_FRAMING_MIN_CONFIDENCE", 0.5))
|
||||
smart_framing_smoothing_window: int = int(os.environ.get("SMART_FRAMING_SMOOTHING_WINDOW", 20))
|
||||
smart_framing_frame_skip: int = int(os.environ.get("SMART_FRAMING_FRAME_SKIP", 2)) # Process every Nth frame (CPU optimization)
|
||||
smart_framing_min_confidence: float = float(os.environ.get("SMART_FRAMING_MIN_CONFIDENCE", 0.3)) # Lowered for better cartoon detection
|
||||
smart_framing_smoothing_window: int = int(os.environ.get("SMART_FRAMING_SMOOTHING_WINDOW", 30)) # Reduced - not needed with containment tracking
|
||||
smart_framing_frame_skip: int = int(os.environ.get("SMART_FRAMING_FRAME_SKIP", 1)) # Process every frame for smooth 30 FPS tracking
|
||||
smart_framing_max_velocity: int = int(os.environ.get("SMART_FRAMING_MAX_VELOCITY", 20)) # Moderate - only used during transitions
|
||||
smart_framing_person_switch_cooldown: int = int(os.environ.get("SMART_FRAMING_PERSON_SWITCH_COOLDOWN", 999999)) # DISABLED - never switch people
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
||||
@@ -7,7 +7,7 @@ and identify who is speaking in video content using MediaPipe and audio analysis
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import cv2
|
||||
@@ -50,20 +50,22 @@ class FrameContext:
|
||||
active_speakers: List[int] # indices of speaking faces
|
||||
primary_focus: Optional[Tuple[int, int]] # (x, y) center point
|
||||
layout_mode: str # "single", "dual_split", "grid"
|
||||
selected_people: List[int] = field(default_factory=list) # indices of people selected for display (max 2)
|
||||
|
||||
|
||||
class MediaPipeDetector:
|
||||
"""Face and pose detection using MediaPipe."""
|
||||
"""Face and pose detection using MediaPipe with OpenCV Haar Cascade fallback."""
|
||||
|
||||
def __init__(self, min_detection_confidence: float = 0.5, min_tracking_confidence: float = 0.5):
|
||||
def __init__(self, min_detection_confidence: float = 0.3, min_tracking_confidence: float = 0.3):
|
||||
self.min_detection_confidence = min_detection_confidence
|
||||
self.min_tracking_confidence = min_tracking_confidence
|
||||
self.mp_face_detection = mp.solutions.face_detection
|
||||
self.mp_face_mesh = mp.solutions.face_mesh
|
||||
|
||||
# MediaPipe detectors with lower confidence for better cartoon detection
|
||||
self.face_detection = self.mp_face_detection.FaceDetection(
|
||||
min_detection_confidence=min_detection_confidence,
|
||||
model_selection=1
|
||||
model_selection=0 # Changed to 0 for better detection of varied faces (including cartoons)
|
||||
)
|
||||
|
||||
self.face_mesh = self.mp_face_mesh.FaceMesh(
|
||||
@@ -73,11 +75,17 @@ class MediaPipeDetector:
|
||||
static_image_mode=False
|
||||
)
|
||||
|
||||
logger.info("MediaPipe detector initialized")
|
||||
# OpenCV Haar Cascade as fallback for cartoon/anime faces
|
||||
self.haar_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
|
||||
|
||||
# Alternative cascade for profile/side faces
|
||||
self.haar_cascade_profile = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_profileface.xml')
|
||||
|
||||
logger.info(f"Hybrid detector initialized (MediaPipe confidence={min_detection_confidence}, OpenCV Haar Cascade enabled)")
|
||||
|
||||
def detect_faces(self, frame: np.ndarray) -> List[FaceDetection]:
|
||||
"""
|
||||
Detect faces in a frame.
|
||||
Detect faces in a frame using hybrid approach (MediaPipe + OpenCV Haar Cascade).
|
||||
|
||||
Args:
|
||||
frame: RGB image array
|
||||
@@ -94,6 +102,7 @@ class MediaPipeDetector:
|
||||
else:
|
||||
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
|
||||
# Try MediaPipe first
|
||||
results = self.face_detection.process(frame_rgb)
|
||||
|
||||
faces = []
|
||||
@@ -126,8 +135,111 @@ class MediaPipeDetector:
|
||||
center_y=center_y
|
||||
))
|
||||
|
||||
# Fallback to OpenCV Haar Cascade if MediaPipe found nothing
|
||||
if not faces:
|
||||
faces = self._detect_faces_haar_cascade(frame, width, height)
|
||||
|
||||
return faces
|
||||
|
||||
def _detect_faces_haar_cascade(self, frame: np.ndarray, width: int, height: int) -> List[FaceDetection]:
|
||||
"""
|
||||
Detect faces using OpenCV Haar Cascade (works better with cartoons/anime).
|
||||
|
||||
Args:
|
||||
frame: Image frame (BGR format)
|
||||
width: Frame width
|
||||
height: Frame height
|
||||
|
||||
Returns:
|
||||
List of detected faces
|
||||
"""
|
||||
# Convert to grayscale for Haar Cascade
|
||||
if len(frame.shape) == 3:
|
||||
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
||||
else:
|
||||
gray = frame
|
||||
|
||||
# Detect frontal faces with more sensitive parameters
|
||||
frontal_faces = self.haar_cascade.detectMultiScale(
|
||||
gray,
|
||||
scaleFactor=1.05, # More sensitive to size variations
|
||||
minNeighbors=3, # Lower threshold for detection (more permissive)
|
||||
minSize=(30, 30), # Smaller minimum size
|
||||
flags=cv2.CASCADE_SCALE_IMAGE
|
||||
)
|
||||
|
||||
# Also try profile faces
|
||||
profile_faces = self.haar_cascade_profile.detectMultiScale(
|
||||
gray,
|
||||
scaleFactor=1.1,
|
||||
minNeighbors=3,
|
||||
minSize=(30, 30),
|
||||
flags=cv2.CASCADE_SCALE_IMAGE
|
||||
)
|
||||
|
||||
# Combine frontal and profile detections
|
||||
all_faces = []
|
||||
|
||||
for (x, y, w, h) in frontal_faces:
|
||||
x = max(0, min(x, width - 1))
|
||||
y = max(0, min(y, height - 1))
|
||||
w = min(w, width - x)
|
||||
h = min(h, height - y)
|
||||
|
||||
center_x = x + w // 2
|
||||
center_y = y + h // 2
|
||||
|
||||
all_faces.append(FaceDetection(
|
||||
x=x,
|
||||
y=y,
|
||||
width=w,
|
||||
height=h,
|
||||
confidence=0.7, # Haar Cascade doesn't provide confidence, use fixed value
|
||||
center_x=center_x,
|
||||
center_y=center_y
|
||||
))
|
||||
|
||||
for (x, y, w, h) in profile_faces:
|
||||
# Check if this face overlaps significantly with any frontal face
|
||||
overlap = False
|
||||
for existing_face in all_faces:
|
||||
# Calculate IoU (Intersection over Union)
|
||||
x1_overlap = max(x, existing_face.x)
|
||||
y1_overlap = max(y, existing_face.y)
|
||||
x2_overlap = min(x + w, existing_face.x + existing_face.width)
|
||||
y2_overlap = min(y + h, existing_face.y + existing_face.height)
|
||||
|
||||
if x1_overlap < x2_overlap and y1_overlap < y2_overlap:
|
||||
overlap_area = (x2_overlap - x1_overlap) * (y2_overlap - y1_overlap)
|
||||
face_area = w * h
|
||||
if overlap_area / face_area > 0.3: # 30% overlap threshold
|
||||
overlap = True
|
||||
break
|
||||
|
||||
if not overlap:
|
||||
x = max(0, min(x, width - 1))
|
||||
y = max(0, min(y, height - 1))
|
||||
w = min(w, width - x)
|
||||
h = min(h, height - y)
|
||||
|
||||
center_x = x + w // 2
|
||||
center_y = y + h // 2
|
||||
|
||||
all_faces.append(FaceDetection(
|
||||
x=x,
|
||||
y=y,
|
||||
width=w,
|
||||
height=h,
|
||||
confidence=0.6, # Slightly lower confidence for profile
|
||||
center_x=center_x,
|
||||
center_y=center_y
|
||||
))
|
||||
|
||||
if all_faces:
|
||||
logger.debug(f"Haar Cascade detected {len(all_faces)} faces (MediaPipe failed)")
|
||||
|
||||
return all_faces
|
||||
|
||||
def detect_face_landmarks(self, frame: np.ndarray) -> List[FaceDetection]:
|
||||
"""
|
||||
Detect faces with landmarks for lip sync detection.
|
||||
@@ -203,8 +315,8 @@ class AudioActivityDetector:
|
||||
def detect_speaking_periods(
|
||||
self,
|
||||
audio_samples: np.ndarray,
|
||||
threshold: float = 0.02,
|
||||
min_speech_duration: float = 0.1
|
||||
threshold: float = 0.01, # Reduced from 0.02 for better speech detection
|
||||
min_speech_duration: float = 0.05 # Reduced from 0.1 to catch shorter utterances
|
||||
) -> List[Tuple[float, float]]:
|
||||
"""
|
||||
Detect periods of speech in audio.
|
||||
@@ -250,6 +362,16 @@ class AudioActivityDetector:
|
||||
if end_time - start_time >= min_speech_duration:
|
||||
periods.append((start_time, end_time))
|
||||
|
||||
# Log detected speech periods for debugging
|
||||
if periods:
|
||||
total_speech_time = sum(end - start for start, end in periods)
|
||||
logger.info(f"Audio speech detection: {len(periods)} periods found, "
|
||||
f"total {total_speech_time:.1f}s of speech (threshold={threshold})")
|
||||
else:
|
||||
max_energy = max(energies) if energies else 0
|
||||
logger.warning(f"No speech detected! Max energy={max_energy:.4f}, threshold={threshold} "
|
||||
f"(try lowering threshold if speech should be present)")
|
||||
|
||||
return periods
|
||||
|
||||
def is_speaking_at_time(self, speaking_periods: List[Tuple[float, float]], time: float) -> bool:
|
||||
@@ -263,12 +385,29 @@ class AudioActivityDetector:
|
||||
class ContextAnalyzer:
|
||||
"""Analyzes video context to determine focus and layout."""
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, person_switch_cooldown: int = 30):
|
||||
self.detector = MediaPipeDetector()
|
||||
self.audio_detector = AudioActivityDetector()
|
||||
self.previous_faces: List[FaceDetection] = []
|
||||
|
||||
logger.info("Context analyzer initialized")
|
||||
# Person tracking state
|
||||
self.current_selected_people: List[int] = [] # Indices of people currently on screen
|
||||
self.last_switch_frame: int = -999 # Frame when we last switched people
|
||||
self.person_switch_cooldown = person_switch_cooldown # Minimum frames before switching
|
||||
|
||||
# Stability tracking to prevent flip-flopping
|
||||
self.desired_people_history: List[List[int]] = [] # Track recent desired selections
|
||||
self.stability_threshold = 20 # Frames needed to confirm a switch (increased for more stability)
|
||||
self.last_switched_people: List[int] = [] # People we just switched FROM
|
||||
|
||||
# Focus stability: track recent focus points for temporal smoothing
|
||||
self.focus_history: List[Tuple[int, int]] = []
|
||||
self.focus_history_size: int = 5 # Keep last 5 focus points for smoothing
|
||||
|
||||
# Debug logging
|
||||
self.frame_log_interval = 30 # Log every N frames
|
||||
|
||||
logger.info(f"Context analyzer initialized (cooldown={person_switch_cooldown} frames, focus_smoothing={self.focus_history_size})")
|
||||
|
||||
def analyze_frame(
|
||||
self,
|
||||
@@ -296,33 +435,47 @@ class ContextAnalyzer:
|
||||
|
||||
# Determine who is speaking
|
||||
active_speakers = []
|
||||
has_audio_speech = speaking_periods and self.audio_detector.is_speaking_at_time(speaking_periods, timestamp)
|
||||
|
||||
for i, face in enumerate(faces):
|
||||
is_speaking = False
|
||||
|
||||
if speaking_periods and self.audio_detector.is_speaking_at_time(speaking_periods, timestamp):
|
||||
# Check audio-based speech detection
|
||||
if has_audio_speech:
|
||||
is_speaking = True
|
||||
|
||||
# Check lip movement (visual speech detection)
|
||||
if face.landmarks and len(self.previous_faces) > i:
|
||||
is_speaking = is_speaking or self._detect_lip_movement(face, self.previous_faces[i])
|
||||
|
||||
if is_speaking:
|
||||
active_speakers.append(i)
|
||||
|
||||
num_faces = len(faces)
|
||||
num_speakers = len(active_speakers)
|
||||
# Debug: Log speech detection
|
||||
if frame_number % 30 == 0: # Every second at 30fps
|
||||
logger.info(f"Speech detection - Frame {frame_number}: audio_active={has_audio_speech}, "
|
||||
f"speakers={active_speakers}, total_faces={len(faces)}")
|
||||
|
||||
if num_faces == 0:
|
||||
layout_mode = "single"
|
||||
elif num_faces == 1:
|
||||
layout_mode = "single"
|
||||
elif num_faces == 2:
|
||||
layout_mode = "dual_split"
|
||||
elif num_faces >= 3:
|
||||
layout_mode = "dual_split"
|
||||
else:
|
||||
layout_mode = "single"
|
||||
# Select THE person to focus on (always single person)
|
||||
# Priority: 1) Who is speaking, 2) Who is most centered
|
||||
selected_people = self._select_person_to_focus(
|
||||
faces,
|
||||
active_speakers,
|
||||
frame_number,
|
||||
frame.shape[1], # frame width for center calculation
|
||||
frame.shape[0] # frame height for center calculation
|
||||
)
|
||||
|
||||
primary_focus = self._calculate_focus_point(faces, active_speakers)
|
||||
# Always use single-person layout (no split screen)
|
||||
layout_mode = "single"
|
||||
|
||||
primary_focus = self._calculate_focus_point(faces, selected_people)
|
||||
|
||||
# Debug logging every N frames
|
||||
if frame_number % self.frame_log_interval == 0:
|
||||
focus_reason = "speaker" if active_speakers else "no_speech_detected"
|
||||
logger.info(f"Frame {frame_number}: {len(faces)} faces, "
|
||||
f"{len(active_speakers)} speakers, focus={selected_people}, reason={focus_reason}")
|
||||
|
||||
self.previous_faces = faces
|
||||
|
||||
@@ -332,7 +485,8 @@ class ContextAnalyzer:
|
||||
detected_faces=faces,
|
||||
active_speakers=active_speakers,
|
||||
primary_focus=primary_focus,
|
||||
layout_mode=layout_mode
|
||||
layout_mode=layout_mode,
|
||||
selected_people=selected_people
|
||||
)
|
||||
|
||||
def _detect_lip_movement(self, current_face: FaceDetection, previous_face: FaceDetection) -> bool:
|
||||
@@ -363,36 +517,309 @@ class ContextAnalyzer:
|
||||
threshold = 2.0
|
||||
return abs(current_dist - previous_dist) > threshold
|
||||
|
||||
def _calculate_focus_point(
|
||||
def _select_person_to_focus(
|
||||
self,
|
||||
faces: List[FaceDetection],
|
||||
active_speakers: List[int]
|
||||
) -> Optional[Tuple[int, int]]:
|
||||
active_speakers: List[int],
|
||||
frame_number: int,
|
||||
frame_width: int,
|
||||
frame_height: int
|
||||
) -> List[int]:
|
||||
"""
|
||||
Calculate the primary focus point based on detected faces and speakers.
|
||||
|
||||
IMPORTANT: This focuses on ONE person to avoid focusing on empty space (table).
|
||||
When multiple people are present, we pick the most relevant person, not average positions.
|
||||
Select THE single person to focus on.
|
||||
Priority: 1) Who is speaking, 2) Who is most centered in frame
|
||||
|
||||
Args:
|
||||
faces: List of detected faces
|
||||
active_speakers: Indices of faces that are speaking
|
||||
active_speakers: Indices of people currently speaking
|
||||
frame_number: Current frame number
|
||||
frame_width: Frame width for center calculation
|
||||
frame_height: Frame height for center calculation
|
||||
|
||||
Returns:
|
||||
List with single person index [idx], or empty list if no faces
|
||||
"""
|
||||
if not faces:
|
||||
self.current_selected_people = []
|
||||
return []
|
||||
|
||||
# If only 1 person, always focus on them
|
||||
if len(faces) == 1:
|
||||
self.current_selected_people = [0]
|
||||
return [0]
|
||||
|
||||
# Check if we can switch people (cooldown period)
|
||||
frames_since_last_switch = frame_number - self.last_switch_frame
|
||||
can_switch = frames_since_last_switch >= self.person_switch_cooldown
|
||||
|
||||
# Calculate frame center for distance comparison
|
||||
frame_center_x = frame_width / 2
|
||||
frame_center_y = frame_height / 2
|
||||
|
||||
# ULTRA-STABLE MODE: Select ONE person at start, NEVER switch
|
||||
# This completely eliminates switching-related instability
|
||||
desired_person_idx = None
|
||||
|
||||
# If we already have someone selected, ALWAYS KEEP THEM (never switch)
|
||||
if self.current_selected_people and len(self.current_selected_people) > 0:
|
||||
current_idx = self.current_selected_people[0]
|
||||
if current_idx < len(faces):
|
||||
# Current person still detected - keep them
|
||||
desired_person_idx = current_idx
|
||||
else:
|
||||
# Current person lost - try to find them again by position/size similarity
|
||||
# This handles temporary detection failures
|
||||
current_person_found = False
|
||||
if self.previous_faces and current_idx < len(self.previous_faces):
|
||||
prev_face = self.previous_faces[current_idx]
|
||||
# Find most similar face by position and size
|
||||
best_match_idx = None
|
||||
best_match_score = float('inf')
|
||||
for idx, face in enumerate(faces):
|
||||
# Distance between centers
|
||||
dx = face.center_x - prev_face.center_x
|
||||
dy = face.center_y - prev_face.center_y
|
||||
dist = np.sqrt(dx**2 + dy**2)
|
||||
# Size similarity
|
||||
size_diff = abs(face.width - prev_face.width) + abs(face.height - prev_face.height)
|
||||
score = dist + size_diff * 0.5
|
||||
if score < best_match_score:
|
||||
best_match_score = score
|
||||
best_match_idx = idx
|
||||
|
||||
if best_match_idx is not None and best_match_score < 1000:
|
||||
desired_person_idx = best_match_idx
|
||||
current_person_found = True
|
||||
|
||||
if not current_person_found:
|
||||
# Really lost - select most confident
|
||||
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
|
||||
face_confidences.sort(key=lambda x: x[1], reverse=True)
|
||||
desired_person_idx = face_confidences[0][0]
|
||||
logger.warning(f"Current person permanently lost - selecting new: {desired_person_idx}")
|
||||
else:
|
||||
# First frame - select most confident person ONCE
|
||||
face_confidences = [(idx, face.confidence) for idx, face in enumerate(faces)]
|
||||
face_confidences.sort(key=lambda x: x[1], reverse=True)
|
||||
desired_person_idx = face_confidences[0][0]
|
||||
logger.info(f"INITIAL SELECTION - Person {desired_person_idx} (will be tracked throughout entire video)")
|
||||
|
||||
# IGNORE SPEECH DETECTION - it was causing instability
|
||||
# We now track ONE person from start to finish, regardless of who speaks
|
||||
|
||||
# OLD LOGIC (commented out - was causing issues):
|
||||
# This logic would switch based on "who is more centered" which caused constant switching
|
||||
if False: # Disabled
|
||||
# Calculate distance from center for each face
|
||||
center_distances = []
|
||||
for idx, face in enumerate(faces):
|
||||
# Euclidean distance from frame center
|
||||
dx = face.center_x - frame_center_x
|
||||
dy = face.center_y - frame_center_y
|
||||
distance = np.sqrt(dx**2 + dy**2)
|
||||
center_distances.append((idx, distance, face.confidence))
|
||||
|
||||
# Sort by distance (closest first), then by confidence as tiebreaker
|
||||
center_distances.sort(key=lambda x: (x[1], -x[2]))
|
||||
most_centered_idx = center_distances[0][0]
|
||||
most_centered_distance = center_distances[0][1]
|
||||
|
||||
# STICKY BEHAVIOR: If we already have someone selected, only switch if:
|
||||
# - New person is SIGNIFICANTLY more centered (30% closer to center)
|
||||
# - OR current person is now very far from center (>40% of frame width)
|
||||
if self.current_selected_people and len(self.current_selected_people) > 0:
|
||||
current_idx = self.current_selected_people[0]
|
||||
if current_idx < len(faces):
|
||||
current_face = faces[current_idx]
|
||||
current_dx = current_face.center_x - frame_center_x
|
||||
current_dy = current_face.center_y - frame_center_y
|
||||
current_distance = np.sqrt(current_dx**2 + current_dy**2)
|
||||
|
||||
# Define "significantly better" threshold
|
||||
max_acceptable_distance = frame_width * 0.4 # 40% of frame width
|
||||
improvement_threshold = 0.7 # New person must be 30% closer (0.7 ratio)
|
||||
|
||||
# Keep current person if they're still reasonably centered
|
||||
if current_distance < max_acceptable_distance:
|
||||
# Current person is still acceptable - only switch if new is MUCH better
|
||||
if most_centered_distance < current_distance * improvement_threshold:
|
||||
desired_person_idx = most_centered_idx
|
||||
logger.debug(f"Switching: new person MUCH more centered ({most_centered_distance:.0f} vs {current_distance:.0f})")
|
||||
else:
|
||||
desired_person_idx = current_idx # Keep current
|
||||
logger.debug(f"Keeping current person: still reasonably centered ({current_distance:.0f} px from center)")
|
||||
else:
|
||||
# Current person is too far from center - switch
|
||||
desired_person_idx = most_centered_idx
|
||||
logger.debug(f"Current person too far from center ({current_distance:.0f} px), switching")
|
||||
else:
|
||||
# Current selection invalid
|
||||
desired_person_idx = most_centered_idx
|
||||
else:
|
||||
# First time - select most centered
|
||||
desired_person_idx = most_centered_idx
|
||||
|
||||
# Wrap in list for compatibility with existing code
|
||||
desired_people = [desired_person_idx] if desired_person_idx is not None else []
|
||||
|
||||
# ULTRA-STABLE MODE: NO SWITCHING LOGIC AT ALL
|
||||
# Simply set the person and never change
|
||||
if not self.current_selected_people:
|
||||
# First time only
|
||||
self.current_selected_people = desired_people
|
||||
self.last_switch_frame = frame_number
|
||||
logger.info(f"Frame {frame_number}: LOCKED ON person {desired_people} - will never switch")
|
||||
else:
|
||||
# Already have someone - just update to desired (which is same person due to logic above)
|
||||
self.current_selected_people = desired_people
|
||||
|
||||
return self.current_selected_people.copy()
|
||||
|
||||
def _ensure_distinct_people(
|
||||
self,
|
||||
faces: List[FaceDetection],
|
||||
people_indices: List[int]
|
||||
) -> List[int]:
|
||||
"""
|
||||
Ensure selected people are distinct by checking minimum distance between them.
|
||||
Prevents showing the same person twice due to duplicate detection.
|
||||
|
||||
Args:
|
||||
faces: List of detected faces
|
||||
people_indices: Indices of people to validate
|
||||
|
||||
Returns:
|
||||
List of distinct people indices (max 2)
|
||||
"""
|
||||
if len(people_indices) <= 1:
|
||||
return people_indices
|
||||
|
||||
distinct_people = []
|
||||
|
||||
for idx in people_indices:
|
||||
if idx >= len(faces):
|
||||
continue
|
||||
|
||||
current_face = faces[idx]
|
||||
is_distinct = True
|
||||
|
||||
# Check if this person is too close to any already selected person
|
||||
for selected_idx in distinct_people:
|
||||
selected_face = faces[selected_idx]
|
||||
|
||||
# Calculate distance between face centers
|
||||
dx = current_face.center_x - selected_face.center_x
|
||||
dy = current_face.center_y - selected_face.center_y
|
||||
distance = np.sqrt(dx**2 + dy**2)
|
||||
|
||||
# Also check overlap via IoU (Intersection over Union)
|
||||
x1_overlap = max(current_face.x, selected_face.x)
|
||||
y1_overlap = max(current_face.y, selected_face.y)
|
||||
x2_overlap = min(current_face.x + current_face.width, selected_face.x + selected_face.width)
|
||||
y2_overlap = min(current_face.y + current_face.height, selected_face.y + selected_face.height)
|
||||
|
||||
overlap_area = 0
|
||||
if x1_overlap < x2_overlap and y1_overlap < y2_overlap:
|
||||
overlap_area = (x2_overlap - x1_overlap) * (y2_overlap - y1_overlap)
|
||||
|
||||
# Calculate areas
|
||||
area1 = current_face.width * current_face.height
|
||||
area2 = selected_face.width * selected_face.height
|
||||
min_area = min(area1, area2)
|
||||
|
||||
# If faces are very close OR significantly overlapping, they're likely the same person
|
||||
# Minimum distance: 1/4 of average face width
|
||||
min_distance = (current_face.width + selected_face.width) / 8
|
||||
overlap_threshold = 0.3 # 30% overlap
|
||||
|
||||
if distance < min_distance or (min_area > 0 and overlap_area / min_area > overlap_threshold):
|
||||
is_distinct = False
|
||||
logger.debug(f"Person {idx} too similar to person {selected_idx} (dist={distance:.1f}, overlap={overlap_area/min_area if min_area > 0 else 0:.2%})")
|
||||
break
|
||||
|
||||
if is_distinct:
|
||||
distinct_people.append(idx)
|
||||
|
||||
# Stop at 2 distinct people
|
||||
if len(distinct_people) >= 2:
|
||||
break
|
||||
|
||||
# If we couldn't find 2 distinct people, return at most 1
|
||||
if len(distinct_people) < 2 and len(people_indices) >= 2:
|
||||
logger.debug(f"Only {len(distinct_people)} distinct person(s) found from {len(people_indices)} detections")
|
||||
|
||||
return distinct_people
|
||||
|
||||
def _calculate_focus_point(
|
||||
self,
|
||||
faces: List[FaceDetection],
|
||||
selected_people: List[int]
|
||||
) -> Optional[Tuple[int, int]]:
|
||||
"""
|
||||
Calculate the primary focus point based on selected people with temporal smoothing.
|
||||
|
||||
Args:
|
||||
faces: List of detected faces
|
||||
selected_people: Indices of people selected for display
|
||||
|
||||
Returns:
|
||||
(x, y) tuple of focus center, or None if no faces
|
||||
"""
|
||||
if not faces:
|
||||
if not faces or not selected_people:
|
||||
return None
|
||||
|
||||
if active_speakers:
|
||||
speaker_faces = [faces[i] for i in active_speakers if i < len(faces)]
|
||||
if speaker_faces:
|
||||
primary_speaker = max(speaker_faces, key=lambda f: f.confidence)
|
||||
return (primary_speaker.center_x, primary_speaker.center_y)
|
||||
# Calculate raw focus point
|
||||
raw_focus_x = 0
|
||||
raw_focus_y = 0
|
||||
|
||||
most_confident = max(faces, key=lambda f: f.confidence)
|
||||
return (most_confident.center_x, most_confident.center_y)
|
||||
if len(selected_people) == 1:
|
||||
# Single person - focus on them
|
||||
if selected_people[0] < len(faces):
|
||||
primary = faces[selected_people[0]]
|
||||
raw_focus_x = primary.center_x
|
||||
raw_focus_y = primary.center_y
|
||||
else:
|
||||
# Fallback
|
||||
most_confident = max(faces, key=lambda f: f.confidence)
|
||||
raw_focus_x = most_confident.center_x
|
||||
raw_focus_y = most_confident.center_y
|
||||
else:
|
||||
# Multiple people - focus on the CENTER between them for stability
|
||||
# This prevents jarring movements when switching focus between people
|
||||
valid_people = [idx for idx in selected_people if idx < len(faces)]
|
||||
if valid_people:
|
||||
centers_x = [faces[idx].center_x for idx in valid_people]
|
||||
centers_y = [faces[idx].center_y for idx in valid_people]
|
||||
raw_focus_x = int(np.mean(centers_x))
|
||||
raw_focus_y = int(np.mean(centers_y))
|
||||
else:
|
||||
# Fallback
|
||||
most_confident = max(faces, key=lambda f: f.confidence)
|
||||
raw_focus_x = most_confident.center_x
|
||||
raw_focus_y = most_confident.center_y
|
||||
|
||||
# Apply temporal smoothing using focus history
|
||||
self.focus_history.append((raw_focus_x, raw_focus_y))
|
||||
if len(self.focus_history) > self.focus_history_size:
|
||||
self.focus_history.pop(0)
|
||||
|
||||
# Calculate smoothed focus as weighted average (more weight to recent frames)
|
||||
if len(self.focus_history) > 1:
|
||||
# Exponential weights: recent frames have more influence
|
||||
weights = [2 ** i for i in range(len(self.focus_history))]
|
||||
total_weight = sum(weights)
|
||||
|
||||
smoothed_x = sum(x * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
|
||||
smoothed_y = sum(y * w for (x, y), w in zip(self.focus_history, weights)) / total_weight
|
||||
|
||||
return (int(smoothed_x), int(smoothed_y))
|
||||
else:
|
||||
return (raw_focus_x, raw_focus_y)
|
||||
|
||||
def close(self):
|
||||
"""Release resources."""
|
||||
self.detector.close()
|
||||
# Clear tracking state to free memory
|
||||
self.previous_faces.clear()
|
||||
self.current_selected_people.clear()
|
||||
self.focus_history.clear()
|
||||
|
||||
@@ -141,8 +141,8 @@ class OpenRouterCopywriter:
|
||||
logger.warning(f"Highlight ignorado: muito curto ({duration}s, minimo 45s)")
|
||||
continue
|
||||
|
||||
if duration > 120:
|
||||
logger.warning(f"Highlight ignorado: muito longo ({duration}s, maximo 120s)")
|
||||
if duration > 90:
|
||||
logger.warning(f"Highlight ignorado: muito longo ({duration}s, maximo 90s)")
|
||||
continue
|
||||
|
||||
if not summary:
|
||||
|
||||
@@ -50,7 +50,10 @@ class MediaPreparer:
|
||||
existing_children = list(workspace_dir.iterdir())
|
||||
if existing_children:
|
||||
logger.info("Limpando workspace existente para %s", sanitized_name)
|
||||
remove_paths(existing_children)
|
||||
try:
|
||||
remove_paths(existing_children)
|
||||
except Exception as e:
|
||||
logger.warning(f"Não foi possível limpar workspace (não crítico): {e}")
|
||||
|
||||
if temp_transcription_json and temp_transcription_json.exists():
|
||||
shutil.move(str(temp_transcription_json), str(transcription_json))
|
||||
@@ -66,7 +69,10 @@ class MediaPreparer:
|
||||
output_dir = ensure_workspace(self.settings.outputs_dir, sanitized_name)
|
||||
existing_outputs = list(output_dir.iterdir())
|
||||
if existing_outputs:
|
||||
remove_paths(existing_outputs)
|
||||
try:
|
||||
remove_paths(existing_outputs)
|
||||
except Exception as e:
|
||||
logger.warning(f"Não foi possível limpar outputs antigos (não crítico): {e}")
|
||||
|
||||
audio_path = workspace_dir / "audio.wav"
|
||||
extract_audio_to_wav(working_video_path, audio_path)
|
||||
|
||||
@@ -107,6 +107,9 @@ class VideoPipeline:
|
||||
TranscriptionService.persist(transcription, context.workspace.workspace_dir)
|
||||
context.transcription = transcription
|
||||
|
||||
# Unload Whisper model immediately after transcription to free memory (1-3GB)
|
||||
self.transcriber.unload_model()
|
||||
|
||||
def _determine_highlights(self, context: PipelineContext) -> None:
|
||||
if not context.transcription:
|
||||
raise RuntimeError("Transcricao nao disponivel")
|
||||
|
||||
@@ -345,7 +345,9 @@ class VideoRenderer:
|
||||
target_width=settings.rendering.frame_width,
|
||||
target_height=settings.rendering.frame_height,
|
||||
frame_skip=settings.rendering.smart_framing_frame_skip,
|
||||
smoothing_window=settings.rendering.smart_framing_smoothing_window
|
||||
smoothing_window=settings.rendering.smart_framing_smoothing_window,
|
||||
max_velocity=settings.rendering.smart_framing_max_velocity,
|
||||
person_switch_cooldown=settings.rendering.smart_framing_person_switch_cooldown
|
||||
)
|
||||
|
||||
def render(
|
||||
@@ -436,12 +438,10 @@ class VideoRenderer:
|
||||
audio_samples=audio_samples
|
||||
)
|
||||
|
||||
# Apply smart framing based on detected layout
|
||||
use_split_screen = framing_plan.layout_mode in ["dual_split", "grid"]
|
||||
# Apply smart framing (always single-person focus)
|
||||
video_clip = self.smart_framer.apply_framing(
|
||||
video_clip=subclip,
|
||||
framing_plan=framing_plan,
|
||||
use_split_screen=use_split_screen
|
||||
framing_plan=framing_plan
|
||||
)
|
||||
|
||||
logger.info(f"Smart framing applied: layout={framing_plan.layout_mode}, "
|
||||
@@ -602,6 +602,10 @@ class VideoRenderer:
|
||||
if audio_clip is not None and audio_needs_close:
|
||||
audio_clip.close()
|
||||
|
||||
# Force garbage collection to free memory after rendering
|
||||
import gc
|
||||
gc.collect()
|
||||
|
||||
return str(output_path)
|
||||
|
||||
def _materialize_audio(
|
||||
|
||||
@@ -46,21 +46,20 @@ class SmartFramer:
|
||||
self,
|
||||
target_width: int = 1080,
|
||||
target_height: int = 1920,
|
||||
frame_skip: int = 2,
|
||||
smoothing_window: int = 15
|
||||
frame_skip: int = 1,
|
||||
smoothing_window: int = 30,
|
||||
max_velocity: int = 20,
|
||||
person_switch_cooldown: int = 999999
|
||||
):
|
||||
self.target_width = target_width
|
||||
self.target_height = target_height
|
||||
self.target_aspect = target_height / target_width
|
||||
|
||||
# Performance parameters
|
||||
self.frame_skip = frame_skip # Process every Nth frame (CPU optimization)
|
||||
|
||||
# Smoothing parameters
|
||||
self.frame_skip = frame_skip
|
||||
self.smoothing_window = smoothing_window
|
||||
self.max_velocity = 30 # pixels per frame (reduced for smoother transitions)
|
||||
self.max_velocity = max_velocity
|
||||
self.person_switch_cooldown = person_switch_cooldown
|
||||
|
||||
logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip})")
|
||||
logger.info(f"Smart framer initialized (target: {target_width}x{target_height}, frame_skip={frame_skip}, smoothing={smoothing_window}, velocity={max_velocity}, cooldown={person_switch_cooldown})")
|
||||
|
||||
def create_framing_plan(
|
||||
self,
|
||||
@@ -81,25 +80,21 @@ class SmartFramer:
|
||||
Returns:
|
||||
FramingPlan with all frame contexts and crop regions
|
||||
"""
|
||||
analyzer = ContextAnalyzer()
|
||||
analyzer = ContextAnalyzer(person_switch_cooldown=self.person_switch_cooldown)
|
||||
|
||||
# Detect speaking periods from audio if available
|
||||
speaking_periods = None
|
||||
if audio_samples is not None:
|
||||
speaking_periods = analyzer.audio_detector.detect_speaking_periods(audio_samples)
|
||||
|
||||
# Open video with error suppression for AV1 codec warnings
|
||||
import os
|
||||
os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'loglevel;quiet'
|
||||
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
|
||||
# Calculate frame range
|
||||
start_frame = int(start_time * fps)
|
||||
end_frame = int(end_time * fps)
|
||||
|
||||
# Set to start frame
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
|
||||
|
||||
frame_contexts = []
|
||||
@@ -113,7 +108,6 @@ class SmartFramer:
|
||||
if not ret:
|
||||
break
|
||||
|
||||
# Only process every Nth frame for performance (CPU optimization)
|
||||
if processed_count % self.frame_skip == 0:
|
||||
timestamp = frame_number / fps
|
||||
context = analyzer.analyze_frame(frame, timestamp, frame_number, speaking_periods)
|
||||
@@ -122,35 +116,36 @@ class SmartFramer:
|
||||
frame_number += 1
|
||||
processed_count += 1
|
||||
|
||||
# Get video dimensions before releasing capture
|
||||
source_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
source_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
|
||||
cap.release()
|
||||
analyzer.close()
|
||||
|
||||
# Determine overall layout mode (most common)
|
||||
layout_modes = [ctx.layout_mode for ctx in frame_contexts]
|
||||
if layout_modes:
|
||||
overall_layout = max(set(layout_modes), key=layout_modes.count)
|
||||
else:
|
||||
overall_layout = "single"
|
||||
|
||||
# Calculate crop regions based on contexts
|
||||
|
||||
crop_regions = self._calculate_crop_regions(
|
||||
frame_contexts,
|
||||
source_width,
|
||||
source_height
|
||||
)
|
||||
|
||||
return FramingPlan(
|
||||
framing_plan = FramingPlan(
|
||||
frame_contexts=frame_contexts,
|
||||
crop_regions=crop_regions,
|
||||
layout_mode=overall_layout,
|
||||
fps=fps
|
||||
)
|
||||
|
||||
import gc
|
||||
gc.collect()
|
||||
|
||||
return framing_plan
|
||||
|
||||
def _calculate_crop_regions(
|
||||
self,
|
||||
contexts: List[FrameContext],
|
||||
@@ -171,66 +166,122 @@ class SmartFramer:
|
||||
if not contexts:
|
||||
return []
|
||||
|
||||
# Calculate ideal crop dimensions maintaining EXACT 9:16 aspect ratio
|
||||
source_aspect = source_width / source_height
|
||||
|
||||
if source_aspect > self.target_aspect:
|
||||
# Source is wider - crop horizontally (use full height)
|
||||
crop_height = source_height
|
||||
crop_width = int(crop_height / self.target_aspect)
|
||||
|
||||
# Ensure crop width fits within source
|
||||
if crop_width > source_width:
|
||||
crop_width = source_width
|
||||
crop_height = int(crop_width * self.target_aspect)
|
||||
else:
|
||||
# Source is taller - crop vertically (use full width)
|
||||
crop_width = source_width
|
||||
crop_height = int(crop_width * self.target_aspect)
|
||||
|
||||
# Ensure crop height fits within source
|
||||
if crop_height > source_height:
|
||||
crop_height = source_height
|
||||
crop_width = int(crop_height / self.target_aspect)
|
||||
|
||||
# Calculate center points for each frame
|
||||
# Since we now always focus on ONE person directly (not averaging),
|
||||
# we can use the focus point directly without complex validation
|
||||
center_xs = []
|
||||
center_ys = []
|
||||
safe_zone_margin_x = crop_width * 0.40
|
||||
safe_zone_margin_y = crop_height * 0.40
|
||||
|
||||
for ctx in contexts:
|
||||
if ctx.primary_focus:
|
||||
# Primary focus is now always a single person's center, never averaged
|
||||
# This means it will never be on the table/empty space
|
||||
center_xs.append(ctx.primary_focus[0])
|
||||
center_ys.append(ctx.primary_focus[1])
|
||||
dead_zone_threshold = 100
|
||||
|
||||
if contexts and contexts[0].primary_focus:
|
||||
current_crop_center_x = contexts[0].primary_focus[0]
|
||||
current_crop_center_y = contexts[0].primary_focus[1]
|
||||
else:
|
||||
current_crop_center_x = source_width // 2
|
||||
current_crop_center_y = source_height // 2
|
||||
|
||||
center_xs = [current_crop_center_x]
|
||||
center_ys = [current_crop_center_y]
|
||||
|
||||
for ctx in contexts[1:]:
|
||||
if ctx.primary_focus and ctx.selected_people and len(ctx.detected_faces) > 0:
|
||||
primary_person_idx = ctx.selected_people[0] if ctx.selected_people else 0
|
||||
if primary_person_idx < len(ctx.detected_faces):
|
||||
face = ctx.detected_faces[primary_person_idx]
|
||||
|
||||
face_left = face.x
|
||||
face_right = face.x + face.width
|
||||
face_top = face.y
|
||||
face_bottom = face.y + face.height
|
||||
|
||||
crop_left = current_crop_center_x - crop_width // 2
|
||||
crop_right = current_crop_center_x + crop_width // 2
|
||||
crop_top = current_crop_center_y - crop_height // 2
|
||||
crop_bottom = current_crop_center_y + crop_height // 2
|
||||
|
||||
face_rel_left = face_left - crop_left
|
||||
face_rel_right = face_right - crop_left
|
||||
face_rel_top = face_top - crop_top
|
||||
face_rel_bottom = face_bottom - crop_top
|
||||
|
||||
face_left_safe = face_rel_left >= safe_zone_margin_x
|
||||
face_right_safe = face_rel_right <= (crop_width - safe_zone_margin_x)
|
||||
face_top_safe = face_rel_top >= safe_zone_margin_y
|
||||
face_bottom_safe = face_rel_bottom <= (crop_height - safe_zone_margin_y)
|
||||
|
||||
face_fully_visible = face_left_safe and face_right_safe and face_top_safe and face_bottom_safe
|
||||
|
||||
if face_fully_visible:
|
||||
center_xs.append(current_crop_center_x)
|
||||
center_ys.append(current_crop_center_y)
|
||||
else:
|
||||
shift_x = 0
|
||||
shift_y = 0
|
||||
|
||||
if not face_left_safe:
|
||||
shift_x = face_rel_left - safe_zone_margin_x
|
||||
elif not face_right_safe:
|
||||
shift_x = face_rel_right - (crop_width - safe_zone_margin_x)
|
||||
|
||||
if not face_top_safe:
|
||||
shift_y = face_rel_top - safe_zone_margin_y
|
||||
elif not face_bottom_safe:
|
||||
shift_y = face_rel_bottom - (crop_height - safe_zone_margin_y)
|
||||
|
||||
if abs(shift_x) > dead_zone_threshold:
|
||||
current_crop_center_x += shift_x
|
||||
if abs(shift_y) > dead_zone_threshold:
|
||||
current_crop_center_y += shift_y
|
||||
|
||||
center_xs.append(current_crop_center_x)
|
||||
center_ys.append(current_crop_center_y)
|
||||
else:
|
||||
center_xs.append(current_crop_center_x)
|
||||
center_ys.append(current_crop_center_y)
|
||||
else:
|
||||
# Default to center only if no faces detected at all
|
||||
center_xs.append(source_width // 2)
|
||||
center_ys.append(source_height // 2)
|
||||
center_xs.append(current_crop_center_x)
|
||||
center_ys.append(current_crop_center_y)
|
||||
|
||||
# Smooth the center points
|
||||
if len(center_xs) > self.smoothing_window:
|
||||
kernel_size = min(self.smoothing_window, len(center_xs))
|
||||
if kernel_size % 2 == 0:
|
||||
kernel_size -= 1
|
||||
if len(center_xs) > 1:
|
||||
alpha = 0.002
|
||||
smoothed_xs = [center_xs[0]]
|
||||
smoothed_ys = [center_ys[0]]
|
||||
for i in range(1, len(center_xs)):
|
||||
if center_xs[i] != center_xs[i-1] or center_ys[i] != center_ys[i-1]:
|
||||
smoothed_xs.append(alpha * center_xs[i] + (1 - alpha) * smoothed_xs[i-1])
|
||||
smoothed_ys.append(alpha * center_ys[i] + (1 - alpha) * smoothed_ys[i-1])
|
||||
else:
|
||||
smoothed_xs.append(smoothed_xs[i-1])
|
||||
smoothed_ys.append(smoothed_ys[i-1])
|
||||
center_xs = smoothed_xs
|
||||
center_ys = smoothed_ys
|
||||
|
||||
center_xs = signal.medfilt(center_xs, kernel_size=kernel_size).tolist()
|
||||
center_ys = signal.medfilt(center_ys, kernel_size=kernel_size).tolist()
|
||||
center_xs = self._limit_velocity(center_xs, 2)
|
||||
center_ys = self._limit_velocity(center_ys, 2)
|
||||
|
||||
# Limit velocity (prevent jarring movements)
|
||||
center_xs = self._limit_velocity(center_xs, self.max_velocity)
|
||||
center_ys = self._limit_velocity(center_ys, self.max_velocity)
|
||||
center_xs = self._apply_dead_zone(center_xs, 5)
|
||||
center_ys = self._apply_dead_zone(center_ys, 5)
|
||||
|
||||
# Convert to crop regions
|
||||
crop_regions = []
|
||||
for center_x, center_y in zip(center_xs, center_ys):
|
||||
# Calculate top-left corner
|
||||
x = int(center_x - crop_width // 2)
|
||||
y = int(center_y - crop_height // 2)
|
||||
|
||||
# Clamp to valid bounds
|
||||
x = max(0, min(x, source_width - crop_width))
|
||||
y = max(0, min(y, source_height - crop_height))
|
||||
|
||||
@@ -241,8 +292,37 @@ class SmartFramer:
|
||||
height=crop_height
|
||||
))
|
||||
|
||||
center_xs.clear()
|
||||
center_ys.clear()
|
||||
|
||||
return crop_regions
|
||||
|
||||
def _apply_dead_zone(self, positions: List[float], threshold: float) -> List[float]:
|
||||
"""
|
||||
Apply dead zone to eliminate micro-movements.
|
||||
If change is smaller than threshold, keep previous position.
|
||||
|
||||
Args:
|
||||
positions: List of positions
|
||||
threshold: Minimum change needed to move (pixels)
|
||||
|
||||
Returns:
|
||||
Positions with dead zone applied
|
||||
"""
|
||||
if len(positions) <= 1:
|
||||
return positions
|
||||
|
||||
filtered = [positions[0]]
|
||||
|
||||
for i in range(1, len(positions)):
|
||||
delta = abs(positions[i] - filtered[i - 1])
|
||||
if delta < threshold:
|
||||
filtered.append(filtered[i - 1])
|
||||
else:
|
||||
filtered.append(positions[i])
|
||||
|
||||
return filtered
|
||||
|
||||
def _limit_velocity(self, positions: List[float], max_velocity: float) -> List[float]:
|
||||
"""
|
||||
Limit the velocity of position changes.
|
||||
@@ -271,33 +351,20 @@ class SmartFramer:
|
||||
def apply_framing(
|
||||
self,
|
||||
video_clip: VideoFileClip,
|
||||
framing_plan: FramingPlan,
|
||||
use_split_screen: bool = False
|
||||
framing_plan: FramingPlan
|
||||
) -> VideoClip:
|
||||
"""
|
||||
Apply smart framing to a video clip.
|
||||
Always uses single-person focus (no split screen).
|
||||
|
||||
Args:
|
||||
video_clip: Source video clip
|
||||
framing_plan: Framing plan to apply
|
||||
use_split_screen: Whether to use split screen for multiple people
|
||||
|
||||
Returns:
|
||||
Reframed video clip
|
||||
"""
|
||||
# Handle different layout modes
|
||||
if framing_plan.layout_mode in ["single", "single_speaker"]:
|
||||
# Single person or single speaker - use focused single framing
|
||||
return self._apply_single_framing(video_clip, framing_plan)
|
||||
elif framing_plan.layout_mode == "dual_split" and use_split_screen:
|
||||
# Two people in conversation - use split screen
|
||||
return self._apply_split_screen(video_clip, framing_plan)
|
||||
elif framing_plan.layout_mode == "grid" and use_split_screen:
|
||||
# 3+ people - use grid layout
|
||||
return self._apply_grid_layout(video_clip, framing_plan)
|
||||
else:
|
||||
# Fallback to single framing
|
||||
return self._apply_single_framing(video_clip, framing_plan)
|
||||
return self._apply_single_framing(video_clip, framing_plan)
|
||||
|
||||
def _apply_single_framing(
|
||||
self,
|
||||
@@ -315,12 +382,9 @@ class SmartFramer:
|
||||
Reframed video clip
|
||||
"""
|
||||
def make_frame(t):
|
||||
# Get the original frame
|
||||
frame = video_clip.get_frame(t)
|
||||
|
||||
# Ensure we have valid crop regions
|
||||
if not framing_plan.crop_regions:
|
||||
# Fallback: return center crop
|
||||
h, w = frame.shape[:2]
|
||||
crop_h = int(w * self.target_aspect)
|
||||
crop_w = w
|
||||
@@ -331,41 +395,32 @@ class SmartFramer:
|
||||
x = (w - crop_w) // 2
|
||||
cropped = frame[y:y + crop_h, x:x + crop_w]
|
||||
else:
|
||||
# Calculate exact frame index with decimal precision for interpolation
|
||||
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
|
||||
|
||||
# Get the two adjacent analyzed frames
|
||||
idx_floor = int(exact_frame_idx)
|
||||
idx_ceil = idx_floor + 1
|
||||
|
||||
# Interpolation factor (0.0 to 1.0)
|
||||
alpha = exact_frame_idx - idx_floor
|
||||
|
||||
# Clamp indices to valid range
|
||||
idx_floor = max(0, min(idx_floor, len(framing_plan.crop_regions) - 1))
|
||||
idx_ceil = max(0, min(idx_ceil, len(framing_plan.crop_regions) - 1))
|
||||
|
||||
# Get crop regions
|
||||
crop1 = framing_plan.crop_regions[idx_floor]
|
||||
crop2 = framing_plan.crop_regions[idx_ceil]
|
||||
|
||||
# Linear interpolation between crop regions
|
||||
x = int(crop1.x * (1 - alpha) + crop2.x * alpha)
|
||||
y = int(crop1.y * (1 - alpha) + crop2.y * alpha)
|
||||
width = int(crop1.width * (1 - alpha) + crop2.width * alpha)
|
||||
height = int(crop1.height * (1 - alpha) + crop2.height * alpha)
|
||||
|
||||
# Ensure crop stays within frame bounds
|
||||
h, w = frame.shape[:2]
|
||||
x = max(0, min(x, w - width))
|
||||
y = max(0, min(y, h - height))
|
||||
width = min(width, w - x)
|
||||
height = min(height, h - y)
|
||||
|
||||
# Crop the frame
|
||||
cropped = frame[y:y + height, x:x + width]
|
||||
|
||||
# Resize to target dimensions
|
||||
resized = cv2.resize(
|
||||
cropped,
|
||||
(self.target_width, self.target_height),
|
||||
@@ -374,7 +429,6 @@ class SmartFramer:
|
||||
|
||||
return resized
|
||||
|
||||
# MoviePy 2.x compatible way to create VideoClip
|
||||
new_clip = VideoClip(duration=video_clip.duration)
|
||||
new_clip.size = (self.target_width, self.target_height)
|
||||
new_clip.frame_function = make_frame
|
||||
@@ -397,13 +451,10 @@ class SmartFramer:
|
||||
"""
|
||||
def make_frame(t):
|
||||
frame = video_clip.get_frame(t)
|
||||
# Calculate exact frame index with decimal precision for smooth interpolation
|
||||
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
|
||||
frame_idx = int(exact_frame_idx)
|
||||
|
||||
# Ensure we have valid contexts
|
||||
if not framing_plan.frame_contexts:
|
||||
# Fallback to simple center crop
|
||||
h, w = frame.shape[:2]
|
||||
crop_h = int(w * self.target_aspect)
|
||||
crop_w = w
|
||||
@@ -415,107 +466,81 @@ class SmartFramer:
|
||||
cropped = frame[y:y + crop_h, x:x + crop_w]
|
||||
return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
|
||||
|
||||
# Clamp index to valid range
|
||||
frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
|
||||
context = framing_plan.frame_contexts[frame_idx]
|
||||
|
||||
# Create output frame
|
||||
output = np.zeros((self.target_height, self.target_width, 3), dtype=np.uint8)
|
||||
|
||||
if len(context.detected_faces) >= 2:
|
||||
# Split vertically 50/50 (two columns)
|
||||
half_width = self.target_width // 2
|
||||
if context.selected_people and len(context.selected_people) >= 2:
|
||||
selected_faces = [context.detected_faces[i] for i in context.selected_people[:2]
|
||||
if i < len(context.detected_faces)]
|
||||
|
||||
# Select the 2 most relevant faces
|
||||
# Priority: ALWAYS show active speaker first + most confident other person
|
||||
if context.active_speakers and len(context.active_speakers) >= 1:
|
||||
# Get the PRIMARY speaker (most confident among active speakers)
|
||||
speaker_faces = [context.detected_faces[i] for i in context.active_speakers
|
||||
if i < len(context.detected_faces)]
|
||||
if len(selected_faces) >= 2:
|
||||
faces = sorted(selected_faces, key=lambda f: f.center_x)
|
||||
left_face = faces[0]
|
||||
right_face = faces[1]
|
||||
|
||||
primary_speaker = max(speaker_faces, key=lambda f: f.confidence)
|
||||
for idx, face in enumerate([left_face, right_face]):
|
||||
|
||||
# Get OTHER faces (not the primary speaker)
|
||||
other_faces = [f for f in context.detected_faces if f != primary_speaker]
|
||||
half_width = self.target_width // 2
|
||||
half_aspect = self.target_height / half_width # Aspect ratio for half
|
||||
|
||||
if len(speaker_faces) >= 2:
|
||||
# Multiple speakers: show primary + second most confident speaker
|
||||
other_speakers = [f for f in speaker_faces if f != primary_speaker]
|
||||
secondary_person = max(other_speakers, key=lambda f: f.confidence)
|
||||
elif other_faces:
|
||||
# One speaker: show speaker + most confident other person
|
||||
secondary_person = max(other_faces, key=lambda f: f.confidence)
|
||||
else:
|
||||
# Fallback: only one person detected
|
||||
secondary_person = primary_speaker
|
||||
face_width = max(face.width, frame.shape[1] // 4) # At least 1/4 of frame width
|
||||
crop_width = int(face_width * 2.5) # Add padding around face
|
||||
crop_height = int(crop_width * half_aspect) # Maintain correct aspect
|
||||
|
||||
selected_faces = [primary_speaker, secondary_person]
|
||||
max_crop_width = frame.shape[1] // 2 # Half the source width
|
||||
max_crop_height = frame.shape[0] # Full source height
|
||||
|
||||
if crop_width > max_crop_width:
|
||||
crop_width = max_crop_width
|
||||
crop_height = int(crop_width * half_aspect)
|
||||
|
||||
if crop_height > max_crop_height:
|
||||
crop_height = max_crop_height
|
||||
crop_width = int(crop_height / half_aspect)
|
||||
|
||||
x = max(0, face.center_x - crop_width // 2)
|
||||
y = max(0, face.center_y - crop_height // 2)
|
||||
|
||||
x = min(x, frame.shape[1] - crop_width)
|
||||
y = min(y, frame.shape[0] - crop_height)
|
||||
|
||||
cropped = frame[y:y + crop_height, x:x + crop_width]
|
||||
resized = cv2.resize(
|
||||
cropped,
|
||||
(half_width, self.target_height),
|
||||
interpolation=cv2.INTER_LINEAR
|
||||
)
|
||||
|
||||
x_offset = idx * half_width
|
||||
output[:, x_offset:x_offset + half_width] = resized
|
||||
else:
|
||||
# No speakers: take 2 most confident faces
|
||||
selected_faces = sorted(context.detected_faces, key=lambda f: f.confidence, reverse=True)[:2]
|
||||
|
||||
# Sort selected faces by horizontal position for consistent left/right placement
|
||||
faces = sorted(selected_faces, key=lambda f: f.center_x)
|
||||
left_face = faces[0]
|
||||
right_face = faces[1]
|
||||
|
||||
# Process each person's frame
|
||||
for idx, face in enumerate([left_face, right_face]):
|
||||
# Calculate crop region focused on this person
|
||||
# Each person gets half the width, full target aspect ratio (9:16)
|
||||
# This ensures NO distortion when resizing
|
||||
|
||||
# For split screen: each side is half_width x full_height
|
||||
# We need to maintain 9:16 aspect for each half
|
||||
half_width = self.target_width // 2
|
||||
half_aspect = self.target_height / half_width # Aspect ratio for half
|
||||
|
||||
# Determine crop size based on face with padding
|
||||
face_width = max(face.width, frame.shape[1] // 4) # At least 1/4 of frame width
|
||||
crop_width = int(face_width * 2.5) # Add padding around face
|
||||
crop_height = int(crop_width * half_aspect) # Maintain correct aspect
|
||||
|
||||
# Ensure crop fits in frame, maintaining aspect ratio
|
||||
max_crop_width = frame.shape[1] // 2 # Half the source width
|
||||
max_crop_height = frame.shape[0] # Full source height
|
||||
|
||||
# If crop is too wide, scale down proportionally
|
||||
if crop_width > max_crop_width:
|
||||
crop_width = max_crop_width
|
||||
crop_height = int(crop_width * half_aspect)
|
||||
|
||||
# If crop is too tall, scale down proportionally
|
||||
if crop_height > max_crop_height:
|
||||
crop_height = max_crop_height
|
||||
crop_width = int(crop_height / half_aspect)
|
||||
|
||||
# Center crop on face
|
||||
x = max(0, face.center_x - crop_width // 2)
|
||||
y = max(0, face.center_y - crop_height // 2)
|
||||
|
||||
# Clamp to frame boundaries
|
||||
x = min(x, frame.shape[1] - crop_width)
|
||||
y = min(y, frame.shape[0] - crop_height)
|
||||
|
||||
# Extract and resize crop
|
||||
cropped = frame[y:y + crop_height, x:x + crop_width]
|
||||
resized = cv2.resize(
|
||||
if framing_plan.crop_regions:
|
||||
crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
|
||||
crop = framing_plan.crop_regions[crop_idx]
|
||||
cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
|
||||
else:
|
||||
h, w = frame.shape[:2]
|
||||
crop_h = int(w * self.target_aspect)
|
||||
crop_w = w
|
||||
if crop_h > h:
|
||||
crop_h = h
|
||||
crop_w = int(h / self.target_aspect)
|
||||
y = (h - crop_h) // 2
|
||||
x = (w - crop_w) // 2
|
||||
cropped = frame[y:y + crop_h, x:x + crop_w]
|
||||
output = cv2.resize(
|
||||
cropped,
|
||||
(half_width, self.target_height),
|
||||
(self.target_width, self.target_height),
|
||||
interpolation=cv2.INTER_LINEAR
|
||||
)
|
||||
|
||||
# Place in output at appropriate horizontal position
|
||||
x_offset = idx * half_width
|
||||
output[:, x_offset:x_offset + half_width] = resized
|
||||
else:
|
||||
# Fall back to single framing
|
||||
if framing_plan.crop_regions:
|
||||
crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
|
||||
crop = framing_plan.crop_regions[crop_idx]
|
||||
cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
|
||||
else:
|
||||
# Fallback to center crop if no crop regions available
|
||||
h, w = frame.shape[:2]
|
||||
crop_h = int(w * self.target_aspect)
|
||||
crop_w = w
|
||||
@@ -533,7 +558,6 @@ class SmartFramer:
|
||||
|
||||
return output
|
||||
|
||||
# MoviePy 2.x compatible way to create VideoClip
|
||||
new_clip = VideoClip(duration=video_clip.duration)
|
||||
new_clip.size = (self.target_width, self.target_height)
|
||||
new_clip.frame_function = make_frame
|
||||
@@ -556,13 +580,10 @@ class SmartFramer:
|
||||
"""
|
||||
def make_frame(t):
|
||||
frame = video_clip.get_frame(t)
|
||||
# Calculate exact frame index with decimal precision for smooth interpolation
|
||||
exact_frame_idx = (t * framing_plan.fps) / self.frame_skip
|
||||
frame_idx = int(exact_frame_idx)
|
||||
|
||||
# Ensure we have valid contexts
|
||||
if not framing_plan.frame_contexts:
|
||||
# Fallback to simple center crop
|
||||
h, w = frame.shape[:2]
|
||||
crop_h = int(w * self.target_aspect)
|
||||
crop_w = w
|
||||
@@ -574,7 +595,6 @@ class SmartFramer:
|
||||
cropped = frame[y:y + crop_h, x:x + crop_w]
|
||||
return cv2.resize(cropped, (self.target_width, self.target_height), interpolation=cv2.INTER_LINEAR)
|
||||
|
||||
# Clamp index to valid range
|
||||
frame_idx = max(0, min(frame_idx, len(framing_plan.frame_contexts) - 1))
|
||||
context = framing_plan.frame_contexts[frame_idx]
|
||||
|
||||
@@ -583,23 +603,18 @@ class SmartFramer:
|
||||
num_faces = len(context.detected_faces)
|
||||
|
||||
if num_faces >= 3:
|
||||
# Create 2x2 grid
|
||||
cell_width = self.target_width // 2
|
||||
cell_height = self.target_height // 2
|
||||
|
||||
for idx, face in enumerate(context.detected_faces[:4]):
|
||||
# Calculate grid position
|
||||
row = idx // 2
|
||||
col = idx % 2
|
||||
|
||||
# Each grid cell maintains aspect ratio (square in this case: cell_width = cell_height)
|
||||
cell_aspect = cell_height / cell_width
|
||||
|
||||
# Crop around face with correct aspect ratio
|
||||
crop_width = frame.shape[1] // 2
|
||||
crop_height = int(crop_width * cell_aspect)
|
||||
|
||||
# Ensure crop fits in frame, maintaining aspect
|
||||
max_crop_width = frame.shape[1] // 2
|
||||
max_crop_height = frame.shape[0] // 2
|
||||
|
||||
@@ -611,11 +626,9 @@ class SmartFramer:
|
||||
crop_height = max_crop_height
|
||||
crop_width = int(crop_height / cell_aspect)
|
||||
|
||||
# Center crop on face
|
||||
x = max(0, face.center_x - crop_width // 2)
|
||||
y = max(0, face.center_y - crop_height // 2)
|
||||
|
||||
# Clamp to frame boundaries
|
||||
x = min(x, frame.shape[1] - crop_width)
|
||||
y = min(y, frame.shape[0] - crop_height)
|
||||
|
||||
@@ -626,18 +639,15 @@ class SmartFramer:
|
||||
interpolation=cv2.INTER_LINEAR
|
||||
)
|
||||
|
||||
# Place in grid
|
||||
y_offset = row * cell_height
|
||||
x_offset = col * cell_width
|
||||
output[y_offset:y_offset + cell_height, x_offset:x_offset + cell_width] = resized
|
||||
else:
|
||||
# Fall back to single framing
|
||||
if framing_plan.crop_regions:
|
||||
crop_idx = max(0, min(frame_idx, len(framing_plan.crop_regions) - 1))
|
||||
crop = framing_plan.crop_regions[crop_idx]
|
||||
cropped = frame[crop.y:crop.y + crop.height, crop.x:crop.x + crop.width]
|
||||
else:
|
||||
# Fallback to center crop if no crop regions available
|
||||
h, w = frame.shape[:2]
|
||||
crop_h = int(w * self.target_aspect)
|
||||
crop_w = w
|
||||
@@ -655,7 +665,6 @@ class SmartFramer:
|
||||
|
||||
return output
|
||||
|
||||
# MoviePy 2.x compatible way to create VideoClip
|
||||
new_clip = VideoClip(duration=video_clip.duration)
|
||||
new_clip.size = (self.target_width, self.target_height)
|
||||
new_clip.frame_function = make_frame
|
||||
|
||||
@@ -6,6 +6,7 @@ from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
from video_render.config import Settings
|
||||
@@ -56,6 +57,17 @@ class TranscriptionService:
|
||||
)
|
||||
return self._model
|
||||
|
||||
def unload_model(self) -> None:
|
||||
"""Unload the Whisper model to free memory (reduces RAM usage by 1-3GB)."""
|
||||
if self._model is not None:
|
||||
logger.info("Descarregando modelo Whisper para liberar memória...")
|
||||
del self._model
|
||||
self._model = None
|
||||
# Force garbage collection to immediately free GPU/CPU memory
|
||||
import gc
|
||||
gc.collect()
|
||||
logger.info("Modelo Whisper descarregado com sucesso")
|
||||
|
||||
def transcribe(self, audio_path: Path, output_dir: Optional[Path] = None) -> TranscriptionResult:
|
||||
if output_dir is not None:
|
||||
existing_transcription = self.load(output_dir)
|
||||
@@ -63,7 +75,34 @@ class TranscriptionService:
|
||||
logger.info("Transcrição já existe em %s, reutilizando...", output_dir)
|
||||
return existing_transcription
|
||||
|
||||
logger.info("Iniciando transcrição do áudio com FasterWhisper...")
|
||||
# Get audio duration to decide if we need chunked processing
|
||||
audio_duration = self._get_audio_duration(audio_path)
|
||||
chunk_duration_minutes = 30 # Process in 30-minute chunks for long videos
|
||||
chunk_duration_seconds = chunk_duration_minutes * 60
|
||||
|
||||
# For videos longer than 30 minutes, use chunked processing to avoid OOM
|
||||
if audio_duration > chunk_duration_seconds:
|
||||
logger.info(
|
||||
f"Áudio longo detectado ({audio_duration/60:.1f} min). "
|
||||
f"Processando em chunks de {chunk_duration_minutes} min para evitar erro de memória..."
|
||||
)
|
||||
return self._transcribe_chunked(audio_path, chunk_duration_seconds)
|
||||
else:
|
||||
logger.info(f"Iniciando transcrição do áudio ({audio_duration/60:.1f} min) com FasterWhisper...")
|
||||
return self._transcribe_full(audio_path)
|
||||
|
||||
def _get_audio_duration(self, audio_path: Path) -> float:
|
||||
"""Get audio duration in seconds."""
|
||||
try:
|
||||
from moviepy.audio.io.AudioFileClip import AudioFileClip
|
||||
with AudioFileClip(str(audio_path)) as audio:
|
||||
return audio.duration or 0.0
|
||||
except Exception as e:
|
||||
logger.warning(f"Falha ao obter duração do áudio, assumindo curto: {e}")
|
||||
return 0.0 # Assume short if we can't determine
|
||||
|
||||
def _transcribe_full(self, audio_path: Path) -> TranscriptionResult:
|
||||
"""Transcribe entire audio at once (for shorter videos)."""
|
||||
model = self._load_model()
|
||||
segments, _ = model.transcribe(
|
||||
str(audio_path),
|
||||
@@ -97,6 +136,101 @@ class TranscriptionService:
|
||||
full_text=" ".join(full_text_parts).strip(),
|
||||
)
|
||||
|
||||
def _transcribe_chunked(self, audio_path: Path, chunk_duration: float) -> TranscriptionResult:
|
||||
"""Transcribe audio in chunks to avoid OOM on long videos."""
|
||||
import subprocess
|
||||
from moviepy.audio.io.AudioFileClip import AudioFileClip
|
||||
|
||||
model = self._load_model()
|
||||
all_segments: List[TranscriptSegment] = []
|
||||
full_text_parts: List[str] = []
|
||||
segment_id_counter = 0
|
||||
|
||||
# Get total duration
|
||||
total_duration = self._get_audio_duration(audio_path)
|
||||
num_chunks = int(np.ceil(total_duration / chunk_duration))
|
||||
|
||||
logger.info(f"Processando áudio em {num_chunks} chunks...")
|
||||
|
||||
for chunk_idx in range(num_chunks):
|
||||
start_time = chunk_idx * chunk_duration
|
||||
end_time = min((chunk_idx + 1) * chunk_duration, total_duration)
|
||||
|
||||
logger.info(
|
||||
f"Processando chunk {chunk_idx + 1}/{num_chunks} "
|
||||
f"({start_time/60:.1f}min - {end_time/60:.1f}min)..."
|
||||
)
|
||||
|
||||
# Extract chunk using ffmpeg directly (more reliable than moviepy subclip)
|
||||
temp_chunk_path = audio_path.parent / f"temp_chunk_{chunk_idx}.wav"
|
||||
try:
|
||||
# Use ffmpeg to extract the chunk
|
||||
chunk_duration_actual = end_time - start_time
|
||||
ffmpeg_cmd = [
|
||||
'ffmpeg',
|
||||
'-y', # Overwrite output file
|
||||
'-ss', str(start_time), # Start time
|
||||
'-i', str(audio_path), # Input file
|
||||
'-t', str(chunk_duration_actual), # Duration
|
||||
'-acodec', 'pcm_s16le', # Audio codec
|
||||
'-ar', '44100', # Sample rate
|
||||
'-ac', '2', # Stereo
|
||||
'-loglevel', 'error', # Only show errors
|
||||
str(temp_chunk_path)
|
||||
]
|
||||
|
||||
subprocess.run(ffmpeg_cmd, check=True, capture_output=True)
|
||||
|
||||
# Transcribe chunk
|
||||
segments, _ = model.transcribe(
|
||||
str(temp_chunk_path),
|
||||
beam_size=5,
|
||||
word_timestamps=True,
|
||||
)
|
||||
|
||||
# Process segments with time offset
|
||||
for segment in segments:
|
||||
words = [
|
||||
WordTiming(
|
||||
start=w.start + start_time,
|
||||
end=w.end + start_time,
|
||||
word=w.word.strip()
|
||||
)
|
||||
for w in segment.words or []
|
||||
if w.word.strip()
|
||||
]
|
||||
text = segment.text.strip()
|
||||
full_text_parts.append(text)
|
||||
all_segments.append(
|
||||
TranscriptSegment(
|
||||
id=segment_id_counter,
|
||||
start=segment.start + start_time,
|
||||
end=segment.end + start_time,
|
||||
text=text,
|
||||
words=words,
|
||||
)
|
||||
)
|
||||
segment_id_counter += 1
|
||||
|
||||
# Force garbage collection after each chunk
|
||||
import gc
|
||||
gc.collect()
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Erro ao extrair chunk {chunk_idx}: {e.stderr.decode() if e.stderr else str(e)}")
|
||||
raise
|
||||
finally:
|
||||
# Clean up temp chunk
|
||||
if temp_chunk_path.exists():
|
||||
temp_chunk_path.unlink()
|
||||
|
||||
logger.info(f"Transcrição em chunks concluída: {len(all_segments)} segmentos processados")
|
||||
|
||||
return TranscriptionResult(
|
||||
segments=all_segments,
|
||||
full_text=" ".join(full_text_parts).strip(),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def persist(result: TranscriptionResult, destination: Path) -> None:
|
||||
json_path = destination / "transcription.json"
|
||||
|
||||
@@ -23,16 +23,58 @@ def ensure_workspace(root: Path, folder_name: str) -> Path:
|
||||
|
||||
|
||||
def remove_paths(paths: Iterable[Path]) -> None:
|
||||
import logging
|
||||
import time
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
for path in paths:
|
||||
if not path.exists():
|
||||
continue
|
||||
if path.is_file() or path.is_symlink():
|
||||
path.unlink(missing_ok=True)
|
||||
else:
|
||||
for child in sorted(path.rglob("*"), reverse=True):
|
||||
if child.is_file() or child.is_symlink():
|
||||
child.unlink(missing_ok=True)
|
||||
elif child.is_dir():
|
||||
child.rmdir()
|
||||
path.rmdir()
|
||||
|
||||
# Try to remove with retries and better error handling
|
||||
max_retries = 3
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
if path.is_file() or path.is_symlink():
|
||||
path.unlink(missing_ok=True)
|
||||
else:
|
||||
for child in sorted(path.rglob("*"), reverse=True):
|
||||
if child.is_file() or child.is_symlink():
|
||||
try:
|
||||
child.unlink(missing_ok=True)
|
||||
except PermissionError:
|
||||
logger.warning(f"Não foi possível deletar {child}: sem permissão")
|
||||
# Try to change permissions and retry
|
||||
try:
|
||||
child.chmod(0o777)
|
||||
child.unlink(missing_ok=True)
|
||||
except Exception as e:
|
||||
logger.warning(f"Falha ao forçar deleção de {child}: {e}")
|
||||
elif child.is_dir():
|
||||
try:
|
||||
child.rmdir()
|
||||
except (PermissionError, OSError) as e:
|
||||
logger.warning(f"Não foi possível remover diretório {child}: {e}")
|
||||
|
||||
try:
|
||||
path.rmdir()
|
||||
except (PermissionError, OSError) as e:
|
||||
logger.warning(f"Não foi possível remover diretório {path}: {e}")
|
||||
break # Success, exit retry loop
|
||||
|
||||
except PermissionError as e:
|
||||
if attempt < max_retries - 1:
|
||||
logger.warning(f"Tentativa {attempt + 1}/{max_retries} falhou ao deletar {path}: {e}. Tentando novamente...")
|
||||
time.sleep(0.5) # Wait a bit before retry
|
||||
# Try to change permissions
|
||||
try:
|
||||
path.chmod(0o777)
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
logger.error(f"Não foi possível deletar {path} após {max_retries} tentativas: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Erro inesperado ao deletar {path}: {e}")
|
||||
break # Don't retry on unexpected errors
|
||||
|
||||
|
||||
Reference in New Issue
Block a user