Cria novos components

This commit is contained in:
LeoMortari
2025-10-20 17:56:36 -03:00
parent 2b99d2ad78
commit b090f7c2cb
38 changed files with 1391 additions and 1024 deletions

98
.gitignore vendored
View File

@@ -1,98 +0,0 @@
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
*.manifest
*.spec
pip-log.txt
pip-delete-this-directory.txt
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
*.mo
*.pot
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
instance/
.webassets-cache
.scrapy
docs/_build/
.pybuilder/
target/
.ipynb_checkpoints
profile_default/
ipython_config.py
.pdm.toml
__pypackages__/
celerybeat-schedule
celerybeat.pid
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
.spyderproject
.spyproject
.ropeproject
/site
.mypy_cache/
.dmypy.json
dmypy.json
.pyre/
.pytype/
cython_debug/
.idea/
.vscode/
*.code-workspace
*.local
*.mp4
*.wav
*.mp3
*.srt
*.vtt
*.json
*.csv
*.xlsx
*.db
*.sqlite3

View File

@@ -1 +0,0 @@
"""Top-level package for the video processing pipeline."""

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -1,3 +1,8 @@
# GEMINI_API_KEY="AIzaSyB5TPjSPPZG1Qb6EtblhKFAjvCOdY15rcw"
# YOUTUBE_API="https://totally-real-dingo.ngrok-free.app"
# OPENROUTER_API_KEY="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"
# OPENROUTER_MODEL="openai/gpt-oss-20b:free"
services: services:
video-render-new: video-render-new:
restart: unless-stopped restart: unless-stopped
@@ -6,19 +11,13 @@ services:
environment: environment:
# RabbitMQ credentials # RabbitMQ credentials
- RABBITMQ_PASS=${RABBITMQ_PASS} - RABBITMQ_PASS=${RABBITMQ_PASS}
- RABBITMQ_HOST=${RABBITMQ_HOST}
- RABBITMQ_USER=${RABBITMQ_USER}
- RABBITMQ_PORT=${RABBITMQ_PORT}
- RABBITMQ_QUEUE=${RABBITMQ_QUEUE}
- RABBITMQ_UPLOAD_QUEUE=${RABBITMQ_UPLOAD_QUEUE}
# API keys for the LLMs
- GEMINI_API_KEY=${GEMINI_API_KEY} - GEMINI_API_KEY=${GEMINI_API_KEY}
- GEMINI_MODEL=${GEMINI_MODEL:-gemini-2.5-pro}
- OPENROUTER_API_KEY=${OPENROUTER_API_KEY} - OPENROUTER_API_KEY=${OPENROUTER_API_KEY}
- OPENROUTER_MODEL=${OPENROUTER_MODEL} - OPENROUTER_MODEL=${OPENROUTER_MODEL:-openai/gpt-oss-20b:free}
# Optional whisper settings - FASTER_WHISPER_MODEL_SIZE=${FASTER_WHISPER_MODEL_SIZE:-small}
- WHISPER_MODEL=${WHISPER_MODEL} ports:
- WHISPER_DEVICE=${WHISPER_DEVICE} - "5000:5000"
- WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE}
volumes: volumes:
# Mount host directories into the container so that videos can be # Mount host directories into the container so that videos can be
# provided and outputs collected. These paths can be customised when # provided and outputs collected. These paths can be customised when
@@ -27,9 +26,18 @@ services:
- "/root/videos:/app/videos" - "/root/videos:/app/videos"
- "/root/outputs:/app/outputs" - "/root/outputs:/app/outputs"
command: "python -u main.py" command: "python -u main.py"
networks: # runtime: nvidia
- dokploy-network
networks: # networks:
dokploy-network: # - dokploy-network
external: true
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: all
# capabilities: [gpu]
# networks:
# dokploy-network:
# external: true

View File

@@ -21,6 +21,10 @@ RUN apt-get update && \
xdg-utils \ xdg-utils \
wget \ wget \
unzip \ unzip \
ffmpeg \
libgomp1 \
libpq-dev \
vim \
libmagick++-dev \ libmagick++-dev \
imagemagick \ imagemagick \
fonts-liberation \ fonts-liberation \

234
llm.py
View File

@@ -1,234 +0,0 @@
"""High-level helpers for interacting with the Gemini and OpenRouter APIs.
This module encapsulates all of the logic needed to call the LLM endpoints
used throughout the application. It uses the OpenAI Python client under the
hood because both Gemini and OpenRouter expose OpenAI-compatible APIs.
Two functions are exposed:
* ``select_highlights`` takes an SRT-like string (the transcription of a
video) and returns a list of highlight objects with start and end
timestamps and their corresponding text. It uses the Gemini model to
identify which parts of the video are most likely to engage viewers on
social media.
* ``generate_titles`` takes a list of highlight objects and returns a list
of the same objects enriched with a ``topText`` field, which contains a
sensational title for the clip. It uses the OpenRouter API with a model
specified via the ``OPENROUTER_MODEL`` environment variable.
Both functions are resilient to malformed outputs from the models. They try
to extract the first JSON array found in the model responses; if that
fails, a descriptive exception is raised. These exceptions should be
handled by callers to post appropriate error messages back to the queue.
"""
from __future__ import annotations
import json
import os
import re
from typing import Any, Dict, List
import openai
class LLMError(Exception):
"""Raised when the LLM response cannot be parsed into the expected format."""
def _extract_json_array(text: str) -> Any:
"""Extract the first JSON array from a string.
LLMs sometimes return explanatory text before or after the JSON. This
helper uses a regular expression to find the first substring that
resembles a JSON array (i.e. starts with '[' and ends with ']'). It
returns the corresponding Python object if successful, otherwise
raises a ``LLMError``.
"""
# Remove Markdown code fences and other formatting noise
cleaned = text.replace("`", "").replace("json", "")
# Find the first [ ... ] block
match = re.search(r"\[.*\]", cleaned, re.DOTALL)
if not match:
raise LLMError("Não foi possível encontrar um JSON válido na resposta da IA.")
json_str = match.group(0)
try:
return json.loads(json_str)
except json.JSONDecodeError as exc:
raise LLMError(f"Erro ao decodificar JSON: {exc}")
def select_highlights(srt_text: str) -> List[Dict[str, Any]]:
"""Call the Gemini API to select highlight segments from a transcription.
The input ``srt_text`` should be a string containing the transcription
formatted like an SRT file, with lines of the form
``00:00:10,140 --> 00:01:00,990`` followed by the spoken text.
Returns a list of dictionaries, each with ``start``, ``end`` and
``text`` keys. On failure to parse the response, a ``LLMError`` is
raised.
"""
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
raise ValueError("GEMINI_API_KEY não definido no ambiente")
model = os.environ.get("GEMINI_MODEL", "gemini-2.5-flash")
# Initialise client for Gemini. The base_url points to the
# generativelanguage API; see the official docs for details.
client = openai.OpenAI(api_key=api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
# System prompt: instructs Gemini how to behave.
system_prompt = (
"Você é um assistente especializado em selecionar **HIGHLIGHTS** de vídeo "
"a partir da transcrição com timestamps.\n"
"Sua única função é **selecionar os trechos** conforme solicitado.\n"
"- **Não resuma, não interprete, não gere comentários ou textos complementares.**\n"
"- **Retorne a resposta exatamente no formato proposto pelo usuário**, sem adicionar ou remover nada além do pedido.\n"
"- Cada trecho selecionado deve ter **no mínimo 60 segundos e no máximo 120 segundos** de duração.\n"
"- Sempre responda **em português (PT-BR)**."
)
# Base prompt: describes how to select highlights and the format to return.
base_prompt = (
"Você assumirá o papel de um especialista em Marketing e Social Media, "
"sua tarefa é selecionar as melhores partes de uma transcrição que irei fornecer.\n\n"
"## Critérios de Seleção\n\n"
"- Escolha trechos baseando-se em:\n"
" - **Picos de emoção ou impacto**\n"
" - **Viradas de assunto**\n"
" - **Punchlines** (frases de efeito, momentos de virada)\n"
" - **Informações-chave**\n\n"
"## Regras Rápidas\n\n"
"- Sempre devolver pelo menos 3 trechos, não possui limite máximo\n"
"- Garanta que cada trecho fique com no MÍNIMO 60 segundos e no MÁXIMO 120 segundos.\n"
"- Nenhum outro texto além do JSON final.\n\n"
"## Restrições de Duração\n\n"
"- **Duração mínima do trecho escolhido:** 60 segundos\n"
"- **Duração máxima do trecho escolhido:** 90 a 120 segundos\n\n"
"## Tarefa\n\n"
"- Proponha o **máximo de trechos** com potencial, mas **sempre devolva no mínimo 3 trechos**.\n"
"- Extraia os trechos **apenas** da transcrição fornecida abaixo.\n\n"
"## IMPORTANTE\n"
"- Cada trecho deve ter no mínimo 60 segundos, e no máximo 120 segundos. Isso é indiscutível\n\n"
"## Entrada\n\n"
"- Transcrição:\n\n"
f"{srt_text}\n\n"
"## Saída\n\n"
"- Retorne **somente** a lista de trechos selecionados em formato JSON, conforme o exemplo abaixo.\n"
"- **Não escreva comentários ou qualquer texto extra.**\n"
"- No atributo \"text\", inclua o texto presente no trecho escolhido.\n\n"
"### Exemplo de Conversão\n\n"
"#### De SRT:\n"
"00:00:10,140 --> 00:01:00,990\n"
"Exemplo de escrita presente no trecho\n\n"
"#### Para JSON:\n"
"[\n"
" {\n"
" \"start\": \"00:00:10,140\",\n"
" \"end\": \"00:01:00,990\",\n"
" \"text\": \"Exemplo de escrita presente no trecho\"\n"
" }\n"
"]\n"
)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": base_prompt},
]
try:
response = client.chat.completions.create(model=model, messages=messages)
except Exception as exc:
raise LLMError(f"Erro ao chamar a API Gemini: {exc}")
# Extract message content
content = response.choices[0].message.content if response.choices else None
if not content:
raise LLMError("A resposta da Gemini veio vazia.")
result = _extract_json_array(content)
if not isinstance(result, list):
raise LLMError("O JSON retornado pela Gemini não é uma lista.")
return result
def generate_titles(highlights: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Call the OpenRouter API to generate a title (topText) for each highlight.
The ``highlights`` argument should be a list of dictionaries as returned
by ``select_highlights``, each containing ``start``, ``end`` and ``text``.
This function adds a ``topText`` field to each dictionary using the
OpenRouter model specified via the ``OPENROUTER_MODEL`` environment
variable. If parsing fails, an ``LLMError`` is raised.
"""
api_key = os.environ.get("OPENROUTER_API_KEY")
if not api_key:
raise ValueError("OPENROUTER_API_KEY não definido no ambiente")
model = os.environ.get("OPENROUTER_MODEL")
if not model:
raise ValueError("OPENROUTER_MODEL não definido no ambiente")
# Create client for OpenRouter
client = openai.OpenAI(api_key=api_key, base_url="https://openrouter.ai/api/v1")
# Compose prompt: instruct to generate titles only
prompt_header = (
"Você é um especialista em Marketing Digital e Criação de Conteúdo Viral.\n\n"
"Sua tarefa é criar **títulos sensacionalistas** (*topText*) para cada trecho "
"de transcrição recebido em formato JSON.\n\n"
"## Instruções\n\n"
"- O texto deve ser **chamativo, impactante** e com alto potencial de viralização "
"em redes sociais, **mas sem sair do contexto do trecho**.\n"
"- Use expressões fortes e curiosas, mas **nunca palavras de baixo calão**.\n"
"- Cada *topText* deve ter **no máximo 2 linhas**.\n"
"- Utilize **exclusivamente** o conteúdo do trecho; não invente fatos.\n"
"- Não adicione comentários, explicações, ou qualquer texto extra na resposta.\n"
"- Responda **apenas** no seguinte formato (mantendo as chaves e colchetes):\n\n"
"[\n {\n \"start\": \"00:00:10,140\",\n \"end\": \"00:01:00,990\",\n \"topText\": \"Título impactante\"\n }\n]\n\n"
"## Observações:\n\n"
"- Nunca fuja do contexto do trecho.\n"
"- Não invente informações.\n"
"- Não utilize palavrões.\n"
"- Não escreva nada além do JSON de saída.\n\n"
"Aqui estão os trechos em JSON:\n"
)
# Compose input JSON for the model
json_input = json.dumps(highlights, ensure_ascii=False)
full_message = prompt_header + json_input
messages = [
{
"role": "system",
"content": "Você é um assistente útil e objetivo."
},
{
"role": "user",
"content": full_message
},
]
try:
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=0.7,
)
except Exception as exc:
raise LLMError(f"Erro ao chamar a API OpenRouter: {exc}")
content = response.choices[0].message.content if response.choices else None
if not content:
raise LLMError("A resposta da OpenRouter veio vazia.")
result = _extract_json_array(content)
if not isinstance(result, list):
raise LLMError("O JSON retornado pela OpenRouter não é uma lista.")
# Merge topText back into highlights
# We assume the result list has the same order and length as input highlights
enriched: List[Dict[str, Any]] = []
input_map = {(item["start"], item["end"]): item for item in highlights}
for item in result:
key = (item.get("start"), item.get("end"))
original = input_map.get(key)
if original is None:
# If the model returns unexpected entries, skip them
continue
enriched_item = original.copy()
# Only topText is expected
enriched_item["topText"] = item.get("topText", "").strip()
enriched.append(enriched_item)
return enriched

269
main.py
View File

@@ -1,265 +1,16 @@
"""Entry point for the video processing pipeline. from video_render.config import load_settings
from video_render.logging_utils import setup_logging
This script listens to a RabbitMQ queue for new video processing tasks. When from video_render.messaging import RabbitMQWorker
a message arrives, it performs the following steps: from video_render.pipeline import VideoPipeline
1. Creates a working directory for the video based off of its filename.
2. Extracts the audio track with FFMPEG and runs Faster-Whisper to produce
a transcription with word-level timestamps.
3. Uses the Gemini model to determine which parts of the video have the
highest potential for engagement. These highlight segments are
represented as a list of objects containing start/end timestamps and
text.
4. Uses the OpenRouter model to generate a sensational title for each
highlight. Only the ``topText`` field is kept; the description is
intentionally omitted since the caption will be burned into the video.
5. Cuts the original video into individual clips corresponding to each
highlight and renders them vertically with a title above and a dynamic
caption below.
6. Publishes a message to the upload queue with information about the
generated clips. On success, this message contains the list of output
files. On failure, ``hasError`` will be set to ``True`` and the
``error`` field will describe what went wrong.
7. Cleans up temporary files (audio, transcript, working directory) and
deletes the original source video from the ``videos`` directory to
conserve disk space.
The queue names and RabbitMQ credentials are configured via environment
variables. See the accompanying ``docker-compose.yml`` for defaults.
"""
from __future__ import annotations
import json
import os
import shutil
import time
import traceback
from typing import Any, Dict, List
import pika
from .utils import sanitize_filename, seconds_to_timestamp, timestamp_to_seconds
from .transcribe import transcribe
from .llm import LLMError, select_highlights, generate_titles
from .render import render_clip
# Environment variables with sensible defaults def main() -> None:
RABBITMQ_HOST = os.environ.get("RABBITMQ_HOST", "rabbitmq") setup_logging()
RABBITMQ_PORT = int(os.environ.get("RABBITMQ_PORT", 5672)) settings = load_settings()
RABBITMQ_USER = os.environ.get("RABBITMQ_USER", "admin")
RABBITMQ_PASS = os.environ.get("RABBITMQ_PASS")
RABBITMQ_QUEUE = os.environ.get("RABBITMQ_QUEUE", "to-render")
RABBITMQ_UPLOAD_QUEUE = os.environ.get("RABBITMQ_UPLOAD_QUEUE", "to-upload")
if not RABBITMQ_PASS: pipeline = VideoPipeline(settings)
raise RuntimeError("RABBITMQ_PASS não definido no ambiente") worker = RabbitMQWorker(settings)
worker.consume_forever(pipeline.process_message)
def get_next_message() -> Any:
"""Retrieve a single message from the RABBITMQ_QUEUE.
Returns ``None`` if no messages are available. This helper opens a new
connection for each call to avoid keeping stale connections alive.
"""
credentials = pika.PlainCredentials(RABBITMQ_USER, RABBITMQ_PASS)
parameters = pika.ConnectionParameters(
host=RABBITMQ_HOST,
port=RABBITMQ_PORT,
credentials=credentials,
heartbeat=60,
blocked_connection_timeout=300,
)
connection = pika.BlockingConnection(parameters)
channel = connection.channel()
method_frame, _, body = channel.basic_get(RABBITMQ_QUEUE)
if method_frame:
channel.basic_ack(method_frame.delivery_tag)
connection.close()
return body
connection.close()
return None
def publish_to_queue(payload: Dict[str, Any]) -> None:
"""Publish a JSON-serialisable payload to the RABBITMQ_UPLOAD_QUEUE."""
credentials = pika.PlainCredentials(RABBITMQ_USER, RABBITMQ_PASS)
parameters = pika.ConnectionParameters(
host=RABBITMQ_HOST,
port=RABBITMQ_PORT,
credentials=credentials,
heartbeat=60,
blocked_connection_timeout=300,
)
connection = pika.BlockingConnection(parameters)
channel = connection.channel()
channel.queue_declare(queue=RABBITMQ_UPLOAD_QUEUE, durable=True)
channel.basic_publish(
exchange="",
routing_key=RABBITMQ_UPLOAD_QUEUE,
body=json.dumps(payload),
properties=pika.BasicProperties(delivery_mode=2),
)
connection.close()
def build_srt(segments: List[Dict[str, Any]]) -> str:
"""Build an SRT-like string from a list of segments.
Each segment should have ``start``, ``end`` and ``text`` fields. The
timestamps are converted to the ``HH:MM:SS,mmm`` format expected by
the Gemini prompt. Segments are separated by a blank line.
"""
lines = []
for seg in segments:
start_ts = seconds_to_timestamp(seg["start"])
end_ts = seconds_to_timestamp(seg["end"])
lines.append(f"{start_ts} --> {end_ts}\n{seg['text']}")
return "\n\n".join(lines)
def process_message(data: Dict[str, Any]) -> Dict[str, Any]:
"""Process a single video task described in ``data``.
Returns the payload to be sent to the upload queue. Raises an
exception on failure; the caller is responsible for catching it and
posting an error payload.
"""
filename = data.get("filename")
if not filename:
raise ValueError("Campo 'filename' ausente na mensagem")
url = data.get("url")
video_id = data.get("videoId")
# Determine source video path; n8n stores videos in the 'videos' directory
video_path = os.path.join("videos", filename)
if not os.path.exists(video_path):
raise FileNotFoundError(f"Arquivo de vídeo não encontrado: {video_path}")
# Sanitize the filename to use as directory name
base_no_ext = os.path.splitext(filename)[0]
sanitized = sanitize_filename(base_no_ext)
work_dir = os.path.join("app", "videos", sanitized)
# Transcribe video
segments, words = transcribe(video_path, work_dir)
# Build SRT string
srt_str = build_srt(segments)
# Call Gemini to select highlights
highlights = select_highlights(srt_str)
# Convert start/end times to floats and keep original strings for openrouter
for item in highlights:
item["start"] = item["start"].strip()
item["end"] = item["end"].strip()
# Generate titles
titles = generate_titles(highlights)
# Render clips
output_dir = os.path.join("outputs", sanitized)
processed_files: List[str] = []
for idx, item in enumerate(titles, start=1):
start_sec = timestamp_to_seconds(item.get("start"))
end_sec = timestamp_to_seconds(item.get("end"))
# Extract relative words for caption
relative_words = []
for w in words:
# Word must overlap clip interval
if w["end"] <= start_sec or w["start"] >= end_sec:
continue
rel_start = max(0.0, w["start"] - start_sec)
rel_end = max(0.0, w["end"] - start_sec)
relative_words.append({
"start": rel_start,
"end": rel_end,
"word": w["word"],
})
# If no words found (e.g. silence), create a dummy word to avoid errors
if not relative_words:
relative_words.append({"start": 0.0, "end": end_sec - start_sec, "word": ""})
out_path = render_clip(
video_path=video_path,
start=start_sec,
end=end_sec,
top_text=item.get("topText", ""),
words=relative_words,
out_dir=output_dir,
base_name=sanitized,
idx=idx,
)
processed_files.append(out_path)
# Compose payload
payload = {
"videosProcessedQuantity": len(processed_files),
"filename": filename,
"processedFiles": processed_files,
"url": url,
"videoId": video_id,
"hasError": False,
"error": None,
}
# Clean up working directory and original video
shutil.rmtree(work_dir, ignore_errors=True)
try:
os.remove(video_path)
except FileNotFoundError:
pass
return payload
def main():
print(" [*] Esperando mensagens. Para sair: CTRL+C")
while True:
body = get_next_message()
if body is None:
time.sleep(5)
continue
try:
data = json.loads(body)
except Exception:
print("⚠️ Mensagem inválida recebida (não é JSON)")
continue
try:
result = process_message(data)
except Exception as exc:
# Print stack trace for debugging
traceback.print_exc()
# Attempt to clean up any directories based on filename
filename = data.get("filename")
sanitized = sanitize_filename(os.path.splitext(filename or "")[0]) if filename else ""
work_dir = os.path.join("app", "videos", sanitized) if sanitized else None
output_dir = os.path.join("outputs", sanitized) if sanitized else None
# Remove working and output directories
if work_dir:
shutil.rmtree(work_dir, ignore_errors=True)
if output_dir:
shutil.rmtree(output_dir, ignore_errors=True)
# Remove original video if present
video_path = os.path.join("videos", filename) if filename else None
if video_path and os.path.exists(video_path):
try:
os.remove(video_path)
except Exception:
pass
# Build error payload
error_payload = {
"videosProcessedQuantity": 0,
"filename": filename,
"processedFiles": [],
"url": data.get("url"),
"videoId": data.get("videoId"),
"hasError": True,
"error": str(exc),
}
try:
publish_to_queue(error_payload)
print(f"Mensagem de erro publicada na fila '{RABBITMQ_UPLOAD_QUEUE}'.")
except Exception as publish_err:
print(f"Erro ao publicar mensagem de erro: {publish_err}")
continue
# On success publish payload
try:
publish_to_queue(result)
print(f"Mensagem publicada na fila '{RABBITMQ_UPLOAD_QUEUE}'.")
except Exception as publish_err:
print(f"Erro ao publicar na fila '{RABBITMQ_UPLOAD_QUEUE}': {publish_err}")
# Loop continues
if __name__ == "__main__": if __name__ == "__main__":

35
prompts/generate.txt Normal file
View File

@@ -0,0 +1,35 @@
Voce e um estrategista de conteudo especializado em identificar cortes curtos de videos longos que performam bem em redes sociais.
FUNCAO:
- Analisar a transcricao completa de um video.
- Escolher trechos curtos (entre 20s e 90s) com maior chance de engajamento.
- Responder APENAS em JSON valido.
FORMATO DA RESPOSTA:
{
"highlights": [
{
"start": <segundos_inicio_float>,
"end": <segundos_fim_float>,
"summary": "Resumo conciso do porque este trecho engaja"
}
]
}
REGRAS:
- Liste no maximo 6 destaques.
- Respeite a ordem cronologica.
- Nunca deixe listas vazias; se nada for relevante, inclua uma entrada com start = 0, end = 0 e summary explicando a ausencia de cortes.
- Utilize apenas valores numericos simples (ponto como separador decimal).
- Nao repita um mesmo trecho.
PERSPECTIVA DE ANALISE:
- Concentre-se em momentos com gatilhos emocionais, insights, storytelling ou chamadas para acao fortes.
- Prefira trechos com comeco, meio e fim claros.
- Evite partes redundantes, silenciosas ou extremamente tecnicas.
TAREFA:
- Leia a transcricao recebida no campo "transcript".
- Use a lista de marcas de tempo detalhadas no campo "segments" para embasar suas escolhas.
- Produza a saida JSON descrita acima.

205
render.py
View File

@@ -1,205 +0,0 @@
"""Rendering logic for producing vertical clips with dynamic captions.
This module defines a single function ``render_clip`` which takes a video
segment and produces a vertical clip suitable for social media. Each clip
contains three regions:
* A top region (480px high) showing a title generated by an LLM.
* A middle region (960px high) containing the original video, scaled to
fit horizontally while preserving aspect ratio and centred vertically.
* A bottom region (480px high) showing a dynamic caption. The caption
displays a sliding window of three to five words from the transcript,
colouring the currently spoken word differently to draw the viewer's
attention.
The function uses the MoviePy library to compose the various elements and
writes the resulting video to disk. It returns the path to the created
file.
"""
from __future__ import annotations
import os
from typing import Dict, List
import numpy as np
from moviepy.video.io.VideoFileClip import VideoFileClip
from moviepy.video.VideoClip import ColorClip, VideoClip
from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
from moviepy.video.VideoClip import TextClip
from PIL import Image, ImageDraw, ImageFont
from .utils import wrap_text
def render_clip(
video_path: str,
start: float,
end: float,
top_text: str,
words: List[Dict[str, float]],
out_dir: str,
base_name: str,
idx: int,
# Use a widely available system font by default. DejaVuSans is installed
# in most Debian-based containers. The caller can override this path.
font_path: str = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
final_width: int = 1080,
final_height: int = 1920,
top_h: int = 480,
middle_h: int = 960,
bottom_h: int = 480,
video_codec: str = "libx264",
bitrate: str = "3000k",
) -> str:
"""Render a single clip with title and dynamic caption.
Parameters
----------
video_path: str
Path to the source video file.
start: float
Start time of the clip in seconds.
end: float
End time of the clip in seconds.
top_text: str
The title to display in the top region.
words: List[Dict[str, float]]
List of word-level timestamps for this clip. Each dict must have
``start``, ``end`` and ``word`` keys. The start and end values
should be relative to the beginning of this clip (i.e. start at 0).
out_dir: str
Directory where the output file should be saved. The function
creates this directory if it doesn't exist.
base_name: str
Base name of the original video (sanitized). Used to build the
output filename.
idx: int
Index of the clip. Output will be named ``clip_{idx}.mp4``.
font_path: str
Path to the TrueType font to use for both title and caption.
final_width: int
Width of the final video in pixels.
final_height: int
Height of the final video in pixels.
top_h: int
Height of the title area in pixels.
middle_h: int
Height of the video area in pixels.
bottom_h: int
Height of the caption area in pixels.
video_codec: str
FFmpeg codec to use when writing the video.
bitrate: str
Bitrate for the output video.
Returns
-------
str
The path to the rendered video file.
"""
os.makedirs(out_dir, exist_ok=True)
# Extract the segment from the source video
with VideoFileClip(video_path) as clip:
segment = clip.subclip(start, end)
dur = segment.duration
# Background
bg = ColorClip(size=(final_width, final_height), color=(0, 0, 0), duration=dur)
# Resize video to fit width
video_resized = segment.resize(width=final_width)
# Compute vertical position to centre in the middle region
y = top_h + (middle_h - video_resized.h) // 2
video_resized = video_resized.set_position((0, y))
# Build title clip
# Wrap the title to avoid overflow
wrapped_lines = wrap_text(top_text, max_chars=40)
wrapped_title = "\n".join(wrapped_lines)
title_clip = TextClip(
wrapped_title,
font=font_path,
fontsize=70,
color="white",
method="caption",
size=(final_width, top_h),
align="center",
).set_duration(dur).set_position((0, 0))
# Prepare font for caption rendering
pil_font = ImageFont.truetype(font_path, size=60)
default_color = (255, 255, 255) # white
highlight_color = (255, 215, 0) # gold-like yellow
# Precompute widths of a space and bounding box height for vertical centering
space_width = pil_font.getbbox(" ")[2] - pil_font.getbbox(" ")[0]
bbox = pil_font.getbbox("A")
text_height = bbox[3] - bbox[1]
def make_caption_frame(t: float):
"""Generate an image for the caption at time t."""
# Determine current word index
idx_cur = 0
for i, w in enumerate(words):
if w["start"] <= t < w["end"]:
idx_cur = i
break
if t >= w["end"]:
idx_cur = i
# Define window of words to display: show up to 5 words
start_idx = max(0, idx_cur - 2)
end_idx = min(len(words), idx_cur + 3)
window = words[start_idx:end_idx]
# Compute widths for each word
word_sizes = []
for w in window:
bbox = pil_font.getbbox(w["word"])
word_width = bbox[2] - bbox[0]
word_sizes.append(word_width)
total_width = sum(word_sizes) + space_width * (len(window) - 1 if window else 0)
# Create blank image for caption area
img = Image.new("RGB", (final_width, bottom_h), color=(0, 0, 0))
draw = ImageDraw.Draw(img)
x = int((final_width - total_width) / 2)
y_pos = int((bottom_h - text_height) / 2)
for j, w in enumerate(window):
color = highlight_color if (start_idx + j) == idx_cur else default_color
draw.text((x, y_pos), w["word"], font=pil_font, fill=color)
x += word_sizes[j] + space_width
return np.array(img)
caption_clip = VideoClip(make_frame=make_caption_frame, duration=dur)
caption_clip = caption_clip.set_position((0, final_height - bottom_h))
# Compose final clip
final = CompositeVideoClip([
bg,
video_resized,
title_clip,
caption_clip,
], size=(final_width, final_height))
# Use the original audio from the video segment
final_audio = segment.audio
if final_audio is not None:
final = final.set_audio(final_audio)
# Define output path
out_path = os.path.join(out_dir, f"clip_{idx}.mp4")
# Write to disk
final.write_videofile(
out_path,
codec=video_codec,
fps=30,
bitrate=bitrate,
audio_codec="aac",
preset="ultrafast",
ffmpeg_params=[
"-tune", "zerolatency",
"-pix_fmt", "yuv420p",
"-profile:v", "high",
"-level", "4.1",
],
threads=4,
)
# Close clips to free resources
final.close()
segment.close()
return out_path

View File

@@ -1,7 +1,6 @@
pika==1.3.2 moviepy==2.2.0
moviepy==2.0.0 pillow==10.3.0
faster-whisper==1.2.0 numpy>=1.26.0
openai==1.16.0 requests>=2.31.0
numpy==1.26.4 pika>=1.3.2
Pillow==10.1.0 faster-whisper==1.0.0
unidecode==1.3.6

View File

@@ -1,111 +0,0 @@
"""Utilities for extracting audio from video and generating transcriptions.
This module handles two tasks:
1. Use FFMPEG to extract the audio track from a video file into a WAV file
suitable for consumption by the Whisper model. The audio is resampled to
16 kHz mono PCM as required by Whisper.
2. Use the Faster-Whisper implementation to generate a transcription with
word-level timestamps. The transcription is returned both as a list of
segments (for building an SRT) and as a flattened list of words (for
building dynamic subtitles).
If FFMPEG is not installed or fails, a ``RuntimeError`` is raised. The caller
is responsible for cleaning up the temporary files created in the working
directory.
"""
from __future__ import annotations
import os
import subprocess
from typing import Dict, List, Tuple
from faster_whisper import WhisperModel
def extract_audio_ffmpeg(video_path: str, audio_path: str) -> None:
"""Use FFMPEG to extract audio from ``video_path`` into ``audio_path``.
The output will be a 16 kHz mono WAV file in PCM S16LE format. Any
existing file at ``audio_path`` will be overwritten. If ffmpeg returns
a non-zero exit code, a ``RuntimeError`` is raised with the stderr.
"""
cmd = [
"ffmpeg",
"-y", # overwrite output
"-i",
video_path,
"-vn", # disable video recording
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
audio_path,
]
proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if proc.returncode != 0:
raise RuntimeError(f"FFMPEG error: {proc.stderr.decode(errors='ignore')}")
def load_whisper_model() -> WhisperModel:
"""Instantiate and cache a Faster-Whisper model.
The model name and device can be configured via the ``WHISPER_MODEL`` and
``WHISPER_DEVICE`` environment variables. The default model is
``large-v3`` for best accuracy. The device can be ``cuda`` or ``cpu``.
A module-level cache is used to prevent loading the model multiple times.
"""
if hasattr(load_whisper_model, "_cache"):
return load_whisper_model._cache # type: ignore[attr-defined]
model_name = os.environ.get("WHISPER_MODEL", "large-v3")
device = os.environ.get("WHISPER_DEVICE", "cpu")
# Compute type can be set via WHISPER_COMPUTE_TYPE; default to float16 on GPU
compute_type = os.environ.get("WHISPER_COMPUTE_TYPE")
# If not explicitly set, choose sensible defaults
if compute_type is None:
compute_type = "float16" if device == "cuda" else "int8"
model = WhisperModel(model_name, device=device, compute_type=compute_type)
load_whisper_model._cache = model # type: ignore[attr-defined]
return model
def transcribe(video_path: str, work_dir: str) -> Tuple[List[Dict[str, float]], List[Dict[str, float]]]:
"""Transcribe a video file using Faster-Whisper.
``video_path`` is the path to the video to transcribe. ``work_dir`` is a
directory where temporary files will be stored (audio file and
transcription). The function returns a tuple ``(segments, words)`` where
``segments`` is a list of dictionaries with ``start``, ``end`` and
``text`` fields, and ``words`` is a flat list of dictionaries with
``start``, ``end`` and ``word`` fields covering the entire video.
The timestamps are expressed in seconds as floats.
"""
os.makedirs(work_dir, exist_ok=True)
audio_path = os.path.join(work_dir, "audio.wav")
# Extract audio
extract_audio_ffmpeg(video_path, audio_path)
# Load Whisper model
model = load_whisper_model()
# Run transcription with word-level timestamps
segments, info = model.transcribe(audio_path, word_timestamps=True)
seg_list: List[Dict[str, float]] = []
words_list: List[Dict[str, float]] = []
for seg in segments:
seg_list.append({
"start": float(seg.start),
"end": float(seg.end),
"text": seg.text.strip(),
})
# Each segment may contain words attribute
for w in getattr(seg, "words", []) or []:
words_list.append({
"start": float(w.start),
"end": float(w.end),
"word": w.word,
})
# Sort words by start time to be safe
words_list.sort(key=lambda d: d["start"])
return seg_list, words_list

View File

@@ -1,93 +0,0 @@
import re
import unicodedata
from typing import List, Tuple
def sanitize_filename(name: str) -> str:
"""Return a sanitized version of a filename.
This helper removes accents, converts to lowercase, replaces spaces
with underscores and removes any non alphanumeric characters except
underscores and dots. This makes the directory names safe to use on
most filesystems and matches the behaviour described in the spec.
"""
if not name:
return ""
# Decompose Unicode characters and strip accents
nfkd_form = unicodedata.normalize("NFKD", name)
no_accents = "".join(c for c in nfkd_form if not unicodedata.combining(c))
# Replace spaces with underscores
no_spaces = no_accents.replace(" ", "_")
# Lowercase and remove any character that is not a letter, digit, dot or underscore
sanitized = re.sub(r"[^A-Za-z0-9_.]+", "", no_spaces)
return sanitized
def timestamp_to_seconds(ts: str) -> float:
"""Convert a timestamp in HH:MM:SS,mmm format to seconds.
The Gemini and OpenRouter prompts use timestamps formatted with a comma
as the decimal separator. This helper splits the string into hours,
minutes and seconds and returns a float expressed in seconds.
"""
if ts is None:
return 0.0
ts = ts.strip()
if not ts:
return 0.0
# Replace comma by dot for decimal seconds
ts = ts.replace(",", ".")
parts = ts.split(":")
parts = [float(p) for p in parts]
if len(parts) == 3:
h, m, s = parts
return h * 3600 + m * 60 + s
elif len(parts) == 2:
m, s = parts
return m * 60 + s
else:
# only seconds
return parts[0]
def seconds_to_timestamp(seconds: float) -> str:
"""Convert a time in seconds to HH:MM:SS,mmm format expected by SRT."""
if seconds < 0:
seconds = 0
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = seconds % 60
# Format with comma as decimal separator and three decimal places
return f"{h:02d}:{m:02d}:{s:06.3f}".replace(".", ",")
def wrap_text(text: str, max_chars: int = 80) -> List[str]:
"""Simple word-wrap for a string.
Splits ``text`` into a list of lines, each at most ``max_chars``
characters long. This does not attempt to hyphenate words a word
longer than ``max_chars`` will occupy its own line. The return value
is a list of lines without trailing whitespace.
"""
if not text:
return []
words = text.split()
lines: List[str] = []
current: List[str] = []
current_len = 0
for word in words:
# If adding this word would exceed the max, flush current line
if current and current_len + 1 + len(word) > max_chars:
lines.append(" ".join(current))
current = [word]
current_len = len(word)
else:
# Add to current line
if current:
current_len += 1 + len(word)
else:
current_len = len(word)
current.append(word)
if current:
lines.append(" ".join(current))
return lines

4
video_render/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
"""
Core package for the revamped video rendering pipeline.
"""

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

103
video_render/config.py Normal file
View File

@@ -0,0 +1,103 @@
from __future__ import annotations
import os
from dataclasses import dataclass
from pathlib import Path
BASE_DIR = Path(__file__).resolve().parent.parent
VIDEOS_ROOT = BASE_DIR / "videos"
OUTPUTS_ROOT = BASE_DIR / "outputs"
TEMP_ROOT = BASE_DIR / "temp"
@dataclass(frozen=True)
class RabbitMQSettings:
host: str = os.environ.get("RABBITMQ_HOST", "rabbitmq")
port: int = int(os.environ.get("RABBITMQ_PORT", 5672))
user: str = os.environ.get("RABBITMQ_USER", "admin")
password: str = os.environ.get("RABBITMQ_PASS", "")
consume_queue: str = os.environ.get("RABBITMQ_QUEUE", "to-render")
publish_queue: str = os.environ.get("RABBITMQ_UPLOAD_QUEUE", "to-upload")
prefetch_count: int = int(os.environ.get("RABBITMQ_PREFETCH", 1))
heartbeat: int = int(os.environ.get("RABBITMQ_HEARTBEAT", 60))
blocked_timeout: int = int(os.environ.get("RABBITMQ_BLOCKED_TIMEOUT", 300))
@dataclass(frozen=True)
class GeminiSettings:
api_key: str = os.environ.get("GEMINI_API_KEY", "")
model: str = os.environ.get("GEMINI_MODEL", "gemini-1.5-pro-latest")
safety_settings: str | None = os.environ.get("GEMINI_SAFETY_SETTINGS")
temperature: float = float(os.environ.get("GEMINI_TEMPERATURE", 0.2))
top_k: int | None = (
int(os.environ["GEMINI_TOP_K"]) if os.environ.get("GEMINI_TOP_K") else None
)
top_p: float | None = (
float(os.environ["GEMINI_TOP_P"]) if os.environ.get("GEMINI_TOP_P") else None
)
prompt_path: str = os.environ.get("GEMINI_PROMPT_PATH", "prompts/generate.txt")
@dataclass(frozen=True)
class OpenRouterSettings:
api_key: str = os.environ.get("OPENROUTER_API_KEY", "")
model: str = os.environ.get(
"OPENROUTER_MODEL", "anthropic/claude-3-haiku:beta"
)
temperature: float = float(os.environ.get("OPENROUTER_TEMPERATURE", 0.6))
max_output_tokens: int = int(os.environ.get("OPENROUTER_MAX_OUTPUT_TOKENS", 256))
@dataclass(frozen=True)
class WhisperSettings:
model_size: str = os.environ.get("FASTER_WHISPER_MODEL_SIZE", "medium")
device: str | None = os.environ.get("FASTER_WHISPER_DEVICE")
compute_type: str | None = os.environ.get("FASTER_WHISPER_COMPUTE_TYPE")
download_root: Path = Path(
os.environ.get("FASTER_WHISPER_DOWNLOAD_ROOT", str(BASE_DIR / ".whisper"))
)
@dataclass(frozen=True)
class RenderingSettings:
frame_width: int = int(os.environ.get("RENDER_WIDTH", 1080))
frame_height: int = int(os.environ.get("RENDER_HEIGHT", 1920))
fps: int = int(os.environ.get("RENDER_FPS", 30))
video_codec: str = os.environ.get("RENDER_CODEC", "libx264")
audio_codec: str = os.environ.get("RENDER_AUDIO_CODEC", "aac")
bitrate: str = os.environ.get("RENDER_BITRATE", "5000k")
preset: str = os.environ.get("RENDER_PRESET", "faster")
highlight_color: str = os.environ.get("SUBTITLE_HIGHLIGHT_COLOR", "#FFD200")
base_color: str = os.environ.get("SUBTITLE_BASE_COLOR", "#FFFFFF")
font_path: Path = Path(os.environ.get("RENDER_FONT_PATH", "./Montserrat.ttf"))
title_font_size: int = int(os.environ.get("RENDER_TITLE_FONT_SIZE", 110))
subtitle_font_size: int = int(os.environ.get("RENDER_SUBTITLE_FONT_SIZE", 64))
caption_min_words: int = int(os.environ.get("CAPTION_MIN_WORDS", 3))
caption_max_words: int = int(os.environ.get("CAPTION_MAX_WORDS", 4))
@dataclass(frozen=True)
class Settings:
rabbitmq: RabbitMQSettings = RabbitMQSettings()
gemini: GeminiSettings = GeminiSettings()
openrouter: OpenRouterSettings = OpenRouterSettings()
whisper: WhisperSettings = WhisperSettings()
rendering: RenderingSettings = RenderingSettings()
videos_dir: Path = VIDEOS_ROOT
outputs_dir: Path = OUTPUTS_ROOT
temp_dir: Path = TEMP_ROOT
def load_settings() -> Settings:
settings = Settings()
if not settings.rabbitmq.password:
raise RuntimeError("RABBITMQ_PASS must be provided")
settings.videos_dir.mkdir(parents=True, exist_ok=True)
settings.outputs_dir.mkdir(parents=True, exist_ok=True)
settings.temp_dir.mkdir(parents=True, exist_ok=True)
return settings

54
video_render/ffmpeg.py Normal file
View File

@@ -0,0 +1,54 @@
from __future__ import annotations
import logging
import shlex
import subprocess
from pathlib import Path
from typing import Sequence
logger = logging.getLogger(__name__)
def _run_ffmpeg(args: Sequence[str]) -> None:
cmd = ["ffmpeg", "-hide_banner", "-loglevel", "error", *args]
logger.debug("Executando ffmpeg: %s", " ".join(shlex.quote(part) for part in cmd))
completed = subprocess.run(cmd, check=False)
if completed.returncode != 0:
raise RuntimeError(f"ffmpeg falhou com exit code {completed.returncode}")
def extract_audio_to_wav(input_video: Path, output_wav: Path) -> Path:
_run_ffmpeg(
[
"-y",
"-i",
str(input_video),
"-ac",
"1",
"-ar",
"16000",
"-vn",
str(output_wav),
]
)
return output_wav
def create_video_segment(input_video: Path, start: float, end: float, output_path: Path) -> Path:
duration = max(0.01, end - start)
_run_ffmpeg(
[
"-y",
"-i",
str(input_video),
"-ss",
f"{start:.3f}",
"-t",
f"{duration:.3f}",
"-c",
"copy",
str(output_path),
]
)
return output_path

187
video_render/llm.py Normal file
View File

@@ -0,0 +1,187 @@
from __future__ import annotations
import json
import logging
from pathlib import Path
from typing import Dict, List
import requests
from .config import BASE_DIR, Settings
from .transcription import TranscriptionResult
logger = logging.getLogger(__name__)
GEMINI_ENDPOINT_TEMPLATE = "https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent"
OPENROUTER_ENDPOINT = "https://openrouter.ai/api/v1/chat/completions"
class GeminiHighlighter:
def __init__(self, settings: Settings) -> None:
if not settings.gemini.api_key:
raise RuntimeError("GEMINI_API_KEY nao foi definido")
prompt_path = Path(settings.gemini.prompt_path)
if not prompt_path.is_absolute():
prompt_path = BASE_DIR / prompt_path
if not prompt_path.exists():
raise FileNotFoundError(f"Prompt do Gemini nao encontrado: {prompt_path}")
self.prompt_template = prompt_path.read_text(encoding="utf-8")
self.settings = settings
def generate_highlights(self, transcription: TranscriptionResult) -> List[Dict]:
payload = {
"transcript": transcription.full_text,
"segments": [
{
"start": segment.start,
"end": segment.end,
"text": segment.text,
}
for segment in transcription.segments
],
}
body = {
"contents": [
{
"role": "user",
"parts": [
{"text": self.prompt_template},
{"text": json.dumps(payload, ensure_ascii=False)},
],
}
]
}
if self.settings.gemini.temperature is not None:
body["generationConfig"] = {
"temperature": self.settings.gemini.temperature,
}
if self.settings.gemini.top_p is not None:
body["generationConfig"]["topP"] = self.settings.gemini.top_p
if self.settings.gemini.top_k is not None:
body["generationConfig"]["topK"] = self.settings.gemini.top_k
url = GEMINI_ENDPOINT_TEMPLATE.format(model=self.settings.gemini.model)
params = {"key": self.settings.gemini.api_key}
response = requests.post(url, params=params, json=body, timeout=120)
response.raise_for_status()
data = response.json()
candidates = data.get("candidates") or []
if not candidates:
raise RuntimeError("Gemini nao retornou candidatos")
text_parts = candidates[0].get("content", {}).get("parts", [])
if not text_parts:
raise RuntimeError("Resposta do Gemini sem conteudo")
raw_text = text_parts[0].get("text")
if not raw_text:
raise RuntimeError("Resposta do Gemini sem texto")
parsed = self._extract_json(raw_text)
highlights = parsed.get("highlights")
if not isinstance(highlights, list):
raise ValueError("Resposta do Gemini invalida: campo 'highlights' ausente")
return highlights
@staticmethod
def _extract_json(response_text: str) -> Dict:
try:
return json.loads(response_text)
except json.JSONDecodeError:
start = response_text.find("{")
end = response_text.rfind("}")
if start == -1 or end == -1:
raise
subset = response_text[start : end + 1]
return json.loads(subset)
class OpenRouterCopywriter:
def __init__(self, settings: Settings) -> None:
if not settings.openrouter.api_key:
raise RuntimeError("OPENROUTER_API_KEY nao foi definido")
self.settings = settings
def generate_titles(self, highlights: List[Dict]) -> List[str]:
if not highlights:
return []
prompt = (
"Voce e um copywriter especializado em titulos curtos e virais para reels.\n"
"Recebera uma lista de trechos destacados de um video com resumo e tempo.\n"
"Produza um titulo envolvente (ate 60 caracteres) para cada item.\n"
"Responda apenas em JSON com a seguinte estrutura:\n"
'{"titles": ["titulo 1", "titulo 2"]}\n'
"Titulos devem ser em portugues, usar verbos fortes e refletir o resumo."
)
user_payload = {
"highlights": [
{
"start": item.get("start"),
"end": item.get("end"),
"summary": item.get("summary"),
}
for item in highlights
]
}
body = {
"model": self.settings.openrouter.model,
"temperature": self.settings.openrouter.temperature,
"max_tokens": self.settings.openrouter.max_output_tokens,
"messages": [
{"role": "system", "content": prompt},
{
"role": "user",
"content": json.dumps(user_payload, ensure_ascii=False),
},
],
}
headers = {
"Authorization": f"Bearer {self.settings.openrouter.api_key}",
"Content-Type": "application/json",
"HTTP-Referer": "https://localhost",
"X-Title": "video-render-pipeline",
}
response = requests.post(
OPENROUTER_ENDPOINT, json=body, headers=headers, timeout=120
)
response.raise_for_status()
data = response.json()
choices = data.get("choices") or []
if not choices:
raise RuntimeError("OpenRouter nao retornou escolhas")
message = choices[0].get("message", {}).get("content")
if not message:
raise RuntimeError("Resposta do OpenRouter sem conteudo")
parsed = self._extract_json(message)
titles = parsed.get("titles")
if not isinstance(titles, list):
raise ValueError("Resposta do OpenRouter invalida: campo 'titles'")
return [str(title) for title in titles]
@staticmethod
def _extract_json(response_text: str) -> Dict:
try:
return json.loads(response_text)
except json.JSONDecodeError:
start = response_text.find("{")
end = response_text.rfind("}")
if start == -1 or end == -1:
raise
subset = response_text[start : end + 1]
return json.loads(subset)

View File

@@ -0,0 +1,13 @@
from __future__ import annotations
import logging
import os
def setup_logging() -> None:
log_level = os.environ.get("LOG_LEVEL", "INFO").upper()
logging.basicConfig(
level=log_level,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)

64
video_render/media.py Normal file
View File

@@ -0,0 +1,64 @@
from __future__ import annotations
import logging
import shutil
from dataclasses import dataclass
from pathlib import Path
from .config import Settings
from .ffmpeg import extract_audio_to_wav
from .utils import ensure_workspace, remove_paths, sanitize_filename
logger = logging.getLogger(__name__)
@dataclass
class VideoWorkspace:
original_filename: str
sanitized_name: str
workspace_dir: Path
output_dir: Path
source_path: Path
working_video_path: Path
audio_path: Path
class MediaPreparer:
def __init__(self, settings: Settings) -> None:
self.settings = settings
def prepare(self, filename: str) -> VideoWorkspace:
source_path = self.settings.videos_dir / filename
if not source_path.exists():
raise FileNotFoundError(f"Arquivo de vídeo não encontrado: {source_path}")
sanitized_name = sanitize_filename(Path(filename).stem)
workspace_dir = ensure_workspace(self.settings.videos_dir, sanitized_name)
existing_children = list(workspace_dir.iterdir())
if existing_children:
logger.info("Limpando workspace existente para %s", sanitized_name)
remove_paths(existing_children)
destination_name = f"{sanitized_name}{source_path.suffix.lower()}"
working_video_path = workspace_dir / destination_name
shutil.copy2(source_path, working_video_path)
logger.info("Cópia do vídeo criada em %s", working_video_path)
output_dir = ensure_workspace(self.settings.outputs_dir, sanitized_name)
existing_outputs = list(output_dir.iterdir())
if existing_outputs:
remove_paths(existing_outputs)
audio_path = workspace_dir / "audio.wav"
extract_audio_to_wav(working_video_path, audio_path)
return VideoWorkspace(
original_filename=filename,
sanitized_name=sanitized_name,
workspace_dir=workspace_dir,
output_dir=output_dir,
source_path=source_path,
working_video_path=working_video_path,
audio_path=audio_path,
)

85
video_render/messaging.py Normal file
View File

@@ -0,0 +1,85 @@
from __future__ import annotations
import json
import logging
from typing import Any, Callable, Dict
import pika
from .config import Settings
logger = logging.getLogger(__name__)
MessageHandler = Callable[[Dict[str, Any]], Dict[str, Any]]
class RabbitMQWorker:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self._params = pika.ConnectionParameters(
host=settings.rabbitmq.host,
port=settings.rabbitmq.port,
credentials=pika.PlainCredentials(
settings.rabbitmq.user, settings.rabbitmq.password
),
heartbeat=settings.rabbitmq.heartbeat,
blocked_connection_timeout=settings.rabbitmq.blocked_timeout,
)
def consume_forever(self, handler: MessageHandler) -> None:
while True:
try:
with pika.BlockingConnection(self._params) as connection:
channel = connection.channel()
channel.queue_declare(queue=self.settings.rabbitmq.consume_queue, durable=True)
channel.queue_declare(queue=self.settings.rabbitmq.publish_queue, durable=True)
channel.basic_qos(prefetch_count=self.settings.rabbitmq.prefetch_count)
def _on_message(ch: pika.adapters.blocking_connection.BlockingChannel, method, properties, body):
try:
message = json.loads(body)
except json.JSONDecodeError:
logger.error("Mensagem inválida recebida: %s", body)
ch.basic_ack(delivery_tag=method.delivery_tag)
return
logger.info("Mensagem recebida: %s", message.get("filename", "<sem_nome>"))
try:
response = handler(message)
except Exception:
logger.exception("Erro não tratado durante o processamento")
response = {
"hasError": True,
"error": "Erro não tratado no pipeline",
"filename": message.get("filename"),
"videoId": message.get("videoId"),
"url": message.get("url"),
"processedFiles": [],
}
try:
payload = json.dumps(response)
ch.basic_publish(
exchange="",
routing_key=self.settings.rabbitmq.publish_queue,
body=payload,
properties=pika.BasicProperties(delivery_mode=2),
)
logger.info("Resposta publicada para '%s'", self.settings.rabbitmq.publish_queue)
except Exception:
logger.exception("Falha ao publicar a resposta na fila de upload")
finally:
ch.basic_ack(delivery_tag=method.delivery_tag)
channel.basic_consume(
queue=self.settings.rabbitmq.consume_queue,
on_message_callback=_on_message,
auto_ack=False,
)
logger.info("Consumidor iniciado. Aguardando mensagens...")
channel.start_consuming()
except pika.exceptions.AMQPConnectionError:
logger.exception("Conexão com RabbitMQ perdida. Tentando reconectar...")
except KeyboardInterrupt:
logger.info("Encerrando consumidor por interrupção do usuário.")
break

236
video_render/pipeline.py Normal file
View File

@@ -0,0 +1,236 @@
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional
from .config import Settings
from .llm import GeminiHighlighter, OpenRouterCopywriter
from .media import MediaPreparer, VideoWorkspace
from .transcription import TranscriptionResult, TranscriptionService
from .utils import remove_paths, sanitize_filename
from .rendering import VideoRenderer
logger = logging.getLogger(__name__)
@dataclass
class JobMessage:
filename: str
url: Optional[str]
video_id: Optional[str]
extras: Dict[str, Any] = field(default_factory=dict)
@dataclass
class HighlightWindow:
start: float
end: float
summary: str
title: Optional[str] = None
@dataclass
class RenderedClip:
path: Path
start: float
end: float
title: str
summary: str
index: int
@dataclass
class PipelineContext:
job: JobMessage
workspace: Optional[VideoWorkspace] = None
transcription: Optional[TranscriptionResult] = None
highlight_windows: List[HighlightWindow] = field(default_factory=list)
rendered_clips: List[RenderedClip] = field(default_factory=list)
class VideoPipeline:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self.media_preparer = MediaPreparer(settings)
self.transcriber = TranscriptionService(settings)
self.highlighter = GeminiHighlighter(settings)
self.copywriter = OpenRouterCopywriter(settings)
self.renderer = VideoRenderer(settings)
def process_message(self, message: Dict[str, Any]) -> Dict[str, Any]:
context = PipelineContext(job=self._parse_job(message))
try:
self._prepare_workspace(context)
self._generate_transcription(context)
self._determine_highlights(context)
self._generate_titles(context)
self._render_clips(context)
return self._build_success_payload(context)
except Exception as exc:
logger.exception("Falha ao processar vídeo %s", context.job.filename)
return self._handle_failure(context, exc)
def _parse_job(self, message: Dict[str, Any]) -> JobMessage:
filename = message.get("filename")
if not filename:
raise ValueError("Mensagem inválida: 'filename' é obrigatório")
url = message.get("url")
video_id = message.get("videoId") or message.get("video_id")
extras = {
key: value
for key, value in message.items()
if key not in {"filename", "url", "videoId", "video_id"}
}
return JobMessage(filename=filename, url=url, video_id=video_id, extras=extras)
def _prepare_workspace(self, context: PipelineContext) -> None:
context.workspace = self.media_preparer.prepare(context.job.filename)
def _generate_transcription(self, context: PipelineContext) -> None:
if not context.workspace:
raise RuntimeError("Workspace não preparado")
transcription = self.transcriber.transcribe(context.workspace.audio_path)
TranscriptionService.persist(transcription, context.workspace.workspace_dir)
context.transcription = transcription
def _determine_highlights(self, context: PipelineContext) -> None:
if not context.transcription:
raise RuntimeError("Transcricao nao disponivel")
highlights_raw = self.highlighter.generate_highlights(context.transcription)
windows: List[HighlightWindow] = []
for item in highlights_raw:
try:
start = float(item.get("start", 0)) # type: ignore[arg-type]
end = float(item.get("end", start)) # type: ignore[arg-type]
except (TypeError, ValueError):
logger.warning("Highlight invalido ignorado: %s", item)
continue
summary = str(item.get("summary", "")).strip()
if end <= start:
logger.debug("Highlight com intervalo invalido ignorado: %s", item)
continue
windows.append(HighlightWindow(start=start, end=end, summary=summary))
if not windows:
last_end = (
context.transcription.segments[-1].end
if context.transcription.segments
else 0
)
windows.append(
HighlightWindow(
start=0.0,
end=max(last_end, 10.0),
summary="Sem destaque identificado; fallback automatico.",
)
)
context.highlight_windows = windows
def _generate_titles(self, context: PipelineContext) -> None:
if not context.highlight_windows:
return
highlight_dicts = [
{"start": window.start, "end": window.end, "summary": window.summary}
for window in context.highlight_windows
]
titles = self.copywriter.generate_titles(highlight_dicts)
for window, title in zip(context.highlight_windows, titles):
window.title = title.strip()
def _render_clips(self, context: PipelineContext) -> None:
if not context.workspace or not context.highlight_windows or not context.transcription:
return
titles = [
window.title or window.summary for window in context.highlight_windows
]
render_results = self.renderer.render(
workspace_path=str(context.workspace.working_video_path),
highlight_windows=context.highlight_windows,
transcription=context.transcription,
titles=titles,
output_dir=context.workspace.output_dir,
)
context.rendered_clips = [
RenderedClip(
path=Path(path),
start=start,
end=end,
title=title,
summary=summary,
index=index,
)
for path, start, end, title, summary, index in render_results
]
def _build_success_payload(self, context: PipelineContext) -> Dict[str, Any]:
return {
"hasError": False,
"videosProcessedQuantity": len(context.rendered_clips),
"filename": context.job.filename,
"videoId": context.job.video_id,
"url": context.job.url,
"workspaceFolder": context.workspace.sanitized_name if context.workspace else None,
"outputDirectory": self._relative_path(context.workspace.output_dir) if context.workspace else None,
"processedFiles": [
{
"path": self._relative_path(clip.path),
"start": clip.start,
"end": clip.end,
"title": clip.title,
"summary": clip.summary,
"clipIndex": clip.index,
}
for clip in context.rendered_clips
],
}
def _handle_failure(self, context: PipelineContext, exc: Exception) -> Dict[str, Any]:
logger.error("Erro no pipeline: %s", exc)
cleanup_targets: List[Path] = []
if context.workspace:
cleanup_targets.append(context.workspace.workspace_dir)
cleanup_targets.append(context.workspace.output_dir)
original_path = context.workspace.source_path
if original_path.exists():
cleanup_targets.append(original_path)
else:
sanitized = sanitize_filename(Path(context.job.filename).stem)
job_output_dir = self.settings.outputs_dir / sanitized
if job_output_dir.exists():
cleanup_targets.append(job_output_dir)
original_path = self.settings.videos_dir / context.job.filename
if original_path.exists():
cleanup_targets.append(original_path)
remove_paths(cleanup_targets)
return {
"hasError": True,
"error": str(exc),
"filename": context.job.filename,
"videoId": context.job.video_id,
"url": context.job.url,
"processedFiles": [],
}
def _relative_path(self, path: Path) -> str:
base = self.settings.videos_dir.parent
try:
return str(path.relative_to(base))
except ValueError:
return str(path)

406
video_render/rendering.py Normal file
View File

@@ -0,0 +1,406 @@
from __future__ import annotations
import logging
import math
import re
from dataclasses import dataclass
from typing import Iterable, List, Sequence, Tuple
import numpy as np
from moviepy.editor import (
ColorClip,
CompositeVideoClip,
ImageClip,
TextClip,
VideoFileClip,
)
from PIL import Image, ImageColor, ImageDraw, ImageFont
from .config import Settings
from .transcription import TranscriptionResult, WordTiming
logger = logging.getLogger(__name__)
def clamp_time(value: float, minimum: float = 0.0) -> float:
return max(minimum, float(value))
@dataclass
class CaptionClipSet:
base: ImageClip
highlights: List[ImageClip]
class CaptionBuilder:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self.font_path = settings.rendering.font_path
if not self.font_path.exists():
raise FileNotFoundError(f"Fonte nao encontrada: {self.font_path}")
self.font = ImageFont.truetype(
str(self.font_path), settings.rendering.subtitle_font_size
)
self.base_color = ImageColor.getrgb(settings.rendering.base_color)
self.highlight_color = ImageColor.getrgb(settings.rendering.highlight_color)
self.canvas_width = settings.rendering.frame_width - 160
self.canvas_height = int(settings.rendering.subtitle_font_size * 2.2)
self.min_words = settings.rendering.caption_min_words
self.max_words = settings.rendering.caption_max_words
bbox = self.font.getbbox("Ay")
self.text_height = bbox[3] - bbox[1]
self.baseline = (self.canvas_height - self.text_height) // 2 - bbox[1]
self.space_width = self.font.getbbox(" ")[2] - self.font.getbbox(" ")[0]
def build(self, words: Sequence[WordTiming], clip_start: float) -> List[CaptionClipSet]:
grouped = self._group_words(words)
clip_sets: List[CaptionClipSet] = []
for group in grouped:
group_start = clamp_time(group[0].start, minimum=clip_start)
group_end = clamp_time(group[-1].end, minimum=group_start + 0.05)
duration = max(0.05, group_end - group_start)
start_offset = group_start - clip_start
base_image, highlight_images = self._render_group(group)
base_clip = (
ImageClip(np.array(base_image))
.with_start(start_offset)
.with_duration(duration)
)
highlight_clips: List[ImageClip] = []
for word, image in zip(group, highlight_images):
h_start = clamp_time(word.start, minimum=clip_start) - clip_start
h_end = clamp_time(word.end, minimum=word.start + 0.02) - clip_start
h_duration = max(0.05, h_end - h_start)
highlight_clip = (
ImageClip(np.array(image))
.with_start(h_start)
.with_duration(h_duration)
)
highlight_clips.append(highlight_clip)
clip_sets.append(CaptionClipSet(base=base_clip, highlights=highlight_clips))
return clip_sets
def _render_group(self, group: Sequence[WordTiming]) -> Tuple[Image.Image, List[Image.Image]]:
texts = [self._clean_word(word.word) for word in group]
widths = []
for text in texts:
bbox = self.font.getbbox(text)
widths.append(bbox[2] - bbox[0])
total_width = sum(widths)
if len(widths) > 1:
total_width += self.space_width * (len(widths) - 1)
start_x = max(0, (self.canvas_width - total_width) // 2)
base_image = Image.new("RGBA", (self.canvas_width, self.canvas_height), (0, 0, 0, 0))
base_draw = ImageDraw.Draw(base_image)
highlight_images: List[Image.Image] = []
x = start_x
for text, width in zip(texts, widths):
base_draw.text((x, self.baseline), text, font=self.font, fill=self.base_color)
highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
highlight_draw = ImageDraw.Draw(highlight_image)
highlight_draw.text(
(x, self.baseline), text, font=self.font, fill=self.highlight_color
)
highlight_images.append(highlight_image)
x += width + self.space_width
return base_image, highlight_images
def _group_words(self, words: Sequence[WordTiming]) -> List[List[WordTiming]]:
if not words:
return []
grouped: List[List[WordTiming]] = []
buffer: List[WordTiming] = []
for word in words:
buffer.append(word)
if len(buffer) == self.max_words:
grouped.append(buffer)
buffer = []
if buffer:
if len(buffer) == 1 and grouped:
grouped[-1].extend(buffer)
else:
grouped.append(buffer)
# Rebalance groups to respect minimum size when possible
for idx, group in enumerate(grouped[:-1]):
if len(group) < self.min_words and len(grouped[idx + 1]) > self.min_words:
deficit = self.min_words - len(group)
transfer = grouped[idx + 1][:deficit]
grouped[idx] = group + transfer
grouped[idx + 1] = grouped[idx + 1][deficit:]
grouped = [grp for grp in grouped if grp]
return grouped
@staticmethod
def _clean_word(text: str) -> str:
text = text.strip()
text = re.sub(r"\s+", " ", text)
return text or "..."
class VideoRenderer:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self.captions = CaptionBuilder(settings)
def render(
self,
workspace_path: str,
highlight_windows: Sequence,
transcription: TranscriptionResult,
titles: Sequence[str],
output_dir,
) -> List[Tuple[str, float, float, str, str, int]]:
results: List[Tuple[str, float, float, str, str, int]] = []
with VideoFileClip(workspace_path) as base_clip:
video_duration = base_clip.duration or 0
for index, window in enumerate(highlight_windows, start=1):
start = clamp_time(window.start)
end = clamp_time(window.end)
start = min(start, video_duration)
end = min(end, video_duration)
if end <= start:
logger.info("Janela ignorada por intervalo invalido: %s", window)
continue
subclip = base_clip.subclipped(start, end)
try:
rendered_path = self._render_single_clip(
subclip=subclip,
start=start,
end=end,
title=titles[index - 1] if index - 1 < len(titles) else window.summary,
summary=window.summary,
index=index,
transcription=transcription,
output_dir=output_dir,
)
finally:
subclip.close()
results.append(
(
rendered_path,
float(start),
float(end),
titles[index - 1] if index - 1 < len(titles) else window.summary,
window.summary,
index,
)
)
return results
def _render_single_clip(
self,
subclip: VideoFileClip,
start: float,
end: float,
title: str,
summary: str,
index: int,
transcription: TranscriptionResult,
output_dir,
) -> str:
duration = end - start
frame_w = self.settings.rendering.frame_width
frame_h = self.settings.rendering.frame_height
top_h = int(frame_h * 0.18)
bottom_h = int(frame_h * 0.20)
video_area_h = frame_h - top_h - bottom_h
scale_factor = min(
frame_w / subclip.w,
video_area_h / subclip.h,
)
resized_clip = subclip.resized(scale_factor)
video_y = top_h + (video_area_h - resized_clip.h) // 2
video_clip = resized_clip.with_position(
((frame_w - resized_clip.w) // 2, video_y)
)
background = ColorClip(size=(frame_w, frame_h), color=(0, 0, 0)).with_duration(duration)
top_panel = (
ColorClip(size=(frame_w, top_h), color=(12, 12, 12))
.with_duration(duration)
.with_opacity(0.85)
)
bottom_panel = (
ColorClip(size=(frame_w, bottom_h), color=(12, 12, 12))
.with_position((0, frame_h - bottom_h))
.with_duration(duration)
.with_opacity(0.85)
)
title_text = title or summary
wrapped_title = self._wrap_text(title_text, max_width=frame_w - 160)
title_clip = (
TextClip(
text=wrapped_title,
font=str(self.settings.rendering.font_path),
font_size=self.settings.rendering.title_font_size,
color=self.settings.rendering.base_color,
method="caption",
size=(frame_w - 160, top_h - 40),
)
.with_duration(duration)
)
title_clip = title_clip.with_position(
((frame_w - title_clip.w) // 2, (top_h - title_clip.h) // 2)
)
words = self._collect_words(transcription, start, end)
caption_sets = self.captions.build(words, clip_start=start)
caption_clips = []
caption_resources: List[ImageClip] = []
caption_y = frame_h - bottom_h + (bottom_h - self.captions.canvas_height) // 2
for clip_set in caption_sets:
base_positioned = clip_set.base.with_position(("center", caption_y))
caption_clips.append(base_positioned)
caption_resources.append(clip_set.base)
for highlight in clip_set.highlights:
positioned = highlight.with_position(("center", caption_y))
caption_clips.append(positioned)
caption_resources.append(highlight)
if not caption_clips:
fallback_text = self._wrap_text(summary or title, max_width=frame_w - 160)
caption_clips.append(
TextClip(
text=fallback_text,
font=str(self.settings.rendering.font_path),
font_size=self.settings.rendering.subtitle_font_size,
color=self.settings.rendering.base_color,
method="caption",
size=(frame_w - 160, bottom_h - 40),
)
.with_duration(duration)
.with_position(("center", caption_y))
)
composite = CompositeVideoClip(
[background, top_panel, bottom_panel, video_clip, title_clip, *caption_clips],
size=(frame_w, frame_h),
)
output_path = output_dir / f"clip_{index:02d}.mp4"
composite.write_videofile(
str(output_path),
codec=self.settings.rendering.video_codec,
audio_codec=self.settings.rendering.audio_codec,
fps=self.settings.rendering.fps,
bitrate=self.settings.rendering.bitrate,
ffmpeg_params=[
"-preset",
self.settings.rendering.preset,
"-pix_fmt",
"yuv420p",
],
temp_audiofile=str(output_dir / f"temp_audio_{index:02d}.m4a"),
remove_temp=True,
threads=4,
)
composite.close()
resized_clip.close()
video_clip.close()
title_clip.close()
background.close()
top_panel.close()
bottom_panel.close()
for clip in caption_clips:
clip.close()
for clip in caption_resources:
clip.close()
return str(output_path)
def _collect_words(
self, transcription: TranscriptionResult, start: float, end: float
) -> List[WordTiming]:
collected: List[WordTiming] = []
for segment in transcription.segments:
if segment.end < start or segment.start > end:
continue
if segment.words:
for word in segment.words:
if word.end < start or word.start > end:
continue
collected.append(
WordTiming(
start=max(start, word.start),
end=min(end, word.end),
word=word.word,
)
)
else:
collected.extend(self._fallback_words(segment.text, segment.start, segment.end, start, end))
collected.sort(key=lambda w: w.start)
return collected
def _fallback_words(
self,
text: str,
segment_start: float,
segment_end: float,
window_start: float,
window_end: float,
) -> Iterable[WordTiming]:
words = [w for w in re.split(r"\s+", text.strip()) if w]
if not words:
return []
seg_start = max(segment_start, window_start)
seg_end = min(segment_end, window_end)
duration = max(0.01, seg_end - seg_start)
step = duration / len(words)
timings: List[WordTiming] = []
for idx, word in enumerate(words):
w_start = seg_start + idx * step
w_end = min(seg_end, w_start + step)
timings.append(WordTiming(start=w_start, end=w_end, word=word))
return timings
@staticmethod
def _wrap_text(text: str, max_width: int) -> str:
text = text.strip()
if not text:
return ""
words = text.split()
lines: List[str] = []
current: List[str] = []
for word in words:
current.append(word)
if len(" ".join(current)) > max_width // 18:
lines.append(" ".join(current[:-1]))
current = [current[-1]]
if current:
lines.append(" ".join(current))
return "\n".join(lines)

View File

@@ -0,0 +1,122 @@
from __future__ import annotations
import json
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional
from faster_whisper import WhisperModel
from .config import Settings
logger = logging.getLogger(__name__)
@dataclass(frozen=True)
class WordTiming:
start: float
end: float
word: str
@dataclass(frozen=True)
class TranscriptSegment:
id: int
start: float
end: float
text: str
words: List[WordTiming]
@dataclass(frozen=True)
class TranscriptionResult:
segments: List[TranscriptSegment]
full_text: str
class TranscriptionService:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self._model: Optional[WhisperModel] = None
def _load_model(self) -> WhisperModel:
if self._model is None:
logger.info(
"Carregando modelo Faster-Whisper '%s' (device=%s, compute_type=%s)",
self.settings.whisper.model_size,
self.settings.whisper.device or "auto",
self.settings.whisper.compute_type or "default",
)
self._model = WhisperModel(
self.settings.whisper.model_size,
device=self.settings.whisper.device or "auto",
compute_type=self.settings.whisper.compute_type or "default",
download_root=str(self.settings.whisper.download_root),
)
return self._model
def transcribe(self, audio_path: Path) -> TranscriptionResult:
model = self._load_model()
segments, _ = model.transcribe(
str(audio_path),
beam_size=5,
word_timestamps=True,
)
parsed_segments: List[TranscriptSegment] = []
full_text_parts: List[str] = []
for idx, segment in enumerate(segments):
words = [
WordTiming(start=w.start, end=w.end, word=w.word.strip())
for w in segment.words or []
if w.word.strip()
]
text = segment.text.strip()
full_text_parts.append(text)
parsed_segments.append(
TranscriptSegment(
id=idx,
start=segment.start,
end=segment.end,
text=text,
words=words,
)
)
return TranscriptionResult(
segments=parsed_segments,
full_text=" ".join(full_text_parts).strip(),
)
@staticmethod
def persist(result: TranscriptionResult, destination: Path) -> None:
json_path = destination / "transcription.json"
text_path = destination / "transcription.txt"
payload = {
"segments": [
{
"id": segment.id,
"start": segment.start,
"end": segment.end,
"text": segment.text,
"words": [
{"start": word.start, "end": word.end, "text": word.word}
for word in segment.words
],
}
for segment in result.segments
],
"full_text": result.full_text,
}
with json_path.open("w", encoding="utf-8") as fp:
json.dump(payload, fp, ensure_ascii=False, indent=2)
with text_path.open("w", encoding="utf-8") as fp:
fp.write(result.full_text)
logger.info("Transcrição salva em %s", destination)

38
video_render/utils.py Normal file
View File

@@ -0,0 +1,38 @@
from __future__ import annotations
import re
import unicodedata
from pathlib import Path
from typing import Iterable
def sanitize_filename(name: str) -> str:
normalized = unicodedata.normalize("NFKD", name)
ascii_text = normalized.encode("ASCII", "ignore").decode()
ascii_text = ascii_text.lower()
ascii_text = ascii_text.replace(" ", "_")
ascii_text = re.sub(r"[^a-z0-9_\-\.]", "", ascii_text)
ascii_text = re.sub(r"_+", "_", ascii_text)
return ascii_text.strip("_") or "video"
def ensure_workspace(root: Path, folder_name: str) -> Path:
workspace = root / folder_name
workspace.mkdir(parents=True, exist_ok=True)
return workspace
def remove_paths(paths: Iterable[Path]) -> None:
for path in paths:
if not path.exists():
continue
if path.is_file() or path.is_symlink():
path.unlink(missing_ok=True)
else:
for child in sorted(path.rglob("*"), reverse=True):
if child.is_file() or child.is_symlink():
child.unlink(missing_ok=True)
elif child.is_dir():
child.rmdir()
path.rmdir()