Inicia novos recursos

Dentre eles estão recurso de adicao do faster-whisper, geração de legenda e integracao com Gemini e Open Router
2025-10-17 09:27:50 -03:00
commit 0c0a9c3b5c
15 changed files with 997 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/init.py
+++ b/init.py
@@ -0,0 +1 @@
+"""Top-level package for the video processing pipeline."""
--- a/pycache/llm.cpython-311.pyc
+++ b/pycache/llm.cpython-311.pyc
--- a/pycache/main.cpython-311.pyc
+++ b/pycache/main.cpython-311.pyc
--- a/pycache/render.cpython-311.pyc
+++ b/pycache/render.cpython-311.pyc
--- a/pycache/transcribe.cpython-311.pyc
+++ b/pycache/transcribe.cpython-311.pyc
--- a/pycache/utils.cpython-311.pyc
+++ b/pycache/utils.cpython-311.pyc
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,35 @@
+services:
+  video-render-new:
+    restart: unless-stopped
+    build: .
+    container_name: video-render-new
+    environment:
+      # RabbitMQ credentials
+      - RABBITMQ_PASS=${RABBITMQ_PASS}
+      - RABBITMQ_HOST=${RABBITMQ_HOST}
+      - RABBITMQ_USER=${RABBITMQ_USER}
+      - RABBITMQ_PORT=${RABBITMQ_PORT}
+      - RABBITMQ_QUEUE=${RABBITMQ_QUEUE}
+      - RABBITMQ_UPLOAD_QUEUE=${RABBITMQ_UPLOAD_QUEUE}
+      # API keys for the LLMs
+      - GEMINI_API_KEY=${GEMINI_API_KEY}
+      - OPENROUTER_API_KEY=${OPENROUTER_API_KEY}
+      - OPENROUTER_MODEL=${OPENROUTER_MODEL}
+      # Optional whisper settings
+      - WHISPER_MODEL=${WHISPER_MODEL}
+      - WHISPER_DEVICE=${WHISPER_DEVICE}
+      - WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE}
+    volumes:
+      # Mount host directories into the container so that videos can be
+      # provided and outputs collected. These paths can be customised when
+      # deploying the stack. The defaults assume /root/videos and
+      # /root/outputs on the host.
+      - "/root/videos:/app/videos"
+      - "/root/outputs:/app/outputs"
+    command: "python -u main.py"
+    networks:
+      - dokploy-network
+
+networks:
+  dokploy-network:
+    external: true
--- a/45
+++ b/45
@@ -0,0 +1,45 @@
+FROM python:3.11-slim
+
+# Create and set the working directory
+WORKDIR /app
+
+# Prevent some interactive prompts during package installation
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install ffmpeg and other system dependencies. The list largely mirrors
+# the original project but omits PostgreSQL development headers which are
+# unused here. We include libgl1 and libglib2.0-0 so that MoviePy
+# (through its dependencies) can find OpenGL and GLib when using the
+# Pillow and numpy backends.
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ffmpeg \
+        libgl1 \
+        libglib2.0-0 \
+        build-essential \
+        xvfb \
+        xdg-utils \
+        wget \
+        unzip \
+        libmagick++-dev \
+        imagemagick \
+        fonts-liberation \
+        sox \
+        bc \
+        gsfonts && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy dependency specification and install Python dependencies
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the rest of the application code
+COPY . .
+
+# Declare volumes for videos and outputs. These paths correspond to the
+# mount points defined in the docker-compose file. Using VOLUME here
+# documents the intended persistent storage locations.
+VOLUME ["/app/videos", "/app/outputs"]
+
+# The default command starts the consumer loop
+CMD ["python", "-u", "main.py"]
--- a/llm.py
+++ b/llm.py
@@ -0,0 +1,234 @@
+"""High-level helpers for interacting with the Gemini and OpenRouter APIs.
+
+This module encapsulates all of the logic needed to call the LLM endpoints
+used throughout the application. It uses the OpenAI Python client under the
+hood because both Gemini and OpenRouter expose OpenAI-compatible APIs.
+
+Two functions are exposed:
+
+* ``select_highlights`` takes an SRT-like string (the transcription of a
+  video) and returns a list of highlight objects with start and end
+  timestamps and their corresponding text. It uses the Gemini model to
+  identify which parts of the video are most likely to engage viewers on
+  social media.
+* ``generate_titles`` takes a list of highlight objects and returns a list
+  of the same objects enriched with a ``topText`` field, which contains a
+  sensational title for the clip. It uses the OpenRouter API with a model
+  specified via the ``OPENROUTER_MODEL`` environment variable.
+
+Both functions are resilient to malformed outputs from the models. They try
+to extract the first JSON array found in the model responses; if that
+fails, a descriptive exception is raised. These exceptions should be
+handled by callers to post appropriate error messages back to the queue.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import re
+from typing import Any, Dict, List
+
+import openai
+
+
+class LLMError(Exception):
+    """Raised when the LLM response cannot be parsed into the expected format."""
+
+
+def _extract_json_array(text: str) -> Any:
+    """Extract the first JSON array from a string.
+
+    LLMs sometimes return explanatory text before or after the JSON. This
+    helper uses a regular expression to find the first substring that
+    resembles a JSON array (i.e. starts with '[' and ends with ']'). It
+    returns the corresponding Python object if successful, otherwise
+    raises a ``LLMError``.
+    """
+    # Remove Markdown code fences and other formatting noise
+    cleaned = text.replace("`", "").replace("json", "")
+    # Find the first [ ... ] block
+    match = re.search(r"\[.*\]", cleaned, re.DOTALL)
+    if not match:
+        raise LLMError("Não foi possível encontrar um JSON válido na resposta da IA.")
+    json_str = match.group(0)
+    try:
+        return json.loads(json_str)
+    except json.JSONDecodeError as exc:
+        raise LLMError(f"Erro ao decodificar JSON: {exc}")
+
+
+def select_highlights(srt_text: str) -> List[Dict[str, Any]]:
+    """Call the Gemini API to select highlight segments from a transcription.
+
+    The input ``srt_text`` should be a string containing the transcription
+    formatted like an SRT file, with lines of the form
+    ``00:00:10,140 --> 00:01:00,990`` followed by the spoken text.
+
+    Returns a list of dictionaries, each with ``start``, ``end`` and
+    ``text`` keys. On failure to parse the response, a ``LLMError`` is
+    raised.
+    """
+    api_key = os.environ.get("GEMINI_API_KEY")
+    if not api_key:
+        raise ValueError("GEMINI_API_KEY não definido no ambiente")
+
+    model = os.environ.get("GEMINI_MODEL", "gemini-2.5-flash")
+
+    # Initialise client for Gemini. The base_url points to the
+    # generativelanguage API; see the official docs for details.
+    client = openai.OpenAI(api_key=api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
+
+    # System prompt: instructs Gemini how to behave.
+    system_prompt = (
+        "Você é um assistente especializado em selecionar **HIGHLIGHTS** de vídeo "
+        "a partir da transcrição com timestamps.\n"
+        "Sua única função é **selecionar os trechos** conforme solicitado.\n"
+        "- **Não resuma, não interprete, não gere comentários ou textos complementares.**\n"
+        "- **Retorne a resposta exatamente no formato proposto pelo usuário**, sem adicionar ou remover nada além do pedido.\n"
+        "- Cada trecho selecionado deve ter **no mínimo 60 segundos e no máximo 120 segundos** de duração.\n"
+        "- Sempre responda **em português (PT-BR)**."
+    )
+
+    # Base prompt: describes how to select highlights and the format to return.
+    base_prompt = (
+        "Você assumirá o papel de um especialista em Marketing e Social Media, "
+        "sua tarefa é selecionar as melhores partes de uma transcrição que irei fornecer.\n\n"
+        "## Critérios de Seleção\n\n"
+        "- Escolha trechos baseando-se em:\n"
+        "  - **Picos de emoção ou impacto**\n"
+        "  - **Viradas de assunto**\n"
+        "  - **Punchlines** (frases de efeito, momentos de virada)\n"
+        "  - **Informações-chave**\n\n"
+        "## Regras Rápidas\n\n"
+        "- Sempre devolver pelo menos 3 trechos, não possui limite máximo\n"
+        "- Garanta que cada trecho fique com no MÍNIMO 60 segundos e no MÁXIMO 120 segundos.\n"
+        "- Nenhum outro texto além do JSON final.\n\n"
+        "## Restrições de Duração\n\n"
+        "- **Duração mínima do trecho escolhido:** 60 segundos\n"
+        "- **Duração máxima do trecho escolhido:** 90 a 120 segundos\n\n"
+        "## Tarefa\n\n"
+        "- Proponha o **máximo de trechos** com potencial, mas **sempre devolva no mínimo 3 trechos**.\n"
+        "- Extraia os trechos **apenas** da transcrição fornecida abaixo.\n\n"
+        "## IMPORTANTE\n"
+        "- Cada trecho deve ter no mínimo 60 segundos, e no máximo 120 segundos. Isso é indiscutível\n\n"
+        "## Entrada\n\n"
+        "- Transcrição:\n\n"
+        f"{srt_text}\n\n"
+        "## Saída\n\n"
+        "- Retorne **somente** a lista de trechos selecionados em formato JSON, conforme o exemplo abaixo.\n"
+        "- **Não escreva comentários ou qualquer texto extra.**\n"
+        "- No atributo \"text\", inclua o texto presente no trecho escolhido.\n\n"
+        "### Exemplo de Conversão\n\n"
+        "#### De SRT:\n"
+        "00:00:10,140 --> 00:01:00,990\n"
+        "Exemplo de escrita presente no trecho\n\n"
+        "#### Para JSON:\n"
+        "[\n"
+        "  {\n"
+        "    \"start\": \"00:00:10,140\",\n"
+        "    \"end\": \"00:01:00,990\",\n"
+        "    \"text\": \"Exemplo de escrita presente no trecho\"\n"
+        "  }\n"
+        "]\n"
+    )
+
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": base_prompt},
+    ]
+    try:
+        response = client.chat.completions.create(model=model, messages=messages)
+    except Exception as exc:
+        raise LLMError(f"Erro ao chamar a API Gemini: {exc}")
+    # Extract message content
+    content = response.choices[0].message.content if response.choices else None
+    if not content:
+        raise LLMError("A resposta da Gemini veio vazia.")
+    result = _extract_json_array(content)
+    if not isinstance(result, list):
+        raise LLMError("O JSON retornado pela Gemini não é uma lista.")
+    return result
+
+
+def generate_titles(highlights: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Call the OpenRouter API to generate a title (topText) for each highlight.
+
+    The ``highlights`` argument should be a list of dictionaries as returned
+    by ``select_highlights``, each containing ``start``, ``end`` and ``text``.
+    This function adds a ``topText`` field to each dictionary using the
+    OpenRouter model specified via the ``OPENROUTER_MODEL`` environment
+    variable. If parsing fails, an ``LLMError`` is raised.
+    """
+    api_key = os.environ.get("OPENROUTER_API_KEY")
+    if not api_key:
+        raise ValueError("OPENROUTER_API_KEY não definido no ambiente")
+    model = os.environ.get("OPENROUTER_MODEL")
+    if not model:
+        raise ValueError("OPENROUTER_MODEL não definido no ambiente")
+    # Create client for OpenRouter
+    client = openai.OpenAI(api_key=api_key, base_url="https://openrouter.ai/api/v1")
+
+    # Compose prompt: instruct to generate titles only
+    prompt_header = (
+        "Você é um especialista em Marketing Digital e Criação de Conteúdo Viral.\n\n"
+        "Sua tarefa é criar **títulos sensacionalistas** (*topText*) para cada trecho "
+        "de transcrição recebido em formato JSON.\n\n"
+        "## Instruções\n\n"
+        "- O texto deve ser **chamativo, impactante** e com alto potencial de viralização "
+        "em redes sociais, **mas sem sair do contexto do trecho**.\n"
+        "- Use expressões fortes e curiosas, mas **nunca palavras de baixo calão**.\n"
+        "- Cada *topText* deve ter **no máximo 2 linhas**.\n"
+        "- Utilize **exclusivamente** o conteúdo do trecho; não invente fatos.\n"
+        "- Não adicione comentários, explicações, ou qualquer texto extra na resposta.\n"
+        "- Responda **apenas** no seguinte formato (mantendo as chaves e colchetes):\n\n"
+        "[\n  {\n    \"start\": \"00:00:10,140\",\n    \"end\": \"00:01:00,990\",\n    \"topText\": \"Título impactante\"\n  }\n]\n\n"
+        "## Observações:\n\n"
+        "- Nunca fuja do contexto do trecho.\n"
+        "- Não invente informações.\n"
+        "- Não utilize palavrões.\n"
+        "- Não escreva nada além do JSON de saída.\n\n"
+        "Aqui estão os trechos em JSON:\n"
+    )
+    # Compose input JSON for the model
+    json_input = json.dumps(highlights, ensure_ascii=False)
+    full_message = prompt_header + json_input
+    messages = [
+        {
+            "role": "system",
+            "content": "Você é um assistente útil e objetivo."
+        },
+        {
+            "role": "user",
+            "content": full_message
+        },
+    ]
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            temperature=0.7,
+        )
+    except Exception as exc:
+        raise LLMError(f"Erro ao chamar a API OpenRouter: {exc}")
+    content = response.choices[0].message.content if response.choices else None
+    if not content:
+        raise LLMError("A resposta da OpenRouter veio vazia.")
+    result = _extract_json_array(content)
+    if not isinstance(result, list):
+        raise LLMError("O JSON retornado pela OpenRouter não é uma lista.")
+    # Merge topText back into highlights
+    # We assume the result list has the same order and length as input highlights
+    enriched: List[Dict[str, Any]] = []
+    input_map = {(item["start"], item["end"]): item for item in highlights}
+    for item in result:
+        key = (item.get("start"), item.get("end"))
+        original = input_map.get(key)
+        if original is None:
+            # If the model returns unexpected entries, skip them
+            continue
+        enriched_item = original.copy()
+        # Only topText is expected
+        enriched_item["topText"] = item.get("topText", "").strip()
+        enriched.append(enriched_item)
+    return enriched
--- a/main.py
+++ b/main.py
@@ -0,0 +1,266 @@
+"""Entry point for the video processing pipeline.
+
+This script listens to a RabbitMQ queue for new video processing tasks. When
+a message arrives, it performs the following steps:
+
+1. Creates a working directory for the video based off of its filename.
+2. Extracts the audio track with FFMPEG and runs Faster-Whisper to produce
+   a transcription with word-level timestamps.
+3. Uses the Gemini model to determine which parts of the video have the
+   highest potential for engagement. These highlight segments are
+   represented as a list of objects containing start/end timestamps and
+   text.
+4. Uses the OpenRouter model to generate a sensational title for each
+   highlight. Only the ``topText`` field is kept; the description is
+   intentionally omitted since the caption will be burned into the video.
+5. Cuts the original video into individual clips corresponding to each
+   highlight and renders them vertically with a title above and a dynamic
+   caption below.
+6. Publishes a message to the upload queue with information about the
+   generated clips. On success, this message contains the list of output
+   files. On failure, ``hasError`` will be set to ``True`` and the
+   ``error`` field will describe what went wrong.
+7. Cleans up temporary files (audio, transcript, working directory) and
+   deletes the original source video from the ``videos`` directory to
+   conserve disk space.
+
+The queue names and RabbitMQ credentials are configured via environment
+variables. See the accompanying ``docker-compose.yml`` for defaults.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import shutil
+import time
+import traceback
+from typing import Any, Dict, List
+
+import pika
+
+from .utils import sanitize_filename, seconds_to_timestamp, timestamp_to_seconds
+from .transcribe import transcribe
+from .llm import LLMError, select_highlights, generate_titles
+from .render import render_clip
+
+
+# Environment variables with sensible defaults
+RABBITMQ_HOST = os.environ.get("RABBITMQ_HOST", "rabbitmq")
+RABBITMQ_PORT = int(os.environ.get("RABBITMQ_PORT", 5672))
+RABBITMQ_USER = os.environ.get("RABBITMQ_USER", "admin")
+RABBITMQ_PASS = os.environ.get("RABBITMQ_PASS")
+RABBITMQ_QUEUE = os.environ.get("RABBITMQ_QUEUE", "to-render")
+RABBITMQ_UPLOAD_QUEUE = os.environ.get("RABBITMQ_UPLOAD_QUEUE", "to-upload")
+
+if not RABBITMQ_PASS:
+    raise RuntimeError("RABBITMQ_PASS não definido no ambiente")
+
+
+def get_next_message() -> Any:
+    """Retrieve a single message from the RABBITMQ_QUEUE.
+
+    Returns ``None`` if no messages are available. This helper opens a new
+    connection for each call to avoid keeping stale connections alive.
+    """
+    credentials = pika.PlainCredentials(RABBITMQ_USER, RABBITMQ_PASS)
+    parameters = pika.ConnectionParameters(
+        host=RABBITMQ_HOST,
+        port=RABBITMQ_PORT,
+        credentials=credentials,
+        heartbeat=60,
+        blocked_connection_timeout=300,
+    )
+    connection = pika.BlockingConnection(parameters)
+    channel = connection.channel()
+    method_frame, _, body = channel.basic_get(RABBITMQ_QUEUE)
+    if method_frame:
+        channel.basic_ack(method_frame.delivery_tag)
+        connection.close()
+        return body
+    connection.close()
+    return None
+
+
+def publish_to_queue(payload: Dict[str, Any]) -> None:
+    """Publish a JSON-serialisable payload to the RABBITMQ_UPLOAD_QUEUE."""
+    credentials = pika.PlainCredentials(RABBITMQ_USER, RABBITMQ_PASS)
+    parameters = pika.ConnectionParameters(
+        host=RABBITMQ_HOST,
+        port=RABBITMQ_PORT,
+        credentials=credentials,
+        heartbeat=60,
+        blocked_connection_timeout=300,
+    )
+    connection = pika.BlockingConnection(parameters)
+    channel = connection.channel()
+    channel.queue_declare(queue=RABBITMQ_UPLOAD_QUEUE, durable=True)
+    channel.basic_publish(
+        exchange="",
+        routing_key=RABBITMQ_UPLOAD_QUEUE,
+        body=json.dumps(payload),
+        properties=pika.BasicProperties(delivery_mode=2),
+    )
+    connection.close()
+
+
+def build_srt(segments: List[Dict[str, Any]]) -> str:
+    """Build an SRT-like string from a list of segments.
+
+    Each segment should have ``start``, ``end`` and ``text`` fields. The
+    timestamps are converted to the ``HH:MM:SS,mmm`` format expected by
+    the Gemini prompt. Segments are separated by a blank line.
+    """
+    lines = []
+    for seg in segments:
+        start_ts = seconds_to_timestamp(seg["start"])
+        end_ts = seconds_to_timestamp(seg["end"])
+        lines.append(f"{start_ts} --> {end_ts}\n{seg['text']}")
+    return "\n\n".join(lines)
+
+
+def process_message(data: Dict[str, Any]) -> Dict[str, Any]:
+    """Process a single video task described in ``data``.
+
+    Returns the payload to be sent to the upload queue. Raises an
+    exception on failure; the caller is responsible for catching it and
+    posting an error payload.
+    """
+    filename = data.get("filename")
+    if not filename:
+        raise ValueError("Campo 'filename' ausente na mensagem")
+    url = data.get("url")
+    video_id = data.get("videoId")
+    # Determine source video path; n8n stores videos in the 'videos' directory
+    video_path = os.path.join("videos", filename)
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"Arquivo de vídeo não encontrado: {video_path}")
+    # Sanitize the filename to use as directory name
+    base_no_ext = os.path.splitext(filename)[0]
+    sanitized = sanitize_filename(base_no_ext)
+    work_dir = os.path.join("app", "videos", sanitized)
+    # Transcribe video
+    segments, words = transcribe(video_path, work_dir)
+    # Build SRT string
+    srt_str = build_srt(segments)
+    # Call Gemini to select highlights
+    highlights = select_highlights(srt_str)
+    # Convert start/end times to floats and keep original strings for openrouter
+    for item in highlights:
+        item["start"] = item["start"].strip()
+        item["end"] = item["end"].strip()
+    # Generate titles
+    titles = generate_titles(highlights)
+    # Render clips
+    output_dir = os.path.join("outputs", sanitized)
+    processed_files: List[str] = []
+    for idx, item in enumerate(titles, start=1):
+        start_sec = timestamp_to_seconds(item.get("start"))
+        end_sec = timestamp_to_seconds(item.get("end"))
+        # Extract relative words for caption
+        relative_words = []
+        for w in words:
+            # Word must overlap clip interval
+            if w["end"] <= start_sec or w["start"] >= end_sec:
+                continue
+            rel_start = max(0.0, w["start"] - start_sec)
+            rel_end = max(0.0, w["end"] - start_sec)
+            relative_words.append({
+                "start": rel_start,
+                "end": rel_end,
+                "word": w["word"],
+            })
+        # If no words found (e.g. silence), create a dummy word to avoid errors
+        if not relative_words:
+            relative_words.append({"start": 0.0, "end": end_sec - start_sec, "word": ""})
+        out_path = render_clip(
+            video_path=video_path,
+            start=start_sec,
+            end=end_sec,
+            top_text=item.get("topText", ""),
+            words=relative_words,
+            out_dir=output_dir,
+            base_name=sanitized,
+            idx=idx,
+        )
+        processed_files.append(out_path)
+    # Compose payload
+    payload = {
+        "videosProcessedQuantity": len(processed_files),
+        "filename": filename,
+        "processedFiles": processed_files,
+        "url": url,
+        "videoId": video_id,
+        "hasError": False,
+        "error": None,
+    }
+    # Clean up working directory and original video
+    shutil.rmtree(work_dir, ignore_errors=True)
+    try:
+        os.remove(video_path)
+    except FileNotFoundError:
+        pass
+    return payload
+
+
+def main():
+    print(" [*] Esperando mensagens. Para sair: CTRL+C")
+    while True:
+        body = get_next_message()
+        if body is None:
+            time.sleep(5)
+            continue
+        try:
+            data = json.loads(body)
+        except Exception:
+            print("⚠️  Mensagem inválida recebida (não é JSON)")
+            continue
+        try:
+            result = process_message(data)
+        except Exception as exc:
+            # Print stack trace for debugging
+            traceback.print_exc()
+            # Attempt to clean up any directories based on filename
+            filename = data.get("filename")
+            sanitized = sanitize_filename(os.path.splitext(filename or "")[0]) if filename else ""
+            work_dir = os.path.join("app", "videos", sanitized) if sanitized else None
+            output_dir = os.path.join("outputs", sanitized) if sanitized else None
+            # Remove working and output directories
+            if work_dir:
+                shutil.rmtree(work_dir, ignore_errors=True)
+            if output_dir:
+                shutil.rmtree(output_dir, ignore_errors=True)
+            # Remove original video if present
+            video_path = os.path.join("videos", filename) if filename else None
+            if video_path and os.path.exists(video_path):
+                try:
+                    os.remove(video_path)
+                except Exception:
+                    pass
+            # Build error payload
+            error_payload = {
+                "videosProcessedQuantity": 0,
+                "filename": filename,
+                "processedFiles": [],
+                "url": data.get("url"),
+                "videoId": data.get("videoId"),
+                "hasError": True,
+                "error": str(exc),
+            }
+            try:
+                publish_to_queue(error_payload)
+                print(f"Mensagem de erro publicada na fila '{RABBITMQ_UPLOAD_QUEUE}'.")
+            except Exception as publish_err:
+                print(f"Erro ao publicar mensagem de erro: {publish_err}")
+            continue
+        # On success publish payload
+        try:
+            publish_to_queue(result)
+            print(f"Mensagem publicada na fila '{RABBITMQ_UPLOAD_QUEUE}'.")
+        except Exception as publish_err:
+            print(f"Erro ao publicar na fila '{RABBITMQ_UPLOAD_QUEUE}': {publish_err}")
+        # Loop continues
+
+
+if __name__ == "__main__":
+    main()
--- a/render.py
+++ b/render.py
@@ -0,0 +1,205 @@
+"""Rendering logic for producing vertical clips with dynamic captions.
+
+This module defines a single function ``render_clip`` which takes a video
+segment and produces a vertical clip suitable for social media. Each clip
+contains three regions:
+
+* A top region (480px high) showing a title generated by an LLM.
+* A middle region (960px high) containing the original video, scaled to
+  fit horizontally while preserving aspect ratio and centred vertically.
+* A bottom region (480px high) showing a dynamic caption. The caption
+  displays a sliding window of three to five words from the transcript,
+  colouring the currently spoken word differently to draw the viewer's
+  attention.
+
+The function uses the MoviePy library to compose the various elements and
+writes the resulting video to disk. It returns the path to the created
+file.
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Dict, List
+
+import numpy as np
+from moviepy.video.io.VideoFileClip import VideoFileClip
+from moviepy.video.VideoClip import ColorClip, VideoClip
+from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
+from moviepy.video.VideoClip import TextClip
+from PIL import Image, ImageDraw, ImageFont
+
+from .utils import wrap_text
+
+
+def render_clip(
+    video_path: str,
+    start: float,
+    end: float,
+    top_text: str,
+    words: List[Dict[str, float]],
+    out_dir: str,
+    base_name: str,
+    idx: int,
+    # Use a widely available system font by default. DejaVuSans is installed
+    # in most Debian-based containers. The caller can override this path.
+    font_path: str = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
+    final_width: int = 1080,
+    final_height: int = 1920,
+    top_h: int = 480,
+    middle_h: int = 960,
+    bottom_h: int = 480,
+    video_codec: str = "libx264",
+    bitrate: str = "3000k",
+) -> str:
+    """Render a single clip with title and dynamic caption.
+
+    Parameters
+    ----------
+    video_path: str
+        Path to the source video file.
+    start: float
+        Start time of the clip in seconds.
+    end: float
+        End time of the clip in seconds.
+    top_text: str
+        The title to display in the top region.
+    words: List[Dict[str, float]]
+        List of word-level timestamps for this clip. Each dict must have
+        ``start``, ``end`` and ``word`` keys. The start and end values
+        should be relative to the beginning of this clip (i.e. start at 0).
+    out_dir: str
+        Directory where the output file should be saved. The function
+        creates this directory if it doesn't exist.
+    base_name: str
+        Base name of the original video (sanitized). Used to build the
+        output filename.
+    idx: int
+        Index of the clip. Output will be named ``clip_{idx}.mp4``.
+    font_path: str
+        Path to the TrueType font to use for both title and caption.
+    final_width: int
+        Width of the final video in pixels.
+    final_height: int
+        Height of the final video in pixels.
+    top_h: int
+        Height of the title area in pixels.
+    middle_h: int
+        Height of the video area in pixels.
+    bottom_h: int
+        Height of the caption area in pixels.
+    video_codec: str
+        FFmpeg codec to use when writing the video.
+    bitrate: str
+        Bitrate for the output video.
+
+    Returns
+    -------
+    str
+        The path to the rendered video file.
+    """
+    os.makedirs(out_dir, exist_ok=True)
+    # Extract the segment from the source video
+    with VideoFileClip(video_path) as clip:
+        segment = clip.subclip(start, end)
+        dur = segment.duration
+        # Background
+        bg = ColorClip(size=(final_width, final_height), color=(0, 0, 0), duration=dur)
+        # Resize video to fit width
+        video_resized = segment.resize(width=final_width)
+        # Compute vertical position to centre in the middle region
+        y = top_h + (middle_h - video_resized.h) // 2
+        video_resized = video_resized.set_position((0, y))
+
+        # Build title clip
+        # Wrap the title to avoid overflow
+        wrapped_lines = wrap_text(top_text, max_chars=40)
+        wrapped_title = "\n".join(wrapped_lines)
+        title_clip = TextClip(
+            wrapped_title,
+            font=font_path,
+            fontsize=70,
+            color="white",
+            method="caption",
+            size=(final_width, top_h),
+            align="center",
+        ).set_duration(dur).set_position((0, 0))
+
+        # Prepare font for caption rendering
+        pil_font = ImageFont.truetype(font_path, size=60)
+        default_color = (255, 255, 255)  # white
+        highlight_color = (255, 215, 0)  # gold-like yellow
+
+        # Precompute widths of a space and bounding box height for vertical centering
+        space_width = pil_font.getbbox(" ")[2] - pil_font.getbbox(" ")[0]
+        bbox = pil_font.getbbox("A")
+        text_height = bbox[3] - bbox[1]
+
+        def make_caption_frame(t: float):
+            """Generate an image for the caption at time t."""
+            # Determine current word index
+            idx_cur = 0
+            for i, w in enumerate(words):
+                if w["start"] <= t < w["end"]:
+                    idx_cur = i
+                    break
+                if t >= w["end"]:
+                    idx_cur = i
+            # Define window of words to display: show up to 5 words
+            start_idx = max(0, idx_cur - 2)
+            end_idx = min(len(words), idx_cur + 3)
+            window = words[start_idx:end_idx]
+            # Compute widths for each word
+            word_sizes = []
+            for w in window:
+                bbox = pil_font.getbbox(w["word"])
+                word_width = bbox[2] - bbox[0]
+                word_sizes.append(word_width)
+            total_width = sum(word_sizes) + space_width * (len(window) - 1 if window else 0)
+            # Create blank image for caption area
+            img = Image.new("RGB", (final_width, bottom_h), color=(0, 0, 0))
+            draw = ImageDraw.Draw(img)
+            x = int((final_width - total_width) / 2)
+            y_pos = int((bottom_h - text_height) / 2)
+            for j, w in enumerate(window):
+                color = highlight_color if (start_idx + j) == idx_cur else default_color
+                draw.text((x, y_pos), w["word"], font=pil_font, fill=color)
+                x += word_sizes[j] + space_width
+            return np.array(img)
+
+        caption_clip = VideoClip(make_frame=make_caption_frame, duration=dur)
+        caption_clip = caption_clip.set_position((0, final_height - bottom_h))
+
+        # Compose final clip
+        final = CompositeVideoClip([
+            bg,
+            video_resized,
+            title_clip,
+            caption_clip,
+        ], size=(final_width, final_height))
+        # Use the original audio from the video segment
+        final_audio = segment.audio
+        if final_audio is not None:
+            final = final.set_audio(final_audio)
+        # Define output path
+        out_path = os.path.join(out_dir, f"clip_{idx}.mp4")
+        # Write to disk
+        final.write_videofile(
+            out_path,
+            codec=video_codec,
+            fps=30,
+            bitrate=bitrate,
+            audio_codec="aac",
+            preset="ultrafast",
+            ffmpeg_params=[
+                "-tune", "zerolatency",
+                "-pix_fmt", "yuv420p",
+                "-profile:v", "high",
+                "-level", "4.1",
+            ],
+            threads=4,
+        )
+        # Close clips to free resources
+        final.close()
+        segment.close()
+    return out_path
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+pika==1.3.2
+moviepy==2.0.0
+faster-whisper==1.2.0
+openai==1.16.0
+numpy==1.26.4
+Pillow==10.1.0
+unidecode==1.3.6
--- a/transcribe.py
+++ b/transcribe.py
@@ -0,0 +1,111 @@
+"""Utilities for extracting audio from video and generating transcriptions.
+
+This module handles two tasks:
+
+1. Use FFMPEG to extract the audio track from a video file into a WAV file
+   suitable for consumption by the Whisper model. The audio is resampled to
+   16 kHz mono PCM as required by Whisper.
+2. Use the Faster-Whisper implementation to generate a transcription with
+   word-level timestamps. The transcription is returned both as a list of
+   segments (for building an SRT) and as a flattened list of words (for
+   building dynamic subtitles).
+
+If FFMPEG is not installed or fails, a ``RuntimeError`` is raised. The caller
+is responsible for cleaning up the temporary files created in the working
+directory.
+"""
+
+from __future__ import annotations
+
+import os
+import subprocess
+from typing import Dict, List, Tuple
+
+from faster_whisper import WhisperModel
+
+
+def extract_audio_ffmpeg(video_path: str, audio_path: str) -> None:
+    """Use FFMPEG to extract audio from ``video_path`` into ``audio_path``.
+
+    The output will be a 16 kHz mono WAV file in PCM S16LE format. Any
+    existing file at ``audio_path`` will be overwritten. If ffmpeg returns
+    a non-zero exit code, a ``RuntimeError`` is raised with the stderr.
+    """
+    cmd = [
+        "ffmpeg",
+        "-y",  # overwrite output
+        "-i",
+        video_path,
+        "-vn",  # disable video recording
+        "-acodec",
+        "pcm_s16le",
+        "-ar",
+        "16000",
+        "-ac",
+        "1",
+        audio_path,
+    ]
+    proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if proc.returncode != 0:
+        raise RuntimeError(f"FFMPEG error: {proc.stderr.decode(errors='ignore')}")
+
+
+def load_whisper_model() -> WhisperModel:
+    """Instantiate and cache a Faster-Whisper model.
+
+    The model name and device can be configured via the ``WHISPER_MODEL`` and
+    ``WHISPER_DEVICE`` environment variables. The default model is
+    ``large-v3`` for best accuracy. The device can be ``cuda`` or ``cpu``.
+    A module-level cache is used to prevent loading the model multiple times.
+    """
+    if hasattr(load_whisper_model, "_cache"):
+        return load_whisper_model._cache  # type: ignore[attr-defined]
+    model_name = os.environ.get("WHISPER_MODEL", "large-v3")
+    device = os.environ.get("WHISPER_DEVICE", "cpu")
+    # Compute type can be set via WHISPER_COMPUTE_TYPE; default to float16 on GPU
+    compute_type = os.environ.get("WHISPER_COMPUTE_TYPE")
+    # If not explicitly set, choose sensible defaults
+    if compute_type is None:
+        compute_type = "float16" if device == "cuda" else "int8"
+    model = WhisperModel(model_name, device=device, compute_type=compute_type)
+    load_whisper_model._cache = model  # type: ignore[attr-defined]
+    return model
+
+
+def transcribe(video_path: str, work_dir: str) -> Tuple[List[Dict[str, float]], List[Dict[str, float]]]:
+    """Transcribe a video file using Faster-Whisper.
+
+    ``video_path`` is the path to the video to transcribe. ``work_dir`` is a
+    directory where temporary files will be stored (audio file and
+    transcription). The function returns a tuple ``(segments, words)`` where
+    ``segments`` is a list of dictionaries with ``start``, ``end`` and
+    ``text`` fields, and ``words`` is a flat list of dictionaries with
+    ``start``, ``end`` and ``word`` fields covering the entire video.
+    The timestamps are expressed in seconds as floats.
+    """
+    os.makedirs(work_dir, exist_ok=True)
+    audio_path = os.path.join(work_dir, "audio.wav")
+    # Extract audio
+    extract_audio_ffmpeg(video_path, audio_path)
+    # Load Whisper model
+    model = load_whisper_model()
+    # Run transcription with word-level timestamps
+    segments, info = model.transcribe(audio_path, word_timestamps=True)
+    seg_list: List[Dict[str, float]] = []
+    words_list: List[Dict[str, float]] = []
+    for seg in segments:
+        seg_list.append({
+            "start": float(seg.start),
+            "end": float(seg.end),
+            "text": seg.text.strip(),
+        })
+        # Each segment may contain words attribute
+        for w in getattr(seg, "words", []) or []:
+            words_list.append({
+                "start": float(w.start),
+                "end": float(w.end),
+                "word": w.word,
+            })
+    # Sort words by start time to be safe
+    words_list.sort(key=lambda d: d["start"])
+    return seg_list, words_list
--- a/utils.py
+++ b/utils.py
@@ -0,0 +1,93 @@
+import re
+import unicodedata
+from typing import List, Tuple
+
+
+def sanitize_filename(name: str) -> str:
+    """Return a sanitized version of a filename.
+
+    This helper removes accents, converts to lowercase, replaces spaces
+    with underscores and removes any non alphanumeric characters except
+    underscores and dots. This makes the directory names safe to use on
+    most filesystems and matches the behaviour described in the spec.
+    """
+    if not name:
+        return ""
+    # Decompose Unicode characters and strip accents
+    nfkd_form = unicodedata.normalize("NFKD", name)
+    no_accents = "".join(c for c in nfkd_form if not unicodedata.combining(c))
+    # Replace spaces with underscores
+    no_spaces = no_accents.replace(" ", "_")
+    # Lowercase and remove any character that is not a letter, digit, dot or underscore
+    sanitized = re.sub(r"[^A-Za-z0-9_.]+", "", no_spaces)
+    return sanitized
+
+
+def timestamp_to_seconds(ts: str) -> float:
+    """Convert a timestamp in HH:MM:SS,mmm format to seconds.
+
+    The Gemini and OpenRouter prompts use timestamps formatted with a comma
+    as the decimal separator. This helper splits the string into hours,
+    minutes and seconds and returns a float expressed in seconds.
+    """
+    if ts is None:
+        return 0.0
+    ts = ts.strip()
+    if not ts:
+        return 0.0
+    # Replace comma by dot for decimal seconds
+    ts = ts.replace(",", ".")
+    parts = ts.split(":")
+    parts = [float(p) for p in parts]
+    if len(parts) == 3:
+        h, m, s = parts
+        return h * 3600 + m * 60 + s
+    elif len(parts) == 2:
+        m, s = parts
+        return m * 60 + s
+    else:
+        # only seconds
+        return parts[0]
+
+
+def seconds_to_timestamp(seconds: float) -> str:
+    """Convert a time in seconds to HH:MM:SS,mmm format expected by SRT."""
+    if seconds < 0:
+        seconds = 0
+    h = int(seconds // 3600)
+    m = int((seconds % 3600) // 60)
+    s = seconds % 60
+    # Format with comma as decimal separator and three decimal places
+    return f"{h:02d}:{m:02d}:{s:06.3f}".replace(".", ",")
+
+
+def wrap_text(text: str, max_chars: int = 80) -> List[str]:
+    """Simple word-wrap for a string.
+
+    Splits ``text`` into a list of lines, each at most ``max_chars``
+    characters long. This does not attempt to hyphenate words – a word
+    longer than ``max_chars`` will occupy its own line. The return value
+    is a list of lines without trailing whitespace.
+    """
+    if not text:
+        return []
+    words = text.split()
+    lines: List[str] = []
+    current: List[str] = []
+    current_len = 0
+    for word in words:
+        # If adding this word would exceed the max, flush current line
+        if current and current_len + 1 + len(word) > max_chars:
+            lines.append(" ".join(current))
+            current = [word]
+            current_len = len(word)
+        else:
+            # Add to current line
+            if current:
+                current_len += 1 + len(word)
+            else:
+                current_len = len(word)
+            current.append(word)
+    if current:
+        lines.append(" ".join(current))
+    return lines
				`@@ -0,0 +1 @@`
				`"""Top-level package for the video processing pipeline."""`