video-render/llm.py

"""High-level helpers for interacting with the Gemini and OpenRouter APIs.

This module encapsulates all of the logic needed to call the LLM endpoints
used throughout the application. It uses the OpenAI Python client under the
hood because both Gemini and OpenRouter expose OpenAI-compatible APIs.

Two functions are exposed:

* ``select_highlights`` takes an SRT-like string (the transcription of a
  video) and returns a list of highlight objects with start and end
  timestamps and their corresponding text. It uses the Gemini model to
  identify which parts of the video are most likely to engage viewers on
  social media.
* ``generate_titles`` takes a list of highlight objects and returns a list
  of the same objects enriched with a ``topText`` field, which contains a
  sensational title for the clip. It uses the OpenRouter API with a model
  specified via the ``OPENROUTER_MODEL`` environment variable.

Both functions are resilient to malformed outputs from the models. They try
to extract the first JSON array found in the model responses; if that
fails, a descriptive exception is raised. These exceptions should be
handled by callers to post appropriate error messages back to the queue.
"""

from __future__ import annotations

import json
import os
import re
from typing import Any, Dict, List

import openai


class LLMError(Exception):
    """Raised when the LLM response cannot be parsed into the expected format."""


def _extract_json_array(text: str) -> Any:
    """Extract the first JSON array from a string.

    LLMs sometimes return explanatory text before or after the JSON. This
    helper uses a regular expression to find the first substring that
    resembles a JSON array (i.e. starts with '[' and ends with ']'). It
    returns the corresponding Python object if successful, otherwise
    raises a ``LLMError``.
    """
    # Remove Markdown code fences and other formatting noise
    cleaned = text.replace("`", "").replace("json", "")
    # Find the first [ ... ] block
    match = re.search(r"\[.*\]", cleaned, re.DOTALL)
    if not match:
        raise LLMError("Não foi possível encontrar um JSON válido na resposta da IA.")
    json_str = match.group(0)
    try:
        return json.loads(json_str)
    except json.JSONDecodeError as exc:
        raise LLMError(f"Erro ao decodificar JSON: {exc}")


def select_highlights(srt_text: str) -> List[Dict[str, Any]]:
    """Call the Gemini API to select highlight segments from a transcription.

    The input ``srt_text`` should be a string containing the transcription
    formatted like an SRT file, with lines of the form
    ``00:00:10,140 --> 00:01:00,990`` followed by the spoken text.

    Returns a list of dictionaries, each with ``start``, ``end`` and
    ``text`` keys. On failure to parse the response, a ``LLMError`` is
    raised.
    """
    api_key = os.environ.get("GEMINI_API_KEY")
    if not api_key:
        raise ValueError("GEMINI_API_KEY não definido no ambiente")

    model = os.environ.get("GEMINI_MODEL", "gemini-2.5-flash")

    # Initialise client for Gemini. The base_url points to the
    # generativelanguage API; see the official docs for details.
    client = openai.OpenAI(api_key=api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")

    # System prompt: instructs Gemini how to behave.
    system_prompt = (
        "Você é um assistente especializado em selecionar **HIGHLIGHTS** de vídeo "
        "a partir da transcrição com timestamps.\n"
        "Sua única função é **selecionar os trechos** conforme solicitado.\n"
        "- **Não resuma, não interprete, não gere comentários ou textos complementares.**\n"
        "- **Retorne a resposta exatamente no formato proposto pelo usuário**, sem adicionar ou remover nada além do pedido.\n"
        "- Cada trecho selecionado deve ter **no mínimo 60 segundos e no máximo 120 segundos** de duração.\n"
        "- Sempre responda **em português (PT-BR)**."
    )

    # Base prompt: describes how to select highlights and the format to return.
    base_prompt = (
        "Você assumirá o papel de um especialista em Marketing e Social Media, "
        "sua tarefa é selecionar as melhores partes de uma transcrição que irei fornecer.\n\n"
        "## Critérios de Seleção\n\n"
        "- Escolha trechos baseando-se em:\n"
        "  - **Picos de emoção ou impacto**\n"
        "  - **Viradas de assunto**\n"
        "  - **Punchlines** (frases de efeito, momentos de virada)\n"
        "  - **Informações-chave**\n\n"
        "## Regras Rápidas\n\n"
        "- Sempre devolver pelo menos 3 trechos, não possui limite máximo\n"
        "- Garanta que cada trecho fique com no MÍNIMO 60 segundos e no MÁXIMO 120 segundos.\n"
        "- Nenhum outro texto além do JSON final.\n\n"
        "## Restrições de Duração\n\n"
        "- **Duração mínima do trecho escolhido:** 60 segundos\n"
        "- **Duração máxima do trecho escolhido:** 90 a 120 segundos\n\n"
        "## Tarefa\n\n"
        "- Proponha o **máximo de trechos** com potencial, mas **sempre devolva no mínimo 3 trechos**.\n"
        "- Extraia os trechos **apenas** da transcrição fornecida abaixo.\n\n"
        "## IMPORTANTE\n"
        "- Cada trecho deve ter no mínimo 60 segundos, e no máximo 120 segundos. Isso é indiscutível\n\n"
        "## Entrada\n\n"
        "- Transcrição:\n\n"
        f"{srt_text}\n\n"
        "## Saída\n\n"
        "- Retorne **somente** a lista de trechos selecionados em formato JSON, conforme o exemplo abaixo.\n"
        "- **Não escreva comentários ou qualquer texto extra.**\n"
        "- No atributo \"text\", inclua o texto presente no trecho escolhido.\n\n"
        "### Exemplo de Conversão\n\n"
        "#### De SRT:\n"
        "00:00:10,140 --> 00:01:00,990\n"
        "Exemplo de escrita presente no trecho\n\n"
        "#### Para JSON:\n"
        "[\n"
        "  {\n"
        "    \"start\": \"00:00:10,140\",\n"
        "    \"end\": \"00:01:00,990\",\n"
        "    \"text\": \"Exemplo de escrita presente no trecho\"\n"
        "  }\n"
        "]\n"
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": base_prompt},
    ]
    try:
        response = client.chat.completions.create(model=model, messages=messages)
    except Exception as exc:
        raise LLMError(f"Erro ao chamar a API Gemini: {exc}")
    # Extract message content
    content = response.choices[0].message.content if response.choices else None
    if not content:
        raise LLMError("A resposta da Gemini veio vazia.")
    result = _extract_json_array(content)
    if not isinstance(result, list):
        raise LLMError("O JSON retornado pela Gemini não é uma lista.")
    return result


def generate_titles(highlights: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Call the OpenRouter API to generate a title (topText) for each highlight.

    The ``highlights`` argument should be a list of dictionaries as returned
    by ``select_highlights``, each containing ``start``, ``end`` and ``text``.
    This function adds a ``topText`` field to each dictionary using the
    OpenRouter model specified via the ``OPENROUTER_MODEL`` environment
    variable. If parsing fails, an ``LLMError`` is raised.
    """
    api_key = os.environ.get("OPENROUTER_API_KEY")
    if not api_key:
        raise ValueError("OPENROUTER_API_KEY não definido no ambiente")
    model = os.environ.get("OPENROUTER_MODEL")
    if not model:
        raise ValueError("OPENROUTER_MODEL não definido no ambiente")
    # Create client for OpenRouter
    client = openai.OpenAI(api_key=api_key, base_url="https://openrouter.ai/api/v1")

    # Compose prompt: instruct to generate titles only
    prompt_header = (
        "Você é um especialista em Marketing Digital e Criação de Conteúdo Viral.\n\n"
        "Sua tarefa é criar **títulos sensacionalistas** (*topText*) para cada trecho "
        "de transcrição recebido em formato JSON.\n\n"
        "## Instruções\n\n"
        "- O texto deve ser **chamativo, impactante** e com alto potencial de viralização "
        "em redes sociais, **mas sem sair do contexto do trecho**.\n"
        "- Use expressões fortes e curiosas, mas **nunca palavras de baixo calão**.\n"
        "- Cada *topText* deve ter **no máximo 2 linhas**.\n"
        "- Utilize **exclusivamente** o conteúdo do trecho; não invente fatos.\n"
        "- Não adicione comentários, explicações, ou qualquer texto extra na resposta.\n"
        "- Responda **apenas** no seguinte formato (mantendo as chaves e colchetes):\n\n"
        "[\n  {\n    \"start\": \"00:00:10,140\",\n    \"end\": \"00:01:00,990\",\n    \"topText\": \"Título impactante\"\n  }\n]\n\n"
        "## Observações:\n\n"
        "- Nunca fuja do contexto do trecho.\n"
        "- Não invente informações.\n"
        "- Não utilize palavrões.\n"
        "- Não escreva nada além do JSON de saída.\n\n"
        "Aqui estão os trechos em JSON:\n"
    )
    # Compose input JSON for the model
    json_input = json.dumps(highlights, ensure_ascii=False)
    full_message = prompt_header + json_input
    messages = [
        {
            "role": "system",
            "content": "Você é um assistente útil e objetivo."
        },
        {
            "role": "user",
            "content": full_message
        },
    ]
    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0.7,
        )
    except Exception as exc:
        raise LLMError(f"Erro ao chamar a API OpenRouter: {exc}")
    content = response.choices[0].message.content if response.choices else None
    if not content:
        raise LLMError("A resposta da OpenRouter veio vazia.")
    result = _extract_json_array(content)
    if not isinstance(result, list):
        raise LLMError("O JSON retornado pela OpenRouter não é uma lista.")
    # Merge topText back into highlights
    # We assume the result list has the same order and length as input highlights
    enriched: List[Dict[str, Any]] = []
    input_map = {(item["start"], item["end"]): item for item in highlights}
    for item in result:
        key = (item.get("start"), item.get("end"))
        original = input_map.get(key)
        if original is None:
            # If the model returns unexpected entries, skip them
            continue
        enriched_item = original.copy()
        # Only topText is expected
        enriched_item["topText"] = item.get("topText", "").strip()
        enriched.append(enriched_item)
    return enriched