diff --git a/.gitignore b/.gitignore deleted file mode 100644 index d4d72bc..0000000 --- a/.gitignore +++ /dev/null @@ -1,98 +0,0 @@ -__pycache__/ -*.py[cod] -*$py.class -*.so -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST -*.manifest -*.spec -pip-log.txt -pip-delete-this-directory.txt -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ -*.mo -*.pot -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal -instance/ -.webassets-cache -.scrapy -docs/_build/ -.pybuilder/ -target/ -.ipynb_checkpoints -profile_default/ -ipython_config.py - -.pdm.toml - -__pypackages__/ - -celerybeat-schedule -celerybeat.pid - -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ -.spyderproject -.spyproject -.ropeproject - -/site - -.mypy_cache/ -.dmypy.json -dmypy.json - -.pyre/ - -.pytype/ - -cython_debug/ -.idea/ -.vscode/ -*.code-workspace -*.local -*.mp4 -*.wav -*.mp3 -*.srt -*.vtt -*.json -*.csv -*.xlsx -*.db -*.sqlite3 diff --git a/__init__.py b/__init__.py deleted file mode 100644 index b437409..0000000 --- a/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Top-level package for the video processing pipeline.""" \ No newline at end of file diff --git a/__pycache__/llm.cpython-311.pyc b/__pycache__/llm.cpython-311.pyc deleted file mode 100644 index 36d44a6..0000000 Binary files a/__pycache__/llm.cpython-311.pyc and /dev/null differ diff --git a/__pycache__/main.cpython-311.pyc b/__pycache__/main.cpython-311.pyc deleted file mode 100644 index 02fec65..0000000 Binary files a/__pycache__/main.cpython-311.pyc and /dev/null differ diff --git a/__pycache__/render.cpython-311.pyc b/__pycache__/render.cpython-311.pyc deleted file mode 100644 index 634cd2e..0000000 Binary files a/__pycache__/render.cpython-311.pyc and /dev/null differ diff --git a/__pycache__/transcribe.cpython-311.pyc b/__pycache__/transcribe.cpython-311.pyc deleted file mode 100644 index cac6337..0000000 Binary files a/__pycache__/transcribe.cpython-311.pyc and /dev/null differ diff --git a/__pycache__/utils.cpython-311.pyc b/__pycache__/utils.cpython-311.pyc deleted file mode 100644 index 3c4f202..0000000 Binary files a/__pycache__/utils.cpython-311.pyc and /dev/null differ diff --git a/docker-compose.yml b/docker-compose.yml index 5d575cc..b74bd0d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,3 +1,8 @@ +# GEMINI_API_KEY="AIzaSyB5TPjSPPZG1Qb6EtblhKFAjvCOdY15rcw" +# YOUTUBE_API="https://totally-real-dingo.ngrok-free.app" +# OPENROUTER_API_KEY="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" +# OPENROUTER_MODEL="openai/gpt-oss-20b:free" + services: video-render-new: restart: unless-stopped @@ -6,19 +11,13 @@ services: environment: # RabbitMQ credentials - RABBITMQ_PASS=${RABBITMQ_PASS} - - RABBITMQ_HOST=${RABBITMQ_HOST} - - RABBITMQ_USER=${RABBITMQ_USER} - - RABBITMQ_PORT=${RABBITMQ_PORT} - - RABBITMQ_QUEUE=${RABBITMQ_QUEUE} - - RABBITMQ_UPLOAD_QUEUE=${RABBITMQ_UPLOAD_QUEUE} - # API keys for the LLMs - GEMINI_API_KEY=${GEMINI_API_KEY} + - GEMINI_MODEL=${GEMINI_MODEL:-gemini-2.5-pro} - OPENROUTER_API_KEY=${OPENROUTER_API_KEY} - - OPENROUTER_MODEL=${OPENROUTER_MODEL} - # Optional whisper settings - - WHISPER_MODEL=${WHISPER_MODEL} - - WHISPER_DEVICE=${WHISPER_DEVICE} - - WHISPER_COMPUTE_TYPE=${WHISPER_COMPUTE_TYPE} + - OPENROUTER_MODEL=${OPENROUTER_MODEL:-openai/gpt-oss-20b:free} + - FASTER_WHISPER_MODEL_SIZE=${FASTER_WHISPER_MODEL_SIZE:-small} + ports: + - "5000:5000" volumes: # Mount host directories into the container so that videos can be # provided and outputs collected. These paths can be customised when @@ -27,9 +26,18 @@ services: - "/root/videos:/app/videos" - "/root/outputs:/app/outputs" command: "python -u main.py" - networks: - - dokploy-network + # runtime: nvidia -networks: - dokploy-network: - external: true \ No newline at end of file + # networks: + # - dokploy-network + + # deploy: + # resources: + # reservations: + # devices: + # - driver: nvidia + # count: all + # capabilities: [gpu] +# networks: +# dokploy-network: +# external: true diff --git a/dockerfile b/dockerfile index dc30f99..048fdd3 100644 --- a/dockerfile +++ b/dockerfile @@ -21,6 +21,10 @@ RUN apt-get update && \ xdg-utils \ wget \ unzip \ + ffmpeg \ + libgomp1 \ + libpq-dev \ + vim \ libmagick++-dev \ imagemagick \ fonts-liberation \ diff --git a/llm.py b/llm.py deleted file mode 100644 index f0a5a2a..0000000 --- a/llm.py +++ /dev/null @@ -1,234 +0,0 @@ -"""High-level helpers for interacting with the Gemini and OpenRouter APIs. - -This module encapsulates all of the logic needed to call the LLM endpoints -used throughout the application. It uses the OpenAI Python client under the -hood because both Gemini and OpenRouter expose OpenAI-compatible APIs. - -Two functions are exposed: - -* ``select_highlights`` takes an SRT-like string (the transcription of a - video) and returns a list of highlight objects with start and end - timestamps and their corresponding text. It uses the Gemini model to - identify which parts of the video are most likely to engage viewers on - social media. -* ``generate_titles`` takes a list of highlight objects and returns a list - of the same objects enriched with a ``topText`` field, which contains a - sensational title for the clip. It uses the OpenRouter API with a model - specified via the ``OPENROUTER_MODEL`` environment variable. - -Both functions are resilient to malformed outputs from the models. They try -to extract the first JSON array found in the model responses; if that -fails, a descriptive exception is raised. These exceptions should be -handled by callers to post appropriate error messages back to the queue. -""" - -from __future__ import annotations - -import json -import os -import re -from typing import Any, Dict, List - -import openai - - -class LLMError(Exception): - """Raised when the LLM response cannot be parsed into the expected format.""" - - -def _extract_json_array(text: str) -> Any: - """Extract the first JSON array from a string. - - LLMs sometimes return explanatory text before or after the JSON. This - helper uses a regular expression to find the first substring that - resembles a JSON array (i.e. starts with '[' and ends with ']'). It - returns the corresponding Python object if successful, otherwise - raises a ``LLMError``. - """ - # Remove Markdown code fences and other formatting noise - cleaned = text.replace("`", "").replace("json", "") - # Find the first [ ... ] block - match = re.search(r"\[.*\]", cleaned, re.DOTALL) - if not match: - raise LLMError("Não foi possível encontrar um JSON válido na resposta da IA.") - json_str = match.group(0) - try: - return json.loads(json_str) - except json.JSONDecodeError as exc: - raise LLMError(f"Erro ao decodificar JSON: {exc}") - - -def select_highlights(srt_text: str) -> List[Dict[str, Any]]: - """Call the Gemini API to select highlight segments from a transcription. - - The input ``srt_text`` should be a string containing the transcription - formatted like an SRT file, with lines of the form - ``00:00:10,140 --> 00:01:00,990`` followed by the spoken text. - - Returns a list of dictionaries, each with ``start``, ``end`` and - ``text`` keys. On failure to parse the response, a ``LLMError`` is - raised. - """ - api_key = os.environ.get("GEMINI_API_KEY") - if not api_key: - raise ValueError("GEMINI_API_KEY não definido no ambiente") - - model = os.environ.get("GEMINI_MODEL", "gemini-2.5-flash") - - # Initialise client for Gemini. The base_url points to the - # generativelanguage API; see the official docs for details. - client = openai.OpenAI(api_key=api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/") - - # System prompt: instructs Gemini how to behave. - system_prompt = ( - "Você é um assistente especializado em selecionar **HIGHLIGHTS** de vídeo " - "a partir da transcrição com timestamps.\n" - "Sua única função é **selecionar os trechos** conforme solicitado.\n" - "- **Não resuma, não interprete, não gere comentários ou textos complementares.**\n" - "- **Retorne a resposta exatamente no formato proposto pelo usuário**, sem adicionar ou remover nada além do pedido.\n" - "- Cada trecho selecionado deve ter **no mínimo 60 segundos e no máximo 120 segundos** de duração.\n" - "- Sempre responda **em português (PT-BR)**." - ) - - # Base prompt: describes how to select highlights and the format to return. - base_prompt = ( - "Você assumirá o papel de um especialista em Marketing e Social Media, " - "sua tarefa é selecionar as melhores partes de uma transcrição que irei fornecer.\n\n" - "## Critérios de Seleção\n\n" - "- Escolha trechos baseando-se em:\n" - " - **Picos de emoção ou impacto**\n" - " - **Viradas de assunto**\n" - " - **Punchlines** (frases de efeito, momentos de virada)\n" - " - **Informações-chave**\n\n" - "## Regras Rápidas\n\n" - "- Sempre devolver pelo menos 3 trechos, não possui limite máximo\n" - "- Garanta que cada trecho fique com no MÍNIMO 60 segundos e no MÁXIMO 120 segundos.\n" - "- Nenhum outro texto além do JSON final.\n\n" - "## Restrições de Duração\n\n" - "- **Duração mínima do trecho escolhido:** 60 segundos\n" - "- **Duração máxima do trecho escolhido:** 90 a 120 segundos\n\n" - "## Tarefa\n\n" - "- Proponha o **máximo de trechos** com potencial, mas **sempre devolva no mínimo 3 trechos**.\n" - "- Extraia os trechos **apenas** da transcrição fornecida abaixo.\n\n" - "## IMPORTANTE\n" - "- Cada trecho deve ter no mínimo 60 segundos, e no máximo 120 segundos. Isso é indiscutível\n\n" - "## Entrada\n\n" - "- Transcrição:\n\n" - f"{srt_text}\n\n" - "## Saída\n\n" - "- Retorne **somente** a lista de trechos selecionados em formato JSON, conforme o exemplo abaixo.\n" - "- **Não escreva comentários ou qualquer texto extra.**\n" - "- No atributo \"text\", inclua o texto presente no trecho escolhido.\n\n" - "### Exemplo de Conversão\n\n" - "#### De SRT:\n" - "00:00:10,140 --> 00:01:00,990\n" - "Exemplo de escrita presente no trecho\n\n" - "#### Para JSON:\n" - "[\n" - " {\n" - " \"start\": \"00:00:10,140\",\n" - " \"end\": \"00:01:00,990\",\n" - " \"text\": \"Exemplo de escrita presente no trecho\"\n" - " }\n" - "]\n" - ) - - messages = [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": base_prompt}, - ] - try: - response = client.chat.completions.create(model=model, messages=messages) - except Exception as exc: - raise LLMError(f"Erro ao chamar a API Gemini: {exc}") - # Extract message content - content = response.choices[0].message.content if response.choices else None - if not content: - raise LLMError("A resposta da Gemini veio vazia.") - result = _extract_json_array(content) - if not isinstance(result, list): - raise LLMError("O JSON retornado pela Gemini não é uma lista.") - return result - - -def generate_titles(highlights: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Call the OpenRouter API to generate a title (topText) for each highlight. - - The ``highlights`` argument should be a list of dictionaries as returned - by ``select_highlights``, each containing ``start``, ``end`` and ``text``. - This function adds a ``topText`` field to each dictionary using the - OpenRouter model specified via the ``OPENROUTER_MODEL`` environment - variable. If parsing fails, an ``LLMError`` is raised. - """ - api_key = os.environ.get("OPENROUTER_API_KEY") - if not api_key: - raise ValueError("OPENROUTER_API_KEY não definido no ambiente") - model = os.environ.get("OPENROUTER_MODEL") - if not model: - raise ValueError("OPENROUTER_MODEL não definido no ambiente") - # Create client for OpenRouter - client = openai.OpenAI(api_key=api_key, base_url="https://openrouter.ai/api/v1") - - # Compose prompt: instruct to generate titles only - prompt_header = ( - "Você é um especialista em Marketing Digital e Criação de Conteúdo Viral.\n\n" - "Sua tarefa é criar **títulos sensacionalistas** (*topText*) para cada trecho " - "de transcrição recebido em formato JSON.\n\n" - "## Instruções\n\n" - "- O texto deve ser **chamativo, impactante** e com alto potencial de viralização " - "em redes sociais, **mas sem sair do contexto do trecho**.\n" - "- Use expressões fortes e curiosas, mas **nunca palavras de baixo calão**.\n" - "- Cada *topText* deve ter **no máximo 2 linhas**.\n" - "- Utilize **exclusivamente** o conteúdo do trecho; não invente fatos.\n" - "- Não adicione comentários, explicações, ou qualquer texto extra na resposta.\n" - "- Responda **apenas** no seguinte formato (mantendo as chaves e colchetes):\n\n" - "[\n {\n \"start\": \"00:00:10,140\",\n \"end\": \"00:01:00,990\",\n \"topText\": \"Título impactante\"\n }\n]\n\n" - "## Observações:\n\n" - "- Nunca fuja do contexto do trecho.\n" - "- Não invente informações.\n" - "- Não utilize palavrões.\n" - "- Não escreva nada além do JSON de saída.\n\n" - "Aqui estão os trechos em JSON:\n" - ) - # Compose input JSON for the model - json_input = json.dumps(highlights, ensure_ascii=False) - full_message = prompt_header + json_input - messages = [ - { - "role": "system", - "content": "Você é um assistente útil e objetivo." - }, - { - "role": "user", - "content": full_message - }, - ] - try: - response = client.chat.completions.create( - model=model, - messages=messages, - temperature=0.7, - ) - except Exception as exc: - raise LLMError(f"Erro ao chamar a API OpenRouter: {exc}") - content = response.choices[0].message.content if response.choices else None - if not content: - raise LLMError("A resposta da OpenRouter veio vazia.") - result = _extract_json_array(content) - if not isinstance(result, list): - raise LLMError("O JSON retornado pela OpenRouter não é uma lista.") - # Merge topText back into highlights - # We assume the result list has the same order and length as input highlights - enriched: List[Dict[str, Any]] = [] - input_map = {(item["start"], item["end"]): item for item in highlights} - for item in result: - key = (item.get("start"), item.get("end")) - original = input_map.get(key) - if original is None: - # If the model returns unexpected entries, skip them - continue - enriched_item = original.copy() - # Only topText is expected - enriched_item["topText"] = item.get("topText", "").strip() - enriched.append(enriched_item) - return enriched \ No newline at end of file diff --git a/main.py b/main.py index 32fd1d1..1ef531b 100644 --- a/main.py +++ b/main.py @@ -1,265 +1,16 @@ -"""Entry point for the video processing pipeline. - -This script listens to a RabbitMQ queue for new video processing tasks. When -a message arrives, it performs the following steps: - -1. Creates a working directory for the video based off of its filename. -2. Extracts the audio track with FFMPEG and runs Faster-Whisper to produce - a transcription with word-level timestamps. -3. Uses the Gemini model to determine which parts of the video have the - highest potential for engagement. These highlight segments are - represented as a list of objects containing start/end timestamps and - text. -4. Uses the OpenRouter model to generate a sensational title for each - highlight. Only the ``topText`` field is kept; the description is - intentionally omitted since the caption will be burned into the video. -5. Cuts the original video into individual clips corresponding to each - highlight and renders them vertically with a title above and a dynamic - caption below. -6. Publishes a message to the upload queue with information about the - generated clips. On success, this message contains the list of output - files. On failure, ``hasError`` will be set to ``True`` and the - ``error`` field will describe what went wrong. -7. Cleans up temporary files (audio, transcript, working directory) and - deletes the original source video from the ``videos`` directory to - conserve disk space. - -The queue names and RabbitMQ credentials are configured via environment -variables. See the accompanying ``docker-compose.yml`` for defaults. -""" - -from __future__ import annotations - -import json -import os -import shutil -import time -import traceback -from typing import Any, Dict, List - -import pika - -from .utils import sanitize_filename, seconds_to_timestamp, timestamp_to_seconds -from .transcribe import transcribe -from .llm import LLMError, select_highlights, generate_titles -from .render import render_clip +from video_render.config import load_settings +from video_render.logging_utils import setup_logging +from video_render.messaging import RabbitMQWorker +from video_render.pipeline import VideoPipeline -# Environment variables with sensible defaults -RABBITMQ_HOST = os.environ.get("RABBITMQ_HOST", "rabbitmq") -RABBITMQ_PORT = int(os.environ.get("RABBITMQ_PORT", 5672)) -RABBITMQ_USER = os.environ.get("RABBITMQ_USER", "admin") -RABBITMQ_PASS = os.environ.get("RABBITMQ_PASS") -RABBITMQ_QUEUE = os.environ.get("RABBITMQ_QUEUE", "to-render") -RABBITMQ_UPLOAD_QUEUE = os.environ.get("RABBITMQ_UPLOAD_QUEUE", "to-upload") +def main() -> None: + setup_logging() + settings = load_settings() -if not RABBITMQ_PASS: - raise RuntimeError("RABBITMQ_PASS não definido no ambiente") - - -def get_next_message() -> Any: - """Retrieve a single message from the RABBITMQ_QUEUE. - - Returns ``None`` if no messages are available. This helper opens a new - connection for each call to avoid keeping stale connections alive. - """ - credentials = pika.PlainCredentials(RABBITMQ_USER, RABBITMQ_PASS) - parameters = pika.ConnectionParameters( - host=RABBITMQ_HOST, - port=RABBITMQ_PORT, - credentials=credentials, - heartbeat=60, - blocked_connection_timeout=300, - ) - connection = pika.BlockingConnection(parameters) - channel = connection.channel() - method_frame, _, body = channel.basic_get(RABBITMQ_QUEUE) - if method_frame: - channel.basic_ack(method_frame.delivery_tag) - connection.close() - return body - connection.close() - return None - - -def publish_to_queue(payload: Dict[str, Any]) -> None: - """Publish a JSON-serialisable payload to the RABBITMQ_UPLOAD_QUEUE.""" - credentials = pika.PlainCredentials(RABBITMQ_USER, RABBITMQ_PASS) - parameters = pika.ConnectionParameters( - host=RABBITMQ_HOST, - port=RABBITMQ_PORT, - credentials=credentials, - heartbeat=60, - blocked_connection_timeout=300, - ) - connection = pika.BlockingConnection(parameters) - channel = connection.channel() - channel.queue_declare(queue=RABBITMQ_UPLOAD_QUEUE, durable=True) - channel.basic_publish( - exchange="", - routing_key=RABBITMQ_UPLOAD_QUEUE, - body=json.dumps(payload), - properties=pika.BasicProperties(delivery_mode=2), - ) - connection.close() - - -def build_srt(segments: List[Dict[str, Any]]) -> str: - """Build an SRT-like string from a list of segments. - - Each segment should have ``start``, ``end`` and ``text`` fields. The - timestamps are converted to the ``HH:MM:SS,mmm`` format expected by - the Gemini prompt. Segments are separated by a blank line. - """ - lines = [] - for seg in segments: - start_ts = seconds_to_timestamp(seg["start"]) - end_ts = seconds_to_timestamp(seg["end"]) - lines.append(f"{start_ts} --> {end_ts}\n{seg['text']}") - return "\n\n".join(lines) - - -def process_message(data: Dict[str, Any]) -> Dict[str, Any]: - """Process a single video task described in ``data``. - - Returns the payload to be sent to the upload queue. Raises an - exception on failure; the caller is responsible for catching it and - posting an error payload. - """ - filename = data.get("filename") - if not filename: - raise ValueError("Campo 'filename' ausente na mensagem") - url = data.get("url") - video_id = data.get("videoId") - # Determine source video path; n8n stores videos in the 'videos' directory - video_path = os.path.join("videos", filename) - if not os.path.exists(video_path): - raise FileNotFoundError(f"Arquivo de vídeo não encontrado: {video_path}") - # Sanitize the filename to use as directory name - base_no_ext = os.path.splitext(filename)[0] - sanitized = sanitize_filename(base_no_ext) - work_dir = os.path.join("app", "videos", sanitized) - # Transcribe video - segments, words = transcribe(video_path, work_dir) - # Build SRT string - srt_str = build_srt(segments) - # Call Gemini to select highlights - highlights = select_highlights(srt_str) - # Convert start/end times to floats and keep original strings for openrouter - for item in highlights: - item["start"] = item["start"].strip() - item["end"] = item["end"].strip() - # Generate titles - titles = generate_titles(highlights) - # Render clips - output_dir = os.path.join("outputs", sanitized) - processed_files: List[str] = [] - for idx, item in enumerate(titles, start=1): - start_sec = timestamp_to_seconds(item.get("start")) - end_sec = timestamp_to_seconds(item.get("end")) - # Extract relative words for caption - relative_words = [] - for w in words: - # Word must overlap clip interval - if w["end"] <= start_sec or w["start"] >= end_sec: - continue - rel_start = max(0.0, w["start"] - start_sec) - rel_end = max(0.0, w["end"] - start_sec) - relative_words.append({ - "start": rel_start, - "end": rel_end, - "word": w["word"], - }) - # If no words found (e.g. silence), create a dummy word to avoid errors - if not relative_words: - relative_words.append({"start": 0.0, "end": end_sec - start_sec, "word": ""}) - out_path = render_clip( - video_path=video_path, - start=start_sec, - end=end_sec, - top_text=item.get("topText", ""), - words=relative_words, - out_dir=output_dir, - base_name=sanitized, - idx=idx, - ) - processed_files.append(out_path) - # Compose payload - payload = { - "videosProcessedQuantity": len(processed_files), - "filename": filename, - "processedFiles": processed_files, - "url": url, - "videoId": video_id, - "hasError": False, - "error": None, - } - # Clean up working directory and original video - shutil.rmtree(work_dir, ignore_errors=True) - try: - os.remove(video_path) - except FileNotFoundError: - pass - return payload - - -def main(): - print(" [*] Esperando mensagens. Para sair: CTRL+C") - while True: - body = get_next_message() - if body is None: - time.sleep(5) - continue - try: - data = json.loads(body) - except Exception: - print("⚠️ Mensagem inválida recebida (não é JSON)") - continue - try: - result = process_message(data) - except Exception as exc: - # Print stack trace for debugging - traceback.print_exc() - # Attempt to clean up any directories based on filename - filename = data.get("filename") - sanitized = sanitize_filename(os.path.splitext(filename or "")[0]) if filename else "" - work_dir = os.path.join("app", "videos", sanitized) if sanitized else None - output_dir = os.path.join("outputs", sanitized) if sanitized else None - # Remove working and output directories - if work_dir: - shutil.rmtree(work_dir, ignore_errors=True) - if output_dir: - shutil.rmtree(output_dir, ignore_errors=True) - # Remove original video if present - video_path = os.path.join("videos", filename) if filename else None - if video_path and os.path.exists(video_path): - try: - os.remove(video_path) - except Exception: - pass - # Build error payload - error_payload = { - "videosProcessedQuantity": 0, - "filename": filename, - "processedFiles": [], - "url": data.get("url"), - "videoId": data.get("videoId"), - "hasError": True, - "error": str(exc), - } - try: - publish_to_queue(error_payload) - print(f"Mensagem de erro publicada na fila '{RABBITMQ_UPLOAD_QUEUE}'.") - except Exception as publish_err: - print(f"Erro ao publicar mensagem de erro: {publish_err}") - continue - # On success publish payload - try: - publish_to_queue(result) - print(f"Mensagem publicada na fila '{RABBITMQ_UPLOAD_QUEUE}'.") - except Exception as publish_err: - print(f"Erro ao publicar na fila '{RABBITMQ_UPLOAD_QUEUE}': {publish_err}") - # Loop continues + pipeline = VideoPipeline(settings) + worker = RabbitMQWorker(settings) + worker.consume_forever(pipeline.process_message) if __name__ == "__main__": diff --git a/prompts/generate.txt b/prompts/generate.txt new file mode 100644 index 0000000..ed2853b --- /dev/null +++ b/prompts/generate.txt @@ -0,0 +1,35 @@ +Voce e um estrategista de conteudo especializado em identificar cortes curtos de videos longos que performam bem em redes sociais. + +FUNCAO: +- Analisar a transcricao completa de um video. +- Escolher trechos curtos (entre 20s e 90s) com maior chance de engajamento. +- Responder APENAS em JSON valido. + +FORMATO DA RESPOSTA: +{ + "highlights": [ + { + "start": , + "end": , + "summary": "Resumo conciso do porque este trecho engaja" + } + ] +} + +REGRAS: +- Liste no maximo 6 destaques. +- Respeite a ordem cronologica. +- Nunca deixe listas vazias; se nada for relevante, inclua uma entrada com start = 0, end = 0 e summary explicando a ausencia de cortes. +- Utilize apenas valores numericos simples (ponto como separador decimal). +- Nao repita um mesmo trecho. + +PERSPECTIVA DE ANALISE: +- Concentre-se em momentos com gatilhos emocionais, insights, storytelling ou chamadas para acao fortes. +- Prefira trechos com comeco, meio e fim claros. +- Evite partes redundantes, silenciosas ou extremamente tecnicas. + +TAREFA: +- Leia a transcricao recebida no campo "transcript". +- Use a lista de marcas de tempo detalhadas no campo "segments" para embasar suas escolhas. +- Produza a saida JSON descrita acima. + diff --git a/render.py b/render.py deleted file mode 100644 index 539324e..0000000 --- a/render.py +++ /dev/null @@ -1,205 +0,0 @@ -"""Rendering logic for producing vertical clips with dynamic captions. - -This module defines a single function ``render_clip`` which takes a video -segment and produces a vertical clip suitable for social media. Each clip -contains three regions: - -* A top region (480px high) showing a title generated by an LLM. -* A middle region (960px high) containing the original video, scaled to - fit horizontally while preserving aspect ratio and centred vertically. -* A bottom region (480px high) showing a dynamic caption. The caption - displays a sliding window of three to five words from the transcript, - colouring the currently spoken word differently to draw the viewer's - attention. - -The function uses the MoviePy library to compose the various elements and -writes the resulting video to disk. It returns the path to the created -file. -""" - -from __future__ import annotations - -import os -from typing import Dict, List - -import numpy as np -from moviepy.video.io.VideoFileClip import VideoFileClip -from moviepy.video.VideoClip import ColorClip, VideoClip -from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip -from moviepy.video.VideoClip import TextClip -from PIL import Image, ImageDraw, ImageFont - -from .utils import wrap_text - - -def render_clip( - video_path: str, - start: float, - end: float, - top_text: str, - words: List[Dict[str, float]], - out_dir: str, - base_name: str, - idx: int, - # Use a widely available system font by default. DejaVuSans is installed - # in most Debian-based containers. The caller can override this path. - font_path: str = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", - final_width: int = 1080, - final_height: int = 1920, - top_h: int = 480, - middle_h: int = 960, - bottom_h: int = 480, - video_codec: str = "libx264", - bitrate: str = "3000k", -) -> str: - """Render a single clip with title and dynamic caption. - - Parameters - ---------- - video_path: str - Path to the source video file. - start: float - Start time of the clip in seconds. - end: float - End time of the clip in seconds. - top_text: str - The title to display in the top region. - words: List[Dict[str, float]] - List of word-level timestamps for this clip. Each dict must have - ``start``, ``end`` and ``word`` keys. The start and end values - should be relative to the beginning of this clip (i.e. start at 0). - out_dir: str - Directory where the output file should be saved. The function - creates this directory if it doesn't exist. - base_name: str - Base name of the original video (sanitized). Used to build the - output filename. - idx: int - Index of the clip. Output will be named ``clip_{idx}.mp4``. - font_path: str - Path to the TrueType font to use for both title and caption. - final_width: int - Width of the final video in pixels. - final_height: int - Height of the final video in pixels. - top_h: int - Height of the title area in pixels. - middle_h: int - Height of the video area in pixels. - bottom_h: int - Height of the caption area in pixels. - video_codec: str - FFmpeg codec to use when writing the video. - bitrate: str - Bitrate for the output video. - - Returns - ------- - str - The path to the rendered video file. - """ - os.makedirs(out_dir, exist_ok=True) - # Extract the segment from the source video - with VideoFileClip(video_path) as clip: - segment = clip.subclip(start, end) - dur = segment.duration - # Background - bg = ColorClip(size=(final_width, final_height), color=(0, 0, 0), duration=dur) - # Resize video to fit width - video_resized = segment.resize(width=final_width) - # Compute vertical position to centre in the middle region - y = top_h + (middle_h - video_resized.h) // 2 - video_resized = video_resized.set_position((0, y)) - - # Build title clip - # Wrap the title to avoid overflow - wrapped_lines = wrap_text(top_text, max_chars=40) - wrapped_title = "\n".join(wrapped_lines) - title_clip = TextClip( - wrapped_title, - font=font_path, - fontsize=70, - color="white", - method="caption", - size=(final_width, top_h), - align="center", - ).set_duration(dur).set_position((0, 0)) - - # Prepare font for caption rendering - pil_font = ImageFont.truetype(font_path, size=60) - default_color = (255, 255, 255) # white - highlight_color = (255, 215, 0) # gold-like yellow - - # Precompute widths of a space and bounding box height for vertical centering - space_width = pil_font.getbbox(" ")[2] - pil_font.getbbox(" ")[0] - bbox = pil_font.getbbox("A") - text_height = bbox[3] - bbox[1] - - def make_caption_frame(t: float): - """Generate an image for the caption at time t.""" - # Determine current word index - idx_cur = 0 - for i, w in enumerate(words): - if w["start"] <= t < w["end"]: - idx_cur = i - break - if t >= w["end"]: - idx_cur = i - # Define window of words to display: show up to 5 words - start_idx = max(0, idx_cur - 2) - end_idx = min(len(words), idx_cur + 3) - window = words[start_idx:end_idx] - # Compute widths for each word - word_sizes = [] - for w in window: - bbox = pil_font.getbbox(w["word"]) - word_width = bbox[2] - bbox[0] - word_sizes.append(word_width) - total_width = sum(word_sizes) + space_width * (len(window) - 1 if window else 0) - # Create blank image for caption area - img = Image.new("RGB", (final_width, bottom_h), color=(0, 0, 0)) - draw = ImageDraw.Draw(img) - x = int((final_width - total_width) / 2) - y_pos = int((bottom_h - text_height) / 2) - for j, w in enumerate(window): - color = highlight_color if (start_idx + j) == idx_cur else default_color - draw.text((x, y_pos), w["word"], font=pil_font, fill=color) - x += word_sizes[j] + space_width - return np.array(img) - - caption_clip = VideoClip(make_frame=make_caption_frame, duration=dur) - caption_clip = caption_clip.set_position((0, final_height - bottom_h)) - - # Compose final clip - final = CompositeVideoClip([ - bg, - video_resized, - title_clip, - caption_clip, - ], size=(final_width, final_height)) - # Use the original audio from the video segment - final_audio = segment.audio - if final_audio is not None: - final = final.set_audio(final_audio) - # Define output path - out_path = os.path.join(out_dir, f"clip_{idx}.mp4") - # Write to disk - final.write_videofile( - out_path, - codec=video_codec, - fps=30, - bitrate=bitrate, - audio_codec="aac", - preset="ultrafast", - ffmpeg_params=[ - "-tune", "zerolatency", - "-pix_fmt", "yuv420p", - "-profile:v", "high", - "-level", "4.1", - ], - threads=4, - ) - # Close clips to free resources - final.close() - segment.close() - return out_path \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f5ce0c5..f329669 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ -pika==1.3.2 -moviepy==2.0.0 -faster-whisper==1.2.0 -openai==1.16.0 -numpy==1.26.4 -Pillow==10.1.0 -unidecode==1.3.6 \ No newline at end of file +moviepy==2.2.0 +pillow==10.3.0 +numpy>=1.26.0 +requests>=2.31.0 +pika>=1.3.2 +faster-whisper==1.0.0 diff --git a/transcribe.py b/transcribe.py deleted file mode 100644 index 8cb4739..0000000 --- a/transcribe.py +++ /dev/null @@ -1,111 +0,0 @@ -"""Utilities for extracting audio from video and generating transcriptions. - -This module handles two tasks: - -1. Use FFMPEG to extract the audio track from a video file into a WAV file - suitable for consumption by the Whisper model. The audio is resampled to - 16 kHz mono PCM as required by Whisper. -2. Use the Faster-Whisper implementation to generate a transcription with - word-level timestamps. The transcription is returned both as a list of - segments (for building an SRT) and as a flattened list of words (for - building dynamic subtitles). - -If FFMPEG is not installed or fails, a ``RuntimeError`` is raised. The caller -is responsible for cleaning up the temporary files created in the working -directory. -""" - -from __future__ import annotations - -import os -import subprocess -from typing import Dict, List, Tuple - -from faster_whisper import WhisperModel - - -def extract_audio_ffmpeg(video_path: str, audio_path: str) -> None: - """Use FFMPEG to extract audio from ``video_path`` into ``audio_path``. - - The output will be a 16 kHz mono WAV file in PCM S16LE format. Any - existing file at ``audio_path`` will be overwritten. If ffmpeg returns - a non-zero exit code, a ``RuntimeError`` is raised with the stderr. - """ - cmd = [ - "ffmpeg", - "-y", # overwrite output - "-i", - video_path, - "-vn", # disable video recording - "-acodec", - "pcm_s16le", - "-ar", - "16000", - "-ac", - "1", - audio_path, - ] - proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if proc.returncode != 0: - raise RuntimeError(f"FFMPEG error: {proc.stderr.decode(errors='ignore')}") - - -def load_whisper_model() -> WhisperModel: - """Instantiate and cache a Faster-Whisper model. - - The model name and device can be configured via the ``WHISPER_MODEL`` and - ``WHISPER_DEVICE`` environment variables. The default model is - ``large-v3`` for best accuracy. The device can be ``cuda`` or ``cpu``. - A module-level cache is used to prevent loading the model multiple times. - """ - if hasattr(load_whisper_model, "_cache"): - return load_whisper_model._cache # type: ignore[attr-defined] - model_name = os.environ.get("WHISPER_MODEL", "large-v3") - device = os.environ.get("WHISPER_DEVICE", "cpu") - # Compute type can be set via WHISPER_COMPUTE_TYPE; default to float16 on GPU - compute_type = os.environ.get("WHISPER_COMPUTE_TYPE") - # If not explicitly set, choose sensible defaults - if compute_type is None: - compute_type = "float16" if device == "cuda" else "int8" - model = WhisperModel(model_name, device=device, compute_type=compute_type) - load_whisper_model._cache = model # type: ignore[attr-defined] - return model - - -def transcribe(video_path: str, work_dir: str) -> Tuple[List[Dict[str, float]], List[Dict[str, float]]]: - """Transcribe a video file using Faster-Whisper. - - ``video_path`` is the path to the video to transcribe. ``work_dir`` is a - directory where temporary files will be stored (audio file and - transcription). The function returns a tuple ``(segments, words)`` where - ``segments`` is a list of dictionaries with ``start``, ``end`` and - ``text`` fields, and ``words`` is a flat list of dictionaries with - ``start``, ``end`` and ``word`` fields covering the entire video. - The timestamps are expressed in seconds as floats. - """ - os.makedirs(work_dir, exist_ok=True) - audio_path = os.path.join(work_dir, "audio.wav") - # Extract audio - extract_audio_ffmpeg(video_path, audio_path) - # Load Whisper model - model = load_whisper_model() - # Run transcription with word-level timestamps - segments, info = model.transcribe(audio_path, word_timestamps=True) - seg_list: List[Dict[str, float]] = [] - words_list: List[Dict[str, float]] = [] - for seg in segments: - seg_list.append({ - "start": float(seg.start), - "end": float(seg.end), - "text": seg.text.strip(), - }) - # Each segment may contain words attribute - for w in getattr(seg, "words", []) or []: - words_list.append({ - "start": float(w.start), - "end": float(w.end), - "word": w.word, - }) - # Sort words by start time to be safe - words_list.sort(key=lambda d: d["start"]) - return seg_list, words_list \ No newline at end of file diff --git a/utils.py b/utils.py deleted file mode 100644 index c8f9dbc..0000000 --- a/utils.py +++ /dev/null @@ -1,93 +0,0 @@ -import re -import unicodedata -from typing import List, Tuple - - -def sanitize_filename(name: str) -> str: - """Return a sanitized version of a filename. - - This helper removes accents, converts to lowercase, replaces spaces - with underscores and removes any non alphanumeric characters except - underscores and dots. This makes the directory names safe to use on - most filesystems and matches the behaviour described in the spec. - """ - if not name: - return "" - # Decompose Unicode characters and strip accents - nfkd_form = unicodedata.normalize("NFKD", name) - no_accents = "".join(c for c in nfkd_form if not unicodedata.combining(c)) - # Replace spaces with underscores - no_spaces = no_accents.replace(" ", "_") - # Lowercase and remove any character that is not a letter, digit, dot or underscore - sanitized = re.sub(r"[^A-Za-z0-9_.]+", "", no_spaces) - return sanitized - - -def timestamp_to_seconds(ts: str) -> float: - """Convert a timestamp in HH:MM:SS,mmm format to seconds. - - The Gemini and OpenRouter prompts use timestamps formatted with a comma - as the decimal separator. This helper splits the string into hours, - minutes and seconds and returns a float expressed in seconds. - """ - if ts is None: - return 0.0 - ts = ts.strip() - if not ts: - return 0.0 - # Replace comma by dot for decimal seconds - ts = ts.replace(",", ".") - parts = ts.split(":") - parts = [float(p) for p in parts] - if len(parts) == 3: - h, m, s = parts - return h * 3600 + m * 60 + s - elif len(parts) == 2: - m, s = parts - return m * 60 + s - else: - # only seconds - return parts[0] - - -def seconds_to_timestamp(seconds: float) -> str: - """Convert a time in seconds to HH:MM:SS,mmm format expected by SRT.""" - if seconds < 0: - seconds = 0 - h = int(seconds // 3600) - m = int((seconds % 3600) // 60) - s = seconds % 60 - # Format with comma as decimal separator and three decimal places - return f"{h:02d}:{m:02d}:{s:06.3f}".replace(".", ",") - - -def wrap_text(text: str, max_chars: int = 80) -> List[str]: - """Simple word-wrap for a string. - - Splits ``text`` into a list of lines, each at most ``max_chars`` - characters long. This does not attempt to hyphenate words – a word - longer than ``max_chars`` will occupy its own line. The return value - is a list of lines without trailing whitespace. - """ - if not text: - return [] - words = text.split() - lines: List[str] = [] - current: List[str] = [] - current_len = 0 - for word in words: - # If adding this word would exceed the max, flush current line - if current and current_len + 1 + len(word) > max_chars: - lines.append(" ".join(current)) - current = [word] - current_len = len(word) - else: - # Add to current line - if current: - current_len += 1 + len(word) - else: - current_len = len(word) - current.append(word) - if current: - lines.append(" ".join(current)) - return lines \ No newline at end of file diff --git a/video_render/__init__.py b/video_render/__init__.py new file mode 100644 index 0000000..e6a2b67 --- /dev/null +++ b/video_render/__init__.py @@ -0,0 +1,4 @@ +""" +Core package for the revamped video rendering pipeline. +""" + diff --git a/video_render/__pycache__/__init__.cpython-39.pyc b/video_render/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..c56007a Binary files /dev/null and b/video_render/__pycache__/__init__.cpython-39.pyc differ diff --git a/video_render/__pycache__/config.cpython-39.pyc b/video_render/__pycache__/config.cpython-39.pyc new file mode 100644 index 0000000..dbce374 Binary files /dev/null and b/video_render/__pycache__/config.cpython-39.pyc differ diff --git a/video_render/__pycache__/ffmpeg.cpython-39.pyc b/video_render/__pycache__/ffmpeg.cpython-39.pyc new file mode 100644 index 0000000..e0ceeeb Binary files /dev/null and b/video_render/__pycache__/ffmpeg.cpython-39.pyc differ diff --git a/video_render/__pycache__/llm.cpython-39.pyc b/video_render/__pycache__/llm.cpython-39.pyc new file mode 100644 index 0000000..72379c3 Binary files /dev/null and b/video_render/__pycache__/llm.cpython-39.pyc differ diff --git a/video_render/__pycache__/logging_utils.cpython-39.pyc b/video_render/__pycache__/logging_utils.cpython-39.pyc new file mode 100644 index 0000000..90fdd43 Binary files /dev/null and b/video_render/__pycache__/logging_utils.cpython-39.pyc differ diff --git a/video_render/__pycache__/media.cpython-39.pyc b/video_render/__pycache__/media.cpython-39.pyc new file mode 100644 index 0000000..b3ae715 Binary files /dev/null and b/video_render/__pycache__/media.cpython-39.pyc differ diff --git a/video_render/__pycache__/messaging.cpython-39.pyc b/video_render/__pycache__/messaging.cpython-39.pyc new file mode 100644 index 0000000..5b011f5 Binary files /dev/null and b/video_render/__pycache__/messaging.cpython-39.pyc differ diff --git a/video_render/__pycache__/pipeline.cpython-39.pyc b/video_render/__pycache__/pipeline.cpython-39.pyc new file mode 100644 index 0000000..4e176c7 Binary files /dev/null and b/video_render/__pycache__/pipeline.cpython-39.pyc differ diff --git a/video_render/__pycache__/rendering.cpython-39.pyc b/video_render/__pycache__/rendering.cpython-39.pyc new file mode 100644 index 0000000..18da5d3 Binary files /dev/null and b/video_render/__pycache__/rendering.cpython-39.pyc differ diff --git a/video_render/__pycache__/transcription.cpython-39.pyc b/video_render/__pycache__/transcription.cpython-39.pyc new file mode 100644 index 0000000..c15552a Binary files /dev/null and b/video_render/__pycache__/transcription.cpython-39.pyc differ diff --git a/video_render/__pycache__/utils.cpython-39.pyc b/video_render/__pycache__/utils.cpython-39.pyc new file mode 100644 index 0000000..0ce9f5a Binary files /dev/null and b/video_render/__pycache__/utils.cpython-39.pyc differ diff --git a/video_render/config.py b/video_render/config.py new file mode 100644 index 0000000..ee42f54 --- /dev/null +++ b/video_render/config.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import os +from dataclasses import dataclass +from pathlib import Path + + +BASE_DIR = Path(__file__).resolve().parent.parent +VIDEOS_ROOT = BASE_DIR / "videos" +OUTPUTS_ROOT = BASE_DIR / "outputs" +TEMP_ROOT = BASE_DIR / "temp" + + +@dataclass(frozen=True) +class RabbitMQSettings: + host: str = os.environ.get("RABBITMQ_HOST", "rabbitmq") + port: int = int(os.environ.get("RABBITMQ_PORT", 5672)) + user: str = os.environ.get("RABBITMQ_USER", "admin") + password: str = os.environ.get("RABBITMQ_PASS", "") + consume_queue: str = os.environ.get("RABBITMQ_QUEUE", "to-render") + publish_queue: str = os.environ.get("RABBITMQ_UPLOAD_QUEUE", "to-upload") + prefetch_count: int = int(os.environ.get("RABBITMQ_PREFETCH", 1)) + heartbeat: int = int(os.environ.get("RABBITMQ_HEARTBEAT", 60)) + blocked_timeout: int = int(os.environ.get("RABBITMQ_BLOCKED_TIMEOUT", 300)) + + +@dataclass(frozen=True) +class GeminiSettings: + api_key: str = os.environ.get("GEMINI_API_KEY", "") + model: str = os.environ.get("GEMINI_MODEL", "gemini-1.5-pro-latest") + safety_settings: str | None = os.environ.get("GEMINI_SAFETY_SETTINGS") + temperature: float = float(os.environ.get("GEMINI_TEMPERATURE", 0.2)) + top_k: int | None = ( + int(os.environ["GEMINI_TOP_K"]) if os.environ.get("GEMINI_TOP_K") else None + ) + top_p: float | None = ( + float(os.environ["GEMINI_TOP_P"]) if os.environ.get("GEMINI_TOP_P") else None + ) + prompt_path: str = os.environ.get("GEMINI_PROMPT_PATH", "prompts/generate.txt") + + +@dataclass(frozen=True) +class OpenRouterSettings: + api_key: str = os.environ.get("OPENROUTER_API_KEY", "") + model: str = os.environ.get( + "OPENROUTER_MODEL", "anthropic/claude-3-haiku:beta" + ) + temperature: float = float(os.environ.get("OPENROUTER_TEMPERATURE", 0.6)) + max_output_tokens: int = int(os.environ.get("OPENROUTER_MAX_OUTPUT_TOKENS", 256)) + + +@dataclass(frozen=True) +class WhisperSettings: + model_size: str = os.environ.get("FASTER_WHISPER_MODEL_SIZE", "medium") + device: str | None = os.environ.get("FASTER_WHISPER_DEVICE") + compute_type: str | None = os.environ.get("FASTER_WHISPER_COMPUTE_TYPE") + download_root: Path = Path( + os.environ.get("FASTER_WHISPER_DOWNLOAD_ROOT", str(BASE_DIR / ".whisper")) + ) + + +@dataclass(frozen=True) +class RenderingSettings: + frame_width: int = int(os.environ.get("RENDER_WIDTH", 1080)) + frame_height: int = int(os.environ.get("RENDER_HEIGHT", 1920)) + fps: int = int(os.environ.get("RENDER_FPS", 30)) + video_codec: str = os.environ.get("RENDER_CODEC", "libx264") + audio_codec: str = os.environ.get("RENDER_AUDIO_CODEC", "aac") + bitrate: str = os.environ.get("RENDER_BITRATE", "5000k") + preset: str = os.environ.get("RENDER_PRESET", "faster") + highlight_color: str = os.environ.get("SUBTITLE_HIGHLIGHT_COLOR", "#FFD200") + base_color: str = os.environ.get("SUBTITLE_BASE_COLOR", "#FFFFFF") + font_path: Path = Path(os.environ.get("RENDER_FONT_PATH", "./Montserrat.ttf")) + title_font_size: int = int(os.environ.get("RENDER_TITLE_FONT_SIZE", 110)) + subtitle_font_size: int = int(os.environ.get("RENDER_SUBTITLE_FONT_SIZE", 64)) + caption_min_words: int = int(os.environ.get("CAPTION_MIN_WORDS", 3)) + caption_max_words: int = int(os.environ.get("CAPTION_MAX_WORDS", 4)) + + +@dataclass(frozen=True) +class Settings: + rabbitmq: RabbitMQSettings = RabbitMQSettings() + gemini: GeminiSettings = GeminiSettings() + openrouter: OpenRouterSettings = OpenRouterSettings() + whisper: WhisperSettings = WhisperSettings() + rendering: RenderingSettings = RenderingSettings() + + videos_dir: Path = VIDEOS_ROOT + outputs_dir: Path = OUTPUTS_ROOT + temp_dir: Path = TEMP_ROOT + + +def load_settings() -> Settings: + settings = Settings() + + if not settings.rabbitmq.password: + raise RuntimeError("RABBITMQ_PASS must be provided") + + settings.videos_dir.mkdir(parents=True, exist_ok=True) + settings.outputs_dir.mkdir(parents=True, exist_ok=True) + settings.temp_dir.mkdir(parents=True, exist_ok=True) + + return settings diff --git a/video_render/ffmpeg.py b/video_render/ffmpeg.py new file mode 100644 index 0000000..358d7a5 --- /dev/null +++ b/video_render/ffmpeg.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import logging +import shlex +import subprocess +from pathlib import Path +from typing import Sequence + +logger = logging.getLogger(__name__) + + +def _run_ffmpeg(args: Sequence[str]) -> None: + cmd = ["ffmpeg", "-hide_banner", "-loglevel", "error", *args] + logger.debug("Executando ffmpeg: %s", " ".join(shlex.quote(part) for part in cmd)) + completed = subprocess.run(cmd, check=False) + if completed.returncode != 0: + raise RuntimeError(f"ffmpeg falhou com exit code {completed.returncode}") + + +def extract_audio_to_wav(input_video: Path, output_wav: Path) -> Path: + _run_ffmpeg( + [ + "-y", + "-i", + str(input_video), + "-ac", + "1", + "-ar", + "16000", + "-vn", + str(output_wav), + ] + ) + return output_wav + + +def create_video_segment(input_video: Path, start: float, end: float, output_path: Path) -> Path: + duration = max(0.01, end - start) + _run_ffmpeg( + [ + "-y", + "-i", + str(input_video), + "-ss", + f"{start:.3f}", + "-t", + f"{duration:.3f}", + "-c", + "copy", + str(output_path), + ] + ) + return output_path + diff --git a/video_render/llm.py b/video_render/llm.py new file mode 100644 index 0000000..8a7b143 --- /dev/null +++ b/video_render/llm.py @@ -0,0 +1,187 @@ +from __future__ import annotations + +import json +import logging +from pathlib import Path +from typing import Dict, List + +import requests + +from .config import BASE_DIR, Settings +from .transcription import TranscriptionResult + +logger = logging.getLogger(__name__) + +GEMINI_ENDPOINT_TEMPLATE = "https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent" +OPENROUTER_ENDPOINT = "https://openrouter.ai/api/v1/chat/completions" + + +class GeminiHighlighter: + def __init__(self, settings: Settings) -> None: + if not settings.gemini.api_key: + raise RuntimeError("GEMINI_API_KEY nao foi definido") + + prompt_path = Path(settings.gemini.prompt_path) + + if not prompt_path.is_absolute(): + prompt_path = BASE_DIR / prompt_path + + if not prompt_path.exists(): + raise FileNotFoundError(f"Prompt do Gemini nao encontrado: {prompt_path}") + + self.prompt_template = prompt_path.read_text(encoding="utf-8") + self.settings = settings + + def generate_highlights(self, transcription: TranscriptionResult) -> List[Dict]: + payload = { + "transcript": transcription.full_text, + "segments": [ + { + "start": segment.start, + "end": segment.end, + "text": segment.text, + } + for segment in transcription.segments + ], + } + + body = { + "contents": [ + { + "role": "user", + "parts": [ + {"text": self.prompt_template}, + {"text": json.dumps(payload, ensure_ascii=False)}, + ], + } + ] + } + + if self.settings.gemini.temperature is not None: + body["generationConfig"] = { + "temperature": self.settings.gemini.temperature, + } + if self.settings.gemini.top_p is not None: + body["generationConfig"]["topP"] = self.settings.gemini.top_p + if self.settings.gemini.top_k is not None: + body["generationConfig"]["topK"] = self.settings.gemini.top_k + + url = GEMINI_ENDPOINT_TEMPLATE.format(model=self.settings.gemini.model) + params = {"key": self.settings.gemini.api_key} + + response = requests.post(url, params=params, json=body, timeout=120) + response.raise_for_status() + data = response.json() + + candidates = data.get("candidates") or [] + if not candidates: + raise RuntimeError("Gemini nao retornou candidatos") + + text_parts = candidates[0].get("content", {}).get("parts", []) + if not text_parts: + raise RuntimeError("Resposta do Gemini sem conteudo") + + raw_text = text_parts[0].get("text") + if not raw_text: + raise RuntimeError("Resposta do Gemini sem texto") + + parsed = self._extract_json(raw_text) + highlights = parsed.get("highlights") + if not isinstance(highlights, list): + raise ValueError("Resposta do Gemini invalida: campo 'highlights' ausente") + return highlights + + @staticmethod + def _extract_json(response_text: str) -> Dict: + try: + return json.loads(response_text) + except json.JSONDecodeError: + start = response_text.find("{") + end = response_text.rfind("}") + if start == -1 or end == -1: + raise + subset = response_text[start : end + 1] + return json.loads(subset) + + +class OpenRouterCopywriter: + def __init__(self, settings: Settings) -> None: + if not settings.openrouter.api_key: + raise RuntimeError("OPENROUTER_API_KEY nao foi definido") + self.settings = settings + + def generate_titles(self, highlights: List[Dict]) -> List[str]: + if not highlights: + return [] + + prompt = ( + "Voce e um copywriter especializado em titulos curtos e virais para reels.\n" + "Recebera uma lista de trechos destacados de um video com resumo e tempo.\n" + "Produza um titulo envolvente (ate 60 caracteres) para cada item.\n" + "Responda apenas em JSON com a seguinte estrutura:\n" + '{"titles": ["titulo 1", "titulo 2"]}\n' + "Titulos devem ser em portugues, usar verbos fortes e refletir o resumo." + ) + + user_payload = { + "highlights": [ + { + "start": item.get("start"), + "end": item.get("end"), + "summary": item.get("summary"), + } + for item in highlights + ] + } + + body = { + "model": self.settings.openrouter.model, + "temperature": self.settings.openrouter.temperature, + "max_tokens": self.settings.openrouter.max_output_tokens, + "messages": [ + {"role": "system", "content": prompt}, + { + "role": "user", + "content": json.dumps(user_payload, ensure_ascii=False), + }, + ], + } + + headers = { + "Authorization": f"Bearer {self.settings.openrouter.api_key}", + "Content-Type": "application/json", + "HTTP-Referer": "https://localhost", + "X-Title": "video-render-pipeline", + } + + response = requests.post( + OPENROUTER_ENDPOINT, json=body, headers=headers, timeout=120 + ) + response.raise_for_status() + data = response.json() + + choices = data.get("choices") or [] + if not choices: + raise RuntimeError("OpenRouter nao retornou escolhas") + + message = choices[0].get("message", {}).get("content") + if not message: + raise RuntimeError("Resposta do OpenRouter sem conteudo") + + parsed = self._extract_json(message) + titles = parsed.get("titles") + if not isinstance(titles, list): + raise ValueError("Resposta do OpenRouter invalida: campo 'titles'") + return [str(title) for title in titles] + + @staticmethod + def _extract_json(response_text: str) -> Dict: + try: + return json.loads(response_text) + except json.JSONDecodeError: + start = response_text.find("{") + end = response_text.rfind("}") + if start == -1 or end == -1: + raise + subset = response_text[start : end + 1] + return json.loads(subset) diff --git a/video_render/logging_utils.py b/video_render/logging_utils.py new file mode 100644 index 0000000..c3967f4 --- /dev/null +++ b/video_render/logging_utils.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +import logging +import os + + +def setup_logging() -> None: + log_level = os.environ.get("LOG_LEVEL", "INFO").upper() + logging.basicConfig( + level=log_level, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + ) + diff --git a/video_render/media.py b/video_render/media.py new file mode 100644 index 0000000..360b231 --- /dev/null +++ b/video_render/media.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import logging +import shutil +from dataclasses import dataclass +from pathlib import Path + +from .config import Settings +from .ffmpeg import extract_audio_to_wav +from .utils import ensure_workspace, remove_paths, sanitize_filename + +logger = logging.getLogger(__name__) + + +@dataclass +class VideoWorkspace: + original_filename: str + sanitized_name: str + workspace_dir: Path + output_dir: Path + source_path: Path + working_video_path: Path + audio_path: Path + + +class MediaPreparer: + def __init__(self, settings: Settings) -> None: + self.settings = settings + + def prepare(self, filename: str) -> VideoWorkspace: + source_path = self.settings.videos_dir / filename + if not source_path.exists(): + raise FileNotFoundError(f"Arquivo de vídeo não encontrado: {source_path}") + + sanitized_name = sanitize_filename(Path(filename).stem) + workspace_dir = ensure_workspace(self.settings.videos_dir, sanitized_name) + + existing_children = list(workspace_dir.iterdir()) + if existing_children: + logger.info("Limpando workspace existente para %s", sanitized_name) + remove_paths(existing_children) + + destination_name = f"{sanitized_name}{source_path.suffix.lower()}" + working_video_path = workspace_dir / destination_name + shutil.copy2(source_path, working_video_path) + logger.info("Cópia do vídeo criada em %s", working_video_path) + + output_dir = ensure_workspace(self.settings.outputs_dir, sanitized_name) + existing_outputs = list(output_dir.iterdir()) + if existing_outputs: + remove_paths(existing_outputs) + + audio_path = workspace_dir / "audio.wav" + extract_audio_to_wav(working_video_path, audio_path) + + return VideoWorkspace( + original_filename=filename, + sanitized_name=sanitized_name, + workspace_dir=workspace_dir, + output_dir=output_dir, + source_path=source_path, + working_video_path=working_video_path, + audio_path=audio_path, + ) diff --git a/video_render/messaging.py b/video_render/messaging.py new file mode 100644 index 0000000..28470f5 --- /dev/null +++ b/video_render/messaging.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import json +import logging +from typing import Any, Callable, Dict + +import pika + +from .config import Settings + +logger = logging.getLogger(__name__) + +MessageHandler = Callable[[Dict[str, Any]], Dict[str, Any]] + + +class RabbitMQWorker: + def __init__(self, settings: Settings) -> None: + self.settings = settings + self._params = pika.ConnectionParameters( + host=settings.rabbitmq.host, + port=settings.rabbitmq.port, + credentials=pika.PlainCredentials( + settings.rabbitmq.user, settings.rabbitmq.password + ), + heartbeat=settings.rabbitmq.heartbeat, + blocked_connection_timeout=settings.rabbitmq.blocked_timeout, + ) + + def consume_forever(self, handler: MessageHandler) -> None: + while True: + try: + with pika.BlockingConnection(self._params) as connection: + channel = connection.channel() + channel.queue_declare(queue=self.settings.rabbitmq.consume_queue, durable=True) + channel.queue_declare(queue=self.settings.rabbitmq.publish_queue, durable=True) + channel.basic_qos(prefetch_count=self.settings.rabbitmq.prefetch_count) + + def _on_message(ch: pika.adapters.blocking_connection.BlockingChannel, method, properties, body): + try: + message = json.loads(body) + except json.JSONDecodeError: + logger.error("Mensagem inválida recebida: %s", body) + ch.basic_ack(delivery_tag=method.delivery_tag) + return + + logger.info("Mensagem recebida: %s", message.get("filename", "")) + try: + response = handler(message) + except Exception: + logger.exception("Erro não tratado durante o processamento") + response = { + "hasError": True, + "error": "Erro não tratado no pipeline", + "filename": message.get("filename"), + "videoId": message.get("videoId"), + "url": message.get("url"), + "processedFiles": [], + } + + try: + payload = json.dumps(response) + ch.basic_publish( + exchange="", + routing_key=self.settings.rabbitmq.publish_queue, + body=payload, + properties=pika.BasicProperties(delivery_mode=2), + ) + logger.info("Resposta publicada para '%s'", self.settings.rabbitmq.publish_queue) + except Exception: + logger.exception("Falha ao publicar a resposta na fila de upload") + finally: + ch.basic_ack(delivery_tag=method.delivery_tag) + + channel.basic_consume( + queue=self.settings.rabbitmq.consume_queue, + on_message_callback=_on_message, + auto_ack=False, + ) + logger.info("Consumidor iniciado. Aguardando mensagens...") + channel.start_consuming() + except pika.exceptions.AMQPConnectionError: + logger.exception("Conexão com RabbitMQ perdida. Tentando reconectar...") + except KeyboardInterrupt: + logger.info("Encerrando consumidor por interrupção do usuário.") + break diff --git a/video_render/pipeline.py b/video_render/pipeline.py new file mode 100644 index 0000000..0d4b7cd --- /dev/null +++ b/video_render/pipeline.py @@ -0,0 +1,236 @@ +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional + +from .config import Settings +from .llm import GeminiHighlighter, OpenRouterCopywriter +from .media import MediaPreparer, VideoWorkspace +from .transcription import TranscriptionResult, TranscriptionService +from .utils import remove_paths, sanitize_filename +from .rendering import VideoRenderer + +logger = logging.getLogger(__name__) + + +@dataclass +class JobMessage: + filename: str + url: Optional[str] + video_id: Optional[str] + extras: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class HighlightWindow: + start: float + end: float + summary: str + title: Optional[str] = None + + +@dataclass +class RenderedClip: + path: Path + start: float + end: float + title: str + summary: str + index: int + + +@dataclass +class PipelineContext: + job: JobMessage + workspace: Optional[VideoWorkspace] = None + transcription: Optional[TranscriptionResult] = None + highlight_windows: List[HighlightWindow] = field(default_factory=list) + rendered_clips: List[RenderedClip] = field(default_factory=list) + + +class VideoPipeline: + def __init__(self, settings: Settings) -> None: + self.settings = settings + self.media_preparer = MediaPreparer(settings) + self.transcriber = TranscriptionService(settings) + self.highlighter = GeminiHighlighter(settings) + self.copywriter = OpenRouterCopywriter(settings) + self.renderer = VideoRenderer(settings) + + def process_message(self, message: Dict[str, Any]) -> Dict[str, Any]: + context = PipelineContext(job=self._parse_job(message)) + try: + self._prepare_workspace(context) + self._generate_transcription(context) + self._determine_highlights(context) + self._generate_titles(context) + self._render_clips(context) + return self._build_success_payload(context) + except Exception as exc: + logger.exception("Falha ao processar vídeo %s", context.job.filename) + return self._handle_failure(context, exc) + + def _parse_job(self, message: Dict[str, Any]) -> JobMessage: + filename = message.get("filename") + if not filename: + raise ValueError("Mensagem inválida: 'filename' é obrigatório") + + url = message.get("url") + video_id = message.get("videoId") or message.get("video_id") + extras = { + key: value + for key, value in message.items() + if key not in {"filename", "url", "videoId", "video_id"} + } + return JobMessage(filename=filename, url=url, video_id=video_id, extras=extras) + + def _prepare_workspace(self, context: PipelineContext) -> None: + context.workspace = self.media_preparer.prepare(context.job.filename) + + def _generate_transcription(self, context: PipelineContext) -> None: + if not context.workspace: + raise RuntimeError("Workspace não preparado") + transcription = self.transcriber.transcribe(context.workspace.audio_path) + TranscriptionService.persist(transcription, context.workspace.workspace_dir) + context.transcription = transcription + + def _determine_highlights(self, context: PipelineContext) -> None: + if not context.transcription: + raise RuntimeError("Transcricao nao disponivel") + + highlights_raw = self.highlighter.generate_highlights(context.transcription) + windows: List[HighlightWindow] = [] + + for item in highlights_raw: + try: + start = float(item.get("start", 0)) # type: ignore[arg-type] + end = float(item.get("end", start)) # type: ignore[arg-type] + except (TypeError, ValueError): + logger.warning("Highlight invalido ignorado: %s", item) + continue + + summary = str(item.get("summary", "")).strip() + if end <= start: + logger.debug("Highlight com intervalo invalido ignorado: %s", item) + continue + + windows.append(HighlightWindow(start=start, end=end, summary=summary)) + + if not windows: + last_end = ( + context.transcription.segments[-1].end + if context.transcription.segments + else 0 + ) + windows.append( + HighlightWindow( + start=0.0, + end=max(last_end, 10.0), + summary="Sem destaque identificado; fallback automatico.", + ) + ) + + context.highlight_windows = windows + + def _generate_titles(self, context: PipelineContext) -> None: + if not context.highlight_windows: + return + + highlight_dicts = [ + {"start": window.start, "end": window.end, "summary": window.summary} + for window in context.highlight_windows + ] + titles = self.copywriter.generate_titles(highlight_dicts) + + for window, title in zip(context.highlight_windows, titles): + window.title = title.strip() + + + def _render_clips(self, context: PipelineContext) -> None: + if not context.workspace or not context.highlight_windows or not context.transcription: + return + + titles = [ + window.title or window.summary for window in context.highlight_windows + ] + + render_results = self.renderer.render( + workspace_path=str(context.workspace.working_video_path), + highlight_windows=context.highlight_windows, + transcription=context.transcription, + titles=titles, + output_dir=context.workspace.output_dir, + ) + + context.rendered_clips = [ + RenderedClip( + path=Path(path), + start=start, + end=end, + title=title, + summary=summary, + index=index, + ) + for path, start, end, title, summary, index in render_results + ] + + def _build_success_payload(self, context: PipelineContext) -> Dict[str, Any]: + return { + "hasError": False, + "videosProcessedQuantity": len(context.rendered_clips), + "filename": context.job.filename, + "videoId": context.job.video_id, + "url": context.job.url, + "workspaceFolder": context.workspace.sanitized_name if context.workspace else None, + "outputDirectory": self._relative_path(context.workspace.output_dir) if context.workspace else None, + "processedFiles": [ + { + "path": self._relative_path(clip.path), + "start": clip.start, + "end": clip.end, + "title": clip.title, + "summary": clip.summary, + "clipIndex": clip.index, + } + for clip in context.rendered_clips + ], + } + + def _handle_failure(self, context: PipelineContext, exc: Exception) -> Dict[str, Any]: + logger.error("Erro no pipeline: %s", exc) + cleanup_targets: List[Path] = [] + + if context.workspace: + cleanup_targets.append(context.workspace.workspace_dir) + cleanup_targets.append(context.workspace.output_dir) + original_path = context.workspace.source_path + if original_path.exists(): + cleanup_targets.append(original_path) + else: + sanitized = sanitize_filename(Path(context.job.filename).stem) + job_output_dir = self.settings.outputs_dir / sanitized + if job_output_dir.exists(): + cleanup_targets.append(job_output_dir) + original_path = self.settings.videos_dir / context.job.filename + if original_path.exists(): + cleanup_targets.append(original_path) + + remove_paths(cleanup_targets) + + return { + "hasError": True, + "error": str(exc), + "filename": context.job.filename, + "videoId": context.job.video_id, + "url": context.job.url, + "processedFiles": [], + } + + def _relative_path(self, path: Path) -> str: + base = self.settings.videos_dir.parent + try: + return str(path.relative_to(base)) + except ValueError: + return str(path) diff --git a/video_render/rendering.py b/video_render/rendering.py new file mode 100644 index 0000000..b59abb7 --- /dev/null +++ b/video_render/rendering.py @@ -0,0 +1,406 @@ +from __future__ import annotations + +import logging +import math +import re +from dataclasses import dataclass +from typing import Iterable, List, Sequence, Tuple + +import numpy as np +from moviepy.editor import ( + ColorClip, + CompositeVideoClip, + ImageClip, + TextClip, + VideoFileClip, +) +from PIL import Image, ImageColor, ImageDraw, ImageFont + +from .config import Settings +from .transcription import TranscriptionResult, WordTiming + +logger = logging.getLogger(__name__) + + +def clamp_time(value: float, minimum: float = 0.0) -> float: + return max(minimum, float(value)) + + +@dataclass +class CaptionClipSet: + base: ImageClip + highlights: List[ImageClip] + + +class CaptionBuilder: + def __init__(self, settings: Settings) -> None: + self.settings = settings + self.font_path = settings.rendering.font_path + if not self.font_path.exists(): + raise FileNotFoundError(f"Fonte nao encontrada: {self.font_path}") + + self.font = ImageFont.truetype( + str(self.font_path), settings.rendering.subtitle_font_size + ) + self.base_color = ImageColor.getrgb(settings.rendering.base_color) + self.highlight_color = ImageColor.getrgb(settings.rendering.highlight_color) + self.canvas_width = settings.rendering.frame_width - 160 + self.canvas_height = int(settings.rendering.subtitle_font_size * 2.2) + self.min_words = settings.rendering.caption_min_words + self.max_words = settings.rendering.caption_max_words + + bbox = self.font.getbbox("Ay") + self.text_height = bbox[3] - bbox[1] + self.baseline = (self.canvas_height - self.text_height) // 2 - bbox[1] + self.space_width = self.font.getbbox(" ")[2] - self.font.getbbox(" ")[0] + + def build(self, words: Sequence[WordTiming], clip_start: float) -> List[CaptionClipSet]: + grouped = self._group_words(words) + clip_sets: List[CaptionClipSet] = [] + + for group in grouped: + group_start = clamp_time(group[0].start, minimum=clip_start) + group_end = clamp_time(group[-1].end, minimum=group_start + 0.05) + duration = max(0.05, group_end - group_start) + start_offset = group_start - clip_start + + base_image, highlight_images = self._render_group(group) + + base_clip = ( + ImageClip(np.array(base_image)) + .with_start(start_offset) + .with_duration(duration) + ) + + highlight_clips: List[ImageClip] = [] + for word, image in zip(group, highlight_images): + h_start = clamp_time(word.start, minimum=clip_start) - clip_start + h_end = clamp_time(word.end, minimum=word.start + 0.02) - clip_start + h_duration = max(0.05, h_end - h_start) + highlight_clip = ( + ImageClip(np.array(image)) + .with_start(h_start) + .with_duration(h_duration) + ) + highlight_clips.append(highlight_clip) + + clip_sets.append(CaptionClipSet(base=base_clip, highlights=highlight_clips)) + + return clip_sets + + def _render_group(self, group: Sequence[WordTiming]) -> Tuple[Image.Image, List[Image.Image]]: + texts = [self._clean_word(word.word) for word in group] + + widths = [] + for text in texts: + bbox = self.font.getbbox(text) + widths.append(bbox[2] - bbox[0]) + + total_width = sum(widths) + if len(widths) > 1: + total_width += self.space_width * (len(widths) - 1) + + start_x = max(0, (self.canvas_width - total_width) // 2) + + base_image = Image.new("RGBA", (self.canvas_width, self.canvas_height), (0, 0, 0, 0)) + base_draw = ImageDraw.Draw(base_image) + highlight_images: List[Image.Image] = [] + + x = start_x + for text, width in zip(texts, widths): + base_draw.text((x, self.baseline), text, font=self.font, fill=self.base_color) + + highlight_image = Image.new("RGBA", base_image.size, (0, 0, 0, 0)) + highlight_draw = ImageDraw.Draw(highlight_image) + highlight_draw.text( + (x, self.baseline), text, font=self.font, fill=self.highlight_color + ) + highlight_images.append(highlight_image) + + x += width + self.space_width + + return base_image, highlight_images + + def _group_words(self, words: Sequence[WordTiming]) -> List[List[WordTiming]]: + if not words: + return [] + + grouped: List[List[WordTiming]] = [] + buffer: List[WordTiming] = [] + + for word in words: + buffer.append(word) + if len(buffer) == self.max_words: + grouped.append(buffer) + buffer = [] + + if buffer: + if len(buffer) == 1 and grouped: + grouped[-1].extend(buffer) + else: + grouped.append(buffer) + + # Rebalance groups to respect minimum size when possible + for idx, group in enumerate(grouped[:-1]): + if len(group) < self.min_words and len(grouped[idx + 1]) > self.min_words: + deficit = self.min_words - len(group) + transfer = grouped[idx + 1][:deficit] + grouped[idx] = group + transfer + grouped[idx + 1] = grouped[idx + 1][deficit:] + + grouped = [grp for grp in grouped if grp] + return grouped + + @staticmethod + def _clean_word(text: str) -> str: + text = text.strip() + text = re.sub(r"\s+", " ", text) + return text or "..." + + +class VideoRenderer: + def __init__(self, settings: Settings) -> None: + self.settings = settings + self.captions = CaptionBuilder(settings) + + def render( + self, + workspace_path: str, + highlight_windows: Sequence, + transcription: TranscriptionResult, + titles: Sequence[str], + output_dir, + ) -> List[Tuple[str, float, float, str, str, int]]: + results: List[Tuple[str, float, float, str, str, int]] = [] + + with VideoFileClip(workspace_path) as base_clip: + video_duration = base_clip.duration or 0 + for index, window in enumerate(highlight_windows, start=1): + start = clamp_time(window.start) + end = clamp_time(window.end) + start = min(start, video_duration) + end = min(end, video_duration) + if end <= start: + logger.info("Janela ignorada por intervalo invalido: %s", window) + continue + + subclip = base_clip.subclipped(start, end) + try: + rendered_path = self._render_single_clip( + subclip=subclip, + start=start, + end=end, + title=titles[index - 1] if index - 1 < len(titles) else window.summary, + summary=window.summary, + index=index, + transcription=transcription, + output_dir=output_dir, + ) + finally: + subclip.close() + + results.append( + ( + rendered_path, + float(start), + float(end), + titles[index - 1] if index - 1 < len(titles) else window.summary, + window.summary, + index, + ) + ) + + return results + + def _render_single_clip( + self, + subclip: VideoFileClip, + start: float, + end: float, + title: str, + summary: str, + index: int, + transcription: TranscriptionResult, + output_dir, + ) -> str: + duration = end - start + frame_w = self.settings.rendering.frame_width + frame_h = self.settings.rendering.frame_height + top_h = int(frame_h * 0.18) + bottom_h = int(frame_h * 0.20) + video_area_h = frame_h - top_h - bottom_h + + scale_factor = min( + frame_w / subclip.w, + video_area_h / subclip.h, + ) + resized_clip = subclip.resized(scale_factor) + video_y = top_h + (video_area_h - resized_clip.h) // 2 + + video_clip = resized_clip.with_position( + ((frame_w - resized_clip.w) // 2, video_y) + ) + + background = ColorClip(size=(frame_w, frame_h), color=(0, 0, 0)).with_duration(duration) + top_panel = ( + ColorClip(size=(frame_w, top_h), color=(12, 12, 12)) + .with_duration(duration) + .with_opacity(0.85) + ) + bottom_panel = ( + ColorClip(size=(frame_w, bottom_h), color=(12, 12, 12)) + .with_position((0, frame_h - bottom_h)) + .with_duration(duration) + .with_opacity(0.85) + ) + + title_text = title or summary + wrapped_title = self._wrap_text(title_text, max_width=frame_w - 160) + title_clip = ( + TextClip( + text=wrapped_title, + font=str(self.settings.rendering.font_path), + font_size=self.settings.rendering.title_font_size, + color=self.settings.rendering.base_color, + method="caption", + size=(frame_w - 160, top_h - 40), + ) + .with_duration(duration) + ) + title_clip = title_clip.with_position( + ((frame_w - title_clip.w) // 2, (top_h - title_clip.h) // 2) + ) + + words = self._collect_words(transcription, start, end) + caption_sets = self.captions.build(words, clip_start=start) + + caption_clips = [] + caption_resources: List[ImageClip] = [] + caption_y = frame_h - bottom_h + (bottom_h - self.captions.canvas_height) // 2 + for clip_set in caption_sets: + base_positioned = clip_set.base.with_position(("center", caption_y)) + caption_clips.append(base_positioned) + caption_resources.append(clip_set.base) + for highlight in clip_set.highlights: + positioned = highlight.with_position(("center", caption_y)) + caption_clips.append(positioned) + caption_resources.append(highlight) + + if not caption_clips: + fallback_text = self._wrap_text(summary or title, max_width=frame_w - 160) + caption_clips.append( + TextClip( + text=fallback_text, + font=str(self.settings.rendering.font_path), + font_size=self.settings.rendering.subtitle_font_size, + color=self.settings.rendering.base_color, + method="caption", + size=(frame_w - 160, bottom_h - 40), + ) + .with_duration(duration) + .with_position(("center", caption_y)) + ) + + composite = CompositeVideoClip( + [background, top_panel, bottom_panel, video_clip, title_clip, *caption_clips], + size=(frame_w, frame_h), + ) + + output_path = output_dir / f"clip_{index:02d}.mp4" + composite.write_videofile( + str(output_path), + codec=self.settings.rendering.video_codec, + audio_codec=self.settings.rendering.audio_codec, + fps=self.settings.rendering.fps, + bitrate=self.settings.rendering.bitrate, + ffmpeg_params=[ + "-preset", + self.settings.rendering.preset, + "-pix_fmt", + "yuv420p", + ], + temp_audiofile=str(output_dir / f"temp_audio_{index:02d}.m4a"), + remove_temp=True, + threads=4, + ) + + composite.close() + resized_clip.close() + video_clip.close() + title_clip.close() + background.close() + top_panel.close() + bottom_panel.close() + for clip in caption_clips: + clip.close() + for clip in caption_resources: + clip.close() + + return str(output_path) + + def _collect_words( + self, transcription: TranscriptionResult, start: float, end: float + ) -> List[WordTiming]: + collected: List[WordTiming] = [] + for segment in transcription.segments: + if segment.end < start or segment.start > end: + continue + + if segment.words: + for word in segment.words: + if word.end < start or word.start > end: + continue + collected.append( + WordTiming( + start=max(start, word.start), + end=min(end, word.end), + word=word.word, + ) + ) + else: + collected.extend(self._fallback_words(segment.text, segment.start, segment.end, start, end)) + + collected.sort(key=lambda w: w.start) + return collected + + def _fallback_words( + self, + text: str, + segment_start: float, + segment_end: float, + window_start: float, + window_end: float, + ) -> Iterable[WordTiming]: + words = [w for w in re.split(r"\s+", text.strip()) if w] + if not words: + return [] + + seg_start = max(segment_start, window_start) + seg_end = min(segment_end, window_end) + duration = max(0.01, seg_end - seg_start) + step = duration / len(words) + + timings: List[WordTiming] = [] + for idx, word in enumerate(words): + w_start = seg_start + idx * step + w_end = min(seg_end, w_start + step) + timings.append(WordTiming(start=w_start, end=w_end, word=word)) + return timings + + @staticmethod + def _wrap_text(text: str, max_width: int) -> str: + text = text.strip() + if not text: + return "" + + words = text.split() + lines: List[str] = [] + current: List[str] = [] + for word in words: + current.append(word) + if len(" ".join(current)) > max_width // 18: + lines.append(" ".join(current[:-1])) + current = [current[-1]] + if current: + lines.append(" ".join(current)) + return "\n".join(lines) diff --git a/video_render/transcription.py b/video_render/transcription.py new file mode 100644 index 0000000..bf5d695 --- /dev/null +++ b/video_render/transcription.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +import json +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import List, Optional + +from faster_whisper import WhisperModel + +from .config import Settings + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class WordTiming: + start: float + end: float + word: str + + +@dataclass(frozen=True) +class TranscriptSegment: + id: int + start: float + end: float + text: str + words: List[WordTiming] + + +@dataclass(frozen=True) +class TranscriptionResult: + segments: List[TranscriptSegment] + full_text: str + + +class TranscriptionService: + def __init__(self, settings: Settings) -> None: + self.settings = settings + self._model: Optional[WhisperModel] = None + + def _load_model(self) -> WhisperModel: + if self._model is None: + logger.info( + "Carregando modelo Faster-Whisper '%s' (device=%s, compute_type=%s)", + self.settings.whisper.model_size, + self.settings.whisper.device or "auto", + self.settings.whisper.compute_type or "default", + ) + self._model = WhisperModel( + self.settings.whisper.model_size, + device=self.settings.whisper.device or "auto", + compute_type=self.settings.whisper.compute_type or "default", + download_root=str(self.settings.whisper.download_root), + ) + return self._model + + def transcribe(self, audio_path: Path) -> TranscriptionResult: + model = self._load_model() + segments, _ = model.transcribe( + str(audio_path), + beam_size=5, + word_timestamps=True, + ) + + parsed_segments: List[TranscriptSegment] = [] + full_text_parts: List[str] = [] + + for idx, segment in enumerate(segments): + words = [ + WordTiming(start=w.start, end=w.end, word=w.word.strip()) + for w in segment.words or [] + if w.word.strip() + ] + text = segment.text.strip() + full_text_parts.append(text) + parsed_segments.append( + TranscriptSegment( + id=idx, + start=segment.start, + end=segment.end, + text=text, + words=words, + ) + ) + + return TranscriptionResult( + segments=parsed_segments, + full_text=" ".join(full_text_parts).strip(), + ) + + @staticmethod + def persist(result: TranscriptionResult, destination: Path) -> None: + json_path = destination / "transcription.json" + text_path = destination / "transcription.txt" + + payload = { + "segments": [ + { + "id": segment.id, + "start": segment.start, + "end": segment.end, + "text": segment.text, + "words": [ + {"start": word.start, "end": word.end, "text": word.word} + for word in segment.words + ], + } + for segment in result.segments + ], + "full_text": result.full_text, + } + + with json_path.open("w", encoding="utf-8") as fp: + json.dump(payload, fp, ensure_ascii=False, indent=2) + + with text_path.open("w", encoding="utf-8") as fp: + fp.write(result.full_text) + + logger.info("Transcrição salva em %s", destination) + diff --git a/video_render/utils.py b/video_render/utils.py new file mode 100644 index 0000000..8d8a4fd --- /dev/null +++ b/video_render/utils.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +import re +import unicodedata +from pathlib import Path +from typing import Iterable + + +def sanitize_filename(name: str) -> str: + normalized = unicodedata.normalize("NFKD", name) + ascii_text = normalized.encode("ASCII", "ignore").decode() + ascii_text = ascii_text.lower() + ascii_text = ascii_text.replace(" ", "_") + ascii_text = re.sub(r"[^a-z0-9_\-\.]", "", ascii_text) + ascii_text = re.sub(r"_+", "_", ascii_text) + return ascii_text.strip("_") or "video" + + +def ensure_workspace(root: Path, folder_name: str) -> Path: + workspace = root / folder_name + workspace.mkdir(parents=True, exist_ok=True) + return workspace + + +def remove_paths(paths: Iterable[Path]) -> None: + for path in paths: + if not path.exists(): + continue + if path.is_file() or path.is_symlink(): + path.unlink(missing_ok=True) + else: + for child in sorted(path.rglob("*"), reverse=True): + if child.is_file() or child.is_symlink(): + child.unlink(missing_ok=True) + elif child.is_dir(): + child.rmdir() + path.rmdir() +