From d5eebb5cc493ec5eb26f94ed681edddcde991e2f Mon Sep 17 00:00:00 2001 From: LeoMortari Date: Tue, 2 Dec 2025 00:38:09 -0300 Subject: [PATCH] Faz ajustes de tempo entre testes de Proxy e ajusta compose --- .env.example | 5 +++ Dockerfile | 7 ++-- docker-compose.yml | 15 +++++--- src/config.py | 5 ++- src/main.py | 12 ++++-- src/scrapers.py | 46 ++++++++++++++++++++++- test_connection.py | 93 ++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 167 insertions(+), 16 deletions(-) create mode 100644 test_connection.py diff --git a/.env.example b/.env.example index a15a0e8..bc8f1f4 100644 --- a/.env.example +++ b/.env.example @@ -5,6 +5,11 @@ POSTGRES_DB=proxies POSTGRES_USER=postgres POSTGRES_PASSWORD=your_secure_password_here +# Development Mode +# Set to True to disable scheduler and run scraping job once then exit +# Set to False for production mode with daily scheduled jobs +DEVELOPMENT=False + # Proxy Validation Settings PROXY_TIMEOUT=10 VALIDATION_URL=http://httpbin.org/ip diff --git a/Dockerfile b/Dockerfile index 42dba99..f72a044 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,11 +47,12 @@ RUN apt-get update && apt-get install -y \ xdg-utils \ && rm -rf /var/lib/apt/lists/* -RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ - && echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \ +RUN wget -q -O /tmp/google-chrome.gpg https://dl-ssl.google.com/linux/linux_signing_key.pub \ + && gpg --dearmor -o /usr/share/keyrings/google-chrome.gpg /tmp/google-chrome.gpg \ + && echo "deb [arch=amd64 signed-by=/usr/share/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \ && apt-get update \ && apt-get install -y google-chrome-stable \ - && rm -rf /var/lib/apt/lists/* + && rm -rf /var/lib/apt/lists/* /tmp/google-chrome.gpg WORKDIR /app diff --git a/docker-compose.yml b/docker-compose.yml index a7bd33c..f0c274c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,11 +6,14 @@ services: container_name: proxy-scraper environment: # PostgreSQL Connection - POSTGRES_HOST: postgres - POSTGRES_PORT: 5432 - POSTGRES_DB: ${POSTGRES_DB:-proxies} - POSTGRES_USER: ${POSTGRES_USER:-postgres} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-postgres} + POSTGRES_HOST: ${POSTGRES_HOST} + POSTGRES_PORT: ${POSTGRES_PORT:} + POSTGRES_DB: ${POSTGRES_DB} + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + + # Development Mode + DEVELOPMENT: False # Proxy Validation PROXY_TIMEOUT: ${PROXY_TIMEOUT:-10} @@ -34,7 +37,7 @@ services: - scraper_logs:/app/logs restart: unless-stopped networks: - - proxy-network + - dokploy-network networks: dokploy-network: diff --git a/src/config.py b/src/config.py index b602d5d..917e1dd 100644 --- a/src/config.py +++ b/src/config.py @@ -32,12 +32,13 @@ class Settings(BaseSettings): # File paths proxies_file: str = Field(default="/app/proxies.txt", alias="PROXIES_FILE") + # Development mode + development: bool = Field(default=False, alias="DEVELOPMENT") + # Logging log_level: str = Field(default="INFO", alias="LOG_LEVEL") class Config: - env_file = ".env" - env_file_encoding = "utf-8" case_sensitive = False diff --git a/src/main.py b/src/main.py index 2313c6b..e013f7e 100644 --- a/src/main.py +++ b/src/main.py @@ -231,16 +231,20 @@ def main(): ) logger.info(f" Proxy Timeout: {settings.proxy_timeout}s") logger.info(f" Validation URL: {settings.validation_url}") + logger.info(f" Development Mode: {settings.development}") service = ProxyScrapingService() - # Check for command line arguments - if len(sys.argv) > 1 and sys.argv[1] == "--immediate": - # Run immediately and exit + # Check for development mode or command line arguments + if settings.development or (len(sys.argv) > 1 and sys.argv[1] == "--immediate"): + # Run immediately and exit (development mode) + if settings.development: + logger.info("Running in DEVELOPMENT mode - executing scraping job and exiting") service.run_immediate() service.db.close() + logger.info("Development run completed. Exiting.") else: - # Start scheduler for recurring jobs + # Start scheduler for recurring jobs (production mode) service.start_scheduler() diff --git a/src/scrapers.py b/src/scrapers.py index 938e691..aab084b 100644 --- a/src/scrapers.py +++ b/src/scrapers.py @@ -218,7 +218,51 @@ class SeleniumScraper(ProxyScraper): "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" ) - service = Service(ChromeDriverManager().install()) + # Get chromedriver path and fix webdriver-manager bug + import os + import stat + + chromedriver_path = ChromeDriverManager().install() + logger.debug(f"Initial chromedriver path from WDM: {chromedriver_path}") + + # Fix for webdriver-manager bug that may return wrong file + # Check if the returned file is the actual binary (not THIRD_PARTY_NOTICES or LICENSE) + filename = os.path.basename(chromedriver_path) + + # If filename contains anything other than just "chromedriver" or "chromedriver.exe", it's wrong + if filename not in ("chromedriver", "chromedriver.exe"): + logger.warning(f"WDM returned wrong file: {filename}. Searching for correct chromedriver...") + + # Navigate to the base directory containing the chromedriver + base_dir = os.path.dirname(chromedriver_path) + + # Look for chromedriver in the same directory + for potential_file in ["chromedriver", "chromedriver.exe"]: + potential_path = os.path.join(base_dir, potential_file) + if os.path.exists(potential_path): + chromedriver_path = potential_path + logger.info(f"Found correct chromedriver: {chromedriver_path}") + break + else: + # If not found, search in parent directory or subdirectories + parent_dir = os.path.dirname(base_dir) + for root, dirs, files in os.walk(parent_dir): + for file in files: + if file == "chromedriver" or file == "chromedriver.exe": + chromedriver_path = os.path.join(root, file) + logger.info(f"Found chromedriver at: {chromedriver_path}") + break + + logger.info(f"Using chromedriver: {chromedriver_path}") + + # Ensure the file has execute permissions + if os.path.exists(chromedriver_path): + current_permissions = os.stat(chromedriver_path).st_mode + os.chmod(chromedriver_path, current_permissions | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) + else: + raise FileNotFoundError(f"Chromedriver not found at: {chromedriver_path}") + + service = Service(chromedriver_path) self.driver = webdriver.Chrome(service=service, options=chrome_options) self.driver.implicitly_wait(10) diff --git a/test_connection.py b/test_connection.py new file mode 100644 index 0000000..35e5f12 --- /dev/null +++ b/test_connection.py @@ -0,0 +1,93 @@ +"""Test database connection and list available databases.""" +import psycopg2 +from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT + +# Connection details +HOST = "154.12.229.181" +PORT = 5666 +USER = "leolitas" +PASSWORD = "L@l321321321" + +print(f"Testing connection to {HOST}:{PORT}") +print(f"User: {USER}") +print("-" * 50) + +try: + # First, connect to postgres database to list all databases + print("\n1. Connecting to 'postgres' database...") + conn = psycopg2.connect( + host=HOST, + port=PORT, + database="postgres", + user=USER, + password=PASSWORD + ) + conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) + + cursor = conn.cursor() + + # List all databases + print("\n2. Available databases:") + cursor.execute("SELECT datname FROM pg_database WHERE datistemplate = false;") + databases = cursor.fetchall() + for db in databases: + print(f" - {db[0]}") + + # Check if 'proxies' database exists + cursor.execute("SELECT 1 FROM pg_database WHERE datname = 'proxies';") + exists = cursor.fetchone() + + if exists: + print("\n✓ Database 'proxies' EXISTS") + + # Try to connect to proxies database + print("\n3. Connecting to 'proxies' database...") + conn.close() + + conn_proxies = psycopg2.connect( + host=HOST, + port=PORT, + database="proxies", + user=USER, + password=PASSWORD + ) + cursor_proxies = conn_proxies.cursor() + + # Check if table exists + cursor_proxies.execute(""" + SELECT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_name = 'proxies' + ); + """) + table_exists = cursor_proxies.fetchone()[0] + + if table_exists: + print("✓ Table 'proxies' EXISTS") + + # Get row count + cursor_proxies.execute("SELECT COUNT(*) FROM proxies;") + count = cursor_proxies.fetchone()[0] + print(f"✓ Table has {count} rows") + else: + print("✗ Table 'proxies' DOES NOT EXIST") + print("\nYou need to run init-db.sql") + + cursor_proxies.close() + conn_proxies.close() + + else: + print("\n✗ Database 'proxies' DOES NOT EXIST") + print("\nTo create it, run:") + print(f" CREATE DATABASE proxies;") + + cursor.close() + conn.close() + + print("\n" + "=" * 50) + print("Connection test completed successfully!") + +except psycopg2.OperationalError as e: + print(f"\n✗ Connection ERROR: {e}") +except Exception as e: + print(f"\n✗ Unexpected ERROR: {e}")