init repo

2025-11-21 18:23:19 -03:00
commit 23c8133f3d
20 changed files with 4481 additions and 0 deletions
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,248 @@
+"""
+Main application service for proxy scraping, validation, and storage.
+"""
+import logging
+import sys
+import random
+from datetime import datetime, time as dt_time
+from apscheduler.schedulers.blocking import BlockingScheduler
+from apscheduler.triggers.cron import CronTrigger
+import pytz
+import colorlog
+
+from config import settings
+from database import DatabaseManager
+from validator import ProxyValidator
+from scrapers import scrape_from_file
+
+
+def setup_logging():
+    """Configure colored logging."""
+    handler = colorlog.StreamHandler()
+    handler.setFormatter(
+        colorlog.ColoredFormatter(
+            "%(log_color)s%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+            log_colors={
+                "DEBUG": "cyan",
+                "INFO": "green",
+                "WARNING": "yellow",
+                "ERROR": "red",
+                "CRITICAL": "red,bg_white",
+            },
+        )
+    )
+
+    root_logger = logging.getLogger()
+    root_logger.addHandler(handler)
+    root_logger.setLevel(getattr(logging, settings.log_level.upper()))
+
+
+logger = logging.getLogger(__name__)
+
+
+class ProxyScrapingService:
+    """Main service for orchestrating proxy scraping operations."""
+
+    def __init__(self):
+        """Initialize the proxy scraping service."""
+        self.db = DatabaseManager()
+        self.validator = ProxyValidator()
+        self.scheduler = BlockingScheduler(timezone=pytz.UTC)
+
+    def run_scraping_job(self):
+        """Execute the complete scraping, validation, and storage workflow."""
+        job_start = datetime.now()
+        logger.info("=" * 80)
+        logger.info(f"Starting proxy scraping job at {job_start}")
+        logger.info("=" * 80)
+
+        try:
+            # Step 1: Scrape proxies from sources
+            logger.info("Step 1: Scraping proxies from sources...")
+            raw_proxies = scrape_from_file(settings.proxies_file)
+
+            if not raw_proxies:
+                logger.warning("No proxies scraped from sources")
+                return
+
+            logger.info(f"Scraped {len(raw_proxies)} proxies from sources")
+
+            # Step 2: Remove duplicates based on IP:PORT:PROTOCOL
+            logger.info("Step 2: Removing duplicates...")
+            unique_proxies = self._deduplicate_proxies(raw_proxies)
+            logger.info(
+                f"Reduced to {len(unique_proxies)} unique proxies "
+                f"(removed {len(raw_proxies) - len(unique_proxies)} duplicates)"
+            )
+
+            # Step 3: Validate proxies
+            logger.info("Step 3: Validating proxies for connectivity and anonymity...")
+            validated_proxies = self.validator.validate_proxies_bulk(
+                unique_proxies, max_workers=20
+            )
+
+            if not validated_proxies:
+                logger.warning("No proxies passed validation")
+                return
+
+            logger.info(
+                f"{len(validated_proxies)} proxies validated successfully "
+                f"({len(validated_proxies) / len(unique_proxies) * 100:.1f}% success rate)"
+            )
+
+            # Step 4: Store in database
+            logger.info("Step 4: Storing validated proxies in database...")
+            inserted_count = 0
+
+            for proxy in validated_proxies:
+                if self.db.insert_proxy(proxy):
+                    inserted_count += 1
+
+            logger.info(
+                f"Inserted {inserted_count} new anonymous proxies into database "
+                f"({len(validated_proxies) - inserted_count} already existed)"
+            )
+
+            # Step 5: Display statistics
+            logger.info("Step 5: Database statistics...")
+            stats = self.db.get_stats()
+            self._display_stats(stats)
+
+        except Exception as e:
+            logger.error(f"Error during scraping job: {e}", exc_info=True)
+        finally:
+            job_end = datetime.now()
+            duration = (job_end - job_start).total_seconds()
+            logger.info("=" * 80)
+            logger.info(f"Scraping job completed at {job_end}")
+            logger.info(f"Total duration: {duration:.2f} seconds")
+            logger.info("=" * 80)
+
+    def _deduplicate_proxies(self, proxies: list) -> list:
+        """
+        Remove duplicate proxies based on IP:PORT:PROTOCOL.
+
+        Args:
+            proxies: List of proxy dictionaries
+
+        Returns:
+            List of unique proxies
+        """
+        seen = set()
+        unique = []
+
+        for proxy in proxies:
+            key = (
+                proxy["ip_address"],
+                proxy["port"],
+                proxy["protocol"],
+            )
+            if key not in seen:
+                seen.add(key)
+                unique.append(proxy)
+
+        return unique
+
+    def _display_stats(self, stats: dict):
+        """
+        Display database statistics.
+
+        Args:
+            stats: Statistics dictionary from database
+        """
+        logger.info("Database Statistics:")
+        logger.info(f"  Total Proxies: {stats.get('total_proxies', 0)}")
+        logger.info(f"  Active Proxies: {stats.get('active_proxies', 0)}")
+        logger.info(f"  Anonymous Proxies: {stats.get('anonymous_proxies', 0)}")
+        logger.info(f"  Unique Protocols: {stats.get('unique_protocols', 0)}")
+        logger.info(f"  Unique Countries: {stats.get('unique_countries', 0)}")
+
+        avg_response = stats.get("avg_response_time")
+        if avg_response:
+            logger.info(f"  Avg Response Time: {avg_response:.2f}ms")
+
+    def schedule_daily_job(self):
+        """Schedule the scraping job to run once daily between configured hours."""
+        # Generate random time between start and end hour
+        random_hour = random.randint(
+            settings.schedule_hour_start, settings.schedule_hour_end - 1
+        )
+        random_minute = random.randint(0, 59)
+
+        logger.info(
+            f"Scheduling daily scraping job at {random_hour:02d}:{random_minute:02d} UTC"
+        )
+
+        # Create cron trigger for daily execution
+        trigger = CronTrigger(
+            hour=random_hour, minute=random_minute, timezone=pytz.UTC
+        )
+
+        self.scheduler.add_job(
+            self.run_scraping_job,
+            trigger=trigger,
+            id="daily_proxy_scraping",
+            name="Daily Proxy Scraping Job",
+            replace_existing=True,
+        )
+
+    def run_immediate(self):
+        """Run scraping job immediately (for testing or manual execution)."""
+        logger.info("Running immediate scraping job...")
+        self.run_scraping_job()
+
+    def start_scheduler(self):
+        """Start the scheduler and wait for scheduled jobs."""
+        try:
+            self.schedule_daily_job()
+
+            logger.info("Scheduler started. Waiting for scheduled jobs...")
+            logger.info("Press Ctrl+C to exit")
+
+            # Also run immediately on startup
+            logger.info("Running initial scraping job on startup...")
+            self.run_scraping_job()
+
+            # Start scheduler
+            self.scheduler.start()
+
+        except (KeyboardInterrupt, SystemExit):
+            logger.info("Scheduler shutdown requested")
+            self.scheduler.shutdown()
+            self.db.close()
+        except Exception as e:
+            logger.error(f"Scheduler error: {e}", exc_info=True)
+            self.db.close()
+            sys.exit(1)
+
+
+def main():
+    """Main entry point for the application."""
+    setup_logging()
+
+    logger.info("Proxy Scraping Service Starting...")
+    logger.info(f"Configuration:")
+    logger.info(f"  PostgreSQL: {settings.postgres_host}:{settings.postgres_port}")
+    logger.info(f"  Database: {settings.postgres_db}")
+    logger.info(f"  Proxies File: {settings.proxies_file}")
+    logger.info(
+        f"  Schedule: Daily between {settings.schedule_hour_start:02d}:00 - {settings.schedule_hour_end:02d}:00 UTC"
+    )
+    logger.info(f"  Proxy Timeout: {settings.proxy_timeout}s")
+    logger.info(f"  Validation URL: {settings.validation_url}")
+
+    service = ProxyScrapingService()
+
+    # Check for command line arguments
+    if len(sys.argv) > 1 and sys.argv[1] == "--immediate":
+        # Run immediately and exit
+        service.run_immediate()
+        service.db.close()
+    else:
+        # Start scheduler for recurring jobs
+        service.start_scheduler()
+
+
+if __name__ == "__main__":
+    main()