init repo
This commit is contained in:
248
src/main.py
Normal file
248
src/main.py
Normal file
@@ -0,0 +1,248 @@
|
||||
"""
|
||||
Main application service for proxy scraping, validation, and storage.
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
import random
|
||||
from datetime import datetime, time as dt_time
|
||||
from apscheduler.schedulers.blocking import BlockingScheduler
|
||||
from apscheduler.triggers.cron import CronTrigger
|
||||
import pytz
|
||||
import colorlog
|
||||
|
||||
from config import settings
|
||||
from database import DatabaseManager
|
||||
from validator import ProxyValidator
|
||||
from scrapers import scrape_from_file
|
||||
|
||||
|
||||
def setup_logging():
|
||||
"""Configure colored logging."""
|
||||
handler = colorlog.StreamHandler()
|
||||
handler.setFormatter(
|
||||
colorlog.ColoredFormatter(
|
||||
"%(log_color)s%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
log_colors={
|
||||
"DEBUG": "cyan",
|
||||
"INFO": "green",
|
||||
"WARNING": "yellow",
|
||||
"ERROR": "red",
|
||||
"CRITICAL": "red,bg_white",
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.addHandler(handler)
|
||||
root_logger.setLevel(getattr(logging, settings.log_level.upper()))
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ProxyScrapingService:
|
||||
"""Main service for orchestrating proxy scraping operations."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the proxy scraping service."""
|
||||
self.db = DatabaseManager()
|
||||
self.validator = ProxyValidator()
|
||||
self.scheduler = BlockingScheduler(timezone=pytz.UTC)
|
||||
|
||||
def run_scraping_job(self):
|
||||
"""Execute the complete scraping, validation, and storage workflow."""
|
||||
job_start = datetime.now()
|
||||
logger.info("=" * 80)
|
||||
logger.info(f"Starting proxy scraping job at {job_start}")
|
||||
logger.info("=" * 80)
|
||||
|
||||
try:
|
||||
# Step 1: Scrape proxies from sources
|
||||
logger.info("Step 1: Scraping proxies from sources...")
|
||||
raw_proxies = scrape_from_file(settings.proxies_file)
|
||||
|
||||
if not raw_proxies:
|
||||
logger.warning("No proxies scraped from sources")
|
||||
return
|
||||
|
||||
logger.info(f"Scraped {len(raw_proxies)} proxies from sources")
|
||||
|
||||
# Step 2: Remove duplicates based on IP:PORT:PROTOCOL
|
||||
logger.info("Step 2: Removing duplicates...")
|
||||
unique_proxies = self._deduplicate_proxies(raw_proxies)
|
||||
logger.info(
|
||||
f"Reduced to {len(unique_proxies)} unique proxies "
|
||||
f"(removed {len(raw_proxies) - len(unique_proxies)} duplicates)"
|
||||
)
|
||||
|
||||
# Step 3: Validate proxies
|
||||
logger.info("Step 3: Validating proxies for connectivity and anonymity...")
|
||||
validated_proxies = self.validator.validate_proxies_bulk(
|
||||
unique_proxies, max_workers=20
|
||||
)
|
||||
|
||||
if not validated_proxies:
|
||||
logger.warning("No proxies passed validation")
|
||||
return
|
||||
|
||||
logger.info(
|
||||
f"{len(validated_proxies)} proxies validated successfully "
|
||||
f"({len(validated_proxies) / len(unique_proxies) * 100:.1f}% success rate)"
|
||||
)
|
||||
|
||||
# Step 4: Store in database
|
||||
logger.info("Step 4: Storing validated proxies in database...")
|
||||
inserted_count = 0
|
||||
|
||||
for proxy in validated_proxies:
|
||||
if self.db.insert_proxy(proxy):
|
||||
inserted_count += 1
|
||||
|
||||
logger.info(
|
||||
f"Inserted {inserted_count} new anonymous proxies into database "
|
||||
f"({len(validated_proxies) - inserted_count} already existed)"
|
||||
)
|
||||
|
||||
# Step 5: Display statistics
|
||||
logger.info("Step 5: Database statistics...")
|
||||
stats = self.db.get_stats()
|
||||
self._display_stats(stats)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during scraping job: {e}", exc_info=True)
|
||||
finally:
|
||||
job_end = datetime.now()
|
||||
duration = (job_end - job_start).total_seconds()
|
||||
logger.info("=" * 80)
|
||||
logger.info(f"Scraping job completed at {job_end}")
|
||||
logger.info(f"Total duration: {duration:.2f} seconds")
|
||||
logger.info("=" * 80)
|
||||
|
||||
def _deduplicate_proxies(self, proxies: list) -> list:
|
||||
"""
|
||||
Remove duplicate proxies based on IP:PORT:PROTOCOL.
|
||||
|
||||
Args:
|
||||
proxies: List of proxy dictionaries
|
||||
|
||||
Returns:
|
||||
List of unique proxies
|
||||
"""
|
||||
seen = set()
|
||||
unique = []
|
||||
|
||||
for proxy in proxies:
|
||||
key = (
|
||||
proxy["ip_address"],
|
||||
proxy["port"],
|
||||
proxy["protocol"],
|
||||
)
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(proxy)
|
||||
|
||||
return unique
|
||||
|
||||
def _display_stats(self, stats: dict):
|
||||
"""
|
||||
Display database statistics.
|
||||
|
||||
Args:
|
||||
stats: Statistics dictionary from database
|
||||
"""
|
||||
logger.info("Database Statistics:")
|
||||
logger.info(f" Total Proxies: {stats.get('total_proxies', 0)}")
|
||||
logger.info(f" Active Proxies: {stats.get('active_proxies', 0)}")
|
||||
logger.info(f" Anonymous Proxies: {stats.get('anonymous_proxies', 0)}")
|
||||
logger.info(f" Unique Protocols: {stats.get('unique_protocols', 0)}")
|
||||
logger.info(f" Unique Countries: {stats.get('unique_countries', 0)}")
|
||||
|
||||
avg_response = stats.get("avg_response_time")
|
||||
if avg_response:
|
||||
logger.info(f" Avg Response Time: {avg_response:.2f}ms")
|
||||
|
||||
def schedule_daily_job(self):
|
||||
"""Schedule the scraping job to run once daily between configured hours."""
|
||||
# Generate random time between start and end hour
|
||||
random_hour = random.randint(
|
||||
settings.schedule_hour_start, settings.schedule_hour_end - 1
|
||||
)
|
||||
random_minute = random.randint(0, 59)
|
||||
|
||||
logger.info(
|
||||
f"Scheduling daily scraping job at {random_hour:02d}:{random_minute:02d} UTC"
|
||||
)
|
||||
|
||||
# Create cron trigger for daily execution
|
||||
trigger = CronTrigger(
|
||||
hour=random_hour, minute=random_minute, timezone=pytz.UTC
|
||||
)
|
||||
|
||||
self.scheduler.add_job(
|
||||
self.run_scraping_job,
|
||||
trigger=trigger,
|
||||
id="daily_proxy_scraping",
|
||||
name="Daily Proxy Scraping Job",
|
||||
replace_existing=True,
|
||||
)
|
||||
|
||||
def run_immediate(self):
|
||||
"""Run scraping job immediately (for testing or manual execution)."""
|
||||
logger.info("Running immediate scraping job...")
|
||||
self.run_scraping_job()
|
||||
|
||||
def start_scheduler(self):
|
||||
"""Start the scheduler and wait for scheduled jobs."""
|
||||
try:
|
||||
self.schedule_daily_job()
|
||||
|
||||
logger.info("Scheduler started. Waiting for scheduled jobs...")
|
||||
logger.info("Press Ctrl+C to exit")
|
||||
|
||||
# Also run immediately on startup
|
||||
logger.info("Running initial scraping job on startup...")
|
||||
self.run_scraping_job()
|
||||
|
||||
# Start scheduler
|
||||
self.scheduler.start()
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
logger.info("Scheduler shutdown requested")
|
||||
self.scheduler.shutdown()
|
||||
self.db.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Scheduler error: {e}", exc_info=True)
|
||||
self.db.close()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the application."""
|
||||
setup_logging()
|
||||
|
||||
logger.info("Proxy Scraping Service Starting...")
|
||||
logger.info(f"Configuration:")
|
||||
logger.info(f" PostgreSQL: {settings.postgres_host}:{settings.postgres_port}")
|
||||
logger.info(f" Database: {settings.postgres_db}")
|
||||
logger.info(f" Proxies File: {settings.proxies_file}")
|
||||
logger.info(
|
||||
f" Schedule: Daily between {settings.schedule_hour_start:02d}:00 - {settings.schedule_hour_end:02d}:00 UTC"
|
||||
)
|
||||
logger.info(f" Proxy Timeout: {settings.proxy_timeout}s")
|
||||
logger.info(f" Validation URL: {settings.validation_url}")
|
||||
|
||||
service = ProxyScrapingService()
|
||||
|
||||
# Check for command line arguments
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "--immediate":
|
||||
# Run immediately and exit
|
||||
service.run_immediate()
|
||||
service.db.close()
|
||||
else:
|
||||
# Start scheduler for recurring jobs
|
||||
service.start_scheduler()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user