Files
proxy-scraper/QUERIES.md
LeoMortari 23c8133f3d init repo
2025-11-21 18:23:19 -03:00

11 KiB

SQL Queries Úteis - Proxy Scraping Service

Este documento contém queries SQL úteis para gerenciar e consultar a base de dados de proxies.

Acesso ao Banco

Via Docker Compose

docker-compose exec postgres psql -U postgres -d proxies

Via pgAdmin

  1. Inicie com: docker-compose --profile admin up -d
  2. Acesse: http://localhost:5050
  3. Login: admin@admin.com / admin
  4. Conecte ao servidor postgres

Queries de Consulta

1. Listar Proxies Ativos e Anônimos

SELECT
    ip_address,
    port,
    protocol,
    country_name,
    response_time_ms,
    last_checked_at,
    source
FROM proxies
WHERE is_active = TRUE
  AND is_anonymous = TRUE
ORDER BY response_time_ms ASC
LIMIT 50;

2. Proxies Mais Rápidos por Protocolo

SELECT
    protocol,
    ip_address,
    port,
    response_time_ms,
    country_name
FROM proxies
WHERE is_active = TRUE
  AND is_anonymous = TRUE
ORDER BY protocol, response_time_ms ASC;

3. Estatísticas Gerais

SELECT
    COUNT(*) as total_proxies,
    COUNT(*) FILTER (WHERE is_active = TRUE) as active_proxies,
    COUNT(*) FILTER (WHERE is_anonymous = TRUE) as anonymous_proxies,
    COUNT(*) FILTER (WHERE is_active = TRUE AND is_anonymous = TRUE) as usable_proxies,
    COUNT(DISTINCT protocol) as unique_protocols,
    COUNT(DISTINCT country_code) as unique_countries,
    ROUND(AVG(response_time_ms)) as avg_response_time_ms,
    MIN(response_time_ms) as min_response_time_ms,
    MAX(response_time_ms) as max_response_time_ms
FROM proxies;

4. Proxies por País

SELECT
    country_name,
    country_code,
    COUNT(*) as total,
    COUNT(*) FILTER (WHERE is_active = TRUE) as active,
    COUNT(*) FILTER (WHERE is_anonymous = TRUE) as anonymous,
    ROUND(AVG(response_time_ms)) as avg_response_time
FROM proxies
WHERE country_name IS NOT NULL
GROUP BY country_name, country_code
ORDER BY total DESC;

5. Proxies por Protocolo

SELECT
    protocol,
    COUNT(*) as total,
    COUNT(*) FILTER (WHERE is_active = TRUE) as active,
    COUNT(*) FILTER (WHERE is_anonymous = TRUE) as anonymous,
    ROUND(AVG(response_time_ms)) as avg_response_time
FROM proxies
GROUP BY protocol
ORDER BY protocol;

6. Proxies por Fonte

SELECT
    source,
    COUNT(*) as total,
    COUNT(*) FILTER (WHERE is_active = TRUE) as active,
    COUNT(*) FILTER (WHERE is_anonymous = TRUE) as anonymous,
    ROUND(AVG(response_time_ms)) as avg_response_time,
    MAX(created_at) as last_scraped
FROM proxies
WHERE source IS NOT NULL
GROUP BY source
ORDER BY total DESC;

7. Top 10 Proxies com Melhor Histórico

SELECT
    ip_address,
    port,
    protocol,
    country_name,
    success_count,
    failure_count,
    ROUND(success_count::numeric / NULLIF(success_count + failure_count, 0) * 100, 2) as success_rate,
    response_time_ms,
    last_successful_at
FROM proxies
WHERE is_active = TRUE
  AND is_anonymous = TRUE
  AND (success_count + failure_count) >= 5
ORDER BY success_rate DESC, response_time_ms ASC
LIMIT 10;

8. Proxies Adicionados Recentemente

SELECT
    ip_address,
    port,
    protocol,
    country_name,
    is_active,
    is_anonymous,
    response_time_ms,
    source,
    created_at
FROM proxies
WHERE created_at >= NOW() - INTERVAL '24 hours'
ORDER BY created_at DESC;

9. Proxies que Falharam Recentemente

SELECT
    ip_address,
    port,
    protocol,
    country_name,
    failure_count,
    last_checked_at,
    response_time_ms
FROM proxies
WHERE is_active = FALSE
  AND last_checked_at >= NOW() - INTERVAL '24 hours'
ORDER BY last_checked_at DESC
LIMIT 50;

10. Distribuição por Faixa de Resposta

SELECT
    CASE
        WHEN response_time_ms < 100 THEN '< 100ms'
        WHEN response_time_ms BETWEEN 100 AND 500 THEN '100-500ms'
        WHEN response_time_ms BETWEEN 501 AND 1000 THEN '501-1000ms'
        WHEN response_time_ms BETWEEN 1001 AND 2000 THEN '1-2s'
        WHEN response_time_ms > 2000 THEN '> 2s'
    END as response_range,
    COUNT(*) as count
FROM proxies
WHERE is_active = TRUE
  AND is_anonymous = TRUE
  AND response_time_ms IS NOT NULL
GROUP BY response_range
ORDER BY
    CASE
        WHEN response_time_ms < 100 THEN 1
        WHEN response_time_ms BETWEEN 100 AND 500 THEN 2
        WHEN response_time_ms BETWEEN 501 AND 1000 THEN 3
        WHEN response_time_ms BETWEEN 1001 AND 2000 THEN 4
        WHEN response_time_ms > 2000 THEN 5
    END;

Queries de Manutenção

11. Remover Proxies Inativos Antigos

-- Ver quantos seriam removidos
SELECT COUNT(*)
FROM proxies
WHERE is_active = FALSE
  AND last_checked_at < NOW() - INTERVAL '30 days';

-- Remover (CUIDADO!)
DELETE FROM proxies
WHERE is_active = FALSE
  AND last_checked_at < NOW() - INTERVAL '30 days';

12. Resetar Contadores de Falha

-- Para proxies que voltaram a funcionar
UPDATE proxies
SET failure_count = 0
WHERE is_active = TRUE
  AND failure_count > 0
  AND last_successful_at > NOW() - INTERVAL '24 hours';

13. Marcar Proxies como Inativos (Limpeza)

-- Proxies que não foram testados há muito tempo
UPDATE proxies
SET is_active = FALSE
WHERE last_checked_at < NOW() - INTERVAL '7 days'
  AND is_active = TRUE;

14. Limpar Duplicatas (Se houver)

-- Identificar duplicatas
SELECT
    ip_address,
    port,
    protocol,
    COUNT(*) as count
FROM proxies
GROUP BY ip_address, port, protocol
HAVING COUNT(*) > 1;

-- Manter apenas a entrada mais recente de cada duplicata
DELETE FROM proxies
WHERE id IN (
    SELECT id
    FROM (
        SELECT id,
               ROW_NUMBER() OVER (
                   PARTITION BY ip_address, port, protocol
                   ORDER BY created_at DESC
               ) as rn
        FROM proxies
    ) t
    WHERE t.rn > 1
);

15. Vacuum e Análise (Otimização)

-- Limpar espaço não utilizado
VACUUM FULL proxies;

-- Atualizar estatísticas para query planner
ANALYZE proxies;

Queries de Relatório

16. Relatório Diário

SELECT
    DATE(created_at) as date,
    COUNT(*) as new_proxies,
    COUNT(*) FILTER (WHERE is_active = TRUE) as active,
    COUNT(*) FILTER (WHERE is_anonymous = TRUE) as anonymous,
    ROUND(AVG(response_time_ms)) as avg_response
FROM proxies
WHERE created_at >= NOW() - INTERVAL '7 days'
GROUP BY DATE(created_at)
ORDER BY date DESC;

17. Relatório por Hora (Últimas 24h)

SELECT
    DATE_TRUNC('hour', created_at) as hour,
    COUNT(*) as new_proxies,
    COUNT(*) FILTER (WHERE is_active = TRUE AND is_anonymous = TRUE) as usable
FROM proxies
WHERE created_at >= NOW() - INTERVAL '24 hours'
GROUP BY DATE_TRUNC('hour', created_at)
ORDER BY hour DESC;

18. Qualidade por Fonte

SELECT
    source,
    COUNT(*) as total,
    COUNT(*) FILTER (WHERE is_active = TRUE AND is_anonymous = TRUE) as usable,
    ROUND(
        COUNT(*) FILTER (WHERE is_active = TRUE AND is_anonymous = TRUE)::numeric /
        COUNT(*)::numeric * 100,
        2
    ) as quality_percentage,
    ROUND(AVG(response_time_ms)) as avg_response,
    MAX(created_at) as last_update
FROM proxies
WHERE source IS NOT NULL
GROUP BY source
ORDER BY quality_percentage DESC;

Queries de Exportação

19. Exportar Proxies Prontos para Uso (CSV)

\copy (
    SELECT
        ip_address || ':' || port as proxy,
        protocol,
        country_name,
        response_time_ms
    FROM proxies
    WHERE is_active = TRUE
      AND is_anonymous = TRUE
    ORDER BY response_time_ms ASC
) TO '/tmp/proxies_export.csv' WITH CSV HEADER;

20. Exportar em Formato URL

SELECT
    CASE
        WHEN protocol = 'HTTP' THEN 'http://' || ip_address || ':' || port
        WHEN protocol = 'HTTPS' THEN 'https://' || ip_address || ':' || port
        WHEN protocol = 'SOCKS4' THEN 'socks4://' || ip_address || ':' || port
        WHEN protocol = 'SOCKS5' THEN 'socks5://' || ip_address || ':' || port
    END as proxy_url,
    country_name,
    response_time_ms
FROM proxies
WHERE is_active = TRUE
  AND is_anonymous = TRUE
ORDER BY response_time_ms ASC;

Queries de Monitoramento

21. Verificar Saúde do Banco

-- Tamanho da tabela
SELECT
    pg_size_pretty(pg_total_relation_size('proxies')) as total_size,
    pg_size_pretty(pg_relation_size('proxies')) as table_size,
    pg_size_pretty(pg_indexes_size('proxies')) as indexes_size;

-- Número de linhas
SELECT
    schemaname,
    tablename,
    n_live_tup as row_count,
    n_dead_tup as dead_rows
FROM pg_stat_user_tables
WHERE tablename = 'proxies';

22. Performance dos Índices

SELECT
    indexname,
    idx_scan as index_scans,
    idx_tup_read as tuples_read,
    idx_tup_fetch as tuples_fetched
FROM pg_stat_user_indexes
WHERE tablename = 'proxies'
ORDER BY idx_scan DESC;

23. Queries Lentas (Diagnóstico)

-- Ativar tracking de queries (se não estiver ativo)
-- ALTER DATABASE proxies SET log_min_duration_statement = 1000;

-- Ver queries ativas
SELECT
    pid,
    usename,
    application_name,
    state,
    query_start,
    NOW() - query_start as duration,
    query
FROM pg_stat_activity
WHERE datname = 'proxies'
  AND state != 'idle'
ORDER BY duration DESC;

Uso com Python

24. Exemplo de Integração Python

import psycopg2
from psycopg2.extras import RealDictCursor

# Conectar
conn = psycopg2.connect(
    host="localhost",
    port=5432,
    database="proxies",
    user="postgres",
    password="your_password"
)

# Buscar proxies
with conn.cursor(cursor_factory=RealDictCursor) as cursor:
    cursor.execute("""
        SELECT ip_address, port, protocol, response_time_ms
        FROM proxies
        WHERE is_active = TRUE AND is_anonymous = TRUE
        ORDER BY response_time_ms ASC
        LIMIT 10
    """)

    proxies = cursor.fetchall()

    for proxy in proxies:
        print(f"{proxy['protocol']}://{proxy['ip_address']}:{proxy['port']}")

conn.close()

Backup e Restore

25. Backup do Banco

# Via Docker
docker-compose exec postgres pg_dump -U postgres proxies > backup_proxies.sql

# Com compressão
docker-compose exec postgres pg_dump -U postgres proxies | gzip > backup_proxies.sql.gz

26. Restore do Banco

# Via Docker
docker-compose exec -T postgres psql -U postgres proxies < backup_proxies.sql

# De arquivo comprimido
gunzip -c backup_proxies.sql.gz | docker-compose exec -T postgres psql -U postgres proxies

Tips & Tricks

Performance

  • Use EXPLAIN ANALYZE antes de queries complexas
  • Mantenha os índices atualizados com ANALYZE
  • Use LIMIT em queries exploratórias
  • Prefira índices compostos para queries com múltiplos filtros

Segurança

  • Sempre use prepared statements em código
  • Limite privilégios do usuário do banco
  • Faça backup regular
  • Use transações para operações críticas

Manutenção

  • Execute VACUUM semanalmente
  • Monitore tamanho do banco
  • Limpe dados antigos periodicamente
  • Documente queries customizadas

Nota: Todas as queries foram testadas no PostgreSQL 16. Ajuste conforme necessário para outras versões.