11 KiB
11 KiB
SQL Queries Úteis - Proxy Scraping Service
Este documento contém queries SQL úteis para gerenciar e consultar a base de dados de proxies.
Acesso ao Banco
Via Docker Compose
docker-compose exec postgres psql -U postgres -d proxies
Via pgAdmin
- Inicie com:
docker-compose --profile admin up -d - Acesse: http://localhost:5050
- Login: admin@admin.com / admin
- Conecte ao servidor postgres
Queries de Consulta
1. Listar Proxies Ativos e Anônimos
SELECT
ip_address,
port,
protocol,
country_name,
response_time_ms,
last_checked_at,
source
FROM proxies
WHERE is_active = TRUE
AND is_anonymous = TRUE
ORDER BY response_time_ms ASC
LIMIT 50;
2. Proxies Mais Rápidos por Protocolo
SELECT
protocol,
ip_address,
port,
response_time_ms,
country_name
FROM proxies
WHERE is_active = TRUE
AND is_anonymous = TRUE
ORDER BY protocol, response_time_ms ASC;
3. Estatísticas Gerais
SELECT
COUNT(*) as total_proxies,
COUNT(*) FILTER (WHERE is_active = TRUE) as active_proxies,
COUNT(*) FILTER (WHERE is_anonymous = TRUE) as anonymous_proxies,
COUNT(*) FILTER (WHERE is_active = TRUE AND is_anonymous = TRUE) as usable_proxies,
COUNT(DISTINCT protocol) as unique_protocols,
COUNT(DISTINCT country_code) as unique_countries,
ROUND(AVG(response_time_ms)) as avg_response_time_ms,
MIN(response_time_ms) as min_response_time_ms,
MAX(response_time_ms) as max_response_time_ms
FROM proxies;
4. Proxies por País
SELECT
country_name,
country_code,
COUNT(*) as total,
COUNT(*) FILTER (WHERE is_active = TRUE) as active,
COUNT(*) FILTER (WHERE is_anonymous = TRUE) as anonymous,
ROUND(AVG(response_time_ms)) as avg_response_time
FROM proxies
WHERE country_name IS NOT NULL
GROUP BY country_name, country_code
ORDER BY total DESC;
5. Proxies por Protocolo
SELECT
protocol,
COUNT(*) as total,
COUNT(*) FILTER (WHERE is_active = TRUE) as active,
COUNT(*) FILTER (WHERE is_anonymous = TRUE) as anonymous,
ROUND(AVG(response_time_ms)) as avg_response_time
FROM proxies
GROUP BY protocol
ORDER BY protocol;
6. Proxies por Fonte
SELECT
source,
COUNT(*) as total,
COUNT(*) FILTER (WHERE is_active = TRUE) as active,
COUNT(*) FILTER (WHERE is_anonymous = TRUE) as anonymous,
ROUND(AVG(response_time_ms)) as avg_response_time,
MAX(created_at) as last_scraped
FROM proxies
WHERE source IS NOT NULL
GROUP BY source
ORDER BY total DESC;
7. Top 10 Proxies com Melhor Histórico
SELECT
ip_address,
port,
protocol,
country_name,
success_count,
failure_count,
ROUND(success_count::numeric / NULLIF(success_count + failure_count, 0) * 100, 2) as success_rate,
response_time_ms,
last_successful_at
FROM proxies
WHERE is_active = TRUE
AND is_anonymous = TRUE
AND (success_count + failure_count) >= 5
ORDER BY success_rate DESC, response_time_ms ASC
LIMIT 10;
8. Proxies Adicionados Recentemente
SELECT
ip_address,
port,
protocol,
country_name,
is_active,
is_anonymous,
response_time_ms,
source,
created_at
FROM proxies
WHERE created_at >= NOW() - INTERVAL '24 hours'
ORDER BY created_at DESC;
9. Proxies que Falharam Recentemente
SELECT
ip_address,
port,
protocol,
country_name,
failure_count,
last_checked_at,
response_time_ms
FROM proxies
WHERE is_active = FALSE
AND last_checked_at >= NOW() - INTERVAL '24 hours'
ORDER BY last_checked_at DESC
LIMIT 50;
10. Distribuição por Faixa de Resposta
SELECT
CASE
WHEN response_time_ms < 100 THEN '< 100ms'
WHEN response_time_ms BETWEEN 100 AND 500 THEN '100-500ms'
WHEN response_time_ms BETWEEN 501 AND 1000 THEN '501-1000ms'
WHEN response_time_ms BETWEEN 1001 AND 2000 THEN '1-2s'
WHEN response_time_ms > 2000 THEN '> 2s'
END as response_range,
COUNT(*) as count
FROM proxies
WHERE is_active = TRUE
AND is_anonymous = TRUE
AND response_time_ms IS NOT NULL
GROUP BY response_range
ORDER BY
CASE
WHEN response_time_ms < 100 THEN 1
WHEN response_time_ms BETWEEN 100 AND 500 THEN 2
WHEN response_time_ms BETWEEN 501 AND 1000 THEN 3
WHEN response_time_ms BETWEEN 1001 AND 2000 THEN 4
WHEN response_time_ms > 2000 THEN 5
END;
Queries de Manutenção
11. Remover Proxies Inativos Antigos
-- Ver quantos seriam removidos
SELECT COUNT(*)
FROM proxies
WHERE is_active = FALSE
AND last_checked_at < NOW() - INTERVAL '30 days';
-- Remover (CUIDADO!)
DELETE FROM proxies
WHERE is_active = FALSE
AND last_checked_at < NOW() - INTERVAL '30 days';
12. Resetar Contadores de Falha
-- Para proxies que voltaram a funcionar
UPDATE proxies
SET failure_count = 0
WHERE is_active = TRUE
AND failure_count > 0
AND last_successful_at > NOW() - INTERVAL '24 hours';
13. Marcar Proxies como Inativos (Limpeza)
-- Proxies que não foram testados há muito tempo
UPDATE proxies
SET is_active = FALSE
WHERE last_checked_at < NOW() - INTERVAL '7 days'
AND is_active = TRUE;
14. Limpar Duplicatas (Se houver)
-- Identificar duplicatas
SELECT
ip_address,
port,
protocol,
COUNT(*) as count
FROM proxies
GROUP BY ip_address, port, protocol
HAVING COUNT(*) > 1;
-- Manter apenas a entrada mais recente de cada duplicata
DELETE FROM proxies
WHERE id IN (
SELECT id
FROM (
SELECT id,
ROW_NUMBER() OVER (
PARTITION BY ip_address, port, protocol
ORDER BY created_at DESC
) as rn
FROM proxies
) t
WHERE t.rn > 1
);
15. Vacuum e Análise (Otimização)
-- Limpar espaço não utilizado
VACUUM FULL proxies;
-- Atualizar estatísticas para query planner
ANALYZE proxies;
Queries de Relatório
16. Relatório Diário
SELECT
DATE(created_at) as date,
COUNT(*) as new_proxies,
COUNT(*) FILTER (WHERE is_active = TRUE) as active,
COUNT(*) FILTER (WHERE is_anonymous = TRUE) as anonymous,
ROUND(AVG(response_time_ms)) as avg_response
FROM proxies
WHERE created_at >= NOW() - INTERVAL '7 days'
GROUP BY DATE(created_at)
ORDER BY date DESC;
17. Relatório por Hora (Últimas 24h)
SELECT
DATE_TRUNC('hour', created_at) as hour,
COUNT(*) as new_proxies,
COUNT(*) FILTER (WHERE is_active = TRUE AND is_anonymous = TRUE) as usable
FROM proxies
WHERE created_at >= NOW() - INTERVAL '24 hours'
GROUP BY DATE_TRUNC('hour', created_at)
ORDER BY hour DESC;
18. Qualidade por Fonte
SELECT
source,
COUNT(*) as total,
COUNT(*) FILTER (WHERE is_active = TRUE AND is_anonymous = TRUE) as usable,
ROUND(
COUNT(*) FILTER (WHERE is_active = TRUE AND is_anonymous = TRUE)::numeric /
COUNT(*)::numeric * 100,
2
) as quality_percentage,
ROUND(AVG(response_time_ms)) as avg_response,
MAX(created_at) as last_update
FROM proxies
WHERE source IS NOT NULL
GROUP BY source
ORDER BY quality_percentage DESC;
Queries de Exportação
19. Exportar Proxies Prontos para Uso (CSV)
\copy (
SELECT
ip_address || ':' || port as proxy,
protocol,
country_name,
response_time_ms
FROM proxies
WHERE is_active = TRUE
AND is_anonymous = TRUE
ORDER BY response_time_ms ASC
) TO '/tmp/proxies_export.csv' WITH CSV HEADER;
20. Exportar em Formato URL
SELECT
CASE
WHEN protocol = 'HTTP' THEN 'http://' || ip_address || ':' || port
WHEN protocol = 'HTTPS' THEN 'https://' || ip_address || ':' || port
WHEN protocol = 'SOCKS4' THEN 'socks4://' || ip_address || ':' || port
WHEN protocol = 'SOCKS5' THEN 'socks5://' || ip_address || ':' || port
END as proxy_url,
country_name,
response_time_ms
FROM proxies
WHERE is_active = TRUE
AND is_anonymous = TRUE
ORDER BY response_time_ms ASC;
Queries de Monitoramento
21. Verificar Saúde do Banco
-- Tamanho da tabela
SELECT
pg_size_pretty(pg_total_relation_size('proxies')) as total_size,
pg_size_pretty(pg_relation_size('proxies')) as table_size,
pg_size_pretty(pg_indexes_size('proxies')) as indexes_size;
-- Número de linhas
SELECT
schemaname,
tablename,
n_live_tup as row_count,
n_dead_tup as dead_rows
FROM pg_stat_user_tables
WHERE tablename = 'proxies';
22. Performance dos Índices
SELECT
indexname,
idx_scan as index_scans,
idx_tup_read as tuples_read,
idx_tup_fetch as tuples_fetched
FROM pg_stat_user_indexes
WHERE tablename = 'proxies'
ORDER BY idx_scan DESC;
23. Queries Lentas (Diagnóstico)
-- Ativar tracking de queries (se não estiver ativo)
-- ALTER DATABASE proxies SET log_min_duration_statement = 1000;
-- Ver queries ativas
SELECT
pid,
usename,
application_name,
state,
query_start,
NOW() - query_start as duration,
query
FROM pg_stat_activity
WHERE datname = 'proxies'
AND state != 'idle'
ORDER BY duration DESC;
Uso com Python
24. Exemplo de Integração Python
import psycopg2
from psycopg2.extras import RealDictCursor
# Conectar
conn = psycopg2.connect(
host="localhost",
port=5432,
database="proxies",
user="postgres",
password="your_password"
)
# Buscar proxies
with conn.cursor(cursor_factory=RealDictCursor) as cursor:
cursor.execute("""
SELECT ip_address, port, protocol, response_time_ms
FROM proxies
WHERE is_active = TRUE AND is_anonymous = TRUE
ORDER BY response_time_ms ASC
LIMIT 10
""")
proxies = cursor.fetchall()
for proxy in proxies:
print(f"{proxy['protocol']}://{proxy['ip_address']}:{proxy['port']}")
conn.close()
Backup e Restore
25. Backup do Banco
# Via Docker
docker-compose exec postgres pg_dump -U postgres proxies > backup_proxies.sql
# Com compressão
docker-compose exec postgres pg_dump -U postgres proxies | gzip > backup_proxies.sql.gz
26. Restore do Banco
# Via Docker
docker-compose exec -T postgres psql -U postgres proxies < backup_proxies.sql
# De arquivo comprimido
gunzip -c backup_proxies.sql.gz | docker-compose exec -T postgres psql -U postgres proxies
Tips & Tricks
Performance
- Use
EXPLAIN ANALYZEantes de queries complexas - Mantenha os índices atualizados com
ANALYZE - Use
LIMITem queries exploratórias - Prefira índices compostos para queries com múltiplos filtros
Segurança
- Sempre use prepared statements em código
- Limite privilégios do usuário do banco
- Faça backup regular
- Use transações para operações críticas
Manutenção
- Execute VACUUM semanalmente
- Monitore tamanho do banco
- Limpe dados antigos periodicamente
- Documente queries customizadas
Nota: Todas as queries foram testadas no PostgreSQL 16. Ajuste conforme necessário para outras versões.