497 lines
11 KiB
Markdown
497 lines
11 KiB
Markdown
# SQL Queries Úteis - Proxy Scraping Service
|
|
|
|
Este documento contém queries SQL úteis para gerenciar e consultar a base de dados de proxies.
|
|
|
|
## Acesso ao Banco
|
|
|
|
### Via Docker Compose
|
|
```bash
|
|
docker-compose exec postgres psql -U postgres -d proxies
|
|
```
|
|
|
|
### Via pgAdmin
|
|
1. Inicie com: `docker-compose --profile admin up -d`
|
|
2. Acesse: http://localhost:5050
|
|
3. Login: admin@admin.com / admin
|
|
4. Conecte ao servidor postgres
|
|
|
|
## Queries de Consulta
|
|
|
|
### 1. Listar Proxies Ativos e Anônimos
|
|
|
|
```sql
|
|
SELECT
|
|
ip_address,
|
|
port,
|
|
protocol,
|
|
country_name,
|
|
response_time_ms,
|
|
last_checked_at,
|
|
source
|
|
FROM proxies
|
|
WHERE is_active = TRUE
|
|
AND is_anonymous = TRUE
|
|
ORDER BY response_time_ms ASC
|
|
LIMIT 50;
|
|
```
|
|
|
|
### 2. Proxies Mais Rápidos por Protocolo
|
|
|
|
```sql
|
|
SELECT
|
|
protocol,
|
|
ip_address,
|
|
port,
|
|
response_time_ms,
|
|
country_name
|
|
FROM proxies
|
|
WHERE is_active = TRUE
|
|
AND is_anonymous = TRUE
|
|
ORDER BY protocol, response_time_ms ASC;
|
|
```
|
|
|
|
### 3. Estatísticas Gerais
|
|
|
|
```sql
|
|
SELECT
|
|
COUNT(*) as total_proxies,
|
|
COUNT(*) FILTER (WHERE is_active = TRUE) as active_proxies,
|
|
COUNT(*) FILTER (WHERE is_anonymous = TRUE) as anonymous_proxies,
|
|
COUNT(*) FILTER (WHERE is_active = TRUE AND is_anonymous = TRUE) as usable_proxies,
|
|
COUNT(DISTINCT protocol) as unique_protocols,
|
|
COUNT(DISTINCT country_code) as unique_countries,
|
|
ROUND(AVG(response_time_ms)) as avg_response_time_ms,
|
|
MIN(response_time_ms) as min_response_time_ms,
|
|
MAX(response_time_ms) as max_response_time_ms
|
|
FROM proxies;
|
|
```
|
|
|
|
### 4. Proxies por País
|
|
|
|
```sql
|
|
SELECT
|
|
country_name,
|
|
country_code,
|
|
COUNT(*) as total,
|
|
COUNT(*) FILTER (WHERE is_active = TRUE) as active,
|
|
COUNT(*) FILTER (WHERE is_anonymous = TRUE) as anonymous,
|
|
ROUND(AVG(response_time_ms)) as avg_response_time
|
|
FROM proxies
|
|
WHERE country_name IS NOT NULL
|
|
GROUP BY country_name, country_code
|
|
ORDER BY total DESC;
|
|
```
|
|
|
|
### 5. Proxies por Protocolo
|
|
|
|
```sql
|
|
SELECT
|
|
protocol,
|
|
COUNT(*) as total,
|
|
COUNT(*) FILTER (WHERE is_active = TRUE) as active,
|
|
COUNT(*) FILTER (WHERE is_anonymous = TRUE) as anonymous,
|
|
ROUND(AVG(response_time_ms)) as avg_response_time
|
|
FROM proxies
|
|
GROUP BY protocol
|
|
ORDER BY protocol;
|
|
```
|
|
|
|
### 6. Proxies por Fonte
|
|
|
|
```sql
|
|
SELECT
|
|
source,
|
|
COUNT(*) as total,
|
|
COUNT(*) FILTER (WHERE is_active = TRUE) as active,
|
|
COUNT(*) FILTER (WHERE is_anonymous = TRUE) as anonymous,
|
|
ROUND(AVG(response_time_ms)) as avg_response_time,
|
|
MAX(created_at) as last_scraped
|
|
FROM proxies
|
|
WHERE source IS NOT NULL
|
|
GROUP BY source
|
|
ORDER BY total DESC;
|
|
```
|
|
|
|
### 7. Top 10 Proxies com Melhor Histórico
|
|
|
|
```sql
|
|
SELECT
|
|
ip_address,
|
|
port,
|
|
protocol,
|
|
country_name,
|
|
success_count,
|
|
failure_count,
|
|
ROUND(success_count::numeric / NULLIF(success_count + failure_count, 0) * 100, 2) as success_rate,
|
|
response_time_ms,
|
|
last_successful_at
|
|
FROM proxies
|
|
WHERE is_active = TRUE
|
|
AND is_anonymous = TRUE
|
|
AND (success_count + failure_count) >= 5
|
|
ORDER BY success_rate DESC, response_time_ms ASC
|
|
LIMIT 10;
|
|
```
|
|
|
|
### 8. Proxies Adicionados Recentemente
|
|
|
|
```sql
|
|
SELECT
|
|
ip_address,
|
|
port,
|
|
protocol,
|
|
country_name,
|
|
is_active,
|
|
is_anonymous,
|
|
response_time_ms,
|
|
source,
|
|
created_at
|
|
FROM proxies
|
|
WHERE created_at >= NOW() - INTERVAL '24 hours'
|
|
ORDER BY created_at DESC;
|
|
```
|
|
|
|
### 9. Proxies que Falharam Recentemente
|
|
|
|
```sql
|
|
SELECT
|
|
ip_address,
|
|
port,
|
|
protocol,
|
|
country_name,
|
|
failure_count,
|
|
last_checked_at,
|
|
response_time_ms
|
|
FROM proxies
|
|
WHERE is_active = FALSE
|
|
AND last_checked_at >= NOW() - INTERVAL '24 hours'
|
|
ORDER BY last_checked_at DESC
|
|
LIMIT 50;
|
|
```
|
|
|
|
### 10. Distribuição por Faixa de Resposta
|
|
|
|
```sql
|
|
SELECT
|
|
CASE
|
|
WHEN response_time_ms < 100 THEN '< 100ms'
|
|
WHEN response_time_ms BETWEEN 100 AND 500 THEN '100-500ms'
|
|
WHEN response_time_ms BETWEEN 501 AND 1000 THEN '501-1000ms'
|
|
WHEN response_time_ms BETWEEN 1001 AND 2000 THEN '1-2s'
|
|
WHEN response_time_ms > 2000 THEN '> 2s'
|
|
END as response_range,
|
|
COUNT(*) as count
|
|
FROM proxies
|
|
WHERE is_active = TRUE
|
|
AND is_anonymous = TRUE
|
|
AND response_time_ms IS NOT NULL
|
|
GROUP BY response_range
|
|
ORDER BY
|
|
CASE
|
|
WHEN response_time_ms < 100 THEN 1
|
|
WHEN response_time_ms BETWEEN 100 AND 500 THEN 2
|
|
WHEN response_time_ms BETWEEN 501 AND 1000 THEN 3
|
|
WHEN response_time_ms BETWEEN 1001 AND 2000 THEN 4
|
|
WHEN response_time_ms > 2000 THEN 5
|
|
END;
|
|
```
|
|
|
|
## Queries de Manutenção
|
|
|
|
### 11. Remover Proxies Inativos Antigos
|
|
|
|
```sql
|
|
-- Ver quantos seriam removidos
|
|
SELECT COUNT(*)
|
|
FROM proxies
|
|
WHERE is_active = FALSE
|
|
AND last_checked_at < NOW() - INTERVAL '30 days';
|
|
|
|
-- Remover (CUIDADO!)
|
|
DELETE FROM proxies
|
|
WHERE is_active = FALSE
|
|
AND last_checked_at < NOW() - INTERVAL '30 days';
|
|
```
|
|
|
|
### 12. Resetar Contadores de Falha
|
|
|
|
```sql
|
|
-- Para proxies que voltaram a funcionar
|
|
UPDATE proxies
|
|
SET failure_count = 0
|
|
WHERE is_active = TRUE
|
|
AND failure_count > 0
|
|
AND last_successful_at > NOW() - INTERVAL '24 hours';
|
|
```
|
|
|
|
### 13. Marcar Proxies como Inativos (Limpeza)
|
|
|
|
```sql
|
|
-- Proxies que não foram testados há muito tempo
|
|
UPDATE proxies
|
|
SET is_active = FALSE
|
|
WHERE last_checked_at < NOW() - INTERVAL '7 days'
|
|
AND is_active = TRUE;
|
|
```
|
|
|
|
### 14. Limpar Duplicatas (Se houver)
|
|
|
|
```sql
|
|
-- Identificar duplicatas
|
|
SELECT
|
|
ip_address,
|
|
port,
|
|
protocol,
|
|
COUNT(*) as count
|
|
FROM proxies
|
|
GROUP BY ip_address, port, protocol
|
|
HAVING COUNT(*) > 1;
|
|
|
|
-- Manter apenas a entrada mais recente de cada duplicata
|
|
DELETE FROM proxies
|
|
WHERE id IN (
|
|
SELECT id
|
|
FROM (
|
|
SELECT id,
|
|
ROW_NUMBER() OVER (
|
|
PARTITION BY ip_address, port, protocol
|
|
ORDER BY created_at DESC
|
|
) as rn
|
|
FROM proxies
|
|
) t
|
|
WHERE t.rn > 1
|
|
);
|
|
```
|
|
|
|
### 15. Vacuum e Análise (Otimização)
|
|
|
|
```sql
|
|
-- Limpar espaço não utilizado
|
|
VACUUM FULL proxies;
|
|
|
|
-- Atualizar estatísticas para query planner
|
|
ANALYZE proxies;
|
|
```
|
|
|
|
## Queries de Relatório
|
|
|
|
### 16. Relatório Diário
|
|
|
|
```sql
|
|
SELECT
|
|
DATE(created_at) as date,
|
|
COUNT(*) as new_proxies,
|
|
COUNT(*) FILTER (WHERE is_active = TRUE) as active,
|
|
COUNT(*) FILTER (WHERE is_anonymous = TRUE) as anonymous,
|
|
ROUND(AVG(response_time_ms)) as avg_response
|
|
FROM proxies
|
|
WHERE created_at >= NOW() - INTERVAL '7 days'
|
|
GROUP BY DATE(created_at)
|
|
ORDER BY date DESC;
|
|
```
|
|
|
|
### 17. Relatório por Hora (Últimas 24h)
|
|
|
|
```sql
|
|
SELECT
|
|
DATE_TRUNC('hour', created_at) as hour,
|
|
COUNT(*) as new_proxies,
|
|
COUNT(*) FILTER (WHERE is_active = TRUE AND is_anonymous = TRUE) as usable
|
|
FROM proxies
|
|
WHERE created_at >= NOW() - INTERVAL '24 hours'
|
|
GROUP BY DATE_TRUNC('hour', created_at)
|
|
ORDER BY hour DESC;
|
|
```
|
|
|
|
### 18. Qualidade por Fonte
|
|
|
|
```sql
|
|
SELECT
|
|
source,
|
|
COUNT(*) as total,
|
|
COUNT(*) FILTER (WHERE is_active = TRUE AND is_anonymous = TRUE) as usable,
|
|
ROUND(
|
|
COUNT(*) FILTER (WHERE is_active = TRUE AND is_anonymous = TRUE)::numeric /
|
|
COUNT(*)::numeric * 100,
|
|
2
|
|
) as quality_percentage,
|
|
ROUND(AVG(response_time_ms)) as avg_response,
|
|
MAX(created_at) as last_update
|
|
FROM proxies
|
|
WHERE source IS NOT NULL
|
|
GROUP BY source
|
|
ORDER BY quality_percentage DESC;
|
|
```
|
|
|
|
## Queries de Exportação
|
|
|
|
### 19. Exportar Proxies Prontos para Uso (CSV)
|
|
|
|
```sql
|
|
\copy (
|
|
SELECT
|
|
ip_address || ':' || port as proxy,
|
|
protocol,
|
|
country_name,
|
|
response_time_ms
|
|
FROM proxies
|
|
WHERE is_active = TRUE
|
|
AND is_anonymous = TRUE
|
|
ORDER BY response_time_ms ASC
|
|
) TO '/tmp/proxies_export.csv' WITH CSV HEADER;
|
|
```
|
|
|
|
### 20. Exportar em Formato URL
|
|
|
|
```sql
|
|
SELECT
|
|
CASE
|
|
WHEN protocol = 'HTTP' THEN 'http://' || ip_address || ':' || port
|
|
WHEN protocol = 'HTTPS' THEN 'https://' || ip_address || ':' || port
|
|
WHEN protocol = 'SOCKS4' THEN 'socks4://' || ip_address || ':' || port
|
|
WHEN protocol = 'SOCKS5' THEN 'socks5://' || ip_address || ':' || port
|
|
END as proxy_url,
|
|
country_name,
|
|
response_time_ms
|
|
FROM proxies
|
|
WHERE is_active = TRUE
|
|
AND is_anonymous = TRUE
|
|
ORDER BY response_time_ms ASC;
|
|
```
|
|
|
|
## Queries de Monitoramento
|
|
|
|
### 21. Verificar Saúde do Banco
|
|
|
|
```sql
|
|
-- Tamanho da tabela
|
|
SELECT
|
|
pg_size_pretty(pg_total_relation_size('proxies')) as total_size,
|
|
pg_size_pretty(pg_relation_size('proxies')) as table_size,
|
|
pg_size_pretty(pg_indexes_size('proxies')) as indexes_size;
|
|
|
|
-- Número de linhas
|
|
SELECT
|
|
schemaname,
|
|
tablename,
|
|
n_live_tup as row_count,
|
|
n_dead_tup as dead_rows
|
|
FROM pg_stat_user_tables
|
|
WHERE tablename = 'proxies';
|
|
```
|
|
|
|
### 22. Performance dos Índices
|
|
|
|
```sql
|
|
SELECT
|
|
indexname,
|
|
idx_scan as index_scans,
|
|
idx_tup_read as tuples_read,
|
|
idx_tup_fetch as tuples_fetched
|
|
FROM pg_stat_user_indexes
|
|
WHERE tablename = 'proxies'
|
|
ORDER BY idx_scan DESC;
|
|
```
|
|
|
|
### 23. Queries Lentas (Diagnóstico)
|
|
|
|
```sql
|
|
-- Ativar tracking de queries (se não estiver ativo)
|
|
-- ALTER DATABASE proxies SET log_min_duration_statement = 1000;
|
|
|
|
-- Ver queries ativas
|
|
SELECT
|
|
pid,
|
|
usename,
|
|
application_name,
|
|
state,
|
|
query_start,
|
|
NOW() - query_start as duration,
|
|
query
|
|
FROM pg_stat_activity
|
|
WHERE datname = 'proxies'
|
|
AND state != 'idle'
|
|
ORDER BY duration DESC;
|
|
```
|
|
|
|
## Uso com Python
|
|
|
|
### 24. Exemplo de Integração Python
|
|
|
|
```python
|
|
import psycopg2
|
|
from psycopg2.extras import RealDictCursor
|
|
|
|
# Conectar
|
|
conn = psycopg2.connect(
|
|
host="localhost",
|
|
port=5432,
|
|
database="proxies",
|
|
user="postgres",
|
|
password="your_password"
|
|
)
|
|
|
|
# Buscar proxies
|
|
with conn.cursor(cursor_factory=RealDictCursor) as cursor:
|
|
cursor.execute("""
|
|
SELECT ip_address, port, protocol, response_time_ms
|
|
FROM proxies
|
|
WHERE is_active = TRUE AND is_anonymous = TRUE
|
|
ORDER BY response_time_ms ASC
|
|
LIMIT 10
|
|
""")
|
|
|
|
proxies = cursor.fetchall()
|
|
|
|
for proxy in proxies:
|
|
print(f"{proxy['protocol']}://{proxy['ip_address']}:{proxy['port']}")
|
|
|
|
conn.close()
|
|
```
|
|
|
|
## Backup e Restore
|
|
|
|
### 25. Backup do Banco
|
|
|
|
```bash
|
|
# Via Docker
|
|
docker-compose exec postgres pg_dump -U postgres proxies > backup_proxies.sql
|
|
|
|
# Com compressão
|
|
docker-compose exec postgres pg_dump -U postgres proxies | gzip > backup_proxies.sql.gz
|
|
```
|
|
|
|
### 26. Restore do Banco
|
|
|
|
```bash
|
|
# Via Docker
|
|
docker-compose exec -T postgres psql -U postgres proxies < backup_proxies.sql
|
|
|
|
# De arquivo comprimido
|
|
gunzip -c backup_proxies.sql.gz | docker-compose exec -T postgres psql -U postgres proxies
|
|
```
|
|
|
|
## Tips & Tricks
|
|
|
|
### Performance
|
|
- Use `EXPLAIN ANALYZE` antes de queries complexas
|
|
- Mantenha os índices atualizados com `ANALYZE`
|
|
- Use `LIMIT` em queries exploratórias
|
|
- Prefira índices compostos para queries com múltiplos filtros
|
|
|
|
### Segurança
|
|
- Sempre use prepared statements em código
|
|
- Limite privilégios do usuário do banco
|
|
- Faça backup regular
|
|
- Use transações para operações críticas
|
|
|
|
### Manutenção
|
|
- Execute VACUUM semanalmente
|
|
- Monitore tamanho do banco
|
|
- Limpe dados antigos periodicamente
|
|
- Documente queries customizadas
|
|
|
|
---
|
|
|
|
**Nota**: Todas as queries foram testadas no PostgreSQL 16. Ajuste conforme necessário para outras versões.
|