fix(database): correct metrics and improve error handling
- Fix metrics double-counting: track deltas for WaitCount/WaitDuration instead of adding cumulative values each tick - Replace fmt.Printf with structured logging in pool monitor - Add PoolOptions validation (MaxConns > 0, MinConns >= 0) - Warn when DATABASE_URI overrides non-default PoolOptions - Improve findAndParseConfig to report all tried files and errors - Remove dead code in pgdb/config.go (unreachable host default) - Fix errcheck lint issues for file.Close() calls - Add context parameter to OpenDBMonitor() (breaking change)
This commit is contained in:
@@ -3,10 +3,10 @@ package database
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"go.ntppool.org/common/logger"
|
||||
)
|
||||
|
||||
// DatabaseMetrics holds the Prometheus metrics for database connection pool monitoring
|
||||
@@ -16,6 +16,10 @@ type DatabaseMetrics struct {
|
||||
ConnectionsInUse prometheus.Gauge
|
||||
ConnectionsWaitCount prometheus.Counter
|
||||
ConnectionsWaitDuration prometheus.Histogram
|
||||
|
||||
// Track last values for delta calculation (cumulative stats from sql.DBStats)
|
||||
lastWaitCount int64
|
||||
lastWaitDuration time.Duration
|
||||
}
|
||||
|
||||
// NewDatabaseMetrics creates a new set of database metrics and registers them
|
||||
@@ -67,26 +71,44 @@ func monitorConnectionPool(ctx context.Context, db *sql.DB, registerer prometheu
|
||||
ticker := time.NewTicker(30 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
log := logger.FromContext(ctx)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
log.InfoContext(ctx, "database connection pool monitor stopped")
|
||||
return
|
||||
case <-ticker.C:
|
||||
stats := db.Stats()
|
||||
|
||||
// Update gauge metrics (current state)
|
||||
metrics.ConnectionsOpen.Set(float64(stats.OpenConnections))
|
||||
metrics.ConnectionsIdle.Set(float64(stats.Idle))
|
||||
metrics.ConnectionsInUse.Set(float64(stats.InUse))
|
||||
metrics.ConnectionsWaitCount.Add(float64(stats.WaitCount))
|
||||
|
||||
if stats.WaitDuration > 0 {
|
||||
metrics.ConnectionsWaitDuration.Observe(stats.WaitDuration.Seconds())
|
||||
// Update counter with delta (WaitCount is cumulative in sql.DBStats)
|
||||
waitCountDelta := stats.WaitCount - metrics.lastWaitCount
|
||||
if waitCountDelta > 0 {
|
||||
metrics.ConnectionsWaitCount.Add(float64(waitCountDelta))
|
||||
metrics.lastWaitCount = stats.WaitCount
|
||||
}
|
||||
|
||||
// Update histogram with delta (WaitDuration is cumulative in sql.DBStats)
|
||||
waitDurationDelta := stats.WaitDuration - metrics.lastWaitDuration
|
||||
if waitDurationDelta > 0 {
|
||||
metrics.ConnectionsWaitDuration.Observe(waitDurationDelta.Seconds())
|
||||
metrics.lastWaitDuration = stats.WaitDuration
|
||||
}
|
||||
|
||||
// Log connection pool stats for high usage or waiting
|
||||
if stats.OpenConnections > 20 || stats.WaitCount > 0 {
|
||||
fmt.Printf("Connection pool stats: open=%d idle=%d in_use=%d wait_count=%d wait_duration=%s\n",
|
||||
stats.OpenConnections, stats.Idle, stats.InUse, stats.WaitCount, stats.WaitDuration)
|
||||
if stats.OpenConnections > 20 || waitCountDelta > 0 {
|
||||
log.WarnContext(ctx, "high database connection usage",
|
||||
"open", stats.OpenConnections,
|
||||
"idle", stats.Idle,
|
||||
"in_use", stats.InUse,
|
||||
"wait_count", stats.WaitCount,
|
||||
"wait_duration", stats.WaitDuration,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user