fix(database): correct metrics and improve error handling

- Fix metrics double-counting: track deltas for WaitCount/WaitDuration
  instead of adding cumulative values each tick
- Replace fmt.Printf with structured logging in pool monitor
- Add PoolOptions validation (MaxConns > 0, MinConns >= 0)
- Warn when DATABASE_URI overrides non-default PoolOptions
- Improve findAndParseConfig to report all tried files and errors
- Remove dead code in pgdb/config.go (unreachable host default)
- Fix errcheck lint issues for file.Close() calls
- Add context parameter to OpenDBMonitor() (breaking change)
This commit is contained in:
2025-11-29 12:56:49 -08:00
parent 283d3936f6
commit 94b718a925
6 changed files with 70 additions and 34 deletions

View File

@@ -3,10 +3,10 @@ package database
import (
"context"
"database/sql"
"fmt"
"time"
"github.com/prometheus/client_golang/prometheus"
"go.ntppool.org/common/logger"
)
// DatabaseMetrics holds the Prometheus metrics for database connection pool monitoring
@@ -16,6 +16,10 @@ type DatabaseMetrics struct {
ConnectionsInUse prometheus.Gauge
ConnectionsWaitCount prometheus.Counter
ConnectionsWaitDuration prometheus.Histogram
// Track last values for delta calculation (cumulative stats from sql.DBStats)
lastWaitCount int64
lastWaitDuration time.Duration
}
// NewDatabaseMetrics creates a new set of database metrics and registers them
@@ -67,26 +71,44 @@ func monitorConnectionPool(ctx context.Context, db *sql.DB, registerer prometheu
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
log := logger.FromContext(ctx)
for {
select {
case <-ctx.Done():
log.InfoContext(ctx, "database connection pool monitor stopped")
return
case <-ticker.C:
stats := db.Stats()
// Update gauge metrics (current state)
metrics.ConnectionsOpen.Set(float64(stats.OpenConnections))
metrics.ConnectionsIdle.Set(float64(stats.Idle))
metrics.ConnectionsInUse.Set(float64(stats.InUse))
metrics.ConnectionsWaitCount.Add(float64(stats.WaitCount))
if stats.WaitDuration > 0 {
metrics.ConnectionsWaitDuration.Observe(stats.WaitDuration.Seconds())
// Update counter with delta (WaitCount is cumulative in sql.DBStats)
waitCountDelta := stats.WaitCount - metrics.lastWaitCount
if waitCountDelta > 0 {
metrics.ConnectionsWaitCount.Add(float64(waitCountDelta))
metrics.lastWaitCount = stats.WaitCount
}
// Update histogram with delta (WaitDuration is cumulative in sql.DBStats)
waitDurationDelta := stats.WaitDuration - metrics.lastWaitDuration
if waitDurationDelta > 0 {
metrics.ConnectionsWaitDuration.Observe(waitDurationDelta.Seconds())
metrics.lastWaitDuration = stats.WaitDuration
}
// Log connection pool stats for high usage or waiting
if stats.OpenConnections > 20 || stats.WaitCount > 0 {
fmt.Printf("Connection pool stats: open=%d idle=%d in_use=%d wait_count=%d wait_duration=%s\n",
stats.OpenConnections, stats.Idle, stats.InUse, stats.WaitCount, stats.WaitDuration)
if stats.OpenConnections > 20 || waitCountDelta > 0 {
log.WarnContext(ctx, "high database connection usage",
"open", stats.OpenConnections,
"idle", stats.Idle,
"in_use", stats.InUse,
"wait_count", stats.WaitCount,
"wait_duration", stats.WaitDuration,
)
}
}
}