- Fix metrics double-counting: track deltas for WaitCount/WaitDuration instead of adding cumulative values each tick - Replace fmt.Printf with structured logging in pool monitor - Add PoolOptions validation (MaxConns > 0, MinConns >= 0) - Warn when DATABASE_URI overrides non-default PoolOptions - Improve findAndParseConfig to report all tried files and errors - Remove dead code in pgdb/config.go (unreachable host default) - Fix errcheck lint issues for file.Close() calls - Add context parameter to OpenDBMonitor() (breaking change)
116 lines
3.6 KiB
Go
116 lines
3.6 KiB
Go
package database
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"go.ntppool.org/common/logger"
|
|
)
|
|
|
|
// DatabaseMetrics holds the Prometheus metrics for database connection pool monitoring
|
|
type DatabaseMetrics struct {
|
|
ConnectionsOpen prometheus.Gauge
|
|
ConnectionsIdle prometheus.Gauge
|
|
ConnectionsInUse prometheus.Gauge
|
|
ConnectionsWaitCount prometheus.Counter
|
|
ConnectionsWaitDuration prometheus.Histogram
|
|
|
|
// Track last values for delta calculation (cumulative stats from sql.DBStats)
|
|
lastWaitCount int64
|
|
lastWaitDuration time.Duration
|
|
}
|
|
|
|
// NewDatabaseMetrics creates a new set of database metrics and registers them
|
|
func NewDatabaseMetrics(registerer prometheus.Registerer) *DatabaseMetrics {
|
|
metrics := &DatabaseMetrics{
|
|
ConnectionsOpen: prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: "database_connections_open",
|
|
Help: "Number of open database connections",
|
|
}),
|
|
ConnectionsIdle: prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: "database_connections_idle",
|
|
Help: "Number of idle database connections",
|
|
}),
|
|
ConnectionsInUse: prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: "database_connections_in_use",
|
|
Help: "Number of database connections in use",
|
|
}),
|
|
ConnectionsWaitCount: prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "database_connections_wait_count_total",
|
|
Help: "Total number of times a connection had to wait",
|
|
}),
|
|
ConnectionsWaitDuration: prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Name: "database_connections_wait_duration_seconds",
|
|
Help: "Time spent waiting for a database connection",
|
|
Buckets: prometheus.DefBuckets,
|
|
}),
|
|
}
|
|
|
|
if registerer != nil {
|
|
registerer.MustRegister(
|
|
metrics.ConnectionsOpen,
|
|
metrics.ConnectionsIdle,
|
|
metrics.ConnectionsInUse,
|
|
metrics.ConnectionsWaitCount,
|
|
metrics.ConnectionsWaitDuration,
|
|
)
|
|
}
|
|
|
|
return metrics
|
|
}
|
|
|
|
// monitorConnectionPool runs a background goroutine to collect connection pool metrics
|
|
func monitorConnectionPool(ctx context.Context, db *sql.DB, registerer prometheus.Registerer) {
|
|
if registerer == nil {
|
|
return // No metrics collection if no registerer provided
|
|
}
|
|
|
|
metrics := NewDatabaseMetrics(registerer)
|
|
ticker := time.NewTicker(30 * time.Second)
|
|
defer ticker.Stop()
|
|
|
|
log := logger.FromContext(ctx)
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
log.InfoContext(ctx, "database connection pool monitor stopped")
|
|
return
|
|
case <-ticker.C:
|
|
stats := db.Stats()
|
|
|
|
// Update gauge metrics (current state)
|
|
metrics.ConnectionsOpen.Set(float64(stats.OpenConnections))
|
|
metrics.ConnectionsIdle.Set(float64(stats.Idle))
|
|
metrics.ConnectionsInUse.Set(float64(stats.InUse))
|
|
|
|
// Update counter with delta (WaitCount is cumulative in sql.DBStats)
|
|
waitCountDelta := stats.WaitCount - metrics.lastWaitCount
|
|
if waitCountDelta > 0 {
|
|
metrics.ConnectionsWaitCount.Add(float64(waitCountDelta))
|
|
metrics.lastWaitCount = stats.WaitCount
|
|
}
|
|
|
|
// Update histogram with delta (WaitDuration is cumulative in sql.DBStats)
|
|
waitDurationDelta := stats.WaitDuration - metrics.lastWaitDuration
|
|
if waitDurationDelta > 0 {
|
|
metrics.ConnectionsWaitDuration.Observe(waitDurationDelta.Seconds())
|
|
metrics.lastWaitDuration = stats.WaitDuration
|
|
}
|
|
|
|
// Log connection pool stats for high usage or waiting
|
|
if stats.OpenConnections > 20 || waitCountDelta > 0 {
|
|
log.WarnContext(ctx, "high database connection usage",
|
|
"open", stats.OpenConnections,
|
|
"idle", stats.Idle,
|
|
"in_use", stats.InUse,
|
|
"wait_count", stats.WaitCount,
|
|
"wait_duration", stats.WaitDuration,
|
|
)
|
|
}
|
|
}
|
|
}
|
|
}
|