feat(metrics): add OTLP metrics support with centralized config

- Create new metrics/ package for OpenTelemetry-native metrics with OTLP export - Refactor OTLP configuration to internal/tracerconfig/ to eliminate code duplication - Add consistent retry configuration across all HTTP OTLP exporters - Add configuration validation and improved error messages - Include test coverage for all new functionality - Make OpenTelemetry metrics dependencies explicit in go.mod Designed for new applications requiring structured metrics export to observability backends via OTLP protocol.
2025-08-02 09:29:27 -07:00
parent 796b2a8412
commit c6230be91e
7 changed files with 1274 additions and 224 deletions
--- a/metrics/metrics.go
+++ b/metrics/metrics.go
@@ -0,0 +1,122 @@
+// Package metrics provides OpenTelemetry-native metrics with OTLP export support.
+//
+// This package implements a metrics system using the OpenTelemetry metrics data model
+// with OTLP export capabilities. It's designed for new applications that want to use
+// structured metrics export to observability backends.
+//
+// Key features:
+//   - OpenTelemetry native metric types (Counter, Histogram, Gauge, etc.)
+//   - OTLP export for sending metrics to observability backends
+//   - Resource detection and correlation with traces/logs
+//   - Graceful handling when OTLP configuration is not available
+//
+// Example usage:
+//
+//	// Initialize metrics along with tracing
+//	shutdown, err := tracing.InitTracer(ctx, cfg)
+//	if err != nil {
+//		log.Fatal(err)
+//	}
+//	defer shutdown(ctx)
+//
+//	// Get a meter and create instruments
+//	meter := metrics.GetMeter("my-service")
+//	counter, _ := meter.Int64Counter("requests_total")
+//	counter.Add(ctx, 1, metric.WithAttributes(attribute.String("method", "GET")))
+package metrics
+
+import (
+	"context"
+	"log/slog"
+	"sync"
+	"time"
+
+	"go.ntppool.org/common/internal/tracerconfig"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/metric"
+	sdkmetric "go.opentelemetry.io/otel/sdk/metric"
+)
+
+var (
+	meterProvider metric.MeterProvider
+	setupOnce     sync.Once
+	setupErr      error
+)
+
+// Setup initializes the OpenTelemetry metrics provider with OTLP export.
+// This function uses the configuration stored by the tracing package and
+// creates a metrics provider that exports to the same OTLP endpoint.
+//
+// The function is safe to call multiple times - it will only initialize once.
+// If tracing configuration is not available, it returns a no-op provider that
+// doesn't export metrics.
+//
+// Returns an error only if there's a configuration problem. Missing tracing
+// configuration is handled gracefully with a warning log.
+func Setup(ctx context.Context) error {
+	setupOnce.Do(func() {
+		setupErr = initializeMetrics(ctx)
+	})
+	return setupErr
+}
+
+// GetMeter returns a named meter for creating metric instruments.
+// The meter uses the configured metrics provider, or the global provider
+// if metrics haven't been set up yet.
+//
+// This is the primary entry point for creating metric instruments in your application.
+func GetMeter(name string, opts ...metric.MeterOption) metric.Meter {
+	if meterProvider == nil {
+		// Return the global provider as fallback (no-op if not configured)
+		return otel.GetMeterProvider().Meter(name, opts...)
+	}
+	return meterProvider.Meter(name, opts...)
+}
+
+// initializeMetrics sets up the OpenTelemetry metrics provider with OTLP export.
+func initializeMetrics(ctx context.Context) error {
+	log := slog.Default()
+
+	// Check if tracing configuration is available
+	cfg, configCtx, factory := tracerconfig.GetMetricExporter()
+	if cfg == nil || configCtx == nil || factory == nil {
+		log.Warn("metrics setup: tracing configuration not available, using no-op provider")
+		// Set the global provider as fallback - metrics just won't be exported
+		meterProvider = otel.GetMeterProvider()
+		return nil
+	}
+
+	// Create OTLP metrics exporter
+	exporter, err := factory(ctx, cfg)
+	if err != nil {
+		log.Error("metrics setup: failed to create OTLP exporter", "error", err)
+		// Fall back to global provider
+		meterProvider = otel.GetMeterProvider()
+		return nil
+	}
+
+	// Create metrics provider with the exporter
+	provider := sdkmetric.NewMeterProvider(
+		sdkmetric.WithReader(sdkmetric.NewPeriodicReader(
+			exporter,
+			sdkmetric.WithInterval(15*time.Second),
+		)),
+	)
+
+	// Set the global provider
+	otel.SetMeterProvider(provider)
+	meterProvider = provider
+
+	log.Info("metrics setup: OTLP metrics provider initialized")
+	return nil
+}
+
+// Shutdown gracefully shuts down the metrics provider.
+// This should be called during application shutdown to ensure all metrics
+// are properly flushed and exported.
+func Shutdown(ctx context.Context) error {
+	if provider, ok := meterProvider.(*sdkmetric.MeterProvider); ok {
+		return provider.Shutdown(ctx)
+	}
+	return nil
+}
--- a/metrics/metrics_test.go
+++ b/metrics/metrics_test.go
@@ -0,0 +1,296 @@
+package metrics
+
+import (
+	"context"
+	"os"
+	"testing"
+	"time"
+
+	"go.ntppool.org/common/internal/tracerconfig"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/metric"
+	sdkmetric "go.opentelemetry.io/otel/sdk/metric"
+	"go.opentelemetry.io/otel/sdk/metric/metricdata"
+)
+
+func TestSetup_NoConfiguration(t *testing.T) {
+	// Clear any existing configuration
+	tracerconfig.Clear()
+
+	ctx := context.Background()
+	err := Setup(ctx)
+	// Should not return an error even when no configuration is available
+	if err != nil {
+		t.Errorf("Setup() returned unexpected error: %v", err)
+	}
+
+	// Should be able to get a meter (even if it's a no-op)
+	meter := GetMeter("test-meter")
+	if meter == nil {
+		t.Error("GetMeter() returned nil")
+	}
+}
+
+func TestGetMeter(t *testing.T) {
+	// Clear any existing configuration
+	tracerconfig.Clear()
+
+	ctx := context.Background()
+	_ = Setup(ctx)
+
+	meter := GetMeter("test-service")
+	if meter == nil {
+		t.Fatal("GetMeter() returned nil")
+	}
+
+	// Test creating a counter instrument
+	counter, err := meter.Int64Counter("test_counter")
+	if err != nil {
+		t.Errorf("Failed to create counter: %v", err)
+	}
+
+	// Test using the counter (should not error even with no-op provider)
+	counter.Add(ctx, 1, metric.WithAttributes(attribute.String("test", "value")))
+}
+
+func TestSetup_MultipleCallsSafe(t *testing.T) {
+	// Clear any existing configuration
+	tracerconfig.Clear()
+
+	ctx := context.Background()
+
+	// Call Setup multiple times
+	err1 := Setup(ctx)
+	err2 := Setup(ctx)
+	err3 := Setup(ctx)
+
+	if err1 != nil {
+		t.Errorf("First Setup() call returned error: %v", err1)
+	}
+	if err2 != nil {
+		t.Errorf("Second Setup() call returned error: %v", err2)
+	}
+	if err3 != nil {
+		t.Errorf("Third Setup() call returned error: %v", err3)
+	}
+
+	// Should still be able to get meters
+	meter := GetMeter("test-meter")
+	if meter == nil {
+		t.Error("GetMeter() returned nil after multiple Setup() calls")
+	}
+}
+
+func TestSetup_WithConfiguration(t *testing.T) {
+	// Clear any existing configuration
+	tracerconfig.Clear()
+
+	ctx := context.Background()
+	config := &tracerconfig.Config{
+		ServiceName: "test-metrics-service",
+		Environment: "test",
+		Endpoint:    "localhost:4317", // Will likely fail to connect, but should set up provider
+	}
+
+	// Create a mock exporter factory that returns a working exporter
+	mockFactory := func(ctx context.Context, cfg *tracerconfig.Config) (sdkmetric.Exporter, error) {
+		// Create a simple in-memory exporter for testing
+		return &mockMetricExporter{}, nil
+	}
+
+	// Store configuration with mock factory
+	tracerconfig.Store(ctx, config, nil, mockFactory, nil)
+
+	// Setup metrics
+	err := Setup(ctx)
+	if err != nil {
+		t.Errorf("Setup() returned error: %v", err)
+	}
+
+	// Should be able to get a meter
+	meter := GetMeter("test-service")
+	if meter == nil {
+		t.Fatal("GetMeter() returned nil")
+	}
+
+	// Test creating and using instruments
+	counter, err := meter.Int64Counter("test_counter")
+	if err != nil {
+		t.Errorf("Failed to create counter: %v", err)
+	}
+
+	histogram, err := meter.Float64Histogram("test_histogram")
+	if err != nil {
+		t.Errorf("Failed to create histogram: %v", err)
+	}
+
+	gauge, err := meter.Int64UpDownCounter("test_gauge")
+	if err != nil {
+		t.Errorf("Failed to create gauge: %v", err)
+	}
+
+	// Use the instruments
+	counter.Add(ctx, 1, metric.WithAttributes(attribute.String("test", "value")))
+	histogram.Record(ctx, 1.5, metric.WithAttributes(attribute.String("test", "value")))
+	gauge.Add(ctx, 10, metric.WithAttributes(attribute.String("test", "value")))
+
+	// Test shutdown
+	err = Shutdown(ctx)
+	if err != nil {
+		t.Errorf("Shutdown() returned error: %v", err)
+	}
+}
+
+func TestSetup_WithRealOTLPConfig(t *testing.T) {
+	// Skip this test in short mode since it may try to make network connections
+	if testing.Short() {
+		t.Skip("Skipping integration test in short mode")
+	}
+
+	// Clear any existing configuration
+	tracerconfig.Clear()
+
+	// Set environment variables for OTLP configuration
+	originalEndpoint := os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
+	originalProtocol := os.Getenv("OTEL_EXPORTER_OTLP_PROTOCOL")
+
+	defer func() {
+		if originalEndpoint != "" {
+			os.Setenv("OTEL_EXPORTER_OTLP_ENDPOINT", originalEndpoint)
+		} else {
+			os.Unsetenv("OTEL_EXPORTER_OTLP_ENDPOINT")
+		}
+		if originalProtocol != "" {
+			os.Setenv("OTEL_EXPORTER_OTLP_PROTOCOL", originalProtocol)
+		} else {
+			os.Unsetenv("OTEL_EXPORTER_OTLP_PROTOCOL")
+		}
+	}()
+
+	os.Setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318") // HTTP endpoint
+	os.Setenv("OTEL_EXPORTER_OTLP_PROTOCOL", "http/protobuf")
+
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	config := &tracerconfig.Config{
+		ServiceName: "test-metrics-e2e",
+		Environment: "test",
+		Endpoint:    "localhost:4318",
+	}
+
+	// Store configuration with real factory
+	tracerconfig.Store(ctx, config, nil, tracerconfig.CreateOTLPMetricExporter, nil)
+
+	// Setup metrics - this may fail if no OTLP collector is running, which is okay
+	err := Setup(ctx)
+	if err != nil {
+		t.Logf("Setup() returned error (expected if no OTLP collector): %v", err)
+	}
+
+	// Should still be able to get a meter
+	meter := GetMeter("test-service-e2e")
+	if meter == nil {
+		t.Fatal("GetMeter() returned nil")
+	}
+
+	// Create and use instruments
+	counter, err := meter.Int64Counter("e2e_test_counter")
+	if err != nil {
+		t.Errorf("Failed to create counter: %v", err)
+	}
+
+	// Add some metrics
+	for i := 0; i < 5; i++ {
+		counter.Add(ctx, 1, metric.WithAttributes(
+			attribute.String("iteration", string(rune('0'+i))),
+			attribute.String("test_type", "e2e"),
+		))
+	}
+
+	// Give some time for export (if collector is running)
+	time.Sleep(100 * time.Millisecond)
+
+	// Test shutdown
+	err = Shutdown(ctx)
+	if err != nil {
+		t.Logf("Shutdown() returned error (may be expected): %v", err)
+	}
+}
+
+func TestConcurrentMetricUsage(t *testing.T) {
+	// Clear any existing configuration
+	tracerconfig.Clear()
+
+	ctx := context.Background()
+	config := &tracerconfig.Config{
+		ServiceName: "concurrent-test",
+	}
+
+	// Use mock factory
+	mockFactory := func(ctx context.Context, cfg *tracerconfig.Config) (sdkmetric.Exporter, error) {
+		return &mockMetricExporter{}, nil
+	}
+
+	tracerconfig.Store(ctx, config, nil, mockFactory, nil)
+	Setup(ctx)
+
+	meter := GetMeter("concurrent-test")
+	counter, err := meter.Int64Counter("concurrent_counter")
+	if err != nil {
+		t.Fatalf("Failed to create counter: %v", err)
+	}
+
+	// Test concurrent metric usage
+	const numGoroutines = 10
+	const metricsPerGoroutine = 100
+
+	done := make(chan bool, numGoroutines)
+
+	for i := 0; i < numGoroutines; i++ {
+		go func(goroutineID int) {
+			for j := 0; j < metricsPerGoroutine; j++ {
+				counter.Add(ctx, 1, metric.WithAttributes(
+					attribute.Int("goroutine", goroutineID),
+					attribute.Int("iteration", j),
+				))
+			}
+			done <- true
+		}(i)
+	}
+
+	// Wait for all goroutines to complete
+	for i := 0; i < numGoroutines; i++ {
+		<-done
+	}
+
+	// Shutdown
+	err = Shutdown(ctx)
+	if err != nil {
+		t.Errorf("Shutdown() returned error: %v", err)
+	}
+}
+
+// mockMetricExporter is a simple mock exporter for testing
+type mockMetricExporter struct{}
+
+func (m *mockMetricExporter) Export(ctx context.Context, rm *metricdata.ResourceMetrics) error {
+	// Just pretend to export
+	return nil
+}
+
+func (m *mockMetricExporter) ForceFlush(ctx context.Context) error {
+	return nil
+}
+
+func (m *mockMetricExporter) Shutdown(ctx context.Context) error {
+	return nil
+}
+
+func (m *mockMetricExporter) Temporality(kind sdkmetric.InstrumentKind) metricdata.Temporality {
+	return metricdata.CumulativeTemporality
+}
+
+func (m *mockMetricExporter) Aggregation(kind sdkmetric.InstrumentKind) sdkmetric.Aggregation {
+	return sdkmetric.DefaultAggregationSelector(kind)
+}