common/tracing/tracing.go
Ask Bjørn Hansen 6a3bc7bab3 feat(logger): add buffering exporter with TLS support for OTLP logs
Add buffering exporter to queue OTLP logs until tracing is configured.
Support TLS configuration for OpenTelemetry log export with client
certificate authentication. Improve logfmt formatting and tracing setup.
2025-07-27 16:36:18 -07:00

426 lines
15 KiB
Go

// Package tracing provides OpenTelemetry distributed tracing setup with OTLP export support.
//
// This package handles the complete OpenTelemetry SDK initialization including:
// - Trace provider configuration with batching and resource detection
// - Log provider setup for structured log export via OTLP
// - Automatic resource discovery (service name, version, host, container, process info)
// - Support for both gRPC and HTTP OTLP exporters with TLS configuration
// - Propagation context setup for distributed tracing across services
// - Graceful shutdown handling for all telemetry components
//
// The package supports various deployment scenarios:
// - Development: Local OTLP collectors or observability backends
// - Production: Secure OTLP export with mutual TLS authentication
// - Container environments: Automatic container and Kubernetes resource detection
//
// Configuration is primarily handled via standard OpenTelemetry environment variables:
// - OTEL_SERVICE_NAME: Service identification
// - OTEL_EXPORTER_OTLP_PROTOCOL: Protocol selection (grpc, http/protobuf)
// - OTEL_TRACES_EXPORTER: Exporter type (otlp, autoexport)
// - OTEL_RESOURCE_ATTRIBUTES: Additional resource attributes
//
// Example usage:
//
// cfg := &tracing.TracerConfig{
// ServiceName: "my-service",
// Environment: "production",
// Endpoint: "https://otlp.example.com:4317",
// }
// shutdown, err := tracing.InitTracer(ctx, cfg)
// if err != nil {
// log.Fatal(err)
// }
// defer shutdown(ctx)
package tracing
// todo, review:
// https://github.com/ttys3/tracing-go/blob/main/tracing.go#L136
import (
"context"
"crypto/tls"
"crypto/x509"
"errors"
"log/slog"
"os"
"slices"
"time"
"go.ntppool.org/common/internal/tracerconfig"
"go.ntppool.org/common/version"
"google.golang.org/grpc/credentials"
"go.opentelemetry.io/contrib/exporters/autoexport"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
"go.opentelemetry.io/otel/log/global"
"go.opentelemetry.io/otel/propagation"
sdklog "go.opentelemetry.io/otel/sdk/log"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
"go.opentelemetry.io/otel/trace"
)
const (
// svcNameKey is the environment variable name that Service Name information will be read from.
svcNameKey = "OTEL_SERVICE_NAME"
otelExporterOTLPProtoEnvKey = "OTEL_EXPORTER_OTLP_PROTOCOL"
otelExporterOTLPTracesProtoEnvKey = "OTEL_EXPORTER_OTLP_TRACES_PROTOCOL"
otelExporterOTLPLogsProtoEnvKey = "OTEL_EXPORTER_OTLP_LOGS_PROTOCOL"
)
var errInvalidOTLPProtocol = errors.New("invalid OTLP protocol - should be one of ['grpc', 'http/protobuf']")
// createOTLPLogExporter creates an OTLP log exporter using the provided configuration.
// This function is used as the ExporterFactory for the tracerconfig bridge.
func createOTLPLogExporter(ctx context.Context, cfg *tracerconfig.Config) (sdklog.Exporter, error) {
// Convert bridge config to local TracerConfig
tracerCfg := &TracerConfig{
ServiceName: cfg.ServiceName,
Environment: cfg.Environment,
Endpoint: cfg.Endpoint,
EndpointURL: cfg.EndpointURL,
CertificateProvider: cfg.CertificateProvider,
RootCAs: cfg.RootCAs,
}
return newOTLPLogExporter(ctx, tracerCfg)
}
// https://github.com/open-telemetry/opentelemetry-go/blob/main/exporters/otlp/otlptrace/otlptracehttp/example_test.go
// TpShutdownFunc represents a function that gracefully shuts down telemetry providers.
// It should be called during application shutdown to ensure all telemetry data is flushed
// and exporters are properly closed. The context can be used to set shutdown timeouts.
type TpShutdownFunc func(ctx context.Context) error
// Tracer returns the configured OpenTelemetry tracer for the NTP Pool project.
// This tracer should be used for creating spans and distributed tracing throughout
// the application. It uses the global tracer provider set up by InitTracer/SetupSDK.
func Tracer() trace.Tracer {
traceProvider := otel.GetTracerProvider()
return traceProvider.Tracer("ntppool-tracer")
}
// Start creates a new span with the given name and options using the configured tracer.
// This is a convenience function that wraps the standard OpenTelemetry span creation.
// It returns a new context containing the span and the span itself for further configuration.
//
// The returned context should be used for downstream operations to maintain trace correlation.
func Start(ctx context.Context, spanName string, opts ...trace.SpanStartOption) (context.Context, trace.Span) {
return Tracer().Start(ctx, spanName, opts...)
}
// GetClientCertificate is an alias for the type defined in tracerconfig.
// This maintains backward compatibility for existing code.
type GetClientCertificate = tracerconfig.GetClientCertificate
// TracerConfig provides configuration options for OpenTelemetry tracing setup.
// It supplements standard OpenTelemetry environment variables with additional
// NTP Pool-specific configuration including TLS settings for secure OTLP export.
type TracerConfig struct {
ServiceName string // Service name for resource identification (overrides OTEL_SERVICE_NAME)
Environment string // Deployment environment (development, staging, production)
Endpoint string // OTLP endpoint hostname/port (e.g., "otlp.example.com:4317")
EndpointURL string // Complete OTLP endpoint URL (e.g., "https://otlp.example.com:4317/v1/traces")
CertificateProvider GetClientCertificate // Client certificate provider for mutual TLS
RootCAs *x509.CertPool // CA certificate pool for server verification
}
// InitTracer initializes the OpenTelemetry SDK with the provided configuration.
// This is the main entry point for setting up distributed tracing in applications.
//
// The function configures trace and log providers, sets up OTLP exporters,
// and returns a shutdown function that must be called during application termination.
//
// Returns a shutdown function and an error. The shutdown function should be called
// with a context that has an appropriate timeout for graceful shutdown.
func InitTracer(ctx context.Context, cfg *TracerConfig) (TpShutdownFunc, error) {
// todo: setup environment from cfg
return SetupSDK(ctx, cfg)
}
// SetupSDK performs the complete OpenTelemetry SDK initialization including resource
// discovery, exporter configuration, provider setup, and shutdown function creation.
//
// The function automatically discovers system resources (service info, host, container,
// process details) and configures both trace and log exporters. It supports multiple
// OTLP protocols (gRPC, HTTP) and handles TLS configuration for secure deployments.
//
// The returned shutdown function coordinates graceful shutdown of all telemetry
// components in the reverse order of their initialization.
func SetupSDK(ctx context.Context, cfg *TracerConfig) (shutdown TpShutdownFunc, err error) {
if cfg == nil {
cfg = &TracerConfig{}
}
// Store configuration for use by logger package via bridge
bridgeConfig := &tracerconfig.Config{
ServiceName: cfg.ServiceName,
Environment: cfg.Environment,
Endpoint: cfg.Endpoint,
EndpointURL: cfg.EndpointURL,
CertificateProvider: cfg.CertificateProvider,
RootCAs: cfg.RootCAs,
}
tracerconfig.Store(ctx, bridgeConfig, createOTLPLogExporter)
log := slog.Default()
if serviceName := os.Getenv(svcNameKey); len(serviceName) == 0 {
if len(cfg.ServiceName) > 0 {
os.Setenv(svcNameKey, cfg.ServiceName)
}
}
resources := []resource.Option{
resource.WithFromEnv(), // Discover and provide attributes from OTEL_RESOURCE_ATTRIBUTES and OTEL_SERVICE_NAME environment variables.
resource.WithTelemetrySDK(), // Discover and provide information about the OpenTelemetry SDK used.
resource.WithProcess(), // Discover and provide process information.
resource.WithOS(), // Discover and provide OS information.
resource.WithContainer(), // Discover and provide container information.
resource.WithHost(), // Discover and provide host information.
// set above via os.Setenv() for WithFromEnv to find
// resource.WithAttributes(semconv.ServiceNameKey.String(cfg.ServiceName)),
resource.WithAttributes(semconv.ServiceVersionKey.String(version.Version())),
}
if len(cfg.Environment) > 0 {
resources = append(resources,
resource.WithAttributes(attribute.String("environment", cfg.Environment)),
)
}
res, err := resource.New(
context.Background(),
resources...,
)
if errors.Is(err, resource.ErrPartialResource) || errors.Is(err, resource.ErrSchemaURLConflict) {
log.Warn("otel resource setup", "err", err) // Log non-fatal issues.
} else if err != nil {
log.Error("otel resource setup", "err", err)
return
}
var shutdownFuncs []func(context.Context) error
shutdown = func(ctx context.Context) error {
// Force flush the global logger provider before shutting down anything else
if loggerProvider := global.GetLoggerProvider(); loggerProvider != nil {
if sdkProvider, ok := loggerProvider.(*sdklog.LoggerProvider); ok {
if flushErr := sdkProvider.ForceFlush(ctx); flushErr != nil {
log.Warn("logger provider force flush failed", "err", flushErr)
}
}
}
var err error
// need to shutdown the providers first,
// exporters after which is the opposite
// order they are setup.
slices.Reverse(shutdownFuncs)
for _, fn := range shutdownFuncs {
err = errors.Join(err, fn(ctx))
}
shutdownFuncs = nil
if err != nil {
log.Warn("shutdown returned errors", "err", err)
}
return err
}
// handleErr calls shutdown for cleanup and makes sure that all errors are returned.
handleErr := func(inErr error) {
err = errors.Join(inErr, shutdown(ctx))
}
prop := newPropagator()
otel.SetTextMapPropagator(prop)
var spanExporter sdktrace.SpanExporter
switch os.Getenv("OTEL_TRACES_EXPORTER") {
case "":
spanExporter, err = newOLTPExporter(ctx, cfg)
case "otlp":
spanExporter, err = newOLTPExporter(ctx, cfg)
default:
// log.Debug("OTEL_TRACES_EXPORTER", "fallback", os.Getenv("OTEL_TRACES_EXPORTER"))
spanExporter, err = autoexport.NewSpanExporter(ctx)
}
if err != nil {
handleErr(err)
return
}
shutdownFuncs = append(shutdownFuncs, spanExporter.Shutdown)
// Set up trace provider.
tracerProvider, err := newTraceProvider(spanExporter, res)
if err != nil {
handleErr(err)
return
}
shutdownFuncs = append(shutdownFuncs, tracerProvider.Shutdown)
otel.SetTracerProvider(tracerProvider)
if err != nil {
handleErr(err)
return
}
return
}
func newOLTPExporter(ctx context.Context, cfg *TracerConfig) (sdktrace.SpanExporter, error) {
log := slog.Default()
var tlsConfig *tls.Config
if cfg.CertificateProvider != nil {
tlsConfig = &tls.Config{
GetClientCertificate: cfg.CertificateProvider,
RootCAs: cfg.RootCAs,
}
}
proto := os.Getenv(otelExporterOTLPTracesProtoEnvKey)
if proto == "" {
proto = os.Getenv(otelExporterOTLPProtoEnvKey)
}
// Fallback to default, http/protobuf.
if proto == "" {
proto = "http/protobuf"
}
var client otlptrace.Client
switch proto {
case "grpc":
opts := []otlptracegrpc.Option{
otlptracegrpc.WithCompressor("gzip"),
}
if tlsConfig != nil {
opts = append(opts, otlptracegrpc.WithTLSCredentials(credentials.NewTLS(tlsConfig)))
}
if len(cfg.Endpoint) > 0 {
log.Info("adding option", "Endpoint", cfg.Endpoint)
opts = append(opts, otlptracegrpc.WithEndpoint(cfg.Endpoint))
}
if len(cfg.EndpointURL) > 0 {
log.Info("adding option", "EndpointURL", cfg.EndpointURL)
opts = append(opts, otlptracegrpc.WithEndpointURL(cfg.EndpointURL))
}
client = otlptracegrpc.NewClient(opts...)
case "http/protobuf", "http/json":
opts := []otlptracehttp.Option{
otlptracehttp.WithCompression(otlptracehttp.GzipCompression),
}
if tlsConfig != nil {
opts = append(opts, otlptracehttp.WithTLSClientConfig(tlsConfig))
}
if len(cfg.Endpoint) > 0 {
opts = append(opts, otlptracehttp.WithEndpoint(cfg.Endpoint))
}
if len(cfg.EndpointURL) > 0 {
opts = append(opts, otlptracehttp.WithEndpointURL(cfg.EndpointURL))
}
client = otlptracehttp.NewClient(opts...)
default:
return nil, errInvalidOTLPProtocol
}
exporter, err := otlptrace.New(ctx, client)
if err != nil {
log.ErrorContext(ctx, "creating OTLP trace exporter", "err", err)
}
return exporter, err
}
func newOTLPLogExporter(ctx context.Context, cfg *TracerConfig) (sdklog.Exporter, error) {
log := slog.Default()
var tlsConfig *tls.Config
if cfg.CertificateProvider != nil {
tlsConfig = &tls.Config{
GetClientCertificate: cfg.CertificateProvider,
RootCAs: cfg.RootCAs,
}
}
proto := os.Getenv(otelExporterOTLPLogsProtoEnvKey)
if proto == "" {
proto = os.Getenv(otelExporterOTLPProtoEnvKey)
}
// Fallback to default, http/protobuf.
if proto == "" {
proto = "http/protobuf"
}
switch proto {
case "grpc":
opts := []otlploggrpc.Option{
otlploggrpc.WithCompressor("gzip"),
}
if tlsConfig != nil {
opts = append(opts, otlploggrpc.WithTLSCredentials(credentials.NewTLS(tlsConfig)))
}
if len(cfg.Endpoint) > 0 {
log.Info("adding log option", "Endpoint", cfg.Endpoint)
opts = append(opts, otlploggrpc.WithEndpoint(cfg.Endpoint))
}
if len(cfg.EndpointURL) > 0 {
log.Info("adding log option", "EndpointURL", cfg.EndpointURL)
opts = append(opts, otlploggrpc.WithEndpointURL(cfg.EndpointURL))
}
return otlploggrpc.New(ctx, opts...)
case "http/protobuf", "http/json":
opts := []otlploghttp.Option{
otlploghttp.WithCompression(otlploghttp.GzipCompression),
}
if tlsConfig != nil {
opts = append(opts, otlploghttp.WithTLSClientConfig(tlsConfig))
}
if len(cfg.Endpoint) > 0 {
opts = append(opts, otlploghttp.WithEndpoint(cfg.Endpoint))
}
if len(cfg.EndpointURL) > 0 {
opts = append(opts, otlploghttp.WithEndpointURL(cfg.EndpointURL))
}
return otlploghttp.New(ctx, opts...)
default:
return nil, errInvalidOTLPProtocol
}
}
func newTraceProvider(traceExporter sdktrace.SpanExporter, res *resource.Resource) (*sdktrace.TracerProvider, error) {
traceProvider := sdktrace.NewTracerProvider(
sdktrace.WithResource(res),
sdktrace.WithBatcher(traceExporter,
sdktrace.WithBatchTimeout(time.Second*3),
),
)
return traceProvider, nil
}
func newPropagator() propagation.TextMapPropagator {
return propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
)
}