From d4bf8d9e16609c99423a7cc5f722cafd947b491a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ask=20Bj=C3=B8rn=20Hansen?= Date: Sat, 26 Jul 2025 09:03:46 -0700 Subject: [PATCH] feat(api): add Grafana test endpoint for table format Add `/api/v2/test/grafana-table` endpoint to validate Grafana table format compatibility before implementing the full time range API. - Create server/grafana.go with table format structures - Add structured logging and OpenTelemetry tracing - Include realistic NTP Pool sample data with null handling - Set proper CORS and cache headers for testing - Update implementation plan with Phase 0 completion status Ready for Grafana JSON API data source integration testing. --- plans/grafana-time-range-api.md | 350 ++++++++++++++++++++++++++++++++ server/grafana.go | 100 +++++++++ server/server.go | 3 +- 3 files changed, 452 insertions(+), 1 deletion(-) create mode 100644 plans/grafana-time-range-api.md create mode 100644 server/grafana.go diff --git a/plans/grafana-time-range-api.md b/plans/grafana-time-range-api.md new file mode 100644 index 0000000..ec2c5e4 --- /dev/null +++ b/plans/grafana-time-range-api.md @@ -0,0 +1,350 @@ +# DETAILED IMPLEMENTATION PLAN: Grafana Time Range API with Future Downsampling Support + +## Overview +Implement a new Grafana-compatible API endpoint `/api/v2/server/scores/{server}.{mode}` that returns time series data in Grafana format with time range support and future downsampling capabilities. + +## API Specification + +### Endpoint +- **URL**: `/api/v2/server/scores/{server}.{mode}` +- **Method**: GET +- **Path Parameters**: + - `server`: Server IP address or ID (same validation as existing API) + - `mode`: Only `json` supported initially + +### Query Parameters (following Grafana conventions) +- `from`: Unix timestamp in milliseconds (required) +- `to`: Unix timestamp in milliseconds (required) +- `maxDataPoints`: Integer, default 50000, max 50000 (for future downsampling) +- `monitor`: Monitor ID, name prefix, or "*" for all (optional, same as existing) +- `interval`: Future downsampling interval like "1m", "5m", "1h" (optional, not implemented initially) + +### Response Format +Grafana table format JSON array (more efficient than separate series): +```json +[ + { + "target": "monitor{name=zakim1-yfhw4a}", + "tags": { + "monitor_id": "126", + "monitor_name": "zakim1-yfhw4a", + "type": "monitor", + "status": "active" + }, + "columns": [ + {"text": "time", "type": "time"}, + {"text": "score", "type": "number"}, + {"text": "rtt", "type": "number", "unit": "ms"}, + {"text": "offset", "type": "number", "unit": "s"} + ], + "values": [ + [1753431667000, 20.0, 18.865, -0.000267], + [1753431419000, 20.0, 18.96, -0.000390], + [1753431151000, 20.0, 18.073, -0.000768], + [1753430063000, 20.0, 18.209, null] + ] + } +] +``` + +## Implementation Details + +### 1. Server Routing (`server/server.go`) +Add new route after existing scores routes: +```go +e.GET("/api/v2/server/scores/:server.:mode", srv.scoresTimeRange) +``` + +## Key Implementation Clarifications + +### Monitor Filtering Behavior +- **monitor=\***: Return ALL monitors (no monitor count limit) +- **50k datapoint limit**: Applied in database query (LIMIT clause) +- Return whatever data we get from database to user (no post-processing truncation) + +### Null Value Handling Strategy +- **Score**: Always include (should never be null) +- **RTT**: Skip datapoints where RTT is null +- **Offset**: Skip datapoints where offset is null + +### Time Range Validation Rules +- **Zero duration**: Return 400 Bad Request +- **Future timestamps**: Allow for now +- **Minimum range**: 1 second +- **Maximum range**: 90 days + +### 2. New Handler Function (`server/history.go`) + +#### Function Signature +```go +func (srv *Server) scoresTimeRange(c echo.Context) error +``` + +#### Parameter Parsing & Validation +```go +// Extend existing historyParameters struct for time range support +type timeRangeParams struct { + historyParameters // embed existing struct + from time.Time + to time.Time + maxDataPoints int + interval string // for future downsampling +} + +func (srv *Server) parseTimeRangeParams(ctx context.Context, c echo.Context) (timeRangeParams, error) { + // Start with existing parameter parsing logic + baseParams, err := srv.getHistoryParameters(ctx, c) + if err != nil { + return timeRangeParams{}, err + } + + // Parse and validate from/to millisecond timestamps + // Validate time range (max 90 days, min 1 second) + // Parse maxDataPoints (default 50000, max 50000) + // Return extended parameters +} +``` + +#### Response Structure +```go +type ColumnDef struct { + Text string `json:"text"` + Type string `json:"type"` + Unit string `json:"unit,omitempty"` +} + +type GrafanaTableSeries struct { + Target string `json:"target"` + Tags map[string]string `json:"tags"` + Columns []ColumnDef `json:"columns"` + Values [][]interface{} `json:"values"` +} + +type GrafanaTimeSeriesResponse []GrafanaTableSeries +``` + +#### Cache Control +```go +// Reuse existing setHistoryCacheControl function for consistency +// Logic based on data recency and entry count: +// - Empty or >8h old data: "s-maxage=260,max-age=360" +// - Single entry: "s-maxage=60,max-age=35" +// - Multiple entries: "s-maxage=90,max-age=120" +setHistoryCacheControl(c, history) +``` + +### 3. ClickHouse Data Access (`chdb/logscores.go`) + +#### New Method +```go +func (d *ClickHouse) LogscoresTimeRange(ctx context.Context, serverID, monitorID int, from, to time.Time, limit int) ([]ntpdb.LogScore, error) { + // Build query with time range WHERE clause + // Always order by ts ASC (Grafana convention) + // Apply limit to prevent memory issues + // Use same row scanning logic as existing Logscores method +} +``` + +#### Query Structure +```sql +SELECT id, monitor_id, server_id, ts, + toFloat64(score), toFloat64(step), offset, + rtt, leap, warning, error +FROM log_scores +WHERE server_id = ? + AND ts >= ? + AND ts <= ? + [AND monitor_id = ?] -- if specific monitor requested +ORDER BY ts ASC +LIMIT ? +``` + +### 4. Data Transformation Logic (`server/history.go`) + +#### Core Transformation Function +```go +func transformToGrafanaTableFormat(history *logscores.LogScoreHistory, monitors []ntpdb.Monitor) GrafanaTimeSeriesResponse { + // Group data by monitor_id (one series per monitor) + // Create table format with columns: time, score, rtt, offset + // Convert timestamps to milliseconds + // Build proper target names and tags + // Handle null values appropriately in table values +} +``` + +#### Grouping Strategy +1. **Group by Monitor**: One table series per monitor +2. **Table Columns**: time, score, rtt, offset (all metrics in one table) +3. **Target Naming**: `monitor{name={sanitized_monitor_name}}` +4. **Tag Structure**: Include monitor metadata (no metric type needed) +5. **Monitor Status**: Query real monitor data using `q.GetServerScores()` like existing API +6. **Series Ordering**: No guaranteed order (standard Grafana behavior) +7. **Efficiency**: More efficient than separate series - less JSON overhead + +#### Timestamp Conversion +```go +timestampMs := logScore.Ts.Unix() * 1000 +``` + +### 5. Error Handling + +#### Validation Errors (400 Bad Request) +- Invalid timestamp format +- from >= to (including zero duration) +- Time range too large (> 90 days) +- Time range too small (< 1 second minimum) +- maxDataPoints > 50000 +- Invalid mode (not "json") + +#### Not Found Errors (404) +- Server not found +- Monitor not found +- Server deleted + +#### Server Errors (500) +- ClickHouse connection issues +- Database query errors + +### 6. Future Downsampling Design + +#### API Extension Points +- `interval` parameter parsing ready +- `maxDataPoints` limit already enforced +- Response format supports downsampled data seamlessly + +#### Downsampling Algorithm (Future Implementation) +```go +// When datapoints > maxDataPoints: +// 1. Calculate downsample interval: (to - from) / maxDataPoints +// 2. Group data into time buckets +// 3. Aggregate per bucket: avg for score/rtt, last for offset +// 4. Return aggregated datapoints +``` + +## Testing Strategy + +### Unit Tests +- Parameter parsing and validation +- Data transformation logic +- Error handling scenarios +- Timestamp conversion accuracy + +### Integration Tests +- End-to-end API requests +- ClickHouse query execution +- Multiple monitor scenarios +- Large time range handling + +### Manual Testing +- Grafana integration testing +- Performance with various time ranges +- Cache behavior validation + +## Performance Considerations + +### Current Implementation +- 50k datapoint limit applied in database query (LIMIT clause) (covers ~few weeks of data) +- ClickHouse-only for better range query performance +- Proper indexing on (server_id, ts) assumed +- Table format more efficient than separate time series (less JSON overhead) + +### Future Optimizations (Critical for Production) +- **Downsampling for large ranges**: Essential for 90-day queries with reasonable performance +- Query optimization based on range size +- Potential parallel monitor queries +- Adaptive sampling rates based on time range duration + +## Documentation Updates + +### API.md Addition +```markdown +### 7. Server Scores Time Range (v2) + +**GET** `/api/v2/server/scores/{server}.{mode}` + +Grafana-compatible time series endpoint for NTP server scoring data. + +#### Path Parameters +- `server`: Server IP address or ID +- `mode`: Response format (`json` only) + +#### Query Parameters +- `from`: Start time as Unix timestamp in milliseconds (required) +- `to`: End time as Unix timestamp in milliseconds (required) +- `maxDataPoints`: Maximum data points to return (default: 50000, max: 50000) +- `monitor`: Monitor filter (ID, name prefix, or "*" for all) + +#### Response Format +Grafana table format array with one series per monitor containing all metrics as columns. +``` + +## Key Research Findings + +### Grafana Error Format Requirements +- **HTTP Status Codes**: Standard 400/404/500 work fine +- **Response Body**: JSON preferred with `Content-Type: application/json` +- **Structure**: Simple `{"error": "message", "status": code}` is sufficient +- **Compatibility**: Existing Echo error patterns are Grafana-compatible + +### Data Volume Considerations +- **50k Datapoint Limit**: Only covers ~few weeks of data, not sufficient for 90-day ranges +- **Downsampling Critical**: Required for production use with 90-day time ranges +- **Current Approach**: Acceptable for MVP, downsampling essential for full utility + +## Implementation Checklist + +### Phase 0: Grafana Table Format Validation ✅ **COMPLETED** +- [x] Add test endpoint `/api/v2/test/grafana-table` returning sample table format +- [x] Implement Grafana table format response structures in `server/grafana.go` +- [x] Add structured logging and OpenTelemetry tracing to test endpoint +- [x] Verify endpoint compiles and serves correct JSON format +- [x] Test endpoint response format and headers (CORS, Content-Type, Cache-Control) +- [ ] Test with actual Grafana instance to validate table format compatibility +- [ ] Confirm time series panels render table format correctly +- [ ] Validate column types and units display properly + +#### Phase 0 Implementation Details +**Files Created/Modified:** +- `server/grafana.go`: New file containing Grafana table format structures and test endpoint +- `server/server.go`: Added route `e.GET("/api/v2/test/grafana-table", srv.testGrafanaTable)` + +**Test Endpoint Features:** +- **URL**: `http://localhost:8030/api/v2/test/grafana-table` +- **Response Format**: Grafana table format with realistic NTP Pool data +- **Sample Data**: Two monitor series (zakim1-yfhw4a, nj2-mon01) with time-based values +- **Columns**: time, score, rtt (ms), offset (s) with proper units +- **Null Handling**: Demonstrates null offset values +- **Headers**: CORS, JSON content-type, cache control +- **Observability**: Structured logging with context, OpenTelemetry tracing + +**Recommended Grafana Data Source**: JSON API plugin (`marcusolsson-json-datasource`) - ideal for REST APIs returning table format JSON + +### Phase 1: Core Implementation +- [ ] Add route in server.go +- [ ] Implement parseTimeRangeParams function +- [ ] Add LogscoresTimeRange method to ClickHouse +- [ ] Implement transformToGrafanaTableFormat function +- [ ] Add scoresTimeRange handler +- [ ] Error handling and validation (reuse existing Echo patterns) +- [ ] Cache control headers (reuse setHistoryCacheControl) + +### Phase 2: Testing & Polish +- [ ] Unit tests for all functions +- [ ] Integration tests +- [ ] Manual Grafana testing with real data +- [ ] Performance testing with large ranges (up to 50k points) +- [ ] API documentation updates + +### Phase 3: Future Enhancement Ready +- [ ] Interval parameter parsing (no-op initially) +- [ ] Downsampling framework hooks (critical for 90-day ranges) +- [ ] Monitoring and metrics for new endpoint + +This design provides a solid foundation for immediate Grafana integration while being fully prepared for future downsampling capabilities without breaking changes. + +## Critical Notes for Production + +- **Downsampling Required**: 50k datapoint limit means 90-day ranges will hit limits quickly +- **Table Format Validation**: Phase 0 testing ensures Grafana compatibility before full implementation +- **Error Handling**: Existing Echo patterns are sufficient for Grafana requirements +- **Scalability**: Current design handles weeks of data well, downsampling needed for months \ No newline at end of file diff --git a/server/grafana.go b/server/grafana.go new file mode 100644 index 0000000..05ff146 --- /dev/null +++ b/server/grafana.go @@ -0,0 +1,100 @@ +package server + +import ( + "net/http" + "time" + + "github.com/labstack/echo/v4" + "go.ntppool.org/common/logger" + "go.ntppool.org/common/tracing" +) + +// ColumnDef represents a Grafana table column definition +type ColumnDef struct { + Text string `json:"text"` + Type string `json:"type"` + Unit string `json:"unit,omitempty"` +} + +// GrafanaTableSeries represents a single table series in Grafana format +type GrafanaTableSeries struct { + Target string `json:"target"` + Tags map[string]string `json:"tags"` + Columns []ColumnDef `json:"columns"` + Values [][]interface{} `json:"values"` +} + +// GrafanaTimeSeriesResponse represents the complete Grafana table response +type GrafanaTimeSeriesResponse []GrafanaTableSeries + +// testGrafanaTable returns sample data in Grafana table format for validation +func (srv *Server) testGrafanaTable(c echo.Context) error { + log := logger.Setup() + ctx, span := tracing.Tracer().Start(c.Request().Context(), "testGrafanaTable") + defer span.End() + + log.InfoContext(ctx, "serving test Grafana table format", + "remote_ip", c.RealIP(), + "user_agent", c.Request().UserAgent(), + ) + + // Generate sample data with realistic NTP Pool values + now := time.Now() + sampleData := GrafanaTimeSeriesResponse{ + { + Target: "monitor{name=zakim1-yfhw4a}", + Tags: map[string]string{ + "monitor_id": "126", + "monitor_name": "zakim1-yfhw4a", + "type": "monitor", + "status": "active", + }, + Columns: []ColumnDef{ + {Text: "time", Type: "time"}, + {Text: "score", Type: "number"}, + {Text: "rtt", Type: "number", Unit: "ms"}, + {Text: "offset", Type: "number", Unit: "s"}, + }, + Values: [][]interface{}{ + {now.Add(-10*time.Minute).Unix() * 1000, 20.0, 18.865, -0.000267}, + {now.Add(-20*time.Minute).Unix() * 1000, 20.0, 18.96, -0.000390}, + {now.Add(-30*time.Minute).Unix() * 1000, 20.0, 18.073, -0.000768}, + {now.Add(-40*time.Minute).Unix() * 1000, 20.0, 18.209, nil}, // null offset example + }, + }, + { + Target: "monitor{name=nj2-mon01}", + Tags: map[string]string{ + "monitor_id": "84", + "monitor_name": "nj2-mon01", + "type": "monitor", + "status": "active", + }, + Columns: []ColumnDef{ + {Text: "time", Type: "time"}, + {Text: "score", Type: "number"}, + {Text: "rtt", Type: "number", Unit: "ms"}, + {Text: "offset", Type: "number", Unit: "s"}, + }, + Values: [][]interface{}{ + {now.Add(-10*time.Minute).Unix() * 1000, 19.5, 22.145, 0.000123}, + {now.Add(-20*time.Minute).Unix() * 1000, 19.8, 21.892, 0.000089}, + {now.Add(-30*time.Minute).Unix() * 1000, 20.0, 22.034, 0.000156}, + }, + }, + } + + // Add CORS header for browser testing + c.Response().Header().Set("Access-Control-Allow-Origin", "*") + c.Response().Header().Set("Content-Type", "application/json") + + // Set cache control similar to other endpoints + c.Response().Header().Set("Cache-Control", "public,max-age=60") + + log.InfoContext(ctx, "test Grafana table response sent", + "series_count", len(sampleData), + "response_size_approx", "~1KB", + ) + + return c.JSON(http.StatusOK, sampleData) +} \ No newline at end of file diff --git a/server/server.go b/server/server.go index d2aea45..9603f14 100644 --- a/server/server.go +++ b/server/server.go @@ -179,7 +179,7 @@ func (srv *Server) Run() error { e.Use(middleware.CORSWithConfig(middleware.CORSConfig{ AllowOrigins: []string{ - "http://localhost", "http://localhost:5173", "http://localhost:8080", + "http://localhost", "http://localhost:5173", "http://localhost:5174", "http://localhost:8080", "https://www.ntppool.org", "https://*.ntppool.org", "https://web.beta.grundclock.com", "https://manage.beta.grundclock.com", "https:/*.askdev.grundclock.com", @@ -208,6 +208,7 @@ func (srv *Server) Run() error { e.GET("/api/server/dns/answers/:server", srv.dnsAnswers) e.GET("/api/server/scores/:server/:mode", srv.history) e.GET("/api/dns/counts", srv.dnsQueryCounts) + e.GET("/api/v2/test/grafana-table", srv.testGrafanaTable) if len(ntpconf.WebHostname()) > 0 { e.POST("/api/server/scores/:server/:mode", func(c echo.Context) error {