-
Notifications
You must be signed in to change notification settings - Fork 1k
Open
Labels
enhancementNew feature or requestNew feature or requestexternalIssues created by non node team membersIssues created by non node team members
Description
Implementation ideas
gRPC
is the most critical protocol of celestia-node
as it acts as the main gateway
between app
and node
which is essential for all blockchain operations, yet failures could not be easily spotted due to a lack of available telemetry.
This type of observability would greatly benefit node operator
teams and protocol performance engineering during testing as it would provide an out of the box visibility.
Some PoCs
1. Connection Health Metrics
// Connection state gauge - current connection status
celestia_grpc_connection_state = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "celestia_grpc_connection_state",
Help: "Current gRPC connection state (1=connected, 0=disconnected)",
},
[]string{"target", "node_type"}, // target="celestia-app:9090", node_type="bridge|full|light"
)
// Example values:
// celestia_grpc_connection_state{target="celestia-app:9090",node_type="bridge"} = 1
// Connection attempt counter - total connection attempts over time
celestia_grpc_connection_attempts_total = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "celestia_grpc_connection_attempts_total",
Help: "Total number of gRPC connection attempts",
},
[]string{"target", "result"}, // result="success|failure"
)
// Example values:
// celestia_grpc_connection_attempts_total{target="celestia-app:9090",result="success"} = 45
// celestia_grpc_connection_attempts_total{target="celestia-app:9090",result="failure"} = 3
// Connection recovery time histogram - how long reconnections take
celestia_grpc_connection_recovery_seconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "celestia_grpc_connection_recovery_seconds",
Help: "Time taken to recover gRPC connection after failure",
Buckets: prometheus.ExponentialBuckets(0.1, 2, 10), // 0.1s to 51.2s
},
[]string{"target"},
)
// Example values:
// celestia_grpc_connection_recovery_seconds{target="celestia-app:9090"} histogram with P50=1.2s, P99=5.8s
2. gRPC Stream Health Metrics
// Active streams gauge - how many streams are currently open
celestia_grpc_streams_active = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "celestia_grpc_streams_active",
Help: "Number of active gRPC streams",
},
[]string{"method", "node_type"}, // method="SubscribeNewHeights", node_type="bridge"
)
// Example values:
// celestia_grpc_streams_active{method="SubscribeNewHeights",node_type="bridge"} = 1
// Stream lifecycle duration histogram - how long streams stay alive
celestia_grpc_stream_duration_seconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "celestia_grpc_stream_duration_seconds",
Help: "Duration of gRPC streams before they close/reset",
Buckets: prometheus.ExponentialBuckets(1, 2, 15), // 1s to 16384s (~4.5 hours)
},
[]string{"method", "close_reason"}, // close_reason="normal|reset|error|timeout"
)
// Example values:
// celestia_grpc_stream_duration_seconds{method="SubscribeNewHeights",close_reason="reset"} histogram
// Stream messages counter - messages received per stream
celestia_grpc_stream_messages_total = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "celestia_grpc_stream_messages_total",
Help: "Total messages received on gRPC streams",
},
[]string{"method", "direction"}, // direction="sent|received"
)
// Example values:
// celestia_grpc_stream_messages_total{method="SubscribeNewHeights",direction="received"} = 1440
// Stream interruption counter - when streams break unexpectedly
celestia_grpc_stream_interruptions_total = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "celestia_grpc_stream_interruptions_total",
Help: "Total number of unexpected stream interruptions",
},
[]string{"method", "error_code"}, // error_code="cancelled|deadline_exceeded|unavailable"
)
// Example values:
// celestia_grpc_stream_interruptions_total{method="SubscribeNewHeights",error_code="unavailable"} = 5
3. gRPC Request Performance Metrics
// Request latency histogram - the most important metric for SLOs
celestia_grpc_request_duration_seconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "celestia_grpc_request_duration_seconds",
Help: "Latency of gRPC requests",
Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60}, // Custom buckets for celestia-node patterns
},
[]string{"method", "sync_scenario", "node_type"},
)
// Example values:
// celestia_grpc_request_duration_seconds{method="GetSignedBlock",sync_scenario="normal",node_type="bridge"} histogram P99=0.8s
// celestia_grpc_request_duration_seconds{method="GetSignedBlock",sync_scenario="genesis_sync",node_type="bridge"} histogram P99=15.2s
// Request success/failure counter - track different gRPC status codes
celestia_grpc_requests_total = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "celestia_grpc_requests_total",
Help: "Total gRPC requests by result",
},
[]string{"method", "code", "sync_scenario"}, // code="OK|DeadlineExceeded|Unavailable|..."
)
// Example values:
// celestia_grpc_requests_total{method="GetSignedBlock",code="OK",sync_scenario="genesis_sync"} = 89543
// celestia_grpc_requests_total{method="GetSignedBlock",code="DeadlineExceeded",sync_scenario="genesis_sync"} = 247
// celestia_grpc_requests_total{method="Commit",code="OK",sync_scenario="normal"} = 1440
// Concurrent requests gauge - how many requests are in-flight
celestia_grpc_concurrent_requests = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "celestia_grpc_concurrent_requests",
Help: "Number of gRPC requests currently in progress",
},
[]string{"method", "target"},
)
// Example values:
// celestia_grpc_concurrent_requests{method="GetSignedBlock",target="celestia-app:9090"} = 25
// Request size histogram - track message sizes (celestia-node has 64MB limit)
celestia_grpc_request_size_bytes = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "celestia_grpc_request_size_bytes",
Help: "Size of gRPC request messages",
Buckets: prometheus.ExponentialBuckets(64, 4, 12), // 64B to 256MB
},
[]string{"method"},
)
celestia_grpc_response_size_bytes = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "celestia_grpc_response_size_bytes",
Help: "Size of gRPC response messages",
Buckets: prometheus.ExponentialBuckets(64, 4, 12), // 64B to 256MB
},
[]string{"method"},
)
// Example values:
// celestia_grpc_response_size_bytes{method="GetSignedBlock"} histogram P50=2.1MB, P99=16.7MB
4. HTTP/2 and gRPC Protocol Health Metrics
// HTTP/2 protocol events counter - low-level transport issues
celestia_grpc_http2_events_total = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "celestia_grpc_http2_events_total",
Help: "HTTP/2 protocol events on gRPC connections",
},
[]string{"event_type", "target"}, // event_type="goaway|rst_stream|window_update|settings"
)
// Example values:
// celestia_grpc_http2_events_total{event_type="goaway",target="celestia-app:9090"} = 3
// celestia_grpc_http2_events_total{event_type="rst_stream",target="celestia-app:9090"} = 12
// Flow control state gauge - HTTP/2 flow control windows
celestia_grpc_flow_control_window_bytes = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "celestia_grpc_flow_control_window_bytes",
Help: "Current HTTP/2 flow control window size",
},
[]string{"window_type", "target"}, // window_type="connection|stream"
)
// Example values:
// celestia_grpc_flow_control_window_bytes{window_type="connection",target="celestia-app:9090"} = 1048576
// Flow control blocked counter - when flow control prevents sending
celestia_grpc_flow_control_blocked_total = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "celestia_grpc_flow_control_blocked_total",
Help: "Number of times flow control blocked sending data",
},
[]string{"target", "level"}, // level="connection|stream"
)
// Example values:
// celestia_grpc_flow_control_blocked_total{target="celestia-app:9090",level="stream"} = 47
// gRPC retry behavior counter - client-side retry logic
celestia_grpc_retry_attempts_total = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "celestia_grpc_retry_attempts_total",
Help: "Number of gRPC retry attempts",
},
[]string{"method", "retry_reason", "attempt_number"}, // retry_reason="unavailable|deadline_exceeded"
)
// Example values:
// celestia_grpc_retry_attempts_total{method="GetSignedBlock",retry_reason="unavailable",attempt_number="1"} = 156
// celestia_grpc_retry_attempts_total{method="GetSignedBlock",retry_reason="unavailable",attempt_number="2"} = 89
// celestia_grpc_retry_attempts_total{method="GetSignedBlock",retry_reason="unavailable",attempt_number="3"} = 23
// Circuit breaker state gauge - if using circuit breaker pattern
celestia_grpc_circuit_breaker_state = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "celestia_grpc_circuit_breaker_state",
Help: "Circuit breaker state (0=closed, 1=open, 2=half-open)",
},
[]string{"method", "target"},
)
// Example values:
// celestia_grpc_circuit_breaker_state{method="GetSignedBlock",target="celestia-app:9090"} = 0
// Large message events counter - messages approaching/exceeding size limits
celestia_grpc_large_message_events_total = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "celestia_grpc_large_message_events_total",
Help: "Messages exceeding size thresholds",
},
[]string{"method", "direction", "threshold"}, // threshold="10mb|32mb|64mb_limit"
)
// Example values:
// celestia_grpc_large_message_events_total{method="GetSignedBlock",direction="response",threshold="32mb"} = 8
gRPC protocol architecture
graph TB
subgraph "celestia-node (gRPC Client)"
A[BlockFetcher] --> B[gRPC Client Connection]
C[Core Listener] --> B
D[State Access] --> B
E[Gas Estimator] --> B
B --> F[HTTP/2 Transport Layer]
F --> G[Connection Pool]
F --> H[Stream Multiplexer]
F --> I[Flow Control Manager]
subgraph "Current Telemetry Gap"
J[🚫 Connection State]
K[🚫 Stream Health]
L[🚫 Request Latency]
M[🚫 Protocol Events]
end
end
subgraph "Network Layer"
N[TCP Connection] --> O[TLS Layer]
O --> P[HTTP/2 Frames]
P --> Q[gRPC Messages]
end
subgraph "celestia-app"
R[gRPC Server]
end
%% Data Flow
G --> N
N --> R
%% gRPC Communication Patterns
R -.->|SubscribeNewHeights Stream| H
H -.->|Block events| C
R -.->|GetSignedBlock calls| H
H -.->|Block data| A
%% Protocol Health Issues
CC[Connection Drops] -.->|🚫 No Telemetry| G
DD[Stream Resets] -.->|🚫 No Telemetry| H
EE[Flow Control Blocks] -.->|🚫 No Telemetry| I
FF[Retry Storms] -.->|🚫 No Telemetry| B
classDef problem fill:#ff6b6b,stroke:#d63031,stroke-width:2px,color:#ffffff
classDef critical fill:#fdcb6e,stroke:#e17055,stroke-width:2px,color:#2d3436
classDef missing fill:#fd79a8,stroke:#e84393,stroke-width:2px,color:#ffffff
class CC,DD,EE,FF problem
class R critical
class J,K,L,M missing
Overall this would be a valuable and powerful layer of visibility that would easily enable us to have real time monitoring while guaranteeing protocol optimal performance avoiding further service degradations of other core components.
Metadata
Metadata
Assignees
Labels
enhancementNew feature or requestNew feature or requestexternalIssues created by non node team membersIssues created by non node team members