11package metrics
22
33import (
4+ "context"
5+ "errors"
46 "fmt"
57 "time"
68
79 "github.com/prometheus/client_golang/prometheus"
810 "github.com/prometheus/client_golang/prometheus/push"
11+ dto "github.com/prometheus/client_model/go"
912 "github.com/ydb-platform/ydb-go-sdk/v3"
13+
14+ "slo/internal/log"
1015)
1116
1217const (
2025 ref string
2126 label string
2227
23- errorsTotal * prometheus.CounterVec
28+ errorsTotal * prometheus.CounterVec
29+ timeoutsTotal * prometheus.CounterVec
2430
2531 operationsTotal * prometheus.CounterVec
2632 operationsSuccessTotal * prometheus.CounterVec
@@ -60,6 +66,13 @@ func New(url, ref, label, jobName string) (*Metrics, error) {
6066 },
6167 []string {"error_type" },
6268 )
69+ m .timeoutsTotal = prometheus .NewCounterVec (
70+ prometheus.CounterOpts {
71+ Name : "sdk_timeouts_total" ,
72+ Help : "Total number of timeout errors" ,
73+ },
74+ []string {},
75+ )
6376
6477 m .operationsTotal = prometheus .NewCounterVec (
6578 prometheus.CounterOpts {
@@ -208,6 +221,9 @@ func (j Span) Finish(err error, attempts int) {
208221 j .m .retryAttemptsTotal .WithLabelValues (j .name ).Add (float64 (attempts ))
209222
210223 if err != nil {
224+ if errors .Is (err , context .DeadlineExceeded ) {
225+ j .m .timeoutsTotal .WithLabelValues ().Add (1 )
226+ }
211227 j .m .errorsTotal .WithLabelValues (err .Error ()).Add (1 )
212228 j .m .retriesFailureTotal .WithLabelValues (j .name ).Add (float64 (attempts ))
213229 j .m .operationsFailureTotal .WithLabelValues (j .name ).Add (1 )
@@ -218,3 +234,47 @@ func (j Span) Finish(err error, attempts int) {
218234 j .m .operationLatencySeconds .WithLabelValues (j .name , OperationStatusSuccess ).Observe (latency .Seconds ())
219235 }
220236}
237+
238+ func getCounterVecTotal (counterVec * prometheus.CounterVec ) float64 {
239+ ch := make (chan prometheus.Metric , 100 )
240+ go func () {
241+ counterVec .Collect (ch )
242+ close (ch )
243+ }()
244+
245+ var total float64
246+ for m := range ch {
247+ pb := & dto.Metric {}
248+ _ = m .Write (pb )
249+ if pb .GetCounter () != nil {
250+ total += pb .GetCounter ().GetValue ()
251+ }
252+ }
253+
254+ return total
255+ }
256+
257+ func (m * Metrics ) OperationsTotal () float64 {
258+ return getCounterVecTotal (m .operationsTotal )
259+ }
260+
261+ func (m * Metrics ) ErrorsTotal () float64 {
262+ return getCounterVecTotal (m .errorsTotal )
263+ }
264+
265+ func (m * Metrics ) TimeoutsTotal () float64 {
266+ return getCounterVecTotal (m .timeoutsTotal )
267+ }
268+
269+ func (m * Metrics ) FailOnError () {
270+ if m .ErrorsTotal () > 0 {
271+ log .Panicf (
272+ "unretriable (or not successfully retried) errors: %.0f errors out of %.0f operations" ,
273+ m .ErrorsTotal (),
274+ m .OperationsTotal (),
275+ )
276+ }
277+ if m .TimeoutsTotal () > 0 {
278+ log .Panicf ("there are user timeouts: %.0f timeouts" , m .TimeoutsTotal ())
279+ }
280+ }
0 commit comments