package metrics import ( "context" "fmt" "time" "github.com/AdguardTeam/golibs/container" "github.com/AdguardTeam/golibs/errors" "github.com/prometheus/client_golang/prometheus" ) // UpdateMetrics is an alias for a structure that contains the information about // a user profiles update operation. // // See [profiledb.UpdateMetrics]. type UpdateMetrics = struct { Duration time.Duration ProfilesNum uint DevicesNum uint IsSuccess bool IsFullSync bool } // ProfileDB is the Prometheus-based implementation of the [profiledb.Metrics] // interface. type ProfileDB struct { // devicesCount is a gauge with the total number of user devices loaded from // the backend. devicesCount prometheus.Gauge // devicesNewCount is a gauge with the number of user devices downloaded // during the last sync. devicesNewCount prometheus.Gauge // profilesCount is a gauge with the total number of user profiles loaded // from the backend. profilesCount prometheus.Gauge // profilesNewCount is a gauge with the number of user profiles downloaded // during the last sync. profilesNewCount prometheus.Gauge // profilesDeletedTotal is a counter with the total number of user profiles // marked as deleted which have been loaded from the backend. // // TODO(d.kolyshev): Add a metric for deleted devices. profilesDeletedTotal prometheus.Counter // profilesSyncTime is a gauge with the timestamp when the profiles were // synced last time. profilesSyncTime prometheus.Gauge // profilesSyncStatus is a gauge with the profiles sync status. Set it to 1 // if the sync was successful. Otherwise, set it to 0. profilesSyncStatus prometheus.Gauge // profilesSyncDuration is a histogram with the duration of a profiles sync. profilesSyncDuration prometheus.Histogram // profilesFullSyncDuration is a gauge with the duration of the last full // sync. It is a gauge because full syncs are not expected to be common. profilesFullSyncDuration prometheus.Gauge // profilesSyncFullTimeouts is a gauge with the total number of timeout // errors occurred during full profiles sync. profilesSyncFullTimeouts prometheus.Gauge // profilesSyncPartTimeouts is a gauge with the total number of timeout // errors occurred during partial profiles sync. profilesSyncPartTimeouts prometheus.Gauge } // NewProfileDB registers the user profiles metrics in reg and returns a // properly initialized [ProfileDB]. func NewProfileDB(namespace string, reg prometheus.Registerer) (m *ProfileDB, err error) { const ( devicesCount = "devices_total" devicesNewCount = "devices_newly_synced_total" profilesCount = "profiles_total" profilesNewCount = "profiles_newly_synced_total" profilesDeletedTotal = "profiles_deleted_total" profilesSyncTime = "profiles_sync_timestamp" profilesSyncStatus = "profiles_sync_status" profilesSyncDuration = "profiles_sync_duration_seconds" profilesFullSyncDuration = "profiles_full_sync_duration_seconds" profilesSyncTimeouts = "profiles_sync_timeouts_total" ) // profilesSyncTimeoutsGaugeVec is a gauge with the total number of timeout // errors occurred during profiles sync, either full or partial. profilesSyncTimeoutsGaugeVec := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: profilesSyncTimeouts, Namespace: namespace, Subsystem: subsystemBackend, Help: "The total number of timeout errors during profiles sync.", }, []string{"is_full_sync"}) m = &ProfileDB{ devicesCount: prometheus.NewGauge(prometheus.GaugeOpts{ Name: devicesCount, Subsystem: subsystemBackend, Namespace: namespace, Help: "The total number of user devices loaded from the backend.", }), devicesNewCount: prometheus.NewGauge(prometheus.GaugeOpts{ Name: devicesNewCount, Subsystem: subsystemBackend, Namespace: namespace, Help: "The number of user devices that were changed or added since " + "the previous sync.", }), profilesCount: prometheus.NewGauge(prometheus.GaugeOpts{ Name: profilesCount, Subsystem: subsystemBackend, Namespace: namespace, Help: "The total number of user profiles loaded from the backend.", }), profilesNewCount: prometheus.NewGauge(prometheus.GaugeOpts{ Name: profilesNewCount, Subsystem: subsystemBackend, Namespace: namespace, Help: "The number of user profiles that were changed or added since " + "the previous sync.", }), profilesDeletedTotal: prometheus.NewCounter(prometheus.CounterOpts{ Name: profilesDeletedTotal, Subsystem: subsystemBackend, Namespace: namespace, Help: "The total number of deleted user profiles loaded from the backend.", }), profilesSyncTime: prometheus.NewGauge(prometheus.GaugeOpts{ Name: profilesSyncTime, Subsystem: subsystemBackend, Namespace: namespace, Help: "The time when the user profiles were synced last time.", }), profilesSyncStatus: prometheus.NewGauge(prometheus.GaugeOpts{ Name: profilesSyncStatus, Subsystem: subsystemBackend, Namespace: namespace, Help: "Status of the last profiles sync. 1 is okay, 0 means there was an error", }), profilesSyncDuration: prometheus.NewHistogram(prometheus.HistogramOpts{ Name: profilesSyncDuration, Subsystem: subsystemBackend, Namespace: namespace, Help: "Time elapsed on syncing user profiles with the backend.", // Profiles sync may take some time since the list of users may be // massive. This is why the buckets go up to 240 seconds. Buckets: []float64{0.01, 0.1, 1, 5, 10, 30, 60, 120, 240}, }), profilesFullSyncDuration: prometheus.NewGauge(prometheus.GaugeOpts{ Name: profilesFullSyncDuration, Subsystem: subsystemBackend, Namespace: namespace, Help: "Time elapsed on fully syncing user profiles with the backend, in seconds.", }), profilesSyncFullTimeouts: profilesSyncTimeoutsGaugeVec.With(prometheus.Labels{ "is_full_sync": "1", }), profilesSyncPartTimeouts: profilesSyncTimeoutsGaugeVec.With(prometheus.Labels{ "is_full_sync": "0", }), } collectors := container.KeyValues[string, prometheus.Collector]{{ Key: devicesCount, Value: m.devicesCount, }, { Key: devicesNewCount, Value: m.devicesNewCount, }, { Key: profilesCount, Value: m.profilesCount, }, { Key: profilesNewCount, Value: m.profilesNewCount, }, { Key: profilesDeletedTotal, Value: m.profilesDeletedTotal, }, { Key: profilesSyncTime, Value: m.profilesSyncTime, }, { Key: profilesSyncStatus, Value: m.profilesSyncStatus, }, { Key: profilesSyncDuration, Value: m.profilesSyncDuration, }, { Key: profilesFullSyncDuration, Value: m.profilesFullSyncDuration, }, { Key: profilesSyncTimeouts, Value: profilesSyncTimeoutsGaugeVec, }} var errs []error for _, c := range collectors { err = reg.Register(c.Value) if err != nil { errs = append(errs, fmt.Errorf("registering metrics %q: %w", c.Key, err)) } } if err = errors.Join(errs...); err != nil { return nil, err } return m, nil } // HandleProfilesUpdate implements the [profilesdb.Metrics] interface for // *ProfileDB. func (m *ProfileDB) HandleProfilesUpdate(_ context.Context, u *UpdateMetrics) { m.profilesSyncTime.SetToCurrentTime() m.profilesNewCount.Set(float64(u.ProfilesNum)) m.devicesNewCount.Set(float64(u.DevicesNum)) if u.IsSuccess { m.profilesSyncStatus.Set(1) } else { m.profilesSyncStatus.Set(0) } dur := u.Duration.Seconds() m.profilesSyncDuration.Observe(dur) if u.IsFullSync { m.profilesFullSyncDuration.Set(dur) } } // SetProfilesAndDevicesNum implements the [profilesdb.Metrics] interface for // *ProfileDB. func (m *ProfileDB) SetProfilesAndDevicesNum(_ context.Context, profNum, devNum uint) { m.profilesCount.Set(float64(profNum)) m.devicesCount.Set(float64(devNum)) } // IncrementSyncTimeouts implements the [profilesdb.Metrics] interface for // *ProfileDB. func (m *ProfileDB) IncrementSyncTimeouts(_ context.Context, isFullSync bool) { if isFullSync { m.profilesSyncFullTimeouts.Inc() } else { m.profilesSyncPartTimeouts.Inc() } } // IncrementDeleted implements the [profilesdb.Metrics] interface for // *ProfileDB. func (m *ProfileDB) IncrementDeleted(_ context.Context) { m.profilesDeletedTotal.Inc() } // BackendProfileDB is the Prometheus-based implementation of the // [backendpb.ProfileDBMetrics] interface. type BackendProfileDB struct { // devicesInvalidTotal is a gauge with the number of invalid user devices // loaded from the backend. devicesInvalidTotal prometheus.Counter // grpcAvgProfileRecvDuration is a histogram with the average duration of a // receive of a single profile during a backend call. grpcAvgProfileRecvDuration prometheus.Histogram // grpcAvgProfileDecDuration is a histogram with the average duration of // decoding a single profile during a backend call. grpcAvgProfileDecDuration prometheus.Histogram } // NewBackendProfileDB registers the protobuf errors metrics in reg and returns // a properly initialized [BackendProfileDB]. func NewBackendProfileDB( namespace string, reg prometheus.Registerer, ) (m *BackendProfileDB, err error) { const ( devicesInvalidTotal = "devices_invalid_total" grpcAvgProfileRecvDuration = "grpc_avg_profile_recv_duration_seconds" grpcAvgProfileDecDuration = "grpc_avg_profile_dec_duration_seconds" ) m = &BackendProfileDB{ devicesInvalidTotal: prometheus.NewCounter(prometheus.CounterOpts{ Name: devicesInvalidTotal, Subsystem: subsystemBackend, Namespace: namespace, Help: "The total number of invalid user devices loaded from the backend.", }), grpcAvgProfileRecvDuration: prometheus.NewHistogram(prometheus.HistogramOpts{ Name: grpcAvgProfileRecvDuration, Subsystem: subsystemBackend, Namespace: namespace, Help: "The average duration of a receive of a profile during a call to the backend, " + "in seconds.", Buckets: []float64{0.000_001, 0.000_010, 0.000_100, 0.001}, }), grpcAvgProfileDecDuration: prometheus.NewHistogram(prometheus.HistogramOpts{ Name: grpcAvgProfileDecDuration, Subsystem: subsystemBackend, Namespace: namespace, Help: "The average duration of decoding one profile during a call to the backend, " + "in seconds.", Buckets: []float64{0.000_001, 0.000_01, 0.000_1, 0.001}, }), } var errs []error collectors := container.KeyValues[string, prometheus.Collector]{{ Key: devicesInvalidTotal, Value: m.devicesInvalidTotal, }, { Key: grpcAvgProfileRecvDuration, Value: m.grpcAvgProfileRecvDuration, }, { Key: grpcAvgProfileDecDuration, Value: m.grpcAvgProfileDecDuration, }} for _, c := range collectors { err = reg.Register(c.Value) if err != nil { errs = append(errs, fmt.Errorf("registering metrics %q: %w", c.Key, err)) } } if err = errors.Join(errs...); err != nil { return nil, err } return m, nil } // IncrementInvalidDevicesCount implements the [backendpb.ProfileDBMetrics] // interface for BackendProfileDB. func (m *BackendProfileDB) IncrementInvalidDevicesCount(_ context.Context) { m.devicesInvalidTotal.Inc() } // UpdateStats implements the [backendpb.ProfileDBMetrics] interface for // BackendProfileDB. func (m *BackendProfileDB) UpdateStats(_ context.Context, avgRecv, avgDec time.Duration) { m.grpcAvgProfileRecvDuration.Observe(avgRecv.Seconds()) m.grpcAvgProfileDecDuration.Observe(avgDec.Seconds()) }