AdGuardDNS/internal/metrics/profiledb.go
Andrey Meshkov f1791135af Sync v2.11.0
2024-12-05 14:19:25 +03:00

348 lines
11 KiB
Go

package metrics
import (
"context"
"fmt"
"time"
"github.com/AdguardTeam/golibs/container"
"github.com/AdguardTeam/golibs/errors"
"github.com/prometheus/client_golang/prometheus"
)
// UpdateMetrics is an alias for a structure that contains the information about
// a user profiles update operation.
//
// See [profiledb.UpdateMetrics].
type UpdateMetrics = struct {
Duration time.Duration
ProfilesNum uint
DevicesNum uint
IsSuccess bool
IsFullSync bool
}
// ProfileDB is the Prometheus-based implementation of the [profiledb.Metrics]
// interface.
type ProfileDB struct {
// devicesCount is a gauge with the total number of user devices loaded from
// the backend.
devicesCount prometheus.Gauge
// devicesNewCount is a gauge with the number of user devices downloaded
// during the last sync.
devicesNewCount prometheus.Gauge
// profilesCount is a gauge with the total number of user profiles loaded
// from the backend.
profilesCount prometheus.Gauge
// profilesNewCount is a gauge with the number of user profiles downloaded
// during the last sync.
profilesNewCount prometheus.Gauge
// profilesDeletedTotal is a counter with the total number of user profiles
// marked as deleted which have been loaded from the backend.
//
// TODO(d.kolyshev): Add a metric for deleted devices.
profilesDeletedTotal prometheus.Counter
// profilesSyncTime is a gauge with the timestamp when the profiles were
// synced last time.
profilesSyncTime prometheus.Gauge
// profilesSyncStatus is a gauge with the profiles sync status. Set it to 1
// if the sync was successful. Otherwise, set it to 0.
profilesSyncStatus prometheus.Gauge
// profilesSyncDuration is a histogram with the duration of a profiles sync.
profilesSyncDuration prometheus.Histogram
// profilesFullSyncDuration is a gauge with the duration of the last full
// sync. It is a gauge because full syncs are not expected to be common.
profilesFullSyncDuration prometheus.Gauge
// profilesSyncFullTimeouts is a gauge with the total number of timeout
// errors occurred during full profiles sync.
profilesSyncFullTimeouts prometheus.Gauge
// profilesSyncPartTimeouts is a gauge with the total number of timeout
// errors occurred during partial profiles sync.
profilesSyncPartTimeouts prometheus.Gauge
}
// NewProfileDB registers the user profiles metrics in reg and returns a
// properly initialized [ProfileDB].
func NewProfileDB(namespace string, reg prometheus.Registerer) (m *ProfileDB, err error) {
const (
devicesCount = "devices_total"
devicesNewCount = "devices_newly_synced_total"
profilesCount = "profiles_total"
profilesNewCount = "profiles_newly_synced_total"
profilesDeletedTotal = "profiles_deleted_total"
profilesSyncTime = "profiles_sync_timestamp"
profilesSyncStatus = "profiles_sync_status"
profilesSyncDuration = "profiles_sync_duration_seconds"
profilesFullSyncDuration = "profiles_full_sync_duration_seconds"
profilesSyncTimeouts = "profiles_sync_timeouts_total"
)
// profilesSyncTimeoutsGaugeVec is a gauge with the total number of timeout
// errors occurred during profiles sync, either full or partial.
profilesSyncTimeoutsGaugeVec := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: profilesSyncTimeouts,
Namespace: namespace,
Subsystem: subsystemBackend,
Help: "The total number of timeout errors during profiles sync.",
}, []string{"is_full_sync"})
m = &ProfileDB{
devicesCount: prometheus.NewGauge(prometheus.GaugeOpts{
Name: devicesCount,
Subsystem: subsystemBackend,
Namespace: namespace,
Help: "The total number of user devices loaded from the backend.",
}),
devicesNewCount: prometheus.NewGauge(prometheus.GaugeOpts{
Name: devicesNewCount,
Subsystem: subsystemBackend,
Namespace: namespace,
Help: "The number of user devices that were changed or added since " +
"the previous sync.",
}),
profilesCount: prometheus.NewGauge(prometheus.GaugeOpts{
Name: profilesCount,
Subsystem: subsystemBackend,
Namespace: namespace,
Help: "The total number of user profiles loaded from the backend.",
}),
profilesNewCount: prometheus.NewGauge(prometheus.GaugeOpts{
Name: profilesNewCount,
Subsystem: subsystemBackend,
Namespace: namespace,
Help: "The number of user profiles that were changed or added since " +
"the previous sync.",
}),
profilesDeletedTotal: prometheus.NewCounter(prometheus.CounterOpts{
Name: profilesDeletedTotal,
Subsystem: subsystemBackend,
Namespace: namespace,
Help: "The total number of deleted user profiles loaded from the backend.",
}),
profilesSyncTime: prometheus.NewGauge(prometheus.GaugeOpts{
Name: profilesSyncTime,
Subsystem: subsystemBackend,
Namespace: namespace,
Help: "The time when the user profiles were synced last time.",
}),
profilesSyncStatus: prometheus.NewGauge(prometheus.GaugeOpts{
Name: profilesSyncStatus,
Subsystem: subsystemBackend,
Namespace: namespace,
Help: "Status of the last profiles sync. 1 is okay, 0 means there was an error",
}),
profilesSyncDuration: prometheus.NewHistogram(prometheus.HistogramOpts{
Name: profilesSyncDuration,
Subsystem: subsystemBackend,
Namespace: namespace,
Help: "Time elapsed on syncing user profiles with the backend.",
// Profiles sync may take some time since the list of users may be
// massive. This is why the buckets go up to 240 seconds.
Buckets: []float64{0.01, 0.1, 1, 5, 10, 30, 60, 120, 240},
}),
profilesFullSyncDuration: prometheus.NewGauge(prometheus.GaugeOpts{
Name: profilesFullSyncDuration,
Subsystem: subsystemBackend,
Namespace: namespace,
Help: "Time elapsed on fully syncing user profiles with the backend, in seconds.",
}),
profilesSyncFullTimeouts: profilesSyncTimeoutsGaugeVec.With(prometheus.Labels{
"is_full_sync": "1",
}),
profilesSyncPartTimeouts: profilesSyncTimeoutsGaugeVec.With(prometheus.Labels{
"is_full_sync": "0",
}),
}
collectors := container.KeyValues[string, prometheus.Collector]{{
Key: devicesCount,
Value: m.devicesCount,
}, {
Key: devicesNewCount,
Value: m.devicesNewCount,
}, {
Key: profilesCount,
Value: m.profilesCount,
}, {
Key: profilesNewCount,
Value: m.profilesNewCount,
}, {
Key: profilesDeletedTotal,
Value: m.profilesDeletedTotal,
}, {
Key: profilesSyncTime,
Value: m.profilesSyncTime,
}, {
Key: profilesSyncStatus,
Value: m.profilesSyncStatus,
}, {
Key: profilesSyncDuration,
Value: m.profilesSyncDuration,
}, {
Key: profilesFullSyncDuration,
Value: m.profilesFullSyncDuration,
}, {
Key: profilesSyncTimeouts,
Value: profilesSyncTimeoutsGaugeVec,
}}
var errs []error
for _, c := range collectors {
err = reg.Register(c.Value)
if err != nil {
errs = append(errs, fmt.Errorf("registering metrics %q: %w", c.Key, err))
}
}
if err = errors.Join(errs...); err != nil {
return nil, err
}
return m, nil
}
// HandleProfilesUpdate implements the [profilesdb.Metrics] interface for
// *ProfileDB.
func (m *ProfileDB) HandleProfilesUpdate(_ context.Context, u *UpdateMetrics) {
m.profilesSyncTime.SetToCurrentTime()
m.profilesNewCount.Set(float64(u.ProfilesNum))
m.devicesNewCount.Set(float64(u.DevicesNum))
if u.IsSuccess {
m.profilesSyncStatus.Set(1)
} else {
m.profilesSyncStatus.Set(0)
}
dur := u.Duration.Seconds()
m.profilesSyncDuration.Observe(dur)
if u.IsFullSync {
m.profilesFullSyncDuration.Set(dur)
}
}
// SetProfilesAndDevicesNum implements the [profilesdb.Metrics] interface for
// *ProfileDB.
func (m *ProfileDB) SetProfilesAndDevicesNum(_ context.Context, profNum, devNum uint) {
m.profilesCount.Set(float64(profNum))
m.devicesCount.Set(float64(devNum))
}
// IncrementSyncTimeouts implements the [profilesdb.Metrics] interface for
// *ProfileDB.
func (m *ProfileDB) IncrementSyncTimeouts(_ context.Context, isFullSync bool) {
if isFullSync {
m.profilesSyncFullTimeouts.Inc()
} else {
m.profilesSyncPartTimeouts.Inc()
}
}
// IncrementDeleted implements the [profilesdb.Metrics] interface for
// *ProfileDB.
func (m *ProfileDB) IncrementDeleted(_ context.Context) {
m.profilesDeletedTotal.Inc()
}
// BackendProfileDB is the Prometheus-based implementation of the
// [backendpb.ProfileDBMetrics] interface.
type BackendProfileDB struct {
// devicesInvalidTotal is a gauge with the number of invalid user devices
// loaded from the backend.
devicesInvalidTotal prometheus.Counter
// grpcAvgProfileRecvDuration is a histogram with the average duration of a
// receive of a single profile during a backend call.
grpcAvgProfileRecvDuration prometheus.Histogram
// grpcAvgProfileDecDuration is a histogram with the average duration of
// decoding a single profile during a backend call.
grpcAvgProfileDecDuration prometheus.Histogram
}
// NewBackendProfileDB registers the protobuf errors metrics in reg and returns
// a properly initialized [BackendProfileDB].
func NewBackendProfileDB(
namespace string,
reg prometheus.Registerer,
) (m *BackendProfileDB, err error) {
const (
devicesInvalidTotal = "devices_invalid_total"
grpcAvgProfileRecvDuration = "grpc_avg_profile_recv_duration_seconds"
grpcAvgProfileDecDuration = "grpc_avg_profile_dec_duration_seconds"
)
m = &BackendProfileDB{
devicesInvalidTotal: prometheus.NewCounter(prometheus.CounterOpts{
Name: devicesInvalidTotal,
Subsystem: subsystemBackend,
Namespace: namespace,
Help: "The total number of invalid user devices loaded from the backend.",
}),
grpcAvgProfileRecvDuration: prometheus.NewHistogram(prometheus.HistogramOpts{
Name: grpcAvgProfileRecvDuration,
Subsystem: subsystemBackend,
Namespace: namespace,
Help: "The average duration of a receive of a profile during a call to the backend, " +
"in seconds.",
Buckets: []float64{0.000_001, 0.000_010, 0.000_100, 0.001},
}),
grpcAvgProfileDecDuration: prometheus.NewHistogram(prometheus.HistogramOpts{
Name: grpcAvgProfileDecDuration,
Subsystem: subsystemBackend,
Namespace: namespace,
Help: "The average duration of decoding one profile during a call to the backend, " +
"in seconds.",
Buckets: []float64{0.000_001, 0.000_01, 0.000_1, 0.001},
}),
}
var errs []error
collectors := container.KeyValues[string, prometheus.Collector]{{
Key: devicesInvalidTotal,
Value: m.devicesInvalidTotal,
}, {
Key: grpcAvgProfileRecvDuration,
Value: m.grpcAvgProfileRecvDuration,
}, {
Key: grpcAvgProfileDecDuration,
Value: m.grpcAvgProfileDecDuration,
}}
for _, c := range collectors {
err = reg.Register(c.Value)
if err != nil {
errs = append(errs, fmt.Errorf("registering metrics %q: %w", c.Key, err))
}
}
if err = errors.Join(errs...); err != nil {
return nil, err
}
return m, nil
}
// IncrementInvalidDevicesCount implements the [backendpb.ProfileDBMetrics]
// interface for BackendProfileDB.
func (m *BackendProfileDB) IncrementInvalidDevicesCount(_ context.Context) {
m.devicesInvalidTotal.Inc()
}
// UpdateStats implements the [backendpb.ProfileDBMetrics] interface for
// BackendProfileDB.
func (m *BackendProfileDB) UpdateStats(_ context.Context, avgRecv, avgDec time.Duration) {
m.grpcAvgProfileRecvDuration.Observe(avgRecv.Seconds())
m.grpcAvgProfileDecDuration.Observe(avgDec.Seconds())
}