mirror of
https://github.com/AdguardTeam/AdGuardDNS.git
synced 2025-02-20 11:23:36 +08:00
348 lines
11 KiB
Go
348 lines
11 KiB
Go
package metrics
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/AdguardTeam/golibs/container"
|
|
"github.com/AdguardTeam/golibs/errors"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
)
|
|
|
|
// UpdateMetrics is an alias for a structure that contains the information about
|
|
// a user profiles update operation.
|
|
//
|
|
// See [profiledb.UpdateMetrics].
|
|
type UpdateMetrics = struct {
|
|
Duration time.Duration
|
|
ProfilesNum uint
|
|
DevicesNum uint
|
|
IsSuccess bool
|
|
IsFullSync bool
|
|
}
|
|
|
|
// ProfileDB is the Prometheus-based implementation of the [profiledb.Metrics]
|
|
// interface.
|
|
type ProfileDB struct {
|
|
// devicesCount is a gauge with the total number of user devices loaded from
|
|
// the backend.
|
|
devicesCount prometheus.Gauge
|
|
|
|
// devicesNewCount is a gauge with the number of user devices downloaded
|
|
// during the last sync.
|
|
devicesNewCount prometheus.Gauge
|
|
|
|
// profilesCount is a gauge with the total number of user profiles loaded
|
|
// from the backend.
|
|
profilesCount prometheus.Gauge
|
|
|
|
// profilesNewCount is a gauge with the number of user profiles downloaded
|
|
// during the last sync.
|
|
profilesNewCount prometheus.Gauge
|
|
|
|
// profilesDeletedTotal is a counter with the total number of user profiles
|
|
// marked as deleted which have been loaded from the backend.
|
|
//
|
|
// TODO(d.kolyshev): Add a metric for deleted devices.
|
|
profilesDeletedTotal prometheus.Counter
|
|
|
|
// profilesSyncTime is a gauge with the timestamp when the profiles were
|
|
// synced last time.
|
|
profilesSyncTime prometheus.Gauge
|
|
|
|
// profilesSyncStatus is a gauge with the profiles sync status. Set it to 1
|
|
// if the sync was successful. Otherwise, set it to 0.
|
|
profilesSyncStatus prometheus.Gauge
|
|
|
|
// profilesSyncDuration is a histogram with the duration of a profiles sync.
|
|
profilesSyncDuration prometheus.Histogram
|
|
|
|
// profilesFullSyncDuration is a gauge with the duration of the last full
|
|
// sync. It is a gauge because full syncs are not expected to be common.
|
|
profilesFullSyncDuration prometheus.Gauge
|
|
|
|
// profilesSyncFullTimeouts is a gauge with the total number of timeout
|
|
// errors occurred during full profiles sync.
|
|
profilesSyncFullTimeouts prometheus.Gauge
|
|
|
|
// profilesSyncPartTimeouts is a gauge with the total number of timeout
|
|
// errors occurred during partial profiles sync.
|
|
profilesSyncPartTimeouts prometheus.Gauge
|
|
}
|
|
|
|
// NewProfileDB registers the user profiles metrics in reg and returns a
|
|
// properly initialized [ProfileDB].
|
|
func NewProfileDB(namespace string, reg prometheus.Registerer) (m *ProfileDB, err error) {
|
|
const (
|
|
devicesCount = "devices_total"
|
|
devicesNewCount = "devices_newly_synced_total"
|
|
profilesCount = "profiles_total"
|
|
profilesNewCount = "profiles_newly_synced_total"
|
|
profilesDeletedTotal = "profiles_deleted_total"
|
|
profilesSyncTime = "profiles_sync_timestamp"
|
|
profilesSyncStatus = "profiles_sync_status"
|
|
profilesSyncDuration = "profiles_sync_duration_seconds"
|
|
profilesFullSyncDuration = "profiles_full_sync_duration_seconds"
|
|
profilesSyncTimeouts = "profiles_sync_timeouts_total"
|
|
)
|
|
|
|
// profilesSyncTimeoutsGaugeVec is a gauge with the total number of timeout
|
|
// errors occurred during profiles sync, either full or partial.
|
|
profilesSyncTimeoutsGaugeVec := prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Name: profilesSyncTimeouts,
|
|
Namespace: namespace,
|
|
Subsystem: subsystemBackend,
|
|
Help: "The total number of timeout errors during profiles sync.",
|
|
}, []string{"is_full_sync"})
|
|
|
|
m = &ProfileDB{
|
|
devicesCount: prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: devicesCount,
|
|
Subsystem: subsystemBackend,
|
|
Namespace: namespace,
|
|
Help: "The total number of user devices loaded from the backend.",
|
|
}),
|
|
devicesNewCount: prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: devicesNewCount,
|
|
Subsystem: subsystemBackend,
|
|
Namespace: namespace,
|
|
Help: "The number of user devices that were changed or added since " +
|
|
"the previous sync.",
|
|
}),
|
|
profilesCount: prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: profilesCount,
|
|
Subsystem: subsystemBackend,
|
|
Namespace: namespace,
|
|
Help: "The total number of user profiles loaded from the backend.",
|
|
}),
|
|
profilesNewCount: prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: profilesNewCount,
|
|
Subsystem: subsystemBackend,
|
|
Namespace: namespace,
|
|
Help: "The number of user profiles that were changed or added since " +
|
|
"the previous sync.",
|
|
}),
|
|
profilesDeletedTotal: prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: profilesDeletedTotal,
|
|
Subsystem: subsystemBackend,
|
|
Namespace: namespace,
|
|
Help: "The total number of deleted user profiles loaded from the backend.",
|
|
}),
|
|
profilesSyncTime: prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: profilesSyncTime,
|
|
Subsystem: subsystemBackend,
|
|
Namespace: namespace,
|
|
Help: "The time when the user profiles were synced last time.",
|
|
}),
|
|
profilesSyncStatus: prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: profilesSyncStatus,
|
|
Subsystem: subsystemBackend,
|
|
Namespace: namespace,
|
|
Help: "Status of the last profiles sync. 1 is okay, 0 means there was an error",
|
|
}),
|
|
profilesSyncDuration: prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Name: profilesSyncDuration,
|
|
Subsystem: subsystemBackend,
|
|
Namespace: namespace,
|
|
Help: "Time elapsed on syncing user profiles with the backend.",
|
|
// Profiles sync may take some time since the list of users may be
|
|
// massive. This is why the buckets go up to 240 seconds.
|
|
Buckets: []float64{0.01, 0.1, 1, 5, 10, 30, 60, 120, 240},
|
|
}),
|
|
profilesFullSyncDuration: prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: profilesFullSyncDuration,
|
|
Subsystem: subsystemBackend,
|
|
Namespace: namespace,
|
|
Help: "Time elapsed on fully syncing user profiles with the backend, in seconds.",
|
|
}),
|
|
profilesSyncFullTimeouts: profilesSyncTimeoutsGaugeVec.With(prometheus.Labels{
|
|
"is_full_sync": "1",
|
|
}),
|
|
profilesSyncPartTimeouts: profilesSyncTimeoutsGaugeVec.With(prometheus.Labels{
|
|
"is_full_sync": "0",
|
|
}),
|
|
}
|
|
|
|
collectors := container.KeyValues[string, prometheus.Collector]{{
|
|
Key: devicesCount,
|
|
Value: m.devicesCount,
|
|
}, {
|
|
Key: devicesNewCount,
|
|
Value: m.devicesNewCount,
|
|
}, {
|
|
Key: profilesCount,
|
|
Value: m.profilesCount,
|
|
}, {
|
|
Key: profilesNewCount,
|
|
Value: m.profilesNewCount,
|
|
}, {
|
|
Key: profilesDeletedTotal,
|
|
Value: m.profilesDeletedTotal,
|
|
}, {
|
|
Key: profilesSyncTime,
|
|
Value: m.profilesSyncTime,
|
|
}, {
|
|
Key: profilesSyncStatus,
|
|
Value: m.profilesSyncStatus,
|
|
}, {
|
|
Key: profilesSyncDuration,
|
|
Value: m.profilesSyncDuration,
|
|
}, {
|
|
Key: profilesFullSyncDuration,
|
|
Value: m.profilesFullSyncDuration,
|
|
}, {
|
|
Key: profilesSyncTimeouts,
|
|
Value: profilesSyncTimeoutsGaugeVec,
|
|
}}
|
|
|
|
var errs []error
|
|
for _, c := range collectors {
|
|
err = reg.Register(c.Value)
|
|
if err != nil {
|
|
errs = append(errs, fmt.Errorf("registering metrics %q: %w", c.Key, err))
|
|
}
|
|
}
|
|
|
|
if err = errors.Join(errs...); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return m, nil
|
|
}
|
|
|
|
// HandleProfilesUpdate implements the [profilesdb.Metrics] interface for
|
|
// *ProfileDB.
|
|
func (m *ProfileDB) HandleProfilesUpdate(_ context.Context, u *UpdateMetrics) {
|
|
m.profilesSyncTime.SetToCurrentTime()
|
|
m.profilesNewCount.Set(float64(u.ProfilesNum))
|
|
m.devicesNewCount.Set(float64(u.DevicesNum))
|
|
|
|
if u.IsSuccess {
|
|
m.profilesSyncStatus.Set(1)
|
|
} else {
|
|
m.profilesSyncStatus.Set(0)
|
|
}
|
|
|
|
dur := u.Duration.Seconds()
|
|
m.profilesSyncDuration.Observe(dur)
|
|
if u.IsFullSync {
|
|
m.profilesFullSyncDuration.Set(dur)
|
|
}
|
|
}
|
|
|
|
// SetProfilesAndDevicesNum implements the [profilesdb.Metrics] interface for
|
|
// *ProfileDB.
|
|
func (m *ProfileDB) SetProfilesAndDevicesNum(_ context.Context, profNum, devNum uint) {
|
|
m.profilesCount.Set(float64(profNum))
|
|
m.devicesCount.Set(float64(devNum))
|
|
}
|
|
|
|
// IncrementSyncTimeouts implements the [profilesdb.Metrics] interface for
|
|
// *ProfileDB.
|
|
func (m *ProfileDB) IncrementSyncTimeouts(_ context.Context, isFullSync bool) {
|
|
if isFullSync {
|
|
m.profilesSyncFullTimeouts.Inc()
|
|
} else {
|
|
m.profilesSyncPartTimeouts.Inc()
|
|
}
|
|
}
|
|
|
|
// IncrementDeleted implements the [profilesdb.Metrics] interface for
|
|
// *ProfileDB.
|
|
func (m *ProfileDB) IncrementDeleted(_ context.Context) {
|
|
m.profilesDeletedTotal.Inc()
|
|
}
|
|
|
|
// BackendProfileDB is the Prometheus-based implementation of the
|
|
// [backendpb.ProfileDBMetrics] interface.
|
|
type BackendProfileDB struct {
|
|
// devicesInvalidTotal is a gauge with the number of invalid user devices
|
|
// loaded from the backend.
|
|
devicesInvalidTotal prometheus.Counter
|
|
|
|
// grpcAvgProfileRecvDuration is a histogram with the average duration of a
|
|
// receive of a single profile during a backend call.
|
|
grpcAvgProfileRecvDuration prometheus.Histogram
|
|
|
|
// grpcAvgProfileDecDuration is a histogram with the average duration of
|
|
// decoding a single profile during a backend call.
|
|
grpcAvgProfileDecDuration prometheus.Histogram
|
|
}
|
|
|
|
// NewBackendProfileDB registers the protobuf errors metrics in reg and returns
|
|
// a properly initialized [BackendProfileDB].
|
|
func NewBackendProfileDB(
|
|
namespace string,
|
|
reg prometheus.Registerer,
|
|
) (m *BackendProfileDB, err error) {
|
|
const (
|
|
devicesInvalidTotal = "devices_invalid_total"
|
|
grpcAvgProfileRecvDuration = "grpc_avg_profile_recv_duration_seconds"
|
|
grpcAvgProfileDecDuration = "grpc_avg_profile_dec_duration_seconds"
|
|
)
|
|
|
|
m = &BackendProfileDB{
|
|
devicesInvalidTotal: prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: devicesInvalidTotal,
|
|
Subsystem: subsystemBackend,
|
|
Namespace: namespace,
|
|
Help: "The total number of invalid user devices loaded from the backend.",
|
|
}),
|
|
grpcAvgProfileRecvDuration: prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Name: grpcAvgProfileRecvDuration,
|
|
Subsystem: subsystemBackend,
|
|
Namespace: namespace,
|
|
Help: "The average duration of a receive of a profile during a call to the backend, " +
|
|
"in seconds.",
|
|
Buckets: []float64{0.000_001, 0.000_010, 0.000_100, 0.001},
|
|
}),
|
|
grpcAvgProfileDecDuration: prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Name: grpcAvgProfileDecDuration,
|
|
Subsystem: subsystemBackend,
|
|
Namespace: namespace,
|
|
Help: "The average duration of decoding one profile during a call to the backend, " +
|
|
"in seconds.",
|
|
Buckets: []float64{0.000_001, 0.000_01, 0.000_1, 0.001},
|
|
}),
|
|
}
|
|
|
|
var errs []error
|
|
collectors := container.KeyValues[string, prometheus.Collector]{{
|
|
Key: devicesInvalidTotal,
|
|
Value: m.devicesInvalidTotal,
|
|
}, {
|
|
Key: grpcAvgProfileRecvDuration,
|
|
Value: m.grpcAvgProfileRecvDuration,
|
|
}, {
|
|
Key: grpcAvgProfileDecDuration,
|
|
Value: m.grpcAvgProfileDecDuration,
|
|
}}
|
|
|
|
for _, c := range collectors {
|
|
err = reg.Register(c.Value)
|
|
if err != nil {
|
|
errs = append(errs, fmt.Errorf("registering metrics %q: %w", c.Key, err))
|
|
}
|
|
}
|
|
|
|
if err = errors.Join(errs...); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return m, nil
|
|
}
|
|
|
|
// IncrementInvalidDevicesCount implements the [backendpb.ProfileDBMetrics]
|
|
// interface for BackendProfileDB.
|
|
func (m *BackendProfileDB) IncrementInvalidDevicesCount(_ context.Context) {
|
|
m.devicesInvalidTotal.Inc()
|
|
}
|
|
|
|
// UpdateStats implements the [backendpb.ProfileDBMetrics] interface for
|
|
// BackendProfileDB.
|
|
func (m *BackendProfileDB) UpdateStats(_ context.Context, avgRecv, avgDec time.Duration) {
|
|
m.grpcAvgProfileRecvDuration.Observe(avgRecv.Seconds())
|
|
m.grpcAvgProfileDecDuration.Observe(avgDec.Seconds())
|
|
}
|