seaweedfs/weed/stats/metrics.go

448 lines
14 KiB
Go
Raw Normal View History

2019-06-16 03:21:44 +08:00
package stats
2019-06-15 04:06:01 +08:00
import (
2020-09-24 20:45:39 +08:00
"log"
2021-09-08 07:43:54 +08:00
"net"
2020-09-24 20:45:39 +08:00
"net/http"
2019-06-16 12:46:55 +08:00
"os"
2021-09-08 07:43:54 +08:00
"strconv"
"strings"
2019-06-15 04:06:01 +08:00
"time"
"github.com/prometheus/client_golang/prometheus"
2021-06-22 15:54:13 +08:00
"github.com/prometheus/client_golang/prometheus/collectors"
2020-09-24 20:45:39 +08:00
"github.com/prometheus/client_golang/prometheus/promhttp"
2019-06-15 04:06:01 +08:00
"github.com/prometheus/client_golang/prometheus/push"
"github.com/seaweedfs/seaweedfs/weed/glog"
2019-06-15 04:06:01 +08:00
)
// Readonly volume types
const (
2022-10-07 19:20:34 +08:00
Namespace = "SeaweedFS"
IsReadOnly = "IsReadOnly"
NoWriteOrDelete = "noWriteOrDelete"
NoWriteCanDelete = "noWriteCanDelete"
IsDiskSpaceLow = "isDiskSpaceLow"
bucketAtiveTTL = 10 * time.Minute
)
var readOnlyVolumeTypes = [4]string{IsReadOnly, NoWriteOrDelete, NoWriteCanDelete, IsDiskSpaceLow}
var bucketLastActiveTsNs map[string]int64 = map[string]int64{}
var (
2020-10-01 03:59:39 +08:00
Gather = prometheus.NewRegistry()
2019-06-14 15:54:56 +08:00
2022-01-24 22:09:43 +08:00
MasterClientConnectCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
2022-01-24 22:09:43 +08:00
Subsystem: "wdclient",
2022-01-25 17:42:47 +08:00
Name: "connect_updates",
2022-01-24 22:09:43 +08:00
Help: "Counter of master client leader updates.",
}, []string{"type"})
2022-01-24 23:13:07 +08:00
MasterRaftIsleader = prometheus.NewGauge(
prometheus.GaugeOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
2022-01-24 23:13:07 +08:00
Subsystem: "master",
2022-01-25 17:42:47 +08:00
Name: "is_leader",
2022-01-24 23:13:07 +08:00
Help: "is leader",
})
MasterAdminLock = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: "master",
Name: "admin_lock",
Help: "admin lock",
}, []string{"client"})
2022-01-24 22:09:43 +08:00
MasterReceivedHeartbeatCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
2022-01-24 22:09:43 +08:00
Subsystem: "master",
2022-01-25 17:42:47 +08:00
Name: "received_heartbeats",
2022-01-24 22:09:43 +08:00
Help: "Counter of master received heartbeat.",
}, []string{"type"})
MasterReplicaPlacementMismatch = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
Subsystem: "master",
Name: "replica_placement_mismatch",
Help: "replica placement mismatch",
}, []string{"collection", "id"})
MasterVolumeLayoutWritable = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: "master",
Name: "volume_layout_writable",
Help: "Number of writable volumes in volume layouts",
}, []string{"collection", "disk", "rp", "ttl"})
MasterVolumeLayoutCrowded = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: "master",
Name: "volume_layout_crowded",
Help: "Number of crowded volumes in volume layouts",
}, []string{"collection", "disk", "rp", "ttl"})
MasterPickForWriteErrorCounter = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: "master",
Name: "pick_for_write_error",
Help: "Counter of master pick for write error",
})
MasterBroadcastToFullErrorCounter = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: "master",
Name: "broadcast_to_full",
Help: "Counter of master broadcast send to full message channel err",
})
2022-01-24 22:09:43 +08:00
MasterLeaderChangeCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
2022-01-24 22:09:43 +08:00
Subsystem: "master",
2022-01-25 17:42:47 +08:00
Name: "leader_changes",
2022-01-24 22:09:43 +08:00
Help: "Counter of master leader changes.",
}, []string{"type"})
2019-06-16 03:21:44 +08:00
FilerRequestCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
Subsystem: "filer",
Name: "request_total",
Help: "Counter of filer requests.",
}, []string{"type", "code"})
FilerHandlerCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: "filer",
Name: "handler_total",
Help: "Counter of filer handlers.",
}, []string{"type"})
2019-06-16 03:21:44 +08:00
FilerRequestHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
Subsystem: "filer",
Name: "request_seconds",
Help: "Bucketed histogram of filer request processing time.",
2019-06-14 15:54:56 +08:00
Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
}, []string{"type"})
FilerInFlightRequestsGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: "filer",
Name: "in_flight_requests",
Help: "Current number of in-flight requests being handled by filer.",
}, []string{"type"})
FilerServerLastSendTsOfSubscribeGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
Subsystem: "filer",
Name: "last_send_timestamp_of_subscribe",
Help: "The last send timestamp of the filer subscription.",
}, []string{"sourceFiler", "clientName", "path"})
2019-06-23 03:23:25 +08:00
FilerStoreCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
2019-06-23 03:23:25 +08:00
Subsystem: "filerStore",
Name: "request_total",
Help: "Counter of filer store requests.",
}, []string{"store", "type"})
FilerStoreHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
2019-06-23 03:23:25 +08:00
Subsystem: "filerStore",
Name: "request_seconds",
Help: "Bucketed histogram of filer store request processing time.",
Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
}, []string{"store", "type"})
FilerSyncOffsetGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
Subsystem: "filerSync",
Name: "sync_offset",
Help: "The offset of the filer synchronization service.",
}, []string{"sourceFiler", "targetFiler", "clientName", "path"})
2019-06-16 03:21:44 +08:00
VolumeServerRequestCounter = prometheus.NewCounterVec(
2019-06-14 15:54:56 +08:00
prometheus.CounterOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
2019-06-14 15:54:56 +08:00
Subsystem: "volumeServer",
Name: "request_total",
2019-06-24 15:26:03 +08:00
Help: "Counter of volume server requests.",
}, []string{"type", "code"})
VolumeServerHandlerCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: "volumeServer",
Name: "handler_total",
Help: "Counter of volume server handlers.",
2019-06-14 15:54:56 +08:00
}, []string{"type"})
VolumeServerVacuumingCompactCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: "volumeServer",
Name: "vacuuming_compact_count",
Help: "Counter of volume vacuuming Compact counter",
}, []string{"success"})
VolumeServerVacuumingCommitCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: "volumeServer",
Name: "vacuuming_commit_count",
Help: "Counter of volume vacuuming commit counter",
}, []string{"success"})
VolumeServerVacuumingHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: "volumeServer",
Name: "vacuuming_seconds",
Help: "Bucketed histogram of volume server vacuuming processing time.",
Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
}, []string{"type"})
2019-06-16 03:21:44 +08:00
VolumeServerRequestHistogram = prometheus.NewHistogramVec(
2019-06-14 15:54:56 +08:00
prometheus.HistogramOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
2019-06-14 15:54:56 +08:00
Subsystem: "volumeServer",
Name: "request_seconds",
2019-06-24 15:26:03 +08:00
Help: "Bucketed histogram of volume server request processing time.",
2019-06-14 15:54:56 +08:00
Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
}, []string{"type"})
2019-06-15 04:06:01 +08:00
VolumeServerInFlightRequestsGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: "volumeServer",
Name: "in_flight_requests",
Help: "Current number of in-flight requests being handled by volume server.",
}, []string{"type"})
2024-04-17 19:49:50 +08:00
VolumeServerVolumeGauge = prometheus.NewGaugeVec(
2019-06-15 04:06:01 +08:00
prometheus.GaugeOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
2019-06-15 04:06:01 +08:00
Subsystem: "volumeServer",
Name: "volumes",
2019-06-18 12:02:50 +08:00
Help: "Number of volumes or shards.",
}, []string{"collection", "type"})
2019-06-16 17:24:15 +08:00
2020-10-15 18:32:02 +08:00
VolumeServerReadOnlyVolumeGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
2020-10-15 18:32:02 +08:00
Subsystem: "volumeServer",
Name: "read_only_volumes",
Help: "Number of read only volumes.",
}, []string{"collection", "type"})
2019-06-18 12:02:50 +08:00
VolumeServerMaxVolumeCounter = prometheus.NewGauge(
2019-06-16 17:24:15 +08:00
prometheus.GaugeOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
2019-06-16 17:24:15 +08:00
Subsystem: "volumeServer",
2019-06-21 00:56:49 +08:00
Name: "max_volumes",
2019-06-18 12:02:50 +08:00
Help: "Maximum number of volumes.",
2019-06-16 17:24:15 +08:00
})
VolumeServerDiskSizeGauge = prometheus.NewGaugeVec(
2019-06-16 17:44:20 +08:00
prometheus.GaugeOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
2019-06-16 17:44:20 +08:00
Subsystem: "volumeServer",
Name: "total_disk_size",
2019-06-16 17:44:20 +08:00
Help: "Actual disk size used by volumes.",
}, []string{"collection", "type"})
2020-09-18 15:09:04 +08:00
VolumeServerResourceGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
Subsystem: "volumeServer",
Name: "resource",
Help: "Resource usage",
}, []string{"name", "type"})
2020-09-18 15:09:04 +08:00
S3RequestCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
2020-09-18 15:09:04 +08:00
Subsystem: "s3",
Name: "request_total",
Help: "Counter of s3 requests.",
}, []string{"type", "code", "bucket"})
S3HandlerCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: "s3",
Name: "handler_total",
Help: "Counter of s3 server handlers.",
}, []string{"type"})
2020-09-18 15:09:04 +08:00
S3RequestHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
2022-10-07 19:20:34 +08:00
Namespace: Namespace,
2020-09-18 15:09:04 +08:00
Subsystem: "s3",
Name: "request_seconds",
Help: "Bucketed histogram of s3 request processing time.",
Buckets: prometheus.ExponentialBuckets(0.0001, 2, 24),
}, []string{"type", "bucket"})
S3TimeToFirstByteHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: "s3",
Name: "time_to_first_byte_millisecond",
Help: "Bucketed histogram of s3 time to first byte request processing time.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 27),
}, []string{"type", "bucket"})
S3InFlightRequestsGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: "s3",
Name: "in_flight_requests",
Help: "Current number of in-flight requests being handled by s3.",
}, []string{"type"})
)
func init() {
2022-01-24 23:13:07 +08:00
Gather.MustRegister(MasterClientConnectCounter)
Gather.MustRegister(MasterRaftIsleader)
Gather.MustRegister(MasterAdminLock)
2022-01-24 23:13:07 +08:00
Gather.MustRegister(MasterReceivedHeartbeatCounter)
Gather.MustRegister(MasterLeaderChangeCounter)
Gather.MustRegister(MasterReplicaPlacementMismatch)
Gather.MustRegister(MasterVolumeLayoutWritable)
Gather.MustRegister(MasterVolumeLayoutCrowded)
Gather.MustRegister(MasterPickForWriteErrorCounter)
Gather.MustRegister(MasterBroadcastToFullErrorCounter)
2022-01-24 23:13:07 +08:00
2020-09-25 01:21:23 +08:00
Gather.MustRegister(FilerRequestCounter)
Gather.MustRegister(FilerHandlerCounter)
2020-09-25 01:21:23 +08:00
Gather.MustRegister(FilerRequestHistogram)
Gather.MustRegister(FilerInFlightRequestsGauge)
2020-09-25 01:21:23 +08:00
Gather.MustRegister(FilerStoreCounter)
Gather.MustRegister(FilerStoreHistogram)
Gather.MustRegister(FilerSyncOffsetGauge)
Gather.MustRegister(FilerServerLastSendTsOfSubscribeGauge)
2021-06-22 15:54:13 +08:00
Gather.MustRegister(collectors.NewGoCollector())
Gather.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
2019-06-14 15:54:56 +08:00
2020-09-25 01:21:23 +08:00
Gather.MustRegister(VolumeServerRequestCounter)
Gather.MustRegister(VolumeServerHandlerCounter)
2020-09-25 01:21:23 +08:00
Gather.MustRegister(VolumeServerRequestHistogram)
Gather.MustRegister(VolumeServerInFlightRequestsGauge)
Gather.MustRegister(VolumeServerVacuumingCompactCounter)
Gather.MustRegister(VolumeServerVacuumingCommitCounter)
Gather.MustRegister(VolumeServerVacuumingHistogram)
2024-04-17 19:49:50 +08:00
Gather.MustRegister(VolumeServerVolumeGauge)
2020-09-25 01:21:23 +08:00
Gather.MustRegister(VolumeServerMaxVolumeCounter)
2020-10-15 18:32:02 +08:00
Gather.MustRegister(VolumeServerReadOnlyVolumeGauge)
2020-09-25 01:21:23 +08:00
Gather.MustRegister(VolumeServerDiskSizeGauge)
Gather.MustRegister(VolumeServerResourceGauge)
2019-06-15 04:06:01 +08:00
2020-09-25 01:21:23 +08:00
Gather.MustRegister(S3RequestCounter)
Gather.MustRegister(S3HandlerCounter)
2020-09-25 01:21:23 +08:00
Gather.MustRegister(S3RequestHistogram)
Gather.MustRegister(S3InFlightRequestsGauge)
Gather.MustRegister(S3TimeToFirstByteHistogram)
go bucketMetricTTLControl()
2019-06-15 04:06:01 +08:00
}
2020-09-25 01:21:23 +08:00
func LoopPushingMetric(name, instance, addr string, intervalSeconds int) {
2020-09-16 16:39:30 +08:00
if addr == "" || intervalSeconds == 0 {
2019-06-24 06:29:49 +08:00
return
}
2020-09-21 06:38:59 +08:00
glog.V(0).Infof("%s server sends metrics to %s every %d seconds", name, addr, intervalSeconds)
2020-09-25 01:21:23 +08:00
pusher := push.New(addr, name).Gatherer(Gather).Grouping("instance", instance)
2019-06-14 15:54:56 +08:00
2019-06-15 04:06:01 +08:00
for {
2020-09-16 16:39:30 +08:00
err := pusher.Push()
if err != nil && !strings.HasPrefix(err.Error(), "unexpected status code 200") {
glog.V(0).Infof("could not push metrics to prometheus push gateway %s: %v", addr, err)
}
if intervalSeconds <= 0 {
intervalSeconds = 15
2019-06-15 04:06:01 +08:00
}
time.Sleep(time.Duration(intervalSeconds) * time.Second)
}
}
2019-06-16 12:46:55 +08:00
func JoinHostPort(host string, port int) string {
portStr := strconv.Itoa(port)
if strings.HasPrefix(host, "[") && strings.HasSuffix(host, "]") {
return host + ":" + portStr
}
return net.JoinHostPort(host, portStr)
}
func StartMetricsServer(ip string, port int) {
2020-09-24 20:45:39 +08:00
if port == 0 {
return
}
2020-09-25 01:21:23 +08:00
http.Handle("/metrics", promhttp.HandlerFor(Gather, promhttp.HandlerOpts{}))
log.Fatal(http.ListenAndServe(JoinHostPort(ip, port), nil))
2020-09-24 20:45:39 +08:00
}
func SourceName(port uint32) string {
2019-06-16 12:46:55 +08:00
hostname, err := os.Hostname()
if err != nil {
return "unknown"
}
2021-09-08 07:43:54 +08:00
return net.JoinHostPort(hostname, strconv.Itoa(int(port)))
2019-06-16 12:46:55 +08:00
}
func RecordBucketActiveTime(bucket string) {
bucketLastActiveTsNs[bucket] = time.Now().UnixNano()
}
func DeleteCollectionMetrics(collection string) {
labels := prometheus.Labels{"collection": collection}
c := MasterReplicaPlacementMismatch.DeletePartialMatch(labels)
c += MasterVolumeLayoutWritable.DeletePartialMatch(labels)
c += MasterVolumeLayoutCrowded.DeletePartialMatch(labels)
c += VolumeServerDiskSizeGauge.DeletePartialMatch(labels)
c += VolumeServerVolumeGauge.DeletePartialMatch(labels)
c += VolumeServerReadOnlyVolumeGauge.DeletePartialMatch(labels)
glog.V(0).Infof("delete collection metrics, %s: %d", collection, c)
}
func bucketMetricTTLControl() {
ttlNs := bucketAtiveTTL.Nanoseconds()
for {
now := time.Now().UnixNano()
for bucket, ts := range bucketLastActiveTsNs {
if (now - ts) > ttlNs {
delete(bucketLastActiveTsNs, bucket)
labels := prometheus.Labels{"bucket": bucket}
c := S3RequestCounter.DeletePartialMatch(labels)
c += S3RequestHistogram.DeletePartialMatch(labels)
c += S3TimeToFirstByteHistogram.DeletePartialMatch(labels)
glog.V(0).Infof("delete inactive bucket metrics, %s: %d", bucket, c)
}
}
time.Sleep(bucketAtiveTTL)
}
}