This repository has been archived by the owner on Apr 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
Revamp caboose metrics #41
Merged
Merged
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
fc2b20c
revamp caboose metrics
aarshkshah1992 4d65686
fix CI
aarshkshah1992 ed65721
address lidel review
aarshkshah1992 910252d
metrics collection
aarshkshah1992 f33ee6c
Merge remote-tracking branch 'origin/main' into feat/metrics-collecti…
aarshkshah1992 88ef8ba
ttfb metrics
aarshkshah1992 3565ec6
wrap up metrics
aarshkshah1992 67d62dd
Merge branch 'lock-cleanup' into feat/metrics-collection-work
aarshkshah1992 efa828c
refactor: explicit names of block metrics
lidel 5804ca7
fix: switch blockSizeHistogram to LinearBuckets
lidel 707e9dd
fix: decouple buckets from app code
lidel File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,17 @@ import ( | |
) | ||
|
||
var ( | ||
// Size buckets from 256 KiB (default chunk in Kubo) to 4MiB (maxBlockSize), 256 KiB wide each | ||
blockSizeHistogram = prometheus.LinearBuckets(262144, 262144, 16) | ||
|
||
// TODO: Speed max bucket could use some further refinement, | ||
// for now we don't expect speed being bigger than transfering 4MiB (max block) in 500ms | ||
speedHistogram = prometheus.ExponentialBucketsRange(1, 4194304/500, 20) | ||
|
||
// Duration max bucket is informed by the timeouts per block and per peer request/retry | ||
durationPerBlockHistogram = prometheus.ExponentialBucketsRange(1, 60000, 10) | ||
durationPerBlockPerPeerHistogram = prometheus.ExponentialBucketsRange(1, 20000, 10) | ||
Comment on lines
+11
to
+17
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ℹ️ I've just remembered that it is a good practice to avoid defining buckets with values that may If we ever need to change buckets, or add new one with bigger max, but keep old ones intact, we can define them as explicit list of values (like we did with legacy ones in kubo a while ago): []float64{0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10, 30, 60} |
||
|
||
CabooseMetrics = prometheus.NewRegistry() | ||
|
||
poolErrorMetric = prometheus.NewCounter(prometheus.CounterOpts{ | ||
|
@@ -22,25 +33,65 @@ var ( | |
Help: "Health of the caboose pool", | ||
}, []string{"weight"}) | ||
|
||
// TODO: if we add CARs, we need to split this one into two, or add two dedicated ones | ||
fetchResponseMetric = prometheus.NewCounterVec(prometheus.CounterOpts{ | ||
Name: prometheus.BuildFQName("ipfs", "caboose", "fetch_errors"), | ||
Help: "Errors fetching from Caboose Peers", | ||
}, []string{"code"}) | ||
|
||
fetchSpeedMetric = prometheus.NewHistogram(prometheus.HistogramOpts{ | ||
Name: prometheus.BuildFQName("ipfs", "caboose", "fetch_speed"), | ||
Help: "Speed observed during caboose fetches", | ||
Buckets: prometheus.DefBuckets, | ||
fetchSpeedPerBlockMetric = prometheus.NewHistogram(prometheus.HistogramOpts{ | ||
Name: prometheus.BuildFQName("ipfs", "caboose", "fetch_speed_block"), | ||
Help: "Speed observed during caboose fetches for a block across multiple peers and retries", | ||
Buckets: speedHistogram, | ||
}) | ||
fetchLatencyMetric = prometheus.NewHistogram(prometheus.HistogramOpts{ | ||
Name: prometheus.BuildFQName("ipfs", "caboose", "fetch_latency"), | ||
Help: "Latency observed during caboose fetches", | ||
Buckets: prometheus.ExponentialBucketsRange(1, 10000, 10), | ||
|
||
fetchSpeedPerBlockPerPeerMetric = prometheus.NewHistogram(prometheus.HistogramOpts{ | ||
Name: prometheus.BuildFQName("ipfs", "caboose", "fetch_speed_block_peer"), | ||
Help: "Speed observed during caboose fetches for fetching a block from a single peer", | ||
Buckets: speedHistogram, | ||
}) | ||
|
||
// TODO: if we add CARs, we need to split this one into two, or add two dedicated ones | ||
fetchSizeMetric = prometheus.NewHistogram(prometheus.HistogramOpts{ | ||
Name: prometheus.BuildFQName("ipfs", "caboose", "fetch_size"), | ||
Help: "Size in bytes of caboose fetches", | ||
Buckets: prometheus.ExponentialBucketsRange(1, 4000000, 16), | ||
Help: "Size in bytes of caboose block fetches", | ||
Buckets: blockSizeHistogram, | ||
}) | ||
|
||
fetchDurationPerBlockPerPeerSuccessMetric = prometheus.NewHistogram(prometheus.HistogramOpts{ | ||
Name: prometheus.BuildFQName("ipfs", "caboose", "fetch_duration_block_peer_success"), | ||
Help: "Latency observed during successful caboose fetches from a single peer", | ||
Buckets: durationPerBlockPerPeerHistogram, | ||
}) | ||
|
||
fetchDurationPerBlockPerPeerFailureMetric = prometheus.NewHistogram(prometheus.HistogramOpts{ | ||
Name: prometheus.BuildFQName("ipfs", "caboose", "fetch_duration_block_peer_failure"), | ||
Help: "Latency observed during failed caboose fetches from a single peer", | ||
Buckets: durationPerBlockPerPeerHistogram, | ||
}) | ||
|
||
fetchDurationBlockSuccessMetric = prometheus.NewHistogram(prometheus.HistogramOpts{ | ||
Name: prometheus.BuildFQName("ipfs", "caboose", "fetch_duration_block_success"), | ||
Help: "Latency observed during successful caboose fetches for a block", | ||
Buckets: durationPerBlockHistogram, | ||
}) | ||
|
||
fetchDurationBlockFailureMetric = prometheus.NewHistogram(prometheus.HistogramOpts{ | ||
Name: prometheus.BuildFQName("ipfs", "caboose", "fetch_duration_block_failure"), | ||
Help: "Latency observed during failed caboose fetches for a block", | ||
Buckets: durationPerBlockHistogram, | ||
}) | ||
|
||
fetchTTFBPerBlockPerPeerSuccessMetric = prometheus.NewHistogram(prometheus.HistogramOpts{ | ||
Name: prometheus.BuildFQName("ipfs", "caboose", "fetch_ttfb_block_peer_success"), | ||
Help: "TTFB observed during a successful caboose fetch from a single peer", | ||
Buckets: durationPerBlockPerPeerHistogram, | ||
}) | ||
|
||
fetchTTFBPerBlockPerPeerFailureMetric = prometheus.NewHistogram(prometheus.HistogramOpts{ | ||
Name: prometheus.BuildFQName("ipfs", "caboose", "fetch_ttfb_block_peer_failure"), | ||
Help: "TTFB observed during a failed caboose fetch from a single peer", | ||
Buckets: durationPerBlockPerPeerHistogram, | ||
}) | ||
) | ||
|
||
|
@@ -49,7 +100,14 @@ func init() { | |
CabooseMetrics.MustRegister(poolSizeMetric) | ||
CabooseMetrics.MustRegister(poolHealthMetric) | ||
CabooseMetrics.MustRegister(fetchResponseMetric) | ||
CabooseMetrics.MustRegister(fetchSpeedMetric) | ||
CabooseMetrics.MustRegister(fetchLatencyMetric) | ||
CabooseMetrics.MustRegister(fetchSizeMetric) | ||
|
||
CabooseMetrics.MustRegister(fetchSpeedPerBlockMetric) | ||
CabooseMetrics.MustRegister(fetchSpeedPerBlockPerPeerMetric) | ||
CabooseMetrics.MustRegister(fetchDurationPerBlockPerPeerSuccessMetric) | ||
CabooseMetrics.MustRegister(fetchDurationPerBlockPerPeerFailureMetric) | ||
CabooseMetrics.MustRegister(fetchDurationBlockSuccessMetric) | ||
CabooseMetrics.MustRegister(fetchDurationBlockFailureMetric) | ||
CabooseMetrics.MustRegister(fetchTTFBPerBlockPerPeerSuccessMetric) | ||
CabooseMetrics.MustRegister(fetchTTFBPerBlockPerPeerFailureMetric) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ℹ️ This gives us more useful buckets (same width), and we can plot them in Grafana using heatmap widget