Skip to content

Commit

Permalink
Make SLO monitor publish full pod startup latency including image pul…
Browse files Browse the repository at this point in the history
…l time
  • Loading branch information
gmarek committed Jan 16, 2018
1 parent 9f6d09e commit e8c2b2d
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 9 deletions.
7 changes: 2 additions & 5 deletions slo-monitor/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,12 @@
# limitations under the License.

PACKAGE = k8s.io/perf-tests/slo-monitor
TAG = 0.9.3
TAG = 0.10.0
REPOSITORY = google-containers

all: build

deps:
go get -u github.com/tools/godep

build: src/monitors/pod_monitor.go src/monitors/util.go src/monitors/store.go src/monitors/watcher.go src/main/slo-monitor.go deps
build: src/monitors/pod_monitor.go src/monitors/util.go src/monitors/store.go src/monitors/watcher.go src/main/slo-monitor.go
GOOS=linux GOARCH=amd64 CGO_ENABLED=0 godep go build -a -o build/slo-monitor src/main/slo-monitor.go

push:
Expand Down
13 changes: 13 additions & 0 deletions slo-monitor/binding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: slo-monitor
subjects:
- kind: ServiceAccount
name: slo-monitor
namespace: kube-system
roleRef:
kind: ClusterRole
name: slo-monitor
apiGroup: rbac.authorization.k8s.io

9 changes: 9 additions & 0 deletions slo-monitor/role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: slo-monitor
namespace: kube-system
rules:
- apiGroups: [""]
resources: ["pods", "events"]
verbs: ["get", "watch", "list"]
5 changes: 5 additions & 0 deletions slo-monitor/serviceaccount.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: slo-monitor
namespace: kube-system
3 changes: 2 additions & 1 deletion slo-monitor/slo-monitor-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ spec:
spec:
containers:
- name: slo-monitor
image: gcr.io/google-containers/slo-monitor:0.9
image: gcr.io/google-containers/slo-monitor:0.10.0
command:
- /slo-monitor
- --alsologtostderr=true
Expand All @@ -34,3 +34,4 @@ spec:
cpu: 300m
memory: 100Mi
restartPolicy: Always
serviceAccountName: slo-monitor
2 changes: 1 addition & 1 deletion slo-monitor/slo-monitor-pod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ spec:
hostNetwork: true
containers:
- name: slo-monitor
image: gcr.io/google-containers/slo-monitor:0.9.1
image: gcr.io/google-containers/slo-monitor:0.10.0
command:
- /slo-monitor
- --alsologtostderr=true
Expand Down
18 changes: 16 additions & 2 deletions slo-monitor/src/monitors/pod_monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ import (
const (
// PodE2EStartupLatencyKey is a key for pod startup latency monitoring metric.
PodE2EStartupLatencyKey = "slomonitor_pod_e2e_startup_latency_seconds"
// PodFullStartupLatencyKey is a key for pod startup latency monitoring metric including pull times.
PodFullStartupLatencyKey = "slomonitor_pod_full_startup_latency_seconds"
)

var (
Expand All @@ -51,6 +53,15 @@ var (
Buckets: prometheus.LinearBuckets(0.5, 0.25, 50),
},
)

// PodFullStartupLatency is a prometheus metric for monitoring pod startup latency including image pull times.
PodFullStartupLatency = prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: PodFullStartupLatencyKey,
Help: "Pod e2e startup latencies in milliseconds, with image pull times",
Buckets: prometheus.LinearBuckets(0.5, 0.5, 100),
},
)
)

var registerMetrics sync.Once
Expand All @@ -59,6 +70,7 @@ var registerMetrics sync.Once
func Register() {
registerMetrics.Do(func() {
prometheus.MustRegister(PodE2EStartupLatency)
prometheus.MustRegister(PodFullStartupLatency)
})
}

Expand Down Expand Up @@ -104,7 +116,7 @@ func NewPodStartupLatencyDataMonitor(c clientset.Interface, purgeAfter time.Dura
}

// Run starts a PodStartupLatencyDataMonitor: it creates all watches, populates PodStartupData and updates
// PodE2EStartupLatency metric
// PodE2EStartupLatency and PodFullStartupLatency metrics
func (pm *PodStartupLatencyDataMonitor) Run(stopCh chan struct{}) error {
controller := NewWatcherWithHandler(
&cache.ListWatch{
Expand Down Expand Up @@ -279,12 +291,14 @@ func (pm *PodStartupLatencyDataMonitor) updateMetric(key string, data *PodStartu
glog.V(4).Infof("Observed Pod %v creation: created %v, pulling: %v, pulled: %v, running: %v",
key, data.created, data.startedPulling, data.finishedPulling, data.observedRunning)
data.accountedFor = true
startupTime := data.observedRunning.Sub(data.created) - data.finishedPulling.Sub(data.startedPulling)
fullStartupTime := data.observedRunning.Sub(data.created)
startupTime := fullStartupTime - data.finishedPulling.Sub(data.startedPulling)
if startupTime < 0 {
glog.Warningf("Saw negative startup time for %v: %v", key, data)
startupTime = 0
}
PodE2EStartupLatency.Observe(float64(startupTime / time.Second))
PodFullStartupLatency.Observe(float64(fullStartupTime / time.Second))
}
}

Expand Down

0 comments on commit e8c2b2d

Please sign in to comment.