File: //usr/lib/netdata/conf.d/health.d/k8s_apiserver.conf
# Kubernetes API Server health alerts
# -----------------------------------------------------------------------------
# Request Rate (for alarm thresholds)
template: k8s_apiserver_1m_requests
on: k8s_apiserver.requests_total
class: Workload
type: Kubernetes
component: API Server
lookup: sum -1m unaligned of requests
calc: ($this == 0)?(1):($this)
units: requests
every: 10s
info: number of API server requests in the last minute
# -----------------------------------------------------------------------------
# Dropped Requests
template: k8s_apiserver_dropped_requests
on: k8s_apiserver.requests_dropped
class: Errors
type: Kubernetes
component: API Server
lookup: sum -1m unaligned of dropped
units: requests
every: 10s
warn: $this > 0
crit: $this > 10
delay: up 1m down 5m multiplier 1.5 max 1h
summary: K8s API server dropped requests
info: API server dropped requests in the last minute due to overload
to: sysadmin
# -----------------------------------------------------------------------------
# Error Rate (5xx responses)
template: k8s_apiserver_5xx_errors
on: k8s_apiserver.requests_by_code
class: Errors
type: Kubernetes
component: API Server
lookup: sum -1m unaligned of 500,501,502,503,504
calc: $this * 100 / $k8s_apiserver_1m_requests
units: %
every: 10s
warn: ($k8s_apiserver_1m_requests > 60) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
crit: ($k8s_apiserver_1m_requests > 60) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
summary: K8s API server error rate
info: Percentage of 5xx server error responses over the last minute
to: sysadmin
# -----------------------------------------------------------------------------
# Client Errors (4xx responses - excluding 401/403 which are normal auth)
template: k8s_apiserver_4xx_errors
on: k8s_apiserver.requests_by_code
class: Errors
type: Kubernetes
component: API Server
lookup: sum -1m unaligned of 400,404,405,408,409,410,422,429
calc: $this * 100 / $k8s_apiserver_1m_requests
units: %
every: 10s
warn: ($k8s_apiserver_1m_requests > 60) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 20 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
summary: K8s API server client errors
info: Percentage of 4xx client error responses over the last minute (excluding 401/403)
to: sysadmin
# -----------------------------------------------------------------------------
# Request Latency
template: k8s_apiserver_latency_10m
on: k8s_apiserver.request_latency
class: Latency
type: Kubernetes
component: API Server
lookup: average -10m unaligned of p99
units: milliseconds
every: 30s
info: average API server request latency (p99) over the last 10 minutes
template: k8s_apiserver_latency_high
on: k8s_apiserver.request_latency
class: Latency
type: Kubernetes
component: API Server
lookup: average -1m unaligned of p99
units: milliseconds
every: 10s
green: 500
red: 2000
warn: ($k8s_apiserver_1m_requests > 60) ? ($this > $green && $this > ($k8s_apiserver_latency_10m * 2)) : ( 0 )
crit: ($k8s_apiserver_1m_requests > 60) ? ($this > $red && $this > ($k8s_apiserver_latency_10m * 4)) : ( 0 )
delay: down 15m multiplier 1.5 max 1h
summary: K8s API server latency
info: API server request latency (p99) over the last minute
options: no-clear-notification
to: sysadmin
# -----------------------------------------------------------------------------
# Inflight Requests
template: k8s_apiserver_inflight_mutating
on: k8s_apiserver.inflight_requests
class: Utilization
type: Kubernetes
component: API Server
lookup: average -1m unaligned of mutating
units: requests
every: 10s
warn: $this > 400
crit: $this > 600
delay: up 1m down 5m multiplier 1.5 max 1h
summary: K8s API server mutating inflight
info: Average number of mutating requests currently in flight over the last minute \
(default limit is 600)
to: sysadmin
template: k8s_apiserver_inflight_readonly
on: k8s_apiserver.inflight_requests
class: Utilization
type: Kubernetes
component: API Server
lookup: average -1m unaligned of read_only
units: requests
every: 10s
warn: $this > 800
crit: $this > 1000
delay: up 1m down 5m multiplier 1.5 max 1h
summary: K8s API server read-only inflight
info: Average number of read-only requests currently in flight over the last minute \
(default limit is 1000)
to: sysadmin
# -----------------------------------------------------------------------------
# Workqueue Depth
template: k8s_apiserver_workqueue_depth
on: k8s_apiserver.workqueue_depth
class: Utilization
type: Kubernetes
component: API Server
lookup: average -1m unaligned of depth
units: items
every: 10s
warn: $this > 100
crit: $this > 500
delay: up 2m down 10m multiplier 1.5 max 1h
summary: K8s API server workqueue depth
info: Average depth of controller work queue over the last minute
to: sysadmin
# -----------------------------------------------------------------------------
# Process Resources
template: k8s_apiserver_goroutines
on: k8s_apiserver.goroutines
class: Utilization
type: Kubernetes
component: API Server
lookup: average -5m unaligned of goroutines
units: goroutines
every: 30s
warn: $this > 10000
crit: $this > 20000
delay: up 5m down 15m multiplier 1.5 max 1h
summary: K8s API server goroutines
info: Average number of goroutines over the last 5 minutes
to: sysadmin
template: k8s_apiserver_open_fds_utilization
on: k8s_apiserver.open_fds
class: Utilization
type: Kubernetes
component: API Server
calc: ($max > 0) ? ($open * 100 / $max) : (0)
units: %
every: 30s
warn: $this > 80
crit: $this > 95
delay: up 2m down 15m multiplier 1.5 max 1h
summary: K8s API server file descriptor utilization
info: Percentage of file descriptors used by the API server
to: sysadmin