GrenXPaRTa

File: //usr/lib/netdata/conf.d/health.d/k8s_apiserver.conf
# Kubernetes API Server health alerts

# -----------------------------------------------------------------------------
# Request Rate (for alarm thresholds)

 template: k8s_apiserver_1m_requests
       on: k8s_apiserver.requests_total
    class: Workload
     type: Kubernetes
component: API Server
   lookup: sum -1m unaligned of requests
     calc: ($this == 0)?(1):($this)
    units: requests
    every: 10s
     info: number of API server requests in the last minute

# -----------------------------------------------------------------------------
# Dropped Requests

 template: k8s_apiserver_dropped_requests
       on: k8s_apiserver.requests_dropped
    class: Errors
     type: Kubernetes
component: API Server
   lookup: sum -1m unaligned of dropped
    units: requests
    every: 10s
     warn: $this > 0
     crit: $this > 10
    delay: up 1m down 5m multiplier 1.5 max 1h
  summary: K8s API server dropped requests
     info: API server dropped requests in the last minute due to overload
       to: sysadmin

# -----------------------------------------------------------------------------
# Error Rate (5xx responses)

 template: k8s_apiserver_5xx_errors
       on: k8s_apiserver.requests_by_code
    class: Errors
     type: Kubernetes
component: API Server
   lookup: sum -1m unaligned of 500,501,502,503,504
     calc: $this * 100 / $k8s_apiserver_1m_requests
    units: %
    every: 10s
     warn: ($k8s_apiserver_1m_requests > 60) ? ($this > (($status >= $WARNING)  ? ( 1 ) : ( 2 )) ) : ( 0 )
     crit: ($k8s_apiserver_1m_requests > 60) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
    delay: up 2m down 15m multiplier 1.5 max 1h
  summary: K8s API server error rate
     info: Percentage of 5xx server error responses over the last minute
       to: sysadmin

# -----------------------------------------------------------------------------
# Client Errors (4xx responses - excluding 401/403 which are normal auth)

 template: k8s_apiserver_4xx_errors
       on: k8s_apiserver.requests_by_code
    class: Errors
     type: Kubernetes
component: API Server
   lookup: sum -1m unaligned of 400,404,405,408,409,410,422,429
     calc: $this * 100 / $k8s_apiserver_1m_requests
    units: %
    every: 10s
     warn: ($k8s_apiserver_1m_requests > 60) ? ($this > (($status >= $WARNING)  ? ( 10 ) : ( 20 )) ) : ( 0 )
    delay: up 2m down 15m multiplier 1.5 max 1h
  summary: K8s API server client errors
     info: Percentage of 4xx client error responses over the last minute (excluding 401/403)
       to: sysadmin

# -----------------------------------------------------------------------------
# Request Latency

 template: k8s_apiserver_latency_10m
       on: k8s_apiserver.request_latency
    class: Latency
     type: Kubernetes
component: API Server
   lookup: average -10m unaligned of p99
    units: milliseconds
    every: 30s
     info: average API server request latency (p99) over the last 10 minutes

 template: k8s_apiserver_latency_high
       on: k8s_apiserver.request_latency
    class: Latency
     type: Kubernetes
component: API Server
   lookup: average -1m unaligned of p99
    units: milliseconds
    every: 10s
    green: 500
      red: 2000
     warn: ($k8s_apiserver_1m_requests > 60) ? ($this > $green && $this > ($k8s_apiserver_latency_10m * 2)) : ( 0 )
     crit: ($k8s_apiserver_1m_requests > 60) ? ($this > $red && $this > ($k8s_apiserver_latency_10m * 4)) : ( 0 )
    delay: down 15m multiplier 1.5 max 1h
  summary: K8s API server latency
     info: API server request latency (p99) over the last minute
  options: no-clear-notification
       to: sysadmin

# -----------------------------------------------------------------------------
# Inflight Requests

 template: k8s_apiserver_inflight_mutating
       on: k8s_apiserver.inflight_requests
    class: Utilization
     type: Kubernetes
component: API Server
   lookup: average -1m unaligned of mutating
    units: requests
    every: 10s
     warn: $this > 400
     crit: $this > 600
    delay: up 1m down 5m multiplier 1.5 max 1h
  summary: K8s API server mutating inflight
     info: Average number of mutating requests currently in flight over the last minute \
           (default limit is 600)
       to: sysadmin

 template: k8s_apiserver_inflight_readonly
       on: k8s_apiserver.inflight_requests
    class: Utilization
     type: Kubernetes
component: API Server
   lookup: average -1m unaligned of read_only
    units: requests
    every: 10s
     warn: $this > 800
     crit: $this > 1000
    delay: up 1m down 5m multiplier 1.5 max 1h
  summary: K8s API server read-only inflight
     info: Average number of read-only requests currently in flight over the last minute \
           (default limit is 1000)
       to: sysadmin

# -----------------------------------------------------------------------------
# Workqueue Depth

 template: k8s_apiserver_workqueue_depth
       on: k8s_apiserver.workqueue_depth
    class: Utilization
     type: Kubernetes
component: API Server
   lookup: average -1m unaligned of depth
    units: items
    every: 10s
     warn: $this > 100
     crit: $this > 500
    delay: up 2m down 10m multiplier 1.5 max 1h
  summary: K8s API server workqueue depth
     info: Average depth of controller work queue over the last minute
       to: sysadmin

# -----------------------------------------------------------------------------
# Process Resources

 template: k8s_apiserver_goroutines
       on: k8s_apiserver.goroutines
    class: Utilization
     type: Kubernetes
component: API Server
   lookup: average -5m unaligned of goroutines
    units: goroutines
    every: 30s
     warn: $this > 10000
     crit: $this > 20000
    delay: up 5m down 15m multiplier 1.5 max 1h
  summary: K8s API server goroutines
     info: Average number of goroutines over the last 5 minutes
       to: sysadmin

 template: k8s_apiserver_open_fds_utilization
       on: k8s_apiserver.open_fds
    class: Utilization
     type: Kubernetes
component: API Server
     calc: ($max > 0) ? ($open * 100 / $max) : (0)
    units: %
    every: 30s
     warn: $this > 80
     crit: $this > 95
    delay: up 2m down 15m multiplier 1.5 max 1h
  summary: K8s API server file descriptor utilization
     info: Percentage of file descriptors used by the API server
       to: sysadmin