File: //usr/lib/netdata/conf.d/health.d/azure_monitor_aks.conf
# you can disable an alarm notification by setting the 'to' line to: silent
# --- API Server ---
template: am_aks_apiserver_cpu
on: azure_monitor.aks.apiserver_cpu
class: Utilization
type: Kubernetes
component: AKS
lookup: average -5m unaligned of average
units: percentage
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (80))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: AKS API server CPU on ${label:resource_name}
info: Average API server CPU utilization on AKS cluster ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
template: am_aks_apiserver_memory
on: azure_monitor.aks.apiserver_memory
class: Utilization
type: Kubernetes
component: AKS
lookup: average -5m unaligned of average
units: percentage
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (80))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: AKS API server memory on ${label:resource_name}
info: Average API server memory utilization on AKS cluster ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
template: am_aks_apiserver_inflight_requests
on: azure_monitor.aks.apiserver_inflight_requests
class: Workload
type: Kubernetes
component: AKS
lookup: average -5m unaligned of average
units: requests
every: 1m
warn: $this > (($status >= $WARNING) ? (400) : (600))
crit: $this > (($status == $CRITICAL) ? (600) : (800))
delay: down 5m multiplier 1.5 max 1h
summary: AKS API server inflight requests on ${label:resource_name}
info: Average number of inflight requests to the API server on AKS cluster ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- etcd ---
template: am_aks_etcd_cpu
on: azure_monitor.aks.etcd_cpu
class: Utilization
type: Kubernetes
component: AKS
lookup: average -5m unaligned of average
units: percentage
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (80))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: AKS etcd CPU on ${label:resource_name}
info: Average etcd CPU utilization on AKS cluster ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
template: am_aks_etcd_memory
on: azure_monitor.aks.etcd_memory
class: Utilization
type: Kubernetes
component: AKS
lookup: average -5m unaligned of average
units: percentage
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (80))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: AKS etcd memory on ${label:resource_name}
info: Average etcd memory utilization on AKS cluster ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
template: am_aks_etcd_database
on: azure_monitor.aks.etcd_database
class: Utilization
type: Kubernetes
component: AKS
lookup: average -5m unaligned of average
units: percentage
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (80))
crit: $this > (($status == $CRITICAL) ? (85) : (90))
delay: down 5m multiplier 1.5 max 1h
summary: AKS etcd database usage on ${label:resource_name}
info: Average etcd database utilization on AKS cluster ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
High etcd database usage can lead to cluster instability.
to: sysadmin
# --- Autoscaler ---
template: am_aks_autoscaler_safe_to_autoscale
on: azure_monitor.aks.autoscaler_health
class: Availability
type: Kubernetes
component: AKS
lookup: average -5m unaligned of safe_to_autoscale
units: state
every: 1m
warn: $this != nan AND $this < 1
delay: down 5m multiplier 1.5 max 1h
summary: AKS autoscaler unsafe on ${label:resource_name}
info: Cluster autoscaler reports the cluster is not safe to autoscale on AKS cluster ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
template: am_aks_autoscaler_unschedulable_pods
on: azure_monitor.aks.autoscaler_unschedulable_pods
class: Errors
type: Kubernetes
component: AKS
lookup: average -5m unaligned of average
units: pods
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (5))
crit: $this > (($status == $CRITICAL) ? (5) : (20))
delay: down 5m multiplier 1.5 max 1h
summary: AKS unschedulable pods on ${label:resource_name}
info: Number of pods that cannot be scheduled by the cluster autoscaler on AKS cluster ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Indicates insufficient cluster capacity.
to: sysadmin
# --- Node CPU ---
template: am_aks_node_cpu
on: azure_monitor.aks.node_cpu_percentage
class: Utilization
type: Kubernetes
component: AKS
lookup: average -5m unaligned of average
units: percentage
every: 1m
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: AKS node CPU on ${label:resource_name}
info: Average node CPU utilization on AKS cluster ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Node Memory ---
template: am_aks_node_memory_working_set
on: azure_monitor.aks.node_memory_working_set_percentage
class: Utilization
type: Kubernetes
component: AKS
lookup: average -5m unaligned of average
units: percentage
every: 1m
warn: $this > (($status >= $WARNING) ? (80) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: AKS node memory working set on ${label:resource_name}
info: Average node memory working set utilization on AKS cluster ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
template: am_aks_node_memory_rss
on: azure_monitor.aks.node_memory_rss_percentage
class: Utilization
type: Kubernetes
component: AKS
lookup: average -5m unaligned of average
units: percentage
every: 1m
warn: $this > (($status >= $WARNING) ? (80) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: AKS node memory RSS on ${label:resource_name}
info: Average node memory RSS utilization on AKS cluster ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Node Disk ---
template: am_aks_node_disk
on: azure_monitor.aks.node_disk_percentage
class: Utilization
type: Kubernetes
component: AKS
lookup: average -5m unaligned of average
units: percentage
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (80))
crit: $this > (($status == $CRITICAL) ? (85) : (90))
delay: down 5m multiplier 1.5 max 1h
summary: AKS node disk usage on ${label:resource_name}
info: Average node disk utilization on AKS cluster ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
High disk usage can cause pod evictions.
to: sysadmin