File: //usr/lib/netdata/conf.d/health.d/azure_monitor_log_analytics.conf
# you can disable an alarm notification by setting the 'to' line to: silent
# --- Workspace SLI: Availability ---
# Query availability (0-100%). Low is bad.
# Azure SLA: 99.9% for Log Analytics queries.
template: am_log_analytics_query_availability
on: azure_monitor.log_analytics.query_availability
class: Availability
type: Other
component: Log Analytics
lookup: average -5m unaligned of availability
units: percentage
every: 1m
warn: $this < (($status >= $WARNING) ? (99.9) : (99))
crit: $this < (($status == $CRITICAL) ? (99) : (90))
delay: down 5m multiplier 1.5 max 1h
summary: Log Analytics query availability on ${label:resource_name}
info: Query availability of Log Analytics workspace ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Workspace SLI: Ingestion ---
# Ingestion latency (seconds). High is bad.
# AMBA: average ingestion latency > 300s (5 min) is concerning.
template: am_log_analytics_ingestion_latency
on: azure_monitor.log_analytics.ingestion_latency
class: Latency
type: Other
component: Log Analytics
lookup: average -5m unaligned of average
units: seconds
every: 1m
warn: $this > (($status >= $WARNING) ? (180) : (300))
crit: $this > (($status == $CRITICAL) ? (300) : (600))
delay: down 5m multiplier 1.5 max 1h
summary: Log Analytics ingestion latency on ${label:resource_name}
info: Average data ingestion latency for Log Analytics workspace ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
High latency means data takes longer to become queryable.
to: sysadmin
# --- User Queries: Failures ---
# Helper: total query count over 5 minutes
template: am_log_analytics_query_total
on: azure_monitor.log_analytics.queries
class: Workload
type: Other
component: Log Analytics
lookup: sum -5m unaligned of total
units: queries
every: 1m
info: Total queries on Log Analytics workspace ${label:resource_name} \
in ${label:resource_group} (${label:region})
# Query failure rate as percentage of total queries.
# Any sustained query failures indicate workspace or query problems.
template: am_log_analytics_query_failure_rate
on: azure_monitor.log_analytics.queries
class: Errors
type: Other
component: Log Analytics
lookup: sum -5m unaligned of failed
calc: ($am_log_analytics_query_total > 0) ? ($this * 100 / $am_log_analytics_query_total) : (0)
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (3) : (5))
crit: $this > (($status == $CRITICAL) ? (5) : (15))
delay: down 5m multiplier 1.5 max 1h
summary: Log Analytics query failures on ${label:resource_name}
info: Percentage of failed queries on Log Analytics workspace ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Data Export ---
# Export failures (rate). Any sustained export failures need attention.
template: am_log_analytics_export_failures
on: azure_monitor.log_analytics.export_failures
class: Errors
type: Other
component: Log Analytics
lookup: average -5m unaligned of failed
units: exports/s
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (0) : (1))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (1) : (5))
delay: down 5m multiplier 1.5 max 1h
summary: Log Analytics export failures on ${label:resource_name}
info: Data export failure rate for Log Analytics workspace ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Failures mean exported data is not reaching the destination.
to: sysadmin
# --- Legacy Agent: CPU ---
# CPU utilization from legacy Log Analytics agents.
template: am_log_analytics_legacy_cpu
on: azure_monitor.log_analytics.legacy_cpu_utilization
class: Utilization
type: Other
component: Log Analytics
lookup: average -5m unaligned of processor
units: percentage
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (80) : (90))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (90) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: Legacy agent CPU on ${label:resource_name}
info: Processor time reported by legacy Log Analytics agent on ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Legacy Agent: Memory ---
# Memory utilization from legacy Log Analytics agents. High is bad.
template: am_log_analytics_legacy_memory
on: azure_monitor.log_analytics.legacy_memory_utilization
class: Utilization
type: Other
component: Log Analytics
lookup: average -5m unaligned of used
units: percentage
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (80) : (90))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (90) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: Legacy agent memory on ${label:resource_name}
info: Memory utilization reported by legacy Log Analytics agent on ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Legacy Agent: Swap ---
# Swap utilization from legacy Log Analytics agents. High is bad.
template: am_log_analytics_legacy_swap
on: azure_monitor.log_analytics.legacy_swap_utilization
class: Utilization
type: Other
component: Log Analytics
lookup: average -5m unaligned of used
units: percentage
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (50) : (70))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (70) : (90))
delay: down 5m multiplier 1.5 max 1h
summary: Legacy agent swap usage on ${label:resource_name}
info: Swap utilization reported by legacy Log Analytics agent on ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Legacy Agent: Disk Space ---
# Disk space utilization from legacy Log Analytics agents. High is bad.
template: am_log_analytics_legacy_disk_space
on: azure_monitor.log_analytics.legacy_disk_space_utilization
class: Utilization
type: Other
component: Log Analytics
lookup: average -5m unaligned of used
units: percentage
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (80) : (85))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: Legacy agent disk space on ${label:resource_name}
info: Disk space utilization reported by legacy Log Analytics agent on ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Legacy Agent: Disk Inodes ---
# Inode utilization from legacy Log Analytics agents. High is bad.
template: am_log_analytics_legacy_disk_inodes
on: azure_monitor.log_analytics.legacy_disk_inodes
class: Utilization
type: Other
component: Log Analytics
lookup: average -5m unaligned of used
units: percentage
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (80) : (85))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: Legacy agent inode usage on ${label:resource_name}
info: Inode utilization reported by legacy Log Analytics agent on ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Legacy Agent: Disk I/O Latency ---
# Disk I/O latency from legacy Log Analytics agents. High is bad.
# Read and write latency in seconds per operation.
template: am_log_analytics_legacy_disk_read_latency
on: azure_monitor.log_analytics.legacy_disk_io_latency
class: Latency
type: Other
component: Log Analytics
lookup: average -5m unaligned of read
units: seconds
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (0.05) : (0.1))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (0.1) : (0.5))
delay: down 5m multiplier 1.5 max 1h
summary: Legacy agent disk read latency on ${label:resource_name}
info: Average disk read latency reported by legacy Log Analytics agent on ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
template: am_log_analytics_legacy_disk_write_latency
on: azure_monitor.log_analytics.legacy_disk_io_latency
class: Latency
type: Other
component: Log Analytics
lookup: average -5m unaligned of write
units: seconds
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (0.05) : (0.1))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (0.1) : (0.5))
delay: down 5m multiplier 1.5 max 1h
summary: Legacy agent disk write latency on ${label:resource_name}
info: Average disk write latency reported by legacy Log Analytics agent on ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Legacy Agent: Disk Queue ---
# Disk queue length from legacy Log Analytics agents. High means I/O saturation.
template: am_log_analytics_legacy_disk_queue
on: azure_monitor.log_analytics.legacy_disk_queue
class: Workload
type: Other
component: Log Analytics
lookup: average -5m unaligned of queue_length
units: operations
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (2) : (5))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (5) : (10))
delay: down 5m multiplier 1.5 max 1h
summary: Legacy agent disk queue on ${label:resource_name}
info: Current disk queue length reported by legacy Log Analytics agent on ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
High queue length indicates I/O saturation.
to: sysadmin
# --- Legacy Agent: Network Errors ---
# Network errors from legacy Log Analytics agents.
template: am_log_analytics_legacy_network_rx_errors
on: azure_monitor.log_analytics.legacy_network_errors
class: Errors
type: Other
component: Log Analytics
lookup: average -5m unaligned of rx
units: errors
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (5) : (10))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (10) : (50))
delay: down 5m multiplier 1.5 max 1h
summary: Legacy agent network RX errors on ${label:resource_name}
info: Network receive errors reported by legacy Log Analytics agent on ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
template: am_log_analytics_legacy_network_tx_errors
on: azure_monitor.log_analytics.legacy_network_errors
class: Errors
type: Other
component: Log Analytics
lookup: average -5m unaligned of tx
units: errors
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (5) : (10))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (10) : (50))
delay: down 5m multiplier 1.5 max 1h
summary: Legacy agent network TX errors on ${label:resource_name}
info: Network transmit errors reported by legacy Log Analytics agent on ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin