GrenXPaRTa

File: //usr/lib/netdata/conf.d/health.d/azure_monitor_log_analytics.conf
# you can disable an alarm notification by setting the 'to' line to: silent

# --- Workspace SLI: Availability ---

# Query availability (0-100%). Low is bad.
# Azure SLA: 99.9% for Log Analytics queries.

 template: am_log_analytics_query_availability
       on: azure_monitor.log_analytics.query_availability
    class: Availability
     type: Other
component: Log Analytics
   lookup: average -5m unaligned of availability
    units: percentage
    every: 1m
     warn: $this < (($status >= $WARNING)  ? (99.9) : (99))
     crit: $this < (($status == $CRITICAL) ? (99)   : (90))
    delay: down 5m multiplier 1.5 max 1h
  summary: Log Analytics query availability on ${label:resource_name}
     info: Query availability of Log Analytics workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

# --- Workspace SLI: Ingestion ---

# Ingestion latency (seconds). High is bad.
# AMBA: average ingestion latency > 300s (5 min) is concerning.

 template: am_log_analytics_ingestion_latency
       on: azure_monitor.log_analytics.ingestion_latency
    class: Latency
     type: Other
component: Log Analytics
   lookup: average -5m unaligned of average
    units: seconds
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (180) : (300))
     crit: $this > (($status == $CRITICAL) ? (300) : (600))
    delay: down 5m multiplier 1.5 max 1h
  summary: Log Analytics ingestion latency on ${label:resource_name}
     info: Average data ingestion latency for Log Analytics workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region}). \
           High latency means data takes longer to become queryable.
       to: sysadmin

# --- User Queries: Failures ---

# Helper: total query count over 5 minutes
 template: am_log_analytics_query_total
       on: azure_monitor.log_analytics.queries
    class: Workload
     type: Other
component: Log Analytics
   lookup: sum -5m unaligned of total
    units: queries
    every: 1m
     info: Total queries on Log Analytics workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region})

# Query failure rate as percentage of total queries.
# Any sustained query failures indicate workspace or query problems.

 template: am_log_analytics_query_failure_rate
       on: azure_monitor.log_analytics.queries
    class: Errors
     type: Other
component: Log Analytics
   lookup: sum -5m unaligned of failed
     calc: ($am_log_analytics_query_total > 0) ? ($this * 100 / $am_log_analytics_query_total) : (0)
    units: %
    every: 1m
     warn: $this > (($status >= $WARNING)  ? (3) : (5))
     crit: $this > (($status == $CRITICAL) ? (5) : (15))
    delay: down 5m multiplier 1.5 max 1h
  summary: Log Analytics query failures on ${label:resource_name}
     info: Percentage of failed queries on Log Analytics workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

# --- Data Export ---

# Export failures (rate). Any sustained export failures need attention.

 template: am_log_analytics_export_failures
       on: azure_monitor.log_analytics.export_failures
    class: Errors
     type: Other
component: Log Analytics
   lookup: average -5m unaligned of failed
    units: exports/s
    every: 1m
     warn: $this != nan AND $this > (($status >= $WARNING)  ? (0) : (1))
     crit: $this != nan AND $this > (($status == $CRITICAL) ? (1) : (5))
    delay: down 5m multiplier 1.5 max 1h
  summary: Log Analytics export failures on ${label:resource_name}
     info: Data export failure rate for Log Analytics workspace ${label:resource_name} \
           in ${label:resource_group} (${label:region}). \
           Failures mean exported data is not reaching the destination.
       to: sysadmin

# --- Legacy Agent: CPU ---

# CPU utilization from legacy Log Analytics agents.

 template: am_log_analytics_legacy_cpu
       on: azure_monitor.log_analytics.legacy_cpu_utilization
    class: Utilization
     type: Other
component: Log Analytics
   lookup: average -5m unaligned of processor
    units: percentage
    every: 1m
     warn: $this != nan AND $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this != nan AND $this > (($status == $CRITICAL) ? (90) : (95))
    delay: down 5m multiplier 1.5 max 1h
  summary: Legacy agent CPU on ${label:resource_name}
     info: Processor time reported by legacy Log Analytics agent on ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

# --- Legacy Agent: Memory ---

# Memory utilization from legacy Log Analytics agents. High is bad.

 template: am_log_analytics_legacy_memory
       on: azure_monitor.log_analytics.legacy_memory_utilization
    class: Utilization
     type: Other
component: Log Analytics
   lookup: average -5m unaligned of used
    units: percentage
    every: 1m
     warn: $this != nan AND $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this != nan AND $this > (($status == $CRITICAL) ? (90) : (95))
    delay: down 5m multiplier 1.5 max 1h
  summary: Legacy agent memory on ${label:resource_name}
     info: Memory utilization reported by legacy Log Analytics agent on ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

# --- Legacy Agent: Swap ---

# Swap utilization from legacy Log Analytics agents. High is bad.

 template: am_log_analytics_legacy_swap
       on: azure_monitor.log_analytics.legacy_swap_utilization
    class: Utilization
     type: Other
component: Log Analytics
   lookup: average -5m unaligned of used
    units: percentage
    every: 1m
     warn: $this != nan AND $this > (($status >= $WARNING)  ? (50) : (70))
     crit: $this != nan AND $this > (($status == $CRITICAL) ? (70) : (90))
    delay: down 5m multiplier 1.5 max 1h
  summary: Legacy agent swap usage on ${label:resource_name}
     info: Swap utilization reported by legacy Log Analytics agent on ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

# --- Legacy Agent: Disk Space ---

# Disk space utilization from legacy Log Analytics agents. High is bad.

 template: am_log_analytics_legacy_disk_space
       on: azure_monitor.log_analytics.legacy_disk_space_utilization
    class: Utilization
     type: Other
component: Log Analytics
   lookup: average -5m unaligned of used
    units: percentage
    every: 1m
     warn: $this != nan AND $this > (($status >= $WARNING)  ? (80) : (85))
     crit: $this != nan AND $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 5m multiplier 1.5 max 1h
  summary: Legacy agent disk space on ${label:resource_name}
     info: Disk space utilization reported by legacy Log Analytics agent on ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

# --- Legacy Agent: Disk Inodes ---

# Inode utilization from legacy Log Analytics agents. High is bad.

 template: am_log_analytics_legacy_disk_inodes
       on: azure_monitor.log_analytics.legacy_disk_inodes
    class: Utilization
     type: Other
component: Log Analytics
   lookup: average -5m unaligned of used
    units: percentage
    every: 1m
     warn: $this != nan AND $this > (($status >= $WARNING)  ? (80) : (85))
     crit: $this != nan AND $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 5m multiplier 1.5 max 1h
  summary: Legacy agent inode usage on ${label:resource_name}
     info: Inode utilization reported by legacy Log Analytics agent on ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

# --- Legacy Agent: Disk I/O Latency ---

# Disk I/O latency from legacy Log Analytics agents. High is bad.
# Read and write latency in seconds per operation.

 template: am_log_analytics_legacy_disk_read_latency
       on: azure_monitor.log_analytics.legacy_disk_io_latency
    class: Latency
     type: Other
component: Log Analytics
   lookup: average -5m unaligned of read
    units: seconds
    every: 1m
     warn: $this != nan AND $this > (($status >= $WARNING)  ? (0.05) : (0.1))
     crit: $this != nan AND $this > (($status == $CRITICAL) ? (0.1)  : (0.5))
    delay: down 5m multiplier 1.5 max 1h
  summary: Legacy agent disk read latency on ${label:resource_name}
     info: Average disk read latency reported by legacy Log Analytics agent on ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

 template: am_log_analytics_legacy_disk_write_latency
       on: azure_monitor.log_analytics.legacy_disk_io_latency
    class: Latency
     type: Other
component: Log Analytics
   lookup: average -5m unaligned of write
    units: seconds
    every: 1m
     warn: $this != nan AND $this > (($status >= $WARNING)  ? (0.05) : (0.1))
     crit: $this != nan AND $this > (($status == $CRITICAL) ? (0.1)  : (0.5))
    delay: down 5m multiplier 1.5 max 1h
  summary: Legacy agent disk write latency on ${label:resource_name}
     info: Average disk write latency reported by legacy Log Analytics agent on ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

# --- Legacy Agent: Disk Queue ---

# Disk queue length from legacy Log Analytics agents. High means I/O saturation.

 template: am_log_analytics_legacy_disk_queue
       on: azure_monitor.log_analytics.legacy_disk_queue
    class: Workload
     type: Other
component: Log Analytics
   lookup: average -5m unaligned of queue_length
    units: operations
    every: 1m
     warn: $this != nan AND $this > (($status >= $WARNING)  ? (2) : (5))
     crit: $this != nan AND $this > (($status == $CRITICAL) ? (5) : (10))
    delay: down 5m multiplier 1.5 max 1h
  summary: Legacy agent disk queue on ${label:resource_name}
     info: Current disk queue length reported by legacy Log Analytics agent on ${label:resource_name} \
           in ${label:resource_group} (${label:region}). \
           High queue length indicates I/O saturation.
       to: sysadmin

# --- Legacy Agent: Network Errors ---

# Network errors from legacy Log Analytics agents.

 template: am_log_analytics_legacy_network_rx_errors
       on: azure_monitor.log_analytics.legacy_network_errors
    class: Errors
     type: Other
component: Log Analytics
   lookup: average -5m unaligned of rx
    units: errors
    every: 1m
     warn: $this != nan AND $this > (($status >= $WARNING)  ? (5)  : (10))
     crit: $this != nan AND $this > (($status == $CRITICAL) ? (10) : (50))
    delay: down 5m multiplier 1.5 max 1h
  summary: Legacy agent network RX errors on ${label:resource_name}
     info: Network receive errors reported by legacy Log Analytics agent on ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin

 template: am_log_analytics_legacy_network_tx_errors
       on: azure_monitor.log_analytics.legacy_network_errors
    class: Errors
     type: Other
component: Log Analytics
   lookup: average -5m unaligned of tx
    units: errors
    every: 1m
     warn: $this != nan AND $this > (($status >= $WARNING)  ? (5)  : (10))
     crit: $this != nan AND $this > (($status == $CRITICAL) ? (10) : (50))
    delay: down 5m multiplier 1.5 max 1h
  summary: Legacy agent network TX errors on ${label:resource_name}
     info: Network transmit errors reported by legacy Log Analytics agent on ${label:resource_name} \
           in ${label:resource_group} (${label:region})
       to: sysadmin