File: //usr/lib/netdata/conf.d/health.d/azure_monitor_service_bus.conf
# you can disable an alarm notification by setting the 'to' line to: silent
# --- Errors ---
# Server-side errors indicate Service Bus infrastructure problems.
# AMBA: ServerErrors > 0 is Sev1
template: am_service_bus_server_errors
on: azure_monitor.service_bus.errors
class: Errors
type: Messaging
component: Azure Service Bus
lookup: average -5m unaligned of server
units: errors/s
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (1))
crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: down 5m multiplier 1.5 max 1h
summary: Service Bus server errors on ${label:resource_name}
info: Server-side error rate on Service Bus namespace ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# Throttled requests mean the namespace has exceeded its messaging unit quotas.
# AMBA: ThrottledRequests > 0 is Sev1
template: am_service_bus_throttled_requests
on: azure_monitor.service_bus.errors
class: Errors
type: Messaging
component: Azure Service Bus
lookup: average -5m unaligned of throttled
units: errors/s
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (1))
crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: down 5m multiplier 1.5 max 1h
summary: Service Bus throttled requests on ${label:resource_name}
info: Rate of throttled requests on Service Bus namespace ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Indicates the namespace is exceeding its messaging unit quotas
to: sysadmin
# User errors (400-class) at sustained high rate may indicate
# client misconfiguration or malformed messages.
template: am_service_bus_user_errors
on: azure_monitor.service_bus.errors
class: Errors
type: Messaging
component: Azure Service Bus
lookup: average -5m unaligned of user
units: errors/s
every: 1m
warn: $this > (($status >= $WARNING) ? (10) : (25))
delay: down 5m multiplier 1.5 max 1h
summary: Service Bus user errors on ${label:resource_name}
info: Rate of user (client-side) errors on Service Bus namespace ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Utilization (Premium tier only) ---
# CPU utilization of Premium namespace messaging units.
# AMBA: NamespaceCpuUsage > 70 is Sev2
template: am_service_bus_namespace_cpu
on: azure_monitor.service_bus.namespace_resources
class: Utilization
type: Messaging
component: Azure Service Bus
lookup: average -5m unaligned of cpu
units: percentage
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (70) : (80))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (80) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: Service Bus namespace CPU on ${label:resource_name}
info: CPU utilization of Premium Service Bus namespace ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Only available on Premium tier namespaces
to: sysadmin
# Memory utilization of Premium namespace messaging units.
# AMBA: NamespaceMemoryUsage > 70 is Sev2
template: am_service_bus_namespace_memory
on: azure_monitor.service_bus.namespace_resources
class: Utilization
type: Messaging
component: Azure Service Bus
lookup: average -5m unaligned of memory
units: percentage
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (70) : (80))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (80) : (95))
delay: down 5m multiplier 1.5 max 1h
summary: Service Bus namespace memory on ${label:resource_name}
info: Memory utilization of Premium Service Bus namespace ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Only available on Premium tier namespaces
to: sysadmin
# --- Latency ---
# Server send latency measures how long Service Bus takes to complete
# send operations. High latency indicates performance degradation.
template: am_service_bus_send_latency
on: azure_monitor.service_bus.send_latency
class: Latency
type: Messaging
component: Azure Service Bus
lookup: average -5m unaligned of average
units: milliseconds
every: 1m
warn: $this > (($status >= $WARNING) ? (500) : (1000))
crit: $this > (($status == $CRITICAL) ? (1000) : (3000))
delay: down 5m multiplier 1.5 max 1h
summary: Service Bus send latency on ${label:resource_name}
info: Average server send latency on Service Bus namespace ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Saturation ---
# Dead-lettered messages accumulate when messages cannot be processed
# after max delivery attempts or when they expire. Growing dead letter
# queues indicate consumer failures or poison messages.
# AMBA: DeadletteredMessages > 0 is Sev2
template: am_service_bus_dead_lettered_messages
on: azure_monitor.service_bus.problem_messages
class: Errors
type: Messaging
component: Azure Service Bus
lookup: average -5m unaligned of dead_lettered
units: messages
every: 1m
warn: $this > (($status >= $WARNING) ? (0) : (1))
crit: $this > (($status == $CRITICAL) ? (10) : (100))
delay: down 5m multiplier 1.5 max 1h
summary: Service Bus dead-lettered messages on ${label:resource_name}
info: Dead-lettered messages in Service Bus namespace ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Messages land in dead letter queue after exceeding max delivery attempts or expiring
to: sysadmin
# Active message queue depth. Sustained growth means consumers
# are not keeping up with producers.
# AMBA: ActiveMessages > 100 is Sev2
template: am_service_bus_active_messages
on: azure_monitor.service_bus.queue_depth
class: Workload
type: Messaging
component: Azure Service Bus
lookup: average -10m unaligned of active
units: messages
every: 1m
warn: $this > (($status >= $WARNING) ? (5000) : (10000))
crit: $this > (($status == $CRITICAL) ? (10000) : (50000))
delay: down 5m multiplier 1.5 max 1h
summary: Service Bus queue depth on ${label:resource_name}
info: Active messages queued in Service Bus namespace ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Sustained growth means consumers are not keeping up with producers
to: sysadmin
# --- Request Success ---
# Helper: total incoming requests over 5 minutes (no alarm, just a value)
template: am_service_bus_incoming_requests
on: azure_monitor.service_bus.requests
class: Workload
type: Messaging
component: Azure Service Bus
lookup: sum -5m unaligned of incoming
units: requests/s
every: 1m
info: Total incoming requests on Service Bus namespace ${label:resource_name} \
in ${label:resource_group} (${label:region})
# Request success rate. Only fires when there is meaningful traffic
# to avoid false positives during idle periods.
template: am_service_bus_request_success_rate
on: azure_monitor.service_bus.requests
class: Errors
type: Messaging
component: Azure Service Bus
lookup: sum -5m unaligned of successful
calc: ($am_service_bus_incoming_requests > 0) ? ($this * 100 / $am_service_bus_incoming_requests) : (100)
units: %
every: 1m
warn: ($am_service_bus_incoming_requests > 120) ? ($this < (($status >= $WARNING) ? (99) : (95))) : (0)
crit: ($am_service_bus_incoming_requests > 120) ? ($this < (($status == $CRITICAL) ? (95) : (80))) : (0)
delay: down 5m multiplier 1.5 max 1h
summary: Service Bus request success rate on ${label:resource_name}
info: Percentage of successful requests on Service Bus namespace ${label:resource_name} \
in ${label:resource_group} (${label:region})
to: sysadmin
# --- Message Operations ---
# Abandoned messages indicate consumers receiving but not processing messages.
# Sustained abandonment suggests poison messages or consumer failures.
template: am_service_bus_abandoned_messages
on: azure_monitor.service_bus.message_operations
class: Errors
type: Messaging
component: Azure Service Bus
lookup: average -5m unaligned of abandoned
units: messages/s
every: 1m
warn: $this > (($status >= $WARNING) ? (1) : (5))
delay: down 5m multiplier 1.5 max 1h
summary: Service Bus abandoned messages on ${label:resource_name}
info: Rate of abandoned messages on Service Bus namespace ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Consumers are receiving but failing to process messages
to: sysadmin
# --- Replication (Geo-DR only) ---
# Replication lag count measures how many messages are pending replication
# to the secondary namespace. Only relevant with Geo-DR configured.
template: am_service_bus_replication_lag
on: azure_monitor.service_bus.replication_lag
class: Latency
type: Messaging
component: Azure Service Bus
lookup: average -5m unaligned of messages
units: messages
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (100) : (1000))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (1000) : (10000))
delay: down 5m multiplier 1.5 max 1h
summary: Service Bus replication lag on ${label:resource_name}
info: Messages pending replication to secondary namespace on Service Bus ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Only relevant when Geo-DR is configured
to: sysadmin
# Replication lag duration measures time behind the primary.
template: am_service_bus_replication_lag_duration
on: azure_monitor.service_bus.replication_lag_duration
class: Latency
type: Messaging
component: Azure Service Bus
lookup: average -5m unaligned of duration
units: seconds
every: 1m
warn: $this != nan AND $this > (($status >= $WARNING) ? (30) : (60))
crit: $this != nan AND $this > (($status == $CRITICAL) ? (60) : (120))
delay: down 5m multiplier 1.5 max 1h
summary: Service Bus replication lag duration on ${label:resource_name}
info: Replication lag duration to secondary namespace on Service Bus ${label:resource_name} \
in ${label:resource_group} (${label:region}). \
Only relevant when Geo-DR is configured
to: sysadmin